마이크로서비스 계측, AI 서비스 관측, SLO 알림을 종합하여 전체 관측 가능성 플랫폼을 구축하고, 운영 체크리스트와 비용 최적화 전략을 정리합니다.
이번 프로젝트에서 구축할 전체 관측 가능성 플랫폼의 아키텍처입니다.
from opentelemetry import trace, metrics
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter
from opentelemetry.sdk.resources import Resource
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
import logging
import os
def init_gateway_telemetry(app):
endpoint = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
resource = Resource.create({
"service.name": "api-gateway",
"service.version": "1.0.0",
"deployment.environment": os.environ.get("ENV", "development"),
})
# Traces
tp = TracerProvider(resource=resource)
tp.add_span_processor(
BatchSpanProcessor(OTLPSpanExporter(endpoint=endpoint))
)
trace.set_tracer_provider(tp)
# Metrics
mr = PeriodicExportingMetricReader(
OTLPMetricExporter(endpoint=endpoint),
export_interval_millis=15000,
)
mp = MeterProvider(resource=resource, metric_readers=[mr])
metrics.set_meter_provider(mp)
# Logs
lp = LoggerProvider(resource=resource)
lp.add_log_record_processor(
BatchLogRecordProcessor(OTLPLogExporter(endpoint=endpoint))
)
logging.getLogger().addHandler(
LoggingHandler(level=logging.INFO, logger_provider=lp)
)
# 자동 계측
FastAPIInstrumentor.instrument_app(app)
HTTPXClientInstrumentor().instrument()from fastapi import FastAPI, Request
from telemetry import init_gateway_telemetry
from opentelemetry import trace
import httpx
import logging
import time
app = FastAPI(title="API Gateway")
init_gateway_telemetry(app)
tracer = trace.get_tracer("api-gateway.routes")
logger = logging.getLogger("api-gateway")
@app.post("/api/orders")
async def create_order(request: Request):
body = await request.json()
with tracer.start_as_current_span("gateway.route-order") as span:
span.set_attribute("gateway.route", "/api/orders")
span.set_attribute("order.item_count", len(body.get("items", [])))
async with httpx.AsyncClient() as client:
response = await client.post(
"http://order-service:8081/orders",
json=body,
timeout=30.0,
)
logger.info("주문 요청 라우팅 완료", extra={
"status_code": response.status_code,
"order_items": len(body.get("items", [])),
})
return response.json()
@app.get("/api/ai/recommend")
async def get_recommendation(query: str):
with tracer.start_as_current_span("gateway.route-ai") as span:
span.set_attribute("gateway.route", "/api/ai/recommend")
span.set_attribute("ai.query_length", len(query))
async with httpx.AsyncClient() as client:
response = await client.get(
f"http://ai-service:8082/recommend?query={query}",
timeout=60.0,
)
return response.json()from flask import Flask, jsonify, request
from opentelemetry import trace, metrics
import logging
import time
app = Flask(__name__)
# (텔레메트리 초기화는 gateway와 동일한 패턴)
tracer = trace.get_tracer("order-service.handlers")
meter = metrics.get_meter("order-service.business")
logger = logging.getLogger("order-service")
# SLI 메트릭
sli_request_total = meter.create_counter("sli.request.total")
sli_request_good = meter.create_counter("sli.request.good")
sli_request_duration = meter.create_histogram(
"sli.request.duration", unit="s"
)
# 비즈니스 메트릭
order_counter = meter.create_counter("orders.created.total")
order_amount = meter.create_histogram("orders.amount", unit="KRW")
@app.route("/orders", methods=["POST"])
def create_order():
start = time.time()
data = request.json
endpoint = "/orders"
try:
with tracer.start_as_current_span("create-order") as span:
span.set_attribute("order.item_count", len(data.get("items", [])))
# 주문 검증
with tracer.start_as_current_span("validate-order"):
validate(data)
# 재고 확인
with tracer.start_as_current_span("check-inventory"):
check_inventory(data["items"])
# 결제 처리
with tracer.start_as_current_span("process-payment") as pay_span:
pay_span.set_attribute("payment.method", data.get("payment_method", "card"))
pay_span.set_attribute("payment.amount", data["total"])
result = process_payment(data)
# 비즈니스 메트릭
order_counter.add(1, {"payment.method": data.get("payment_method", "card")})
order_amount.record(data["total"])
logger.info("주문 생성 완료", extra={"order_id": result["order_id"]})
# SLI 기록
sli_request_total.add(1, {"endpoint": endpoint})
sli_request_good.add(1, {"endpoint": endpoint})
sli_request_duration.record(time.time() - start, {"endpoint": endpoint})
return jsonify(result), 201
except Exception as e:
sli_request_total.add(1, {"endpoint": endpoint})
sli_request_duration.record(time.time() - start, {"endpoint": endpoint})
# 5xx는 good에 카운트하지 않음
logger.error("주문 생성 실패", extra={"error": str(e)})
return jsonify({"error": str(e)}), 500from flask import Flask, jsonify, request
from opentelemetry import trace, metrics
from opentelemetry.trace import SpanKind
import openai
import logging
import time
app = Flask(__name__)
tracer = trace.get_tracer("ai-service.handlers")
meter = metrics.get_meter("ai-service.llm")
logger = logging.getLogger("ai-service")
# LLM 메트릭
token_counter = meter.create_counter("llm.token.usage", unit="tokens")
llm_duration = meter.create_histogram("llm.request.duration", unit="s")
llm_cost = meter.create_counter("llm.request.cost", unit="USD")
MODEL_PRICING = {
"gpt-4o": {"input": 0.0025 / 1000, "output": 0.01 / 1000},
"gpt-4o-mini": {"input": 0.00015 / 1000, "output": 0.0006 / 1000},
}
client = openai.OpenAI()
@app.route("/recommend")
def recommend():
query = request.args.get("query", "")
model = "gpt-4o-mini"
with tracer.start_as_current_span("ai.recommend") as root_span:
root_span.set_attribute("ai.query", query[:200])
root_span.set_attribute("ai.model", model)
# 프롬프트 구성
with tracer.start_as_current_span("ai.build-prompt"):
messages = [
{"role": "system", "content": "기술 관련 질문에 대해 간결하게 답변합니다."},
{"role": "user", "content": query},
]
# LLM 호출
with tracer.start_as_current_span(
"llm.chat.completion",
kind=SpanKind.CLIENT,
) as llm_span:
llm_span.set_attribute("llm.model", model)
start = time.time()
try:
response = client.chat.completions.create(
model=model,
messages=messages,
temperature=0.3,
max_tokens=1000,
)
duration = time.time() - start
usage = response.usage
llm_span.set_attribute("llm.usage.input_tokens", usage.prompt_tokens)
llm_span.set_attribute("llm.usage.output_tokens", usage.completion_tokens)
llm_span.set_attribute("llm.usage.total_tokens", usage.total_tokens)
# 메트릭 기록
attrs = {"llm.model": model}
token_counter.add(usage.prompt_tokens, {**attrs, "token.type": "input"})
token_counter.add(usage.completion_tokens, {**attrs, "token.type": "output"})
llm_duration.record(duration, attrs)
pricing = MODEL_PRICING[model]
cost = usage.prompt_tokens * pricing["input"] + usage.completion_tokens * pricing["output"]
llm_cost.add(cost, attrs)
logger.info("LLM 호출 완료", extra={
"model": model,
"tokens": usage.total_tokens,
"duration_ms": int(duration * 1000),
})
return jsonify({
"recommendation": response.choices[0].message.content,
"model": model,
"tokens_used": usage.total_tokens,
})
except Exception as e:
llm_span.set_status(trace.StatusCode.ERROR, str(e))
llm_span.record_exception(e)
logger.error("LLM 호출 실패", extra={"model": model, "error": str(e)})
return jsonify({"error": "AI 서비스 일시 장애"}), 503receivers:
otlp:
protocols:
grpc:
endpoint: "0.0.0.0:4317"
http:
endpoint: "0.0.0.0:4318"
processors:
memory_limiter:
check_interval: 1s
limit_mib: 256
spike_limit_mib: 64
batch:
timeout: 1s
send_batch_size: 512
exporters:
otlp/gateway:
endpoint: "otel-gateway:4317"
tls:
insecure: true
service:
pipelines:
traces:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [otlp/gateway]
metrics:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [otlp/gateway]
logs:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [otlp/gateway]receivers:
otlp:
protocols:
grpc:
endpoint: "0.0.0.0:4317"
processors:
memory_limiter:
check_interval: 1s
limit_mib: 1024
spike_limit_mib: 256
filter/healthcheck:
traces:
span:
- 'attributes["http.target"] == "/healthz"'
- 'attributes["http.target"] == "/readyz"'
logs:
log_record:
- 'IsMatch(body, ".*healthcheck.*")'
attributes/environment:
actions:
- key: deployment.cluster
value: "production-kr"
action: insert
- key: user.email
action: hash
tail_sampling:
decision_wait: 30s
num_traces: 50000
policies:
- name: errors
type: status_code
status_code:
status_codes: [ERROR]
- name: slow-requests
type: latency
latency:
threshold_ms: 1000
- name: ai-service
type: string_attribute
string_attribute:
key: service.name
values: [ai-service]
- name: default
type: probabilistic
probabilistic:
sampling_percentage: 10
batch:
timeout: 2s
send_batch_size: 2048
exporters:
otlp/tempo:
endpoint: "tempo:4317"
tls:
insecure: true
prometheusremotewrite:
endpoint: "http://prometheus:9090/api/v1/write"
resource_to_telemetry_conversion:
enabled: true
otlp/loki:
endpoint: "http://loki:3100/otlp"
tls:
insecure: true
service:
pipelines:
traces:
receivers: [otlp]
processors: [memory_limiter, filter/healthcheck, attributes/environment, tail_sampling, batch]
exporters: [otlp/tempo]
metrics:
receivers: [otlp]
processors: [memory_limiter, attributes/environment, batch]
exporters: [prometheusremotewrite]
logs:
receivers: [otlp]
processors: [memory_limiter, filter/healthcheck, attributes/environment, batch]
exporters: [otlp/loki]services:
# --- 애플리케이션 ---
api-gateway:
build: ./gateway
ports: ["8080:8080"]
environment:
- OTEL_EXPORTER_OTLP_ENDPOINT=http://collector-agent:4317
- ENV=production
depends_on: [collector-agent]
order-service:
build: ./order-service
ports: ["8081:8081"]
environment:
- OTEL_EXPORTER_OTLP_ENDPOINT=http://collector-agent:4317
depends_on: [collector-agent]
ai-service:
build: ./ai-service
ports: ["8082:8082"]
environment:
- OTEL_EXPORTER_OTLP_ENDPOINT=http://collector-agent:4317
- OPENAI_API_KEY=${OPENAI_API_KEY}
depends_on: [collector-agent]
# --- OTel 계층 ---
collector-agent:
image: otel/opentelemetry-collector-contrib:0.100.0
volumes:
- ./config/agent-collector.yaml:/etc/otelcol/config.yaml
ports: ["4317:4317", "4318:4318"]
depends_on: [collector-gateway]
collector-gateway:
image: otel/opentelemetry-collector-contrib:0.100.0
volumes:
- ./config/gateway-collector.yaml:/etc/otelcol/config.yaml
depends_on: [tempo, prometheus, loki]
# --- 백엔드 ---
tempo:
image: grafana/tempo:2.4.1
volumes:
- ./config/tempo.yaml:/etc/tempo/config.yaml
command: ["-config.file=/etc/tempo/config.yaml"]
prometheus:
image: prom/prometheus:v2.51.0
volumes:
- ./config/prometheus.yaml:/etc/prometheus/prometheus.yml
- ./config/slo-alert-rules.yaml:/etc/prometheus/rules/slo-rules.yaml
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--web.enable-remote-write-receiver"
- "--enable-feature=exemplar-storage"
loki:
image: grafana/loki:2.9.6
command: ["-config.file=/etc/loki/local-config.yaml"]
grafana:
image: grafana/grafana:10.4.1
volumes:
- ./config/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml
- ./config/grafana-dashboards.yaml:/etc/grafana/provisioning/dashboards/dashboards.yaml
- ./dashboards:/var/lib/grafana/dashboards
ports: ["3000:3000"]
environment:
- GF_AUTH_ANONYMOUS_ENABLED=true
- GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
depends_on: [tempo, prometheus, loki]10장에서 설계한 SLO 알림을 프로젝트에 적용합니다.
groups:
- name: project-slo
rules:
# 가용성 기록 규칙
- record: sli:availability:1h
expr: |
sum(rate(sli_request_good_total[1h])) by (endpoint)
/ sum(rate(sli_request_total_total[1h])) by (endpoint)
# Fast Burn 알림 (99.9% SLO)
- alert: OrderService_SLO_FastBurn
expr: |
(1 - sli:availability:1h{endpoint="/orders"}) > (14.4 * 0.001)
for: 2m
labels:
severity: critical
service: order-service
# AI 서비스 비용 알림
- alert: AIService_HighCost
expr: |
sum(rate(llm_request_cost_USD_total[1h])) * 3600 * 24 > 100
for: 15m
labels:
severity: warning
service: ai-service
annotations:
summary: "AI 서비스 일일 예상 비용이 $100를 초과합니다"
# AI 서비스 지연 알림
- alert: AIService_HighLatency
expr: |
histogram_quantile(0.99,
rate(llm_request_duration_seconds_bucket[5m])
) > 10
for: 5m
labels:
severity: warning
service: ai-service프로덕션 환경에서 관측 가능성 플랫폼을 운영하기 위한 체크리스트입니다.
service.name, service.version, deployment.environment 리소스 속성이 설정되어 있는가memory_limiter가 적절히 설정되어 있는가| 전략 | 대상 | 예상 절감 |
|---|---|---|
| Tail Sampling (10%) | 트레이스 | 저장 비용 80-90% 절감 |
| 로그 레벨 조정 (WARN 이상) | 로그 | 볼륨 50-70% 절감 |
| 메트릭 카디널리티 관리 | 메트릭 | 시계열 수 대폭 감소 |
| 헬스 체크 필터링 | 전체 | 불필요 데이터 제거 |
| 로그 티어링 (Hot/Warm/Cold) | 로그 | 장기 저장 비용 절감 |
# Collector가 처리하는 초당 스�� 수
sum(rate(otelcol_receiver_accepted_spans[5m]))
# 초당 메트릭 데이터 포인트
sum(rate(otelcol_receiver_accepted_metric_points[5m]))
# 초당 로그 레코드 수
sum(rate(otelcol_receiver_accepted_log_records[5m]))
# Tail Sampling 효율 (보존 비율)
sum(rate(otelcol_processor_tail_sampling_count_traces_sampled[5m]))
/ sum(rate(otelcol_processor_tail_sampling_count_traces_evaluated[5m]))관측 가능성 비용은 데이터 볼륨에 비례합니다. "모든 것을 수집"하는 것이 아니라, "문제를 진단하는 데 필요한 것만 수집"하는 것이 올바른 접근입니다. 정상 트래픽은 샘플링하고, 비정상 트래픽(에러, 느린 응답)은 100% 보존하는 전략이 비용과 관측 가능성의 최적 균형점입니다.
11장에 걸쳐 OpenTelemetry의 이론과 실전을 종합적으로 다루었습니다. 이 시리즈에서 학습한 내용을 요약합니다.
OpenTelemetry는 벤더 중립적 관측 가능성의 표준으로 자리잡았으며, CNCF 졸업 프로젝트로서 지속적으로 발전하고 있습니다. "한 번 계측하면, 어디로든 보낼 수 있다"는 원칙을 기억하고, 이 시리즈에서 학습한 내용을 실무에 적용해 보시기 바랍니다.
관측 가능성은 시스템의 복잡도가 증가할수록 그 가치가 커집니다. 마이크로서비스, AI 서비스, 클라우드 네이티브 환경에서 OpenTelemetry 기반의 관측 가능성 플랫폼은 안정적인 서비스 운영의 핵심 기반이 될 것입니다.
이 글이 도움이 되셨나요?
관련 주제 더 보기
SLI/SLO/에러 버짓의 개념을 정립하고, 번 레이트 알림(fast-burn/slow-burn), Prometheus 알림 규칙, Grafana 알림 채널을 설계합니다.
LLM 호출 추적, 토큰 사용량/비용 모니터링, AI 에이전트 행동 추적, LangChain/LlamaIndex OTel 통합을 통한 AI 관측 가능성을 학습합니다.
Jaeger로 분산 추적을 시각화하고, Prometheus로 메트릭을 저장/쿼리하며, Grafana로 통합 대시보드를 구성합니다. Docker Compose로 전체 스택을 실습합니다.