novai-api/
  proto/
    inference.proto           # 추론 서비스 정의
    embedding.proto           # 임베딩 서비스 정의
    tool_execution.proto      # 도구 실행 서비스 정의
  
  api/
    main.py                   # FastAPI 앱 진입점
    config.py                 # 설정 관리
    dependencies.py           # 의존성 주입
    
    routers/
      completions.py          # /v1/chat/completions
      embeddings.py           # /v1/embeddings
      models.py               # /v1/models
      batches.py              # /v1/batches
      usage.py                # /v1/usage
    
    middleware/
      auth.py                 # 인증 미들웨어
      rate_limit.py           # 레이트 리미팅
      logging_mw.py           # 요청 로깅
      cors.py                 # CORS 설정
    
    schemas/
      completions.py          # 요청/응답 스키마
      embeddings.py
      models.py
      errors.py               # 에러 스키마
    
    services/
      inference.py            # gRPC 추론 클라이언트
      embedding.py            # gRPC 임베딩 클라이언트
      streaming.py            # SSE 스트리밍 핸들러
      tool_executor.py        # 도구 실행
      batch_processor.py      # 배치 작업 처리
      cost_tracker.py         # 비용 추적
  
  internal/
    inference_server.py       # gRPC 추론 서버
    embedding_server.py       # gRPC 임베딩 서버
  
  openapi/
    spec.yaml                 # OpenAPI 3.1 스펙
  
  sdk/
    python/                   # 생성된 Python SDK
    typescript/               # 생성된 TypeScript SDK
  
  tests/
    test_completions.py
    test_streaming.py
    test_rate_limit.py
  
  docker-compose.yml
  Dockerfile

OpenAPI 스펙 작성

openapi/spec.yaml

yaml

openapi: 3.1.0
info:
  title: NovAI API
  description: |
    멀티 프로바이더 AI 추론 서비스 API.
    OpenAI 호환 인터페이스를 제공합니다.
  version: 1.0.0
  contact:
    email: api@novai.example.com
 
servers:
  - url: https://api.novai.example.com/v1
    description: Production
 
security:
  - bearerAuth: []
 
tags:
  - name: Chat
    description: 대화형 텍스트 완성
  - name: Embeddings
    description: 벡터 임베딩 생성
  - name: Models
    description: 모델 관리 및 조회
  - name: Batches
    description: 배치 작업 처리
  - name: Usage
    description: 사용량 및 비용 추적
 
paths:
  /chat/completions:
    post:
      operationId: createChatCompletion
      summary: 대화형 텍스트 완성
      tags: [Chat]
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: "#/components/schemas/ChatCompletionRequest"
      responses:
        "200":
          description: 완성 결과
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ChatCompletionResponse"
            text/event-stream:
              schema:
                type: string
                description: SSE 스트리밍 응답
        "429":
          $ref: "#/components/responses/RateLimitError"
        "401":
          $ref: "#/components/responses/AuthError"
 
  /embeddings:
    post:
      operationId: createEmbedding
      summary: 벡터 임베딩 생성
      tags: [Embeddings]
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: "#/components/schemas/EmbeddingRequest"
      responses:
        "200":
          description: 임베딩 결과
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/EmbeddingResponse"
 
  /models:
    get:
      operationId: listModels
      summary: 사용 가능한 모델 목록
      tags: [Models]
      responses:
        "200":
          description: 모델 목록
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ModelList"
 
  /batches:
    post:
      operationId: createBatch
      summary: 배치 작업 생성
      tags: [Batches]
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: "#/components/schemas/BatchRequest"
      responses:
        "202":
          description: 배치 생성 완료
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/BatchResponse"
 
  /batches/{batch_id}:
    get:
      operationId: getBatch
      summary: 배치 상태 조회
      tags: [Batches]
      parameters:
        - name: batch_id
          in: path
          required: true
          schema:
            type: string
      responses:
        "200":
          description: 배치 상태
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/BatchResponse"
 
  /usage:
    get:
      operationId: getUsage
      summary: 사용량 조회
      tags: [Usage]
      parameters:
        - name: start_date
          in: query
          required: true
          schema:
            type: string
            format: date
        - name: end_date
          in: query
          required: true
          schema:
            type: string
            format: date
      responses:
        "200":
          description: 사용량 데이터
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/UsageResponse"
 
components:
  securitySchemes:
    bearerAuth:
      type: http
      scheme: bearer
 
  responses:
    RateLimitError:
      description: 레이트 리밋 초과
      headers:
        Retry-After:
          schema:
            type: integer
      content:
        application/json:
          schema:
            $ref: "#/components/schemas/ErrorResponse"
    AuthError:
      description: 인증 실패
      content:
        application/json:
          schema:
            $ref: "#/components/schemas/ErrorResponse"
 
  schemas:
    ChatCompletionRequest:
      type: object
      required: [model, messages]
      properties:
        model:
          type: string
          description: 사용할 모델 ID
        messages:
          type: array
          items:
            $ref: "#/components/schemas/Message"
        temperature:
          type: number
          default: 1.0
          minimum: 0
          maximum: 2
        max_tokens:
          type: integer
          minimum: 1
        stream:
          type: boolean
          default: false
        tools:
          type: array
          items:
            $ref: "#/components/schemas/Tool"
        response_format:
          $ref: "#/components/schemas/ResponseFormat"
 
    Message:
      type: object
      required: [role, content]
      properties:
        role:
          type: string
          enum: [system, user, assistant, tool]
        content:
          oneOf:
            - type: string
            - type: array
              items:
                $ref: "#/components/schemas/ContentPart"
        tool_calls:
          type: array
          items:
            $ref: "#/components/schemas/ToolCall"
        tool_call_id:
          type: string
 
    ContentPart:
      type: object
      required: [type]
      properties:
        type:
          type: string
          enum: [text, image_url]
        text:
          type: string
        image_url:
          type: object
          properties:
            url:
              type: string
 
    Tool:
      type: object
      required: [type, function]
      properties:
        type:
          type: string
          enum: [function]
        function:
          type: object
          required: [name, parameters]
          properties:
            name:
              type: string
            description:
              type: string
            parameters:
              type: object
 
    ToolCall:
      type: object
      properties:
        id:
          type: string
        type:
          type: string
        function:
          type: object
          properties:
            name:
              type: string
            arguments:
              type: string
 
    ResponseFormat:
      type: object
      properties:
        type:
          type: string
          enum: [text, json_object, json_schema]
        json_schema:
          type: object
 
    ChatCompletionResponse:
      type: object
      properties:
        id:
          type: string
        object:
          type: string
          enum: [chat.completion]
        created:
          type: integer
        model:
          type: string
        choices:
          type: array
          items:
            $ref: "#/components/schemas/Choice"
        usage:
          $ref: "#/components/schemas/TokenUsage"
 
    Choice:
      type: object
      properties:
        index:
          type: integer
        message:
          $ref: "#/components/schemas/Message"
        finish_reason:
          type: string
          enum: [stop, length, tool_calls, content_filter]
 
    TokenUsage:
      type: object
      properties:
        prompt_tokens:
          type: integer
        completion_tokens:
          type: integer
        total_tokens:
          type: integer
 
    EmbeddingRequest:
      type: object
      required: [model, input]
      properties:
        model:
          type: string
        input:
          oneOf:
            - type: string
            - type: array
              items:
                type: string
 
    EmbeddingResponse:
      type: object
      properties:
        data:
          type: array
          items:
            type: object
            properties:
              index:
                type: integer
              embedding:
                type: array
                items:
                  type: number
        model:
          type: string
        usage:
          $ref: "#/components/schemas/TokenUsage"
 
    ModelList:
      type: object
      properties:
        data:
          type: array
          items:
            $ref: "#/components/schemas/Model"
 
    Model:
      type: object
      properties:
        id:
          type: string
        object:
          type: string
        created:
          type: integer
        owned_by:
          type: string
 
    BatchRequest:
      type: object
      required: [inputs]
      properties:
        inputs:
          type: array
          items:
            type: object
        completion_window:
          type: string
          default: "24h"
 
    BatchResponse:
      type: object
      properties:
        id:
          type: string
        status:
          type: string
          enum: [pending, running, completed, failed, cancelled]
        total:
          type: integer
        completed:
          type: integer
        failed:
          type: integer
 
    UsageResponse:
      type: object
      properties:
        data:
          type: array
          items:
            type: object
            properties:
              date:
                type: string
              model:
                type: string
              total_tokens:
                type: integer
              cost_usd:
                type: number
 
    ErrorResponse:
      type: object
      properties:
        error:
          type: object
          properties:
            type:
              type: string
            message:
              type: string
            code:
              type: string

FastAPI 핵심 구현

앱 진입점

api/main.py

python

from contextlib import asynccontextmanager
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
 
from api.routers import completions, embeddings, models, batches, usage
from api.middleware.auth import AuthMiddleware
from api.middleware.rate_limit import RateLimitMiddleware
from api.middleware.logging_mw import RequestLoggingMiddleware
from api.config import settings
 
 
@asynccontextmanager
async def lifespan(app: FastAPI):
    # 시작 시: gRPC 연결, Redis 연결, 모델 목록 로드
    await startup()
    yield
    # 종료 시: 연결 정리
    await shutdown()
 
 
app = FastAPI(
    title="NovAI API",
    version="1.0.0",
    lifespan=lifespan,
    docs_url="/docs" if settings.debug else None,
    redoc_url="/redoc" if settings.debug else None,
)
 
# 미들웨어 (역순으로 실행)
app.add_middleware(RequestLoggingMiddleware)
app.add_middleware(RateLimitMiddleware, redis_url=settings.redis_url)
app.add_middleware(AuthMiddleware)
app.add_middleware(
    CORSMiddleware,
    allow_origins=settings.cors_origins,
    allow_methods=["GET", "POST", "DELETE"],
    allow_headers=["Authorization", "Content-Type", "X-API-Version"],
    expose_headers=[
        "X-Request-Id",
        "X-RateLimit-Limit-RPM",
        "X-RateLimit-Remaining-RPM",
        "X-RateLimit-Limit-TPM",
        "X-RateLimit-Remaining-TPM",
        "Retry-After",
    ],
)
 
# 라우터 등록
app.include_router(completions.router, prefix="/v1")
app.include_router(embeddings.router, prefix="/v1")
app.include_router(models.router, prefix="/v1")
app.include_router(batches.router, prefix="/v1")
app.include_router(usage.router, prefix="/v1")

완성 라우터 (스트리밍 통합)

api/routers/completions.py

python

from fastapi import APIRouter, Request, Depends
from fastapi.responses import StreamingResponse, JSONResponse
 
from api.schemas.completions import (
    ChatCompletionRequest,
    ChatCompletionResponse,
)
from api.services.inference import InferenceService
from api.services.streaming import create_sse_stream
from api.services.cost_tracker import CostTracker
from api.dependencies import get_inference_service, get_cost_tracker
 
router = APIRouter(tags=["Chat"])
 
 
@router.post("/chat/completions")
async def create_chat_completion(
    request: ChatCompletionRequest,
    http_request: Request,
    inference: InferenceService = Depends(get_inference_service),
    cost_tracker: CostTracker = Depends(get_cost_tracker),
):
    user = http_request.state.user
    
    # 비용 예약 (max_tokens 기준)
    reservation = await cost_tracker.reserve(
        user_id=user.id,
        model=request.model,
        estimated_input_tokens=estimate_input_tokens(request),
        max_output_tokens=request.max_tokens or 4096,
    )
    
    try:
        if request.stream:
            return StreamingResponse(
                create_sse_stream(
                    inference=inference,
                    request=request,
                    http_request=http_request,
                    reservation=reservation,
                    cost_tracker=cost_tracker,
                    user=user,
                ),
                media_type="text/event-stream",
                headers={
                    "Cache-Control": "no-cache",
                    "Connection": "keep-alive",
                    "X-Accel-Buffering": "no",
                    "X-Request-Id": http_request.state.request_id,
                },
            )
        
        # 동기 응답
        result = await inference.complete(request)
        
        # 비용 정산
        await cost_tracker.settle(
            reservation=reservation,
            actual_input_tokens=result.usage.prompt_tokens,
            actual_output_tokens=result.usage.completion_tokens,
        )
        
        return ChatCompletionResponse(
            id=result.id,
            object="chat.completion",
            created=result.created,
            model=result.model,
            choices=result.choices,
            usage=result.usage,
        )
    except Exception as e:
        # 실패 시 비용 예약 해제
        await cost_tracker.release(reservation)
        raise

스트리밍 핸들러

api/services/streaming.py

python

import json
import time
import uuid
 
 
async def create_sse_stream(
    inference: InferenceService,
    request: ChatCompletionRequest,
    http_request: Request,
    reservation: CostReservation,
    cost_tracker: CostTracker,
    user: UserInfo,
):
    request_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
    total_completion_tokens = 0
    start_time = time.monotonic()
    first_token_time = None
    
    try:
        async for token_data in inference.stream(request):
            # 클라이언트 연결 확인
            if await http_request.is_disconnected():
                break
            
            if first_token_time is None:
                first_token_time = time.monotonic()
                ttft_ms = (first_token_time - start_time) * 1000
                # TTFT 메트릭 기록
                record_ttft(request.model, ttft_ms)
            
            total_completion_tokens += 1
            
            chunk = {
                "id": request_id,
                "object": "chat.completion.chunk",
                "created": int(time.time()),
                "model": request.model,
                "choices": [{
                    "index": 0,
                    "delta": token_data.delta,
                    "finish_reason": token_data.finish_reason,
                }],
            }
            
            # 마지막 청크에 사용량 포함
            if token_data.finish_reason:
                chunk["usage"] = {
                    "prompt_tokens": token_data.prompt_tokens,
                    "completion_tokens": total_completion_tokens,
                    "total_tokens": (
                        token_data.prompt_tokens + total_completion_tokens
                    ),
                }
            
            yield f"data: {json.dumps(chunk, ensure_ascii=False)}\n\n"
        
        yield "data: [DONE]\n\n"
        
    except Exception as e:
        error_chunk = {
            "error": {
                "type": "server_error",
                "message": str(e),
            }
        }
        yield f"data: {json.dumps(error_chunk)}\n\n"
        yield "data: [DONE]\n\n"
    
    finally:
        # 비용 정산
        await cost_tracker.settle(
            reservation=reservation,
            actual_input_tokens=token_data.prompt_tokens if token_data else 0,
            actual_output_tokens=total_completion_tokens,
        )
        
        # 메트릭 기록
        elapsed = (time.monotonic() - start_time) * 1000
        record_completion_metrics(
            model=request.model,
            latency_ms=elapsed,
            tokens=total_completion_tokens,
            stream=True,
            user_tier=user.tier,
        )

gRPC 내부 서비스

internal/inference_server.py

python

import grpc
from gen import inference_pb2 as pb
from gen import inference_pb2_grpc as service
 
 
class InferenceServicer(service.InferenceServiceServicer):
    """gRPC 추론 서비스 구현"""
    
    def __init__(self, litellm_proxy_url: str):
        self.proxy_url = litellm_proxy_url
    
    async def Complete(
        self, request: pb.CompleteRequest, context
    ) -> pb.CompleteResponse:
        response = await call_litellm(
            model=request.model,
            messages=proto_to_messages(request.messages),
            temperature=request.temperature,
            max_tokens=request.max_tokens,
        )
        return messages_to_proto(response)
    
    async def StreamComplete(
        self, request: pb.CompleteRequest, context
    ):
        async for chunk in stream_litellm(
            model=request.model,
            messages=proto_to_messages(request.messages),
            temperature=request.temperature,
            max_tokens=request.max_tokens,
        ):
            if context.cancelled():
                return
            
            yield pb.StreamChunk(
                id=chunk["id"],
                model=chunk["model"],
                choices=[
                    pb.StreamChoice(
                        index=c["index"],
                        delta=pb.MessageDelta(
                            content=c["delta"].get("content"),
                        ),
                        finish_reason=c.get("finish_reason"),
                    )
                    for c in chunk["choices"]
                ],
            )
 
 
async def serve():
    server = grpc.aio.server(
        options=[
            ("grpc.max_send_message_length", 50 * 1024 * 1024),
            ("grpc.max_receive_message_length", 50 * 1024 * 1024),
            ("grpc.keepalive_time_ms", 30000),
        ],
    )
    service.add_InferenceServiceServicer_to_server(
        InferenceServicer(litellm_proxy_url="http://litellm:4000"),
        server,
    )
    server.add_insecure_port("[::]:50051")
    await server.start()
    await server.wait_for_termination()

Docker Compose 구성

docker-compose.yml

yaml

services:
  api:
    build: .
    ports:
      - "8000:8000"
    environment:
      - REDIS_URL=redis://redis:6379
      - DATABASE_URL=postgresql://novai:password@postgres:5432/novai
      - INFERENCE_GRPC_HOST=inference:50051
      - LITELLM_PROXY_URL=http://litellm:4000
    depends_on:
      - redis
      - postgres
      - inference
      - litellm
 
  inference:
    build:
      context: .
      dockerfile: Dockerfile.inference
    ports:
      - "50051:50051"
    environment:
      - LITELLM_PROXY_URL=http://litellm:4000
 
  litellm:
    image: ghcr.io/berriai/litellm:main-latest
    ports:
      - "4000:4000"
    volumes:
      - ./litellm_config.yaml:/app/config.yaml
    command: ["--config", "/app/config.yaml"]
    environment:
      - OPENAI_API_KEY=${OPENAI_API_KEY}
      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
 
  redis:
    image: redis:7-alpine
    ports:
      - "6379:6379"
 
  postgres:
    image: postgres:16-alpine
    environment:
      - POSTGRES_USER=novai
      - POSTGRES_PASSWORD=password
      - POSTGRES_DB=novai
    ports:
      - "5432:5432"
    volumes:
      - pgdata:/var/lib/postgresql/data
 
volumes:
  pgdata:

SDK 생성

OpenAPI 스펙에서 SDK를 자동 생성합니다.

generate-sdks.sh

bash

# Python SDK (openapi-generator 사용)
docker run --rm \
  -v "${PWD}:/local" \
  openapitools/openapi-generator-cli generate \
  -i /local/openapi/spec.yaml \
  -g python \
  -o /local/sdk/python \
  --additional-properties=packageName=novai,projectName=novai-python
 
# TypeScript SDK
docker run --rm \
  -v "${PWD}:/local" \
  openapitools/openapi-generator-cli generate \
  -i /local/openapi/spec.yaml \
  -g typescript-fetch \
  -o /local/sdk/typescript \
  --additional-properties=npmName=novai,supportsES6=true

SDK 사용 예시

sdk-usage-python.py

python

from novai import NovAI
 
client = NovAI(
    api_key="sk-novai-...",
    base_url="https://api.novai.example.com/v1",
)
 
# 동기 완성
response = client.chat.completions.create(
    model="claude-4",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "API 설계 체크리스트를 알려주세요"},
    ],
    max_tokens=2048,
)
 
print(response.choices[0].message.content)
print(f"토큰: {response.usage.total_tokens}")
 
# 스트리밍
stream = client.chat.completions.create(
    model="claude-4",
    messages=[{"role": "user", "content": "Hello"}],
    stream=True,
)
 
for chunk in stream:
    if chunk.choices[0].delta.content:
        print(chunk.choices[0].delta.content, end="")

API 설계 체크리스트

프로젝트를 마무리하며, 프로덕션 AI API 설계 시 확인해야 할 체크리스트입니다.

설계 단계

OpenAPI/Protobuf 스펙을 먼저 정의했는가 (스펙 퍼스트)
리소스 중심 URL 설계를 따르는가
HTTP 메서드와 상태 코드를 올바르게 사용하는가
요청/응답 스키마가 명확하게 정의되어 있는가
에러 응답 형식이 일관적인가
페이지네이션과 필터링이 표준 패턴을 따르는가
버전 관리 전략이 결정되어 있는가

AI 특화 기능

스트리밍 응답이 OpenAI 호환 SSE 형식을 따르는가
도구 호출 프로토콜이 올바르게 구현되어 있는가
멀티모달 입력을 처리할 수 있는가
구조화된 출력(JSON Schema)을 지원하는가
배치 API가 비동기 패턴으로 구현되어 있는가
토큰 사용량이 응답에 포함되는가

보안과 안정성

API Key 또는 JWT 인증이 적용되어 있는가
RPM + TPM 이중 레이트 리미팅이 구현되어 있는가
비용 캡이 설정되어 있는가
TLS가 적용되어 있는가
CORS가 올바르게 설정되어 있는가
입력 검증이 철저한가

개발자 경험

SDK가 자동 생성되어 있는가
API 문서가 최신 상태인가
코드 예제가 실행 가능한가
에러 메시지가 문제 해결에 도움이 되는가
레이트 리밋 헤더가 응답에 포함되는가
폐기 예정 기능에 경고가 있는가

운영

구조화된 로깅이 적용되어 있는가
핵심 메트릭(지연시간, TTFT, 토큰, 비용)이 수집되는가
분산 트레이싱이 설정되어 있는가
모델 폴백이 구현되어 있는가
게이트웨이 헬스 체크가 있는가
알림 임계값이 설정되어 있는가

Tip

이 체크리스트를 팀의 API 리뷰 프로세스에 통합하세요. 새로운 엔드포인트를 추가하거나 기존 API를 변경할 때마다 이 목록을 확인하면, 일관된 품질의 API를 유지할 수 있습니다.

정리

이 장에서는 시리즈 전체의 내용을 하나의 프로젝트로 통합했습니다. REST 공개 API와 gRPC 내부 통신을 결합한 하이브리드 아키텍처 위에, OpenAPI 스펙 정의, FastAPI 구현, SSE 스트리밍, 인증/레이트 리미팅, LiteLLM 프록시, 그리고 SDK 자동 생성까지 AI 서비스 API의 전체 파이프라인을 구축했습니다.

API 설계는 단순히 엔드포인트를 정의하는 것이 아닙니다. 개발자 경험, 비용 효율성, 운영 안정성, 보안을 모두 아우르는 종합적인 설계 활동입니다. 이 시리즈에서 다룬 패턴과 원칙을 실무에 적용하여, 견고하고 확장 가능한 AI 서비스 API를 구축하시기 바랍니다.

시리즈 회고

11장에 걸쳐 API 설계의 기초부터 AI 서비스 특화 패턴, 프로덕션 인프라까지 살펴보았습니다. 핵심 메시지를 정리하면 다음과 같습니다.

프로토콜 선택은 맥락에 따라 -- REST(공개), gRPC(내부), GraphQL(BFF)을 사용 시나리오에 맞게 조합합니다.

AI 서비스는 새로운 설계 요구 -- 비결정적 출력, 스트리밍, 토큰 과금, 도구 호출은 기존 API 패턴의 확장을 필요로 합니다.

스펙 퍼스트 접근법 -- OpenAPI나 Protobuf 스펙을 먼저 정의하면 팀 협업, SDK 생성, 테스트 자동화가 자연스럽게 따라옵니다.

DX가 채택률을 결정 -- 타입 안전 SDK, 인터랙티브 문서, 명확한 에러 메시지가 API의 성패를 좌우합니다.

운영까지 설계의 범위 -- 레이트 리미팅, 비용 캡, 게이트웨이, 관측 가능성은 설계 단계부터 고려해야 합니다.

이 글이 도움이 되셨나요?

아키텍처

10장: API 게이트웨이와 프로덕션 인프라

LLM 게이트웨이를 활용한 멀티 프로바이더 라우팅, 모델 폴백, 인증/인가, 캐싱, 관측 가능성 등 프로덕션 API 인프라를 학습합니다.

2026년 2월 22일·16분

아키텍처

9장: SDK 자동 생성과 개발자 경험

OpenAPI 스펙에서 타입 안전 SDK를 자동 생성하고, API 문서화, 인터랙티브 플레이그라운드로 개발자 경험을 최적화하는 방법을 학습합니다.

2026년 2월 20일·13분

아키텍처

8장: 레이트 리미팅과 비용 제어

토큰 기반 레이트 리미팅, 토큰 버킷과 슬라이딩 윈도우 알고리즘, 사용자별 한도 설정, 비용 캡, Redis 기반 구현을 학습합니다.

2026년 2월 18일·16분

2026년 2월 24일·아키텍처·

11장: 실전 프로젝트 — AI 서비스 API 설계

REST 공개 API와 gRPC 내부 통신을 결합한 AI 서비스 API를 설계하고, OpenAPI 스펙, FastAPI 구현, 스트리밍, 인증, SDK 생성까지 전체를 구축합니다.

20분2,042자12개 섹션

api-design graphql architecture

api-design11 / 11

1 2 3 4 5 6 7 8 9 10 11

이전10장: API 게이트웨이와 프로덕션 인프라

학습 목표

REST 공개 API + gRPC 내부 통신의 하이브리드 아키텍처를 설계합니다
OpenAPI 스펙을 작성하고 FastAPI로 구현합니다
스트리밍, 배치, 도구 호출 엔드포인트를 통합합니다
인증, 레이트 리미팅, SDK 생성까지 전체 파이프라인을 구축합니다

프로젝트 개요

기능 요구사항

텍스트 완성 (동기/스트리밍)
도구 호출 (Function Calling)
벡터 임베딩 생성
배치 작업 처리
모델 관리 및 조회
토큰 사용량/비용 추적

비기능 요구사항

P99 지연시간: TTFT 500ms 이내
가용성: 99.9%
인증: API Key + JWT
레이트 리미팅: RPM + TPM 이중 제한
관측 가능성: 구조화된 로깅 + 메트릭

전체 아키텍처

프로젝트 구조

project-structure.txt

text

novai-api/
  proto/
    inference.proto           # 추론 서비스 정의
    embedding.proto           # 임베딩 서비스 정의
    tool_execution.proto      # 도구 실행 서비스 정의
  
  api/
    main.py                   # FastAPI 앱 진입점
    config.py                 # 설정 관리
    dependencies.py           # 의존성 주입
    
    routers/
      completions.py          # /v1/chat/completions
      embeddings.py           # /v1/embeddings
      models.py               # /v1/models
      batches.py              # /v1/batches
      usage.py                # /v1/usage
    
    middleware/
      auth.py                 # 인증 미들웨어
      rate_limit.py           # 레이트 리미팅
      logging_mw.py           # 요청 로깅
      cors.py                 # CORS 설정
    
    schemas/
      completions.py          # 요청/응답 스키마
      embeddings.py
      models.py
      errors.py               # 에러 스키마
    
    services/
      inference.py            # gRPC 추론 클라이언트
      embedding.py            # gRPC 임베딩 클라이언트
      streaming.py            # SSE 스트리밍 핸들러
      tool_executor.py        # 도구 실행
      batch_processor.py      # 배치 작업 처리
      cost_tracker.py         # 비용 추적
  
  internal/
    inference_server.py       # gRPC 추론 서버
    embedding_server.py       # gRPC 임베딩 서버
  
  openapi/
    spec.yaml                 # OpenAPI 3.1 스펙
  
  sdk/
    python/                   # 생성된 Python SDK
    typescript/               # 생성된 TypeScript SDK
  
  tests/
    test_completions.py
    test_streaming.py
    test_rate_limit.py
  
  docker-compose.yml
  Dockerfile

OpenAPI 스펙 작성

openapi/spec.yaml

yaml

openapi: 3.1.0
info:
  title: NovAI API
  description: |
    멀티 프로바이더 AI 추론 서비스 API.
    OpenAI 호환 인터페이스를 제공합니다.
  version: 1.0.0
  contact:
    email: api@novai.example.com
 
servers:
  - url: https://api.novai.example.com/v1
    description: Production
 
security:
  - bearerAuth: []
 
tags:
  - name: Chat
    description: 대화형 텍스트 완성
  - name: Embeddings
    description: 벡터 임베딩 생성
  - name: Models
    description: 모델 관리 및 조회
  - name: Batches
    description: 배치 작업 처리
  - name: Usage
    description: 사용량 및 비용 추적
 
paths:
  /chat/completions:
    post:
      operationId: createChatCompletion
      summary: 대화형 텍스트 완성
      tags: [Chat]
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: "#/components/schemas/ChatCompletionRequest"
      responses:
        "200":
          description: 완성 결과
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ChatCompletionResponse"
            text/event-stream:
              schema:
                type: string
                description: SSE 스트리밍 응답
        "429":
          $ref: "#/components/responses/RateLimitError"
        "401":
          $ref: "#/components/responses/AuthError"
 
  /embeddings:
    post:
      operationId: createEmbedding
      summary: 벡터 임베딩 생성
      tags: [Embeddings]
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: "#/components/schemas/EmbeddingRequest"
      responses:
        "200":
          description: 임베딩 결과
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/EmbeddingResponse"
 
  /models:
    get:
      operationId: listModels
      summary: 사용 가능한 모델 목록
      tags: [Models]
      responses:
        "200":
          description: 모델 목록
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ModelList"
 
  /batches:
    post:
      operationId: createBatch
      summary: 배치 작업 생성
      tags: [Batches]
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: "#/components/schemas/BatchRequest"
      responses:
        "202":
          description: 배치 생성 완료
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/BatchResponse"
 
  /batches/{batch_id}:
    get:
      operationId: getBatch
      summary: 배치 상태 조회
      tags: [Batches]
      parameters:
        - name: batch_id
          in: path
          required: true
          schema:
            type: string
      responses:
        "200":
          description: 배치 상태
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/BatchResponse"
 
  /usage:
    get:
      operationId: getUsage
      summary: 사용량 조회
      tags: [Usage]
      parameters:
        - name: start_date
          in: query
          required: true
          schema:
            type: string
            format: date
        - name: end_date
          in: query
          required: true
          schema:
            type: string
            format: date
      responses:
        "200":
          description: 사용량 데이터
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/UsageResponse"
 
components:
  securitySchemes:
    bearerAuth:
      type: http
      scheme: bearer
 
  responses:
    RateLimitError:
      description: 레이트 리밋 초과
      headers:
        Retry-After:
          schema:
            type: integer
      content:
        application/json:
          schema:
            $ref: "#/components/schemas/ErrorResponse"
    AuthError:
      description: 인증 실패
      content:
        application/json:
          schema:
            $ref: "#/components/schemas/ErrorResponse"
 
  schemas:
    ChatCompletionRequest:
      type: object
      required: [model, messages]
      properties:
        model:
          type: string
          description: 사용할 모델 ID
        messages:
          type: array
          items:
            $ref: "#/components/schemas/Message"
        temperature:
          type: number
          default: 1.0
          minimum: 0
          maximum: 2
        max_tokens:
          type: integer
          minimum: 1
        stream:
          type: boolean
          default: false
        tools:
          type: array
          items:
            $ref: "#/components/schemas/Tool"
        response_format:
          $ref: "#/components/schemas/ResponseFormat"
 
    Message:
      type: object
      required: [role, content]
      properties:
        role:
          type: string
          enum: [system, user, assistant, tool]
        content:
          oneOf:
            - type: string
            - type: array
              items:
                $ref: "#/components/schemas/ContentPart"
        tool_calls:
          type: array
          items:
            $ref: "#/components/schemas/ToolCall"
        tool_call_id:
          type: string
 
    ContentPart:
      type: object
      required: [type]
      properties:
        type:
          type: string
          enum: [text, image_url]
        text:
          type: string
        image_url:
          type: object
          properties:
            url:
              type: string
 
    Tool:
      type: object
      required: [type, function]
      properties:
        type:
          type: string
          enum: [function]
        function:
          type: object
          required: [name, parameters]
          properties:
            name:
              type: string
            description:
              type: string
            parameters:
              type: object
 
    ToolCall:
      type: object
      properties:
        id:
          type: string
        type:
          type: string
        function:
          type: object
          properties:
            name:
              type: string
            arguments:
              type: string
 
    ResponseFormat:
      type: object
      properties:
        type:
          type: string
          enum: [text, json_object, json_schema]
        json_schema:
          type: object
 
    ChatCompletionResponse:
      type: object
      properties:
        id:
          type: string
        object:
          type: string
          enum: [chat.completion]
        created:
          type: integer
        model:
          type: string
        choices:
          type: array
          items:
            $ref: "#/components/schemas/Choice"
        usage:
          $ref: "#/components/schemas/TokenUsage"
 
    Choice:
      type: object
      properties:
        index:
          type: integer
        message:
          $ref: "#/components/schemas/Message"
        finish_reason:
          type: string
          enum: [stop, length, tool_calls, content_filter]
 
    TokenUsage:
      type: object
      properties:
        prompt_tokens:
          type: integer
        completion_tokens:
          type: integer
        total_tokens:
          type: integer
 
    EmbeddingRequest:
      type: object
      required: [model, input]
      properties:
        model:
          type: string
        input:
          oneOf:
            - type: string
            - type: array
              items:
                type: string
 
    EmbeddingResponse:
      type: object
      properties:
        data:
          type: array
          items:
            type: object
            properties:
              index:
                type: integer
              embedding:
                type: array
                items:
                  type: number
        model:
          type: string
        usage:
          $ref: "#/components/schemas/TokenUsage"
 
    ModelList:
      type: object
      properties:
        data:
          type: array
          items:
            $ref: "#/components/schemas/Model"
 
    Model:
      type: object
      properties:
        id:
          type: string
        object:
          type: string
        created:
          type: integer
        owned_by:
          type: string
 
    BatchRequest:
      type: object
      required: [inputs]
      properties:
        inputs:
          type: array
          items:
            type: object
        completion_window:
          type: string
          default: "24h"
 
    BatchResponse:
      type: object
      properties:
        id:
          type: string
        status:
          type: string
          enum: [pending, running, completed, failed, cancelled]
        total:
          type: integer
        completed:
          type: integer
        failed:
          type: integer
 
    UsageResponse:
      type: object
      properties:
        data:
          type: array
          items:
            type: object
            properties:
              date:
                type: string
              model:
                type: string
              total_tokens:
                type: integer
              cost_usd:
                type: number
 
    ErrorResponse:
      type: object
      properties:
        error:
          type: object
          properties:
            type:
              type: string
            message:
              type: string
            code:
              type: string

FastAPI 핵심 구현

앱 진입점

api/main.py

python

from contextlib import asynccontextmanager
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
 
from api.routers import completions, embeddings, models, batches, usage
from api.middleware.auth import AuthMiddleware
from api.middleware.rate_limit import RateLimitMiddleware
from api.middleware.logging_mw import RequestLoggingMiddleware
from api.config import settings
 
 
@asynccontextmanager
async def lifespan(app: FastAPI):
    # 시작 시: gRPC 연결, Redis 연결, 모델 목록 로드
    await startup()
    yield
    # 종료 시: 연결 정리
    await shutdown()
 
 
app = FastAPI(
    title="NovAI API",
    version="1.0.0",
    lifespan=lifespan,
    docs_url="/docs" if settings.debug else None,
    redoc_url="/redoc" if settings.debug else None,
)
 
# 미들웨어 (역순으로 실행)
app.add_middleware(RequestLoggingMiddleware)
app.add_middleware(RateLimitMiddleware, redis_url=settings.redis_url)
app.add_middleware(AuthMiddleware)
app.add_middleware(
    CORSMiddleware,
    allow_origins=settings.cors_origins,
    allow_methods=["GET", "POST", "DELETE"],
    allow_headers=["Authorization", "Content-Type", "X-API-Version"],
    expose_headers=[
        "X-Request-Id",
        "X-RateLimit-Limit-RPM",
        "X-RateLimit-Remaining-RPM",
        "X-RateLimit-Limit-TPM",
        "X-RateLimit-Remaining-TPM",
        "Retry-After",
    ],
)
 
# 라우터 등록
app.include_router(completions.router, prefix="/v1")
app.include_router(embeddings.router, prefix="/v1")
app.include_router(models.router, prefix="/v1")
app.include_router(batches.router, prefix="/v1")
app.include_router(usage.router, prefix="/v1")

완성 라우터 (스트리밍 통합)

api/routers/completions.py

python

from fastapi import APIRouter, Request, Depends
from fastapi.responses import StreamingResponse, JSONResponse
 
from api.schemas.completions import (
    ChatCompletionRequest,
    ChatCompletionResponse,
)
from api.services.inference import InferenceService
from api.services.streaming import create_sse_stream
from api.services.cost_tracker import CostTracker
from api.dependencies import get_inference_service, get_cost_tracker
 
router = APIRouter(tags=["Chat"])
 
 
@router.post("/chat/completions")
async def create_chat_completion(
    request: ChatCompletionRequest,
    http_request: Request,
    inference: InferenceService = Depends(get_inference_service),
    cost_tracker: CostTracker = Depends(get_cost_tracker),
):
    user = http_request.state.user
    
    # 비용 예약 (max_tokens 기준)
    reservation = await cost_tracker.reserve(
        user_id=user.id,
        model=request.model,
        estimated_input_tokens=estimate_input_tokens(request),
        max_output_tokens=request.max_tokens or 4096,
    )
    
    try:
        if request.stream:
            return StreamingResponse(
                create_sse_stream(
                    inference=inference,
                    request=request,
                    http_request=http_request,
                    reservation=reservation,
                    cost_tracker=cost_tracker,
                    user=user,
                ),
                media_type="text/event-stream",
                headers={
                    "Cache-Control": "no-cache",
                    "Connection": "keep-alive",
                    "X-Accel-Buffering": "no",
                    "X-Request-Id": http_request.state.request_id,
                },
            )
        
        # 동기 응답
        result = await inference.complete(request)
        
        # 비용 정산
        await cost_tracker.settle(
            reservation=reservation,
            actual_input_tokens=result.usage.prompt_tokens,
            actual_output_tokens=result.usage.completion_tokens,
        )
        
        return ChatCompletionResponse(
            id=result.id,
            object="chat.completion",
            created=result.created,
            model=result.model,
            choices=result.choices,
            usage=result.usage,
        )
    except Exception as e:
        # 실패 시 비용 예약 해제
        await cost_tracker.release(reservation)
        raise

스트리밍 핸들러

api/services/streaming.py

python

import json
import time
import uuid
 
 
async def create_sse_stream(
    inference: InferenceService,
    request: ChatCompletionRequest,
    http_request: Request,
    reservation: CostReservation,
    cost_tracker: CostTracker,
    user: UserInfo,
):
    request_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
    total_completion_tokens = 0
    start_time = time.monotonic()
    first_token_time = None
    
    try:
        async for token_data in inference.stream(request):
            # 클라이언트 연결 확인
            if await http_request.is_disconnected():
                break
            
            if first_token_time is None:
                first_token_time = time.monotonic()
                ttft_ms = (first_token_time - start_time) * 1000
                # TTFT 메트릭 기록
                record_ttft(request.model, ttft_ms)
            
            total_completion_tokens += 1
            
            chunk = {
                "id": request_id,
                "object": "chat.completion.chunk",
                "created": int(time.time()),
                "model": request.model,
                "choices": [{
                    "index": 0,
                    "delta": token_data.delta,
                    "finish_reason": token_data.finish_reason,
                }],
            }
            
            # 마지막 청크에 사용량 포함
            if token_data.finish_reason:
                chunk["usage"] = {
                    "prompt_tokens": token_data.prompt_tokens,
                    "completion_tokens": total_completion_tokens,
                    "total_tokens": (
                        token_data.prompt_tokens + total_completion_tokens
                    ),
                }
            
            yield f"data: {json.dumps(chunk, ensure_ascii=False)}\n\n"
        
        yield "data: [DONE]\n\n"
        
    except Exception as e:
        error_chunk = {
            "error": {
                "type": "server_error",
                "message": str(e),
            }
        }
        yield f"data: {json.dumps(error_chunk)}\n\n"
        yield "data: [DONE]\n\n"
    
    finally:
        # 비용 정산
        await cost_tracker.settle(
            reservation=reservation,
            actual_input_tokens=token_data.prompt_tokens if token_data else 0,
            actual_output_tokens=total_completion_tokens,
        )
        
        # 메트릭 기록
        elapsed = (time.monotonic() - start_time) * 1000
        record_completion_metrics(
            model=request.model,
            latency_ms=elapsed,
            tokens=total_completion_tokens,
            stream=True,
            user_tier=user.tier,
        )

gRPC 내부 서비스

internal/inference_server.py

python

import grpc
from gen import inference_pb2 as pb
from gen import inference_pb2_grpc as service
 
 
class InferenceServicer(service.InferenceServiceServicer):
    """gRPC 추론 서비스 구현"""
    
    def __init__(self, litellm_proxy_url: str):
        self.proxy_url = litellm_proxy_url
    
    async def Complete(
        self, request: pb.CompleteRequest, context
    ) -> pb.CompleteResponse:
        response = await call_litellm(
            model=request.model,
            messages=proto_to_messages(request.messages),
            temperature=request.temperature,
            max_tokens=request.max_tokens,
        )
        return messages_to_proto(response)
    
    async def StreamComplete(
        self, request: pb.CompleteRequest, context
    ):
        async for chunk in stream_litellm(
            model=request.model,
            messages=proto_to_messages(request.messages),
            temperature=request.temperature,
            max_tokens=request.max_tokens,
        ):
            if context.cancelled():
                return
            
            yield pb.StreamChunk(
                id=chunk["id"],
                model=chunk["model"],
                choices=[
                    pb.StreamChoice(
                        index=c["index"],
                        delta=pb.MessageDelta(
                            content=c["delta"].get("content"),
                        ),
                        finish_reason=c.get("finish_reason"),
                    )
                    for c in chunk["choices"]
                ],
            )
 
 
async def serve():
    server = grpc.aio.server(
        options=[
            ("grpc.max_send_message_length", 50 * 1024 * 1024),
            ("grpc.max_receive_message_length", 50 * 1024 * 1024),
            ("grpc.keepalive_time_ms", 30000),
        ],
    )
    service.add_InferenceServiceServicer_to_server(
        InferenceServicer(litellm_proxy_url="http://litellm:4000"),
        server,
    )
    server.add_insecure_port("[::]:50051")
    await server.start()
    await server.wait_for_termination()

Docker Compose 구성

docker-compose.yml

yaml

services:
  api:
    build: .
    ports:
      - "8000:8000"
    environment:
      - REDIS_URL=redis://redis:6379
      - DATABASE_URL=postgresql://novai:password@postgres:5432/novai
      - INFERENCE_GRPC_HOST=inference:50051
      - LITELLM_PROXY_URL=http://litellm:4000
    depends_on:
      - redis
      - postgres
      - inference
      - litellm
 
  inference:
    build:
      context: .
      dockerfile: Dockerfile.inference
    ports:
      - "50051:50051"
    environment:
      - LITELLM_PROXY_URL=http://litellm:4000
 
  litellm:
    image: ghcr.io/berriai/litellm:main-latest
    ports:
      - "4000:4000"
    volumes:
      - ./litellm_config.yaml:/app/config.yaml
    command: ["--config", "/app/config.yaml"]
    environment:
      - OPENAI_API_KEY=${OPENAI_API_KEY}
      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
 
  redis:
    image: redis:7-alpine
    ports:
      - "6379:6379"
 
  postgres:
    image: postgres:16-alpine
    environment:
      - POSTGRES_USER=novai
      - POSTGRES_PASSWORD=password
      - POSTGRES_DB=novai
    ports:
      - "5432:5432"
    volumes:
      - pgdata:/var/lib/postgresql/data
 
volumes:
  pgdata:

SDK 생성

OpenAPI 스펙에서 SDK를 자동 생성합니다.

generate-sdks.sh

bash

# Python SDK (openapi-generator 사용)
docker run --rm \
  -v "${PWD}:/local" \
  openapitools/openapi-generator-cli generate \
  -i /local/openapi/spec.yaml \
  -g python \
  -o /local/sdk/python \
  --additional-properties=packageName=novai,projectName=novai-python
 
# TypeScript SDK
docker run --rm \
  -v "${PWD}:/local" \
  openapitools/openapi-generator-cli generate \
  -i /local/openapi/spec.yaml \
  -g typescript-fetch \
  -o /local/sdk/typescript \
  --additional-properties=npmName=novai,supportsES6=true

SDK 사용 예시

sdk-usage-python.py

python

from novai import NovAI
 
client = NovAI(
    api_key="sk-novai-...",
    base_url="https://api.novai.example.com/v1",
)
 
# 동기 완성
response = client.chat.completions.create(
    model="claude-4",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "API 설계 체크리스트를 알려주세요"},
    ],
    max_tokens=2048,
)
 
print(response.choices[0].message.content)
print(f"토큰: {response.usage.total_tokens}")
 
# 스트리밍
stream = client.chat.completions.create(
    model="claude-4",
    messages=[{"role": "user", "content": "Hello"}],
    stream=True,
)
 
for chunk in stream:
    if chunk.choices[0].delta.content:
        print(chunk.choices[0].delta.content, end="")