from dataclasses import dataclass
from enum import Enum
 
 
class GateStatus(Enum):
    PASSED = "passed"
    FAILED = "failed"
    WARNING = "warning"
 
 
@dataclass
class GateCondition:
    """품질 게이트의 개별 조건."""
    name: str
    metric: str
    operator: str       # "gte", "lte", "eq", "delta_gte", "delta_lte"
    threshold: float
    blocking: bool       # True이면 실패 시 배포 차단
    description: str
 
 
@dataclass
class GateResult:
    """품질 게이트 평가 결과."""
    condition: GateCondition
    actual_value: float
    status: GateStatus
    message: str
 
 
class QualityGate:
    """품질 게이트를 정의하고 평가합니다."""
    
    def __init__(self, conditions: list[GateCondition]):
        self.conditions = conditions
    
    def evaluate(
        self,
        current_metrics: dict[str, float],
        baseline_metrics: dict[str, float] | None = None,
    ) -> tuple[GateStatus, list[GateResult]]:
        """모든 게이트 조건을 평가합니다."""
        
        results: list[GateResult] = []
        overall_status = GateStatus.PASSED
        
        for condition in self.conditions:
            result = self._check_condition(condition, current_metrics, baseline_metrics)
            results.append(result)
            
            if result.status == GateStatus.FAILED and condition.blocking:
                overall_status = GateStatus.FAILED
            elif result.status == GateStatus.WARNING and overall_status == GateStatus.PASSED:
                overall_status = GateStatus.WARNING
        
        return overall_status, results
    
    def _check_condition(
        self,
        condition: GateCondition,
        current: dict[str, float],
        baseline: dict[str, float] | None,
    ) -> GateResult:
        """단일 조건을 확인합니다."""
        
        current_value = current.get(condition.metric, 0.0)
        
        if condition.operator.startswith("delta_"):
            if baseline is None:
                return GateResult(
                    condition=condition,
                    actual_value=current_value,
                    status=GateStatus.WARNING,
                    message="베이스라인 없음 - 델타 비교 불가",
                )
            baseline_value = baseline.get(condition.metric, 0.0)
            delta = current_value - baseline_value
            actual = delta
        else:
            actual = current_value
        
        passed = self._compare(actual, condition.operator, condition.threshold)
        
        return GateResult(
            condition=condition,
            actual_value=actual,
            status=GateStatus.PASSED if passed else GateStatus.FAILED,
            message=f"{condition.metric}: {actual:.4f} {'>=<'} {condition.threshold}",
        )
    
    @staticmethod
    def _compare(value: float, operator: str, threshold: float) -> bool:
        op = operator.replace("delta_", "")
        if op == "gte":
            return value >= threshold
        elif op == "lte":
            return value <= threshold
        elif op == "eq":
            return abs(value - threshold) < 1e-6
        return False
 
 
# 실전 품질 게이트 정의
PRODUCTION_GATE = QualityGate(conditions=[
    GateCondition(
        name="최소 정확도",
        metric="accuracy",
        operator="gte",
        threshold=0.85,
        blocking=True,
        description="정확도가 85% 미만이면 배포를 차단합니다.",
    ),
    GateCondition(
        name="정확도 회귀",
        metric="accuracy",
        operator="delta_gte",
        threshold=-0.02,
        blocking=True,
        description="이전 대비 정확도가 2%p 이상 하락하면 배포를 차단합니다.",
    ),
    GateCondition(
        name="PII 노출 없음",
        metric="pii_leak_rate",
        operator="lte",
        threshold=0.0,
        blocking=True,
        description="PII 노출이 하나라도 발생하면 배포를 차단합니다.",
    ),
    GateCondition(
        name="지연시간",
        metric="p99_latency_ms",
        operator="lte",
        threshold=3000,
        blocking=False,
        description="P99 지연시간 3초 초과 시 경고합니다.",
    ),
])

Info

품질 게이트를 처음 도입할 때는 모든 조건을 비차단(non-blocking)으로 시작하세요. 2-4주간 데이터를 수집하면서 적절한 임계값을 파악한 후, 핵심 조건만 차단(blocking)으로 전환합니다. 처음부터 엄격한 게이트를 설정하면 팀의 배포 속도가 급격히 저하될 수 있습니다.

GitHub Actions 기반 평가 워크플로우

프롬프트 변경 감지 + 자동 평가

.github/workflows/llm-eval.yml

yaml

name: LLM Evaluation Pipeline
 
on:
  pull_request:
    paths:
      - 'prompts/**'
      - 'src/llm/**'
      - 'config/model_config.yaml'
 
jobs:
  detect-changes:
    runs-on: ubuntu-latest
    outputs:
      prompts_changed: ${{ steps.changes.outputs.prompts }}
      model_changed: ${{ steps.changes.outputs.model }}
    steps:
      - uses: actions/checkout@v4
      - uses: dorny/paths-filter@v3
        id: changes
        with:
          filters: |
            prompts:
              - 'prompts/**'
            model:
              - 'config/model_config.yaml'
 
  run-evaluation:
    needs: detect-changes
    if: needs.detect-changes.outputs.prompts_changed == 'true' || needs.detect-changes.outputs.model_changed == 'true'
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.12'
      
      - name: Install dependencies
        run: pip install -r requirements-eval.txt
      
      - name: Load baseline metrics
        id: baseline
        run: |
          python scripts/load_baseline.py \
            --branch ${{ github.base_ref }} \
            --output baseline_metrics.json
      
      - name: Run evaluation
        env:
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
        run: |
          python scripts/run_eval.py \
            --config eval_config.yaml \
            --output current_metrics.json \
            --golden-dataset data/golden_v1.2.json
      
      - name: Check quality gate
        id: gate
        run: |
          python scripts/check_gate.py \
            --current current_metrics.json \
            --baseline baseline_metrics.json \
            --gate-config quality_gate.yaml \
            --output gate_result.json
      
      - name: Post PR comment
        uses: actions/github-script@v7
        with:
          script: |
            const fs = require('fs');
            const result = JSON.parse(fs.readFileSync('gate_result.json', 'utf8'));
            
            let body = `## LLM 평가 결과\n\n`;
            body += `상태: **${result.overall_status}**\n\n`;
            body += `| 조건 | 결과 | 값 | 임계값 |\n`;
            body += `|------|------|-----|--------|\n`;
            
            for (const r of result.details) {
              const icon = r.status === 'passed' ? '[PASS]' : r.status === 'failed' ? '[FAIL]' : '[WARN]';
              body += `| ${r.name} | ${icon} | ${r.actual_value.toFixed(4)} | ${r.threshold} |\n`;
            }
            
            await github.rest.issues.createComment({
              issue_number: context.issue.number,
              owner: context.repo.owner,
              repo: context.repo.repo,
              body: body
            });
      
      - name: Enforce gate
        if: steps.gate.outputs.status == 'failed'
        run: |
          echo "품질 게이트 실패 - 차단 조건이 충족되지 않았습니다."
          exit 1

promptfoo를 활용한 간소화

promptfoo는 GitHub Actions 통합을 네이티브로 지원합니다.

.github/workflows/promptfoo-eval.yml

yaml

name: Prompt Evaluation
 
on:
  pull_request:
    paths:
      - 'prompts/**'
 
jobs:
  evaluate:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      
      - name: Run promptfoo evaluation
        uses: promptfoo/promptfoo-action@v1
        with:
          config: promptfooconfig.yaml
          cache: false
        env:
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
      
      - name: Comment results
        uses: promptfoo/promptfoo-action/comment@v1
        with:
          github-token: ${{ secrets.GITHUB_TOKEN }}

Tip

CI에서 LLM 평가를 실행할 때 비용 관리가 중요합니다. 골든 데이터셋의 크기를 50-100개로 제한하고, 비용이 저렴한 모델(GPT-4o-mini 등)을 LLM-as-Judge로 사용하면 PR당 평가 비용을 $1 이하로 유지할 수 있습니다.

회귀 테스트 자동화

베이스라인 관리

baseline_manager.py

python

import json
from pathlib import Path
from datetime import datetime
 
 
class BaselineManager:
    """평가 베이스라인을 관리합니다."""
    
    def __init__(self, storage_dir: str):
        self.storage_dir = Path(storage_dir)
        self.storage_dir.mkdir(parents=True, exist_ok=True)
    
    def save_baseline(
        self,
        metrics: dict[str, float],
        branch: str,
        commit_sha: str,
    ) -> None:
        """현재 메트릭을 베이스라인으로 저장합니다."""
        baseline = {
            "metrics": metrics,
            "branch": branch,
            "commit_sha": commit_sha,
            "timestamp": datetime.now().isoformat(),
        }
        
        # 브랜치별 최신 베이스라인
        path = self.storage_dir / f"{branch}_latest.json"
        with open(path, "w") as f:
            json.dump(baseline, f, indent=2)
        
        # 이력 보관
        history_path = self.storage_dir / f"{branch}_{commit_sha[:8]}.json"
        with open(history_path, "w") as f:
            json.dump(baseline, f, indent=2)
    
    def load_baseline(self, branch: str) -> dict[str, float] | None:
        """브랜치의 최신 베이스라인을 로드합니다."""
        path = self.storage_dir / f"{branch}_latest.json"
        if not path.exists():
            return None
        
        with open(path) as f:
            data = json.load(f)
        return data["metrics"]
    
    def compare_with_baseline(
        self,
        current: dict[str, float],
        branch: str,
    ) -> dict[str, dict]:
        """현재 메트릭을 베이스라인과 비교합니다."""
        baseline = self.load_baseline(branch)
        if baseline is None:
            return {"status": "no_baseline", "details": {}}
        
        comparison = {}
        for metric, value in current.items():
            baseline_value = baseline.get(metric)
            if baseline_value is not None:
                delta = value - baseline_value
                comparison[metric] = {
                    "current": value,
                    "baseline": baseline_value,
                    "delta": delta,
                    "relative_change": delta / baseline_value if baseline_value != 0 else 0,
                    "regression": delta < -0.01,  # 1%p 이상 하락
                }
        
        return {"status": "compared", "details": comparison}

프롬프트 변경 감지

프롬프트 파일이 변경되었을 때 자동으로 영향 범위를 분석하고 관련 평가를 트리거합니다.

prompt_change_detector.py

python

import subprocess
from pathlib import Path
 
 
class PromptChangeDetector:
    """프롬프트 변경을 감지하고 영향 범위를 분석합니다."""
    
    def __init__(self, prompt_dir: str = "prompts/"):
        self.prompt_dir = Path(prompt_dir)
        # 프롬프트 파일과 관련 평가 태스크의 매핑
        self.prompt_task_mapping: dict[str, list[str]] = {
            "prompts/customer_support.txt": ["support_accuracy", "support_tone"],
            "prompts/summarization.txt": ["summary_quality", "summary_faithfulness"],
            "prompts/system_common.txt": ["all"],  # 공통 프롬프트는 전체 평가
        }
    
    def get_changed_prompts(self, base_ref: str = "main") -> list[str]:
        """기준 브랜치 대비 변경된 프롬프트 파일 목록을 반환합니다."""
        result = subprocess.run(
            ["git", "diff", "--name-only", base_ref, "--", str(self.prompt_dir)],
            capture_output=True,
            text=True,
        )
        return [f.strip() for f in result.stdout.strip().split("\n") if f.strip()]
    
    def get_required_evaluations(self, changed_files: list[str]) -> set[str]:
        """변경된 파일에 따라 실행해야 할 평가 태스크를 결정합니다."""
        tasks: set[str] = set()
        
        for file in changed_files:
            mapped_tasks = self.prompt_task_mapping.get(file, ["unknown_prompt_eval"])
            if "all" in mapped_tasks:
                return {"all"}
            tasks.update(mapped_tasks)
        
        return tasks

드리프트 모니터링 통합

CI/CD는 배포 전 검증을 담당하지만, 배포 후 모니터링도 필수적입니다.

drift_monitor.py

python

from datetime import datetime, timedelta
 
 
class DriftMonitor:
    """프로덕션 환경의 품질 드리프트를 모니터링합니다."""
    
    def __init__(self, alert_threshold: float = 0.05):
        self.alert_threshold = alert_threshold
        self.metric_history: list[dict] = []
    
    def record_metric(self, metric_name: str, value: float, timestamp: datetime | None = None) -> None:
        """메트릭 값을 기록합니다."""
        self.metric_history.append({
            "metric": metric_name,
            "value": value,
            "timestamp": (timestamp or datetime.now()).isoformat(),
        })
    
    def check_drift(self, metric_name: str, window_days: int = 7) -> dict:
        """최근 윈도우 기간의 메트릭 변화를 분석합니다."""
        cutoff = datetime.now() - timedelta(days=window_days)
        
        recent = [
            entry["value"]
            for entry in self.metric_history
            if entry["metric"] == metric_name
            and datetime.fromisoformat(entry["timestamp"]) >= cutoff
        ]
        
        older = [
            entry["value"]
            for entry in self.metric_history
            if entry["metric"] == metric_name
            and datetime.fromisoformat(entry["timestamp"]) < cutoff
        ]
        
        if not recent or not older:
            return {"status": "insufficient_data"}
        
        import numpy as np
        recent_mean = np.mean(recent)
        older_mean = np.mean(older)
        drift = recent_mean - older_mean
        
        return {
            "status": "drift_detected" if abs(drift) > self.alert_threshold else "stable",
            "recent_mean": recent_mean,
            "baseline_mean": older_mean,
            "drift": drift,
            "sample_sizes": {"recent": len(recent), "baseline": len(older)},
        }

종합 프로젝트: 전체 파이프라인

시리즈에서 다룬 모든 개념을 통합한 프로젝트 구조입니다.

llm-eval-pipeline/
  config/
    quality_gate.yaml       # 품질 게이트 설정
    eval_config.yaml        # 평가 설정
    model_config.yaml       # 모델 설정
  data/
    golden/                 # 골든 데이터셋 (버전별)
    baselines/              # 베이스라인 메트릭
  prompts/
    customer_support.txt    # 프롬프트 파일
    summarization.txt
  src/
    eval/
      harness.py            # 평가 하네스 코어
      metrics.py            # 메트릭 정의
      llm_judge.py          # LLM-as-Judge
      quality_gate.py       # 품질 게이트
    monitor/
      drift.py              # 드리프트 모니터링
      alerting.py           # 알림 시스템
  scripts/
    run_eval.py             # 평가 실행 스크립트
    check_gate.py           # 게이트 검증 스크립트
    load_baseline.py        # 베이스라인 로드
  .github/
    workflows/
      llm-eval.yml          # 평가 CI 워크플로우
  tests/
    test_metrics.py         # 메트릭 단위 테스트
    test_gate.py            # 게이트 로직 테스트

Warning

CI/CD 평가 파이프라인에서 API 키와 같은 시크릿은 반드시 GitHub Secrets 또는 Vault 등의 안전한 저장소를 통해 관리해야 합니다. 평가 결과 로그에 API 키나 민감한 데이터가 노출되지 않도록 주의하세요.

시리즈를 마치며

이 시리즈를 통해 AI 평가 하네스와 벤치마킹 시스템의 전체 스펙트럼을 살펴보았습니다.

1장에서 평가의 필요성과 생태계를 조망하고, 2장에서 아키텍처의 핵심 개념을 이해했습니다. 3-5장에서는 lm-evaluation-harness, HELM, Inspect AI를 심층 분석했고, 6장에서 실무 도구 생태계를 비교했습니다. 7장에서 커스텀 하네스를 구축하고, 8장에서 벤치마크 설계 원칙을 배웠으며, 9장에서 모델 비교 파이프라인을 자동화했습니다. 마지막으로 이 장에서 CI/CD 통합까지 완성했습니다.

AI 시스템의 품질은 평가의 품질에 의해 결정됩니다. 좋은 평가 없이는 좋은 시스템을 만들 수 없습니다. 이 시리즈가 여러분의 AI 시스템에 견고한 평가 체계를 구축하는 데 도움이 되기를 바랍니다.

핵심 요약

CI/CD에 LLM 평가를 통합하면 프롬프트 변경이나 모델 교체의 영향을 배포 전에 자동으로 감지할 수 있습니다.
품질 게이트는 처음에 비차단으로 시작하여 데이터를 수집한 후 핵심 조건만 차단으로 전환하는 것이 실전적입니다.
GitHub Actions와 promptfoo를 조합하면 프롬프트 변경 시 자동 평가와 PR 코멘트를 빠르게 구축할 수 있습니다.
배포 전 CI/CD 검증과 배포 후 드리프트 모니터링을 결합하면 LLM 시스템의 품질을 지속적으로 보장할 수 있습니다.
LLM-as-Judge를 CI 차단 조건으로 사용하려면 인간 평가자와 80% 이상의 일치율이 전제되어야 합니다.

이 글이 도움이 되셨나요?

AI / ML

9장: 자동화된 모델 비교 파이프라인

ELO 레이팅과 리더보드 구현, A/B 테스트 자동화, 비용/지연시간/품질 트레이드오프 분석, 모델 선택 자동화, 비교 리포트 자동 생성까지 모델 비교 파이프라인을 구축합니다.

2026년 3월 18일·17분

AI / ML

8장: 벤치마크 스위트 설계 원칙과 실전

벤치마크 오염 문제, 좋은 벤치마크의 조건, 다차원 평가 설계, 도메인별 벤치마크 구축, 데이터셋 버전 관리, 통계적 유의성 검증까지 벤치마크 스위트 설계의 전체를 다룹니다.

2026년 3월 16일·18분

AI / ML

7장: 커스텀 평가 하네스 설계와 구축

도메인 특화 평가 하네스를 처음부터 설계하고 구축합니다. 평가 태스크 설계, 메트릭 정의, LLM-as-Judge 구현, 인간 평가 통합, Golden Dataset 관리를 코드와 함께 실습합니다.

2026년 3월 14일·19분

2026년 3월 20일·AI / ML·

10장: CI/CD 통합과 품질 게이트 구축

18분1,361자10개 섹션

ai evaluation mlops

ai-eval-harness10 / 10

1 2 3 4 5 6 7 8 9 10

이전9장: 자동화된 모델 비교 파이프라인

이 장에서 배울 내용

CI/CD 파이프라인에 LLM 평가를 통합하는 아키텍처
품질 게이트(Quality Gate)의 설계 원칙과 구현
GitHub Actions 기반 평가 워크플로우 구축
프롬프트 변경 감지와 자동 회귀 테스트
프로덕션 드리프트 모니터링 통합
시리즈 종합 프로젝트: 전체 파이프라인 구축

python

from dataclasses import dataclass
from enum import Enum
 
 
class GateStatus(Enum):
    PASSED = "passed"
    FAILED = "failed"
    WARNING = "warning"
 
 
@dataclass
class GateCondition:
    """품질 게이트의 개별 조건."""
    name: str
    metric: str
    operator: str       # "gte", "lte", "eq", "delta_gte", "delta_lte"
    threshold: float
    blocking: bool       # True이면 실패 시 배포 차단
    description: str
 
 
@dataclass
class GateResult:
    """품질 게이트 평가 결과."""
    condition: GateCondition
    actual_value: float
    status: GateStatus
    message: str
 
 
class QualityGate:
    """품질 게이트를 정의하고 평가합니다."""
    
    def __init__(self, conditions: list[GateCondition]):
        self.conditions = conditions
    
    def evaluate(
        self,
        current_metrics: dict[str, float],
        baseline_metrics: dict[str, float] | None = None,
    ) -> tuple[GateStatus, list[GateResult]]:
        """모든 게이트 조건을 평가합니다."""
        
        results: list[GateResult] = []
        overall_status = GateStatus.PASSED
        
        for condition in self.conditions:
            result = self._check_condition(condition, current_metrics, baseline_metrics)
            results.append(result)
            
            if result.status == GateStatus.FAILED and condition.blocking:
                overall_status = GateStatus.FAILED
            elif result.status == GateStatus.WARNING and overall_status == GateStatus.PASSED:
                overall_status = GateStatus.WARNING
        
        return overall_status, results
    
    def _check_condition(
        self,
        condition: GateCondition,
        current: dict[str, float],
        baseline: dict[str, float] | None,
    ) -> GateResult:
        """단일 조건을 확인합니다."""
        
        current_value = current.get(condition.metric, 0.0)
        
        if condition.operator.startswith("delta_"):
            if baseline is None:
                return GateResult(
                    condition=condition,
                    actual_value=current_value,
                    status=GateStatus.WARNING,
                    message="베이스라인 없음 - 델타 비교 불가",
                )
            baseline_value = baseline.get(condition.metric, 0.0)
            delta = current_value - baseline_value
            actual = delta
        else:
            actual = current_value
        
        passed = self._compare(actual, condition.operator, condition.threshold)
        
        return GateResult(
            condition=condition,
            actual_value=actual,
            status=GateStatus.PASSED if passed else GateStatus.FAILED,
            message=f"{condition.metric}: {actual:.4f} {'>=<'} {condition.threshold}",
        )
    
    @staticmethod
    def _compare(value: float, operator: str, threshold: float) -> bool:
        op = operator.replace("delta_", "")
        if op == "gte":
            return value >= threshold
        elif op == "lte":
            return value <= threshold
        elif op == "eq":
            return abs(value - threshold) < 1e-6
        return False
 
 
# 실전 품질 게이트 정의
PRODUCTION_GATE = QualityGate(conditions=[
    GateCondition(
        name="최소 정확도",
        metric="accuracy",
        operator="gte",
        threshold=0.85,
        blocking=True,
        description="정확도가 85% 미만이면 배포를 차단합니다.",
    ),
    GateCondition(
        name="정확도 회귀",
        metric="accuracy",
        operator="delta_gte",
        threshold=-0.02,
        blocking=True,
        description="이전 대비 정확도가 2%p 이상 하락하면 배포를 차단합니다.",
    ),
    GateCondition(
        name="PII 노출 없음",
        metric="pii_leak_rate",
        operator="lte",
        threshold=0.0,
        blocking=True,
        description="PII 노출이 하나라도 발생하면 배포를 차단합니다.",
    ),
    GateCondition(
        name="지연시간",
        metric="p99_latency_ms",
        operator="lte",
        threshold=3000,
        blocking=False,
        description="P99 지연시간 3초 초과 시 경고합니다.",
    ),
])

Info

GitHub Actions 기반 평가 워크플로우

프롬프트 변경 감지 + 자동 평가

.github/workflows/llm-eval.yml

yaml

name: LLM Evaluation Pipeline
 
on:
  pull_request:
    paths:
      - 'prompts/**'
      - 'src/llm/**'
      - 'config/model_config.yaml'
 
jobs:
  detect-changes:
    runs-on: ubuntu-latest
    outputs:
      prompts_changed: ${{ steps.changes.outputs.prompts }}
      model_changed: ${{ steps.changes.outputs.model }}
    steps:
      - uses: actions/checkout@v4
      - uses: dorny/paths-filter@v3
        id: changes
        with:
          filters: |
            prompts:
              - 'prompts/**'
            model:
              - 'config/model_config.yaml'
 
  run-evaluation:
    needs: detect-changes
    if: needs.detect-changes.outputs.prompts_changed == 'true' || needs.detect-changes.outputs.model_changed == 'true'
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.12'
      
      - name: Install dependencies
        run: pip install -r requirements-eval.txt
      
      - name: Load baseline metrics
        id: baseline
        run: |
          python scripts/load_baseline.py \
            --branch ${{ github.base_ref }} \
            --output baseline_metrics.json
      
      - name: Run evaluation
        env:
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
        run: |
          python scripts/run_eval.py \
            --config eval_config.yaml \
            --output current_metrics.json \
            --golden-dataset data/golden_v1.2.json
      
      - name: Check quality gate
        id: gate
        run: |
          python scripts/check_gate.py \
            --current current_metrics.json \
            --baseline baseline_metrics.json \
            --gate-config quality_gate.yaml \
            --output gate_result.json
      
      - name: Post PR comment
        uses: actions/github-script@v7
        with:
          script: |
            const fs = require('fs');
            const result = JSON.parse(fs.readFileSync('gate_result.json', 'utf8'));
            
            let body = `## LLM 평가 결과\n\n`;
            body += `상태: **${result.overall_status}**\n\n`;
            body += `| 조건 | 결과 | 값 | 임계값 |\n`;
            body += `|------|------|-----|--------|\n`;
            
            for (const r of result.details) {
              const icon = r.status === 'passed' ? '[PASS]' : r.status === 'failed' ? '[FAIL]' : '[WARN]';
              body += `| ${r.name} | ${icon} | ${r.actual_value.toFixed(4)} | ${r.threshold} |\n`;
            }
            
            await github.rest.issues.createComment({
              issue_number: context.issue.number,
              owner: context.repo.owner,
              repo: context.repo.repo,
              body: body
            });
      
      - name: Enforce gate
        if: steps.gate.outputs.status == 'failed'
        run: |
          echo "품질 게이트 실패 - 차단 조건이 충족되지 않았습니다."
          exit 1

promptfoo를 활용한 간소화

promptfoo는 GitHub Actions 통합을 네이티브로 지원합니다.

.github/workflows/promptfoo-eval.yml

yaml

name: Prompt Evaluation
 
on:
  pull_request:
    paths:
      - 'prompts/**'
 
jobs:
  evaluate:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      
      - name: Run promptfoo evaluation
        uses: promptfoo/promptfoo-action@v1
        with:
          config: promptfooconfig.yaml
          cache: false
        env:
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
      
      - name: Comment results
        uses: promptfoo/promptfoo-action/comment@v1
        with:
          github-token: ${{ secrets.GITHUB_TOKEN }}

Tip

회귀 테스트 자동화

베이스라인 관리

baseline_manager.py

python

import json
from pathlib import Path
from datetime import datetime
 
 
class BaselineManager:
    """평가 베이스라인을 관리합니다."""
    
    def __init__(self, storage_dir: str):
        self.storage_dir = Path(storage_dir)
        self.storage_dir.mkdir(parents=True, exist_ok=True)
    
    def save_baseline(
        self,
        metrics: dict[str, float],
        branch: str,
        commit_sha: str,
    ) -> None:
        """현재 메트릭을 베이스라인으로 저장합니다."""
        baseline = {
            "metrics": metrics,
            "branch": branch,
            "commit_sha": commit_sha,
            "timestamp": datetime.now().isoformat(),
        }
        
        # 브랜치별 최신 베이스라인
        path = self.storage_dir / f"{branch}_latest.json"
        with open(path, "w") as f:
            json.dump(baseline, f, indent=2)
        
        # 이력 보관
        history_path = self.storage_dir / f"{branch}_{commit_sha[:8]}.json"
        with open(history_path, "w") as f:
            json.dump(baseline, f, indent=2)
    
    def load_baseline(self, branch: str) -> dict[str, float] | None:
        """브랜치의 최신 베이스라인을 로드합니다."""
        path = self.storage_dir / f"{branch}_latest.json"
        if not path.exists():
            return None
        
        with open(path) as f:
            data = json.load(f)
        return data["metrics"]
    
    def compare_with_baseline(
        self,
        current: dict[str, float],
        branch: str,
    ) -> dict[str, dict]:
        """현재 메트릭을 베이스라인과 비교합니다."""
        baseline = self.load_baseline(branch)
        if baseline is None:
            return {"status": "no_baseline", "details": {}}
        
        comparison = {}
        for metric, value in current.items():
            baseline_value = baseline.get(metric)
            if baseline_value is not None:
                delta = value - baseline_value
                comparison[metric] = {
                    "current": value,
                    "baseline": baseline_value,
                    "delta": delta,
                    "relative_change": delta / baseline_value if baseline_value != 0 else 0,
                    "regression": delta < -0.01,  # 1%p 이상 하락
                }
        
        return {"status": "compared", "details": comparison}

프롬프트 변경 감지

프롬프트 파일이 변경되었을 때 자동으로 영향 범위를 분석하고 관련 평가를 트리거합니다.

prompt_change_detector.py

python

import subprocess
from pathlib import Path
 
 
class PromptChangeDetector:
    """프롬프트 변경을 감지하고 영향 범위를 분석합니다."""
    
    def __init__(self, prompt_dir: str = "prompts/"):
        self.prompt_dir = Path(prompt_dir)
        # 프롬프트 파일과 관련 평가 태스크의 매핑
        self.prompt_task_mapping: dict[str, list[str]] = {
            "prompts/customer_support.txt": ["support_accuracy", "support_tone"],
            "prompts/summarization.txt": ["summary_quality", "summary_faithfulness"],
            "prompts/system_common.txt": ["all"],  # 공통 프롬프트는 전체 평가
        }
    
    def get_changed_prompts(self, base_ref: str = "main") -> list[str]:
        """기준 브랜치 대비 변경된 프롬프트 파일 목록을 반환합니다."""
        result = subprocess.run(
            ["git", "diff", "--name-only", base_ref, "--", str(self.prompt_dir)],
            capture_output=True,
            text=True,
        )
        return [f.strip() for f in result.stdout.strip().split("\n") if f.strip()]
    
    def get_required_evaluations(self, changed_files: list[str]) -> set[str]:
        """변경된 파일에 따라 실행해야 할 평가 태스크를 결정합니다."""
        tasks: set[str] = set()
        
        for file in changed_files:
            mapped_tasks = self.prompt_task_mapping.get(file, ["unknown_prompt_eval"])
            if "all" in mapped_tasks:
                return {"all"}
            tasks.update(mapped_tasks)
        
        return tasks

드리프트 모니터링 통합

CI/CD는 배포 전 검증을 담당하지만, 배포 후 모니터링도 필수적입니다.

drift_monitor.py

python

from datetime import datetime, timedelta
 
 
class DriftMonitor:
    """프로덕션 환경의 품질 드리프트를 모니터링합니다."""
    
    def __init__(self, alert_threshold: float = 0.05):
        self.alert_threshold = alert_threshold
        self.metric_history: list[dict] = []
    
    def record_metric(self, metric_name: str, value: float, timestamp: datetime | None = None) -> None:
        """메트릭 값을 기록합니다."""
        self.metric_history.append({
            "metric": metric_name,
            "value": value,
            "timestamp": (timestamp or datetime.now()).isoformat(),
        })
    
    def check_drift(self, metric_name: str, window_days: int = 7) -> dict:
        """최근 윈도우 기간의 메트릭 변화를 분석합니다."""
        cutoff = datetime.now() - timedelta(days=window_days)
        
        recent = [
            entry["value"]
            for entry in self.metric_history
            if entry["metric"] == metric_name
            and datetime.fromisoformat(entry["timestamp"]) >= cutoff
        ]
        
        older = [
            entry["value"]
            for entry in self.metric_history
            if entry["metric"] == metric_name
            and datetime.fromisoformat(entry["timestamp"]) < cutoff
        ]
        
        if not recent or not older:
            return {"status": "insufficient_data"}
        
        import numpy as np
        recent_mean = np.mean(recent)
        older_mean = np.mean(older)
        drift = recent_mean - older_mean
        
        return {
            "status": "drift_detected" if abs(drift) > self.alert_threshold else "stable",
            "recent_mean": recent_mean,
            "baseline_mean": older_mean,
            "drift": drift,
            "sample_sizes": {"recent": len(recent), "baseline": len(older)},
        }

종합 프로젝트: 전체 파이프라인

시리즈에서 다룬 모든 개념을 통합한 프로젝트 구조입니다.

llm-eval-pipeline/
  config/
    quality_gate.yaml       # 품질 게이트 설정
    eval_config.yaml        # 평가 설정
    model_config.yaml       # 모델 설정
  data/
    golden/                 # 골든 데이터셋 (버전별)
    baselines/              # 베이스라인 메트릭
  prompts/
    customer_support.txt    # 프롬프트 파일
    summarization.txt
  src/
    eval/
      harness.py            # 평가 하네스 코어
      metrics.py            # 메트릭 정의
      llm_judge.py          # LLM-as-Judge
      quality_gate.py       # 품질 게이트
    monitor/
      drift.py              # 드리프트 모니터링
      alerting.py           # 알림 시스템
  scripts/
    run_eval.py             # 평가 실행 스크립트
    check_gate.py           # 게이트 검증 스크립트
    load_baseline.py        # 베이스라인 로드
  .github/
    workflows/
      llm-eval.yml          # 평가 CI 워크플로우
  tests/
    test_metrics.py         # 메트릭 단위 테스트
    test_gate.py            # 게이트 로직 테스트

Warning

시리즈를 마치며

이 시리즈를 통해 AI 평가 하네스와 벤치마킹 시스템의 전체 스펙트럼을 살펴보았습니다.

핵심 요약

CI/CD에 LLM 평가를 통합하면 프롬프트 변경이나 모델 교체의 영향을 배포 전에 자동으로 감지할 수 있습니다.
품질 게이트는 처음에 비차단으로 시작하여 데이터를 수집한 후 핵심 조건만 차단으로 전환하는 것이 실전적입니다.
GitHub Actions와 promptfoo를 조합하면 프롬프트 변경 시 자동 평가와 PR 코멘트를 빠르게 구축할 수 있습니다.
배포 전 CI/CD 검증과 배포 후 드리프트 모니터링을 결합하면 LLM 시스템의 품질을 지속적으로 보장할 수 있습니다.
LLM-as-Judge를 CI 차단 조건으로 사용하려면 인간 평가자와 80% 이상의 일치율이 전제되어야 합니다.

이 글이 도움이 되셨나요?

AI / ML

10장: CI/CD 통합과 품질 게이트 구축

이 장에서 배울 내용

왜 CI/CD에 평가를 통합하는가

품질 게이트 설계

품질 게이트의 구성요소

GitHub Actions 기반 평가 워크플로우

프롬프트 변경 감지 + 자동 평가

promptfoo를 활용한 간소화

회귀 테스트 자동화

베이스라인 관리

프롬프트 변경 감지

드리프트 모니터링 통합

종합 프로젝트: 전체 파이프라인

시리즈를 마치며

핵심 요약

관련 글

9장: 자동화된 모델 비교 파이프라인

8장: 벤치마크 스위트 설계 원칙과 실전

7장: 커스텀 평가 하네스 설계와 구축

댓글

10장: CI/CD 통합과 품질 게이트 구축

이 장에서 배울 내용

왜 CI/CD에 평가를 통합하는가

품질 게이트 설계

품질 게이트의 구성요소

GitHub Actions 기반 평가 워크플로우

프롬프트 변경 감지 + 자동 평가

promptfoo를 활용한 간소화

회귀 테스트 자동화

베이스라인 관리

프롬프트 변경 감지

드리프트 모니터링 통합

종합 프로젝트: 전체 파이프라인

시리즈를 마치며

핵심 요약

관련 글

9장: 자동화된 모델 비교 파이프라인

8장: 벤치마크 스위트 설계 원칙과 실전

7장: 커스텀 평가 하네스 설계와 구축

댓글