2026년 1월 19일·AI / ML·

3장: 자동 평가 파이프라인 구축

코드 기반 메트릭과 벤치마크 자동화로 LLM 애플리케이션의 품질을 체계적으로 측정하는 평가 파이프라인을 구축합니다.

20분1,258자9개 섹션

llm evaluation monitoring observability testing

llm-evaluation3 / 10

1 2 3 4 5 6 7 8 9 10

이전2장: 평가 메트릭 설계 - 정확성, 관련성, 안전성 다음4장: LLM-as-Judge - LLM으로 LLM 평가하기

자동 평가 파이프라인의 구조

수동으로 LLM 출력을 하나씩 확인하는 방식은 확장할 수 없습니다. 프롬프트를 변경할 때마다, 모델을 교체할 때마다, 데이터 소스를 업데이트할 때마다 체계적인 평가를 실행하려면 자동화된 파이프라인이 필수입니다.

자동 평가 파이프라인은 네 단계로 구성됩니다.

평가 데이터셋 구축

데이터셋 구조 설계

평가 데이터셋은 단순한 입출력 쌍을 넘어, 평가에 필요한 모든 맥락 정보를 포함해야 합니다.

python

from dataclasses import dataclass, field
from typing import Optional
 
@dataclass
class EvalCase:
    """단일 평가 케이스를 정의합니다."""
    id: str                          # 고유 식별자
    input_text: str                  # LLM에 전달할 입력
    expected_output: Optional[str]   # 기대 출력 (레퍼런스)
    context: Optional[str] = None    # RAG 컨텍스트 (해당 시)
    metadata: dict = field(default_factory=dict)  # 카테고리, 난이도 등
 
@dataclass
class EvalDataset:
    """평가 데이터셋 전체를 관리합니다."""
    name: str
    version: str
    cases: list  # EvalCase 리스트
    description: str = ""
 
    def filter_by_category(self, category: str) -> list:
        return [c for c in self.cases if c.metadata.get("category") == category]
 
    def sample(self, n: int, seed: int = 42) -> list:
        import random
        random.seed(seed)
        return random.sample(self.cases, min(n, len(self.cases)))

데이터셋 소싱 전략

평가 데이터셋을 구축하는 방법은 여러 가지가 있습니다.

프로덕션 로그에서 추출: 실제 사용자 질문을 수집하고, 전문가가 기대 답변을 작성합니다. 가장 현실적인 데이터를 확보할 수 있습니다.

도메인 전문가 작성: 해당 분야 전문가가 대표적인 질문과 답변을 작성합니다. 품질이 높지만 다양성이 부족할 수 있습니다.

합성 데이터 생성: LLM을 활용하여 다양한 질문-답변 쌍을 생성하고, 인간이 검증합니다.

python

def generate_synthetic_eval_data(
    domain: str,
    num_cases: int,
    difficulty_distribution: dict
) -> list:
    """LLM으로 합성 평가 데이터를 생성합니다."""
    prompt = """다음 도메인에 대한 질문-답변 쌍을 생성하세요.
 
도메인: {domain}
난이도: {difficulty}
생성 개수: {count}
 
각 항목은 다음 형식으로 작성하세요:
질문: [질문 내용]
기대 답변: [모범 답변]
카테고리: [세부 카테고리]
 
질문은 실제 사용자가 할 법한 자연스러운 표현으로 작성하세요."""
 
    cases = []
    for difficulty, count in difficulty_distribution.items():
        formatted = prompt.format(
            domain=domain,
            difficulty=difficulty,
            count=count
        )
        response = call_llm("claude-sonnet-4-20250514", formatted)
        parsed = parse_qa_pairs(response)
        cases.extend(parsed)
 
    return cases[:num_cases]

Warning

합성 데이터만으로 평가 데이터셋을 구성하면 편향이 발생할 수 있습니다. LLM이 생성한 데이터는 LLM이 잘 다루는 패턴에 편중되며, 실제 사용자의 다양한 표현과 엣지 케이스를 반영하지 못합니다. 합성 데이터는 보조 수단으로 활용하고, 핵심 데이터셋은 실제 사용 데이터와 전문가 작성을 기반으로 구축하세요.

데이터셋 버전 관리

평가 데이터셋은 코드와 마찬가지로 버전 관리가 필요합니다. 데이터셋이 변경되면 이전 결과와 비교할 수 없게 됩니다.

python

import json
import hashlib
from datetime import datetime
 
def save_dataset(dataset: EvalDataset, path: str) -> str:
    """데이터셋을 버전 정보와 함께 저장합니다."""
    data = {
        "name": dataset.name,
        "version": dataset.version,
        "description": dataset.description,
        "created_at": datetime.now().isoformat(),
        "num_cases": len(dataset.cases),
        "cases": [vars(c) for c in dataset.cases],
    }
 
    content = json.dumps(data, ensure_ascii=False, indent=2)
    checksum = hashlib.sha256(content.encode()).hexdigest()[:8]
 
    filename = path + "/" + dataset.name + "_v" + dataset.version + "_" + checksum + ".json"
    with open(filename, "w") as f:
        f.write(content)
 
    return filename

코드 기반 메트릭 구현

ROUGE 점수

ROUGE(Recall-Oriented Understudy for Gisting Evaluation)는 요약 평가에 널리 사용되는 메트릭입니다.

python

from rouge_score import rouge_scorer
 
def compute_rouge(prediction: str, reference: str) -> dict:
    """ROUGE-1, ROUGE-2, ROUGE-L 점수를 계산합니다."""
    scorer = rouge_scorer.RougeScorer(
        ["rouge1", "rouge2", "rougeL"],
        use_stemmer=False  # 한국어에는 스테머 비활성화
    )
    scores = scorer.score(reference, prediction)
 
    return {
        "rouge1_f": round(scores["rouge1"].fmeasure, 4),
        "rouge2_f": round(scores["rouge2"].fmeasure, 4),
        "rougeL_f": round(scores["rougeL"].fmeasure, 4),
    }
 
# 사용 예시
reference = "파이썬은 범용 프로그래밍 언어로 간결한 문법과 풍부한 라이브러리가 특징입니다"
prediction = "파이썬은 간결한 문법을 가진 프로그래밍 언어입니다"
print(compute_rouge(prediction, reference))

의미적 유사도 (Semantic Similarity)

토큰 겹침 기반 메트릭의 한계를 보완하기 위해, 임베딩 기반 의미적 유사도를 측정합니다.

python

import numpy as np
 
def cosine_similarity(vec_a: np.ndarray, vec_b: np.ndarray) -> float:
    """코사인 유사도를 계산합니다."""
    dot_product = np.dot(vec_a, vec_b)
    norm_a = np.linalg.norm(vec_a)
    norm_b = np.linalg.norm(vec_b)
    if norm_a == 0 or norm_b == 0:
        return 0.0
    return float(dot_product / (norm_a * norm_b))
 
def semantic_similarity(
    prediction: str,
    reference: str,
    model: str = "text-embedding-3-small"
) -> float:
    """임베딩 기반 의미적 유사도를 계산합니다."""
    pred_embedding = get_embedding(prediction, model)
    ref_embedding = get_embedding(reference, model)
    return cosine_similarity(
        np.array(pred_embedding),
        np.array(ref_embedding)
    )

정규식 기반 형식 검증

구조화된 출력을 요구하는 경우, 형식이 올바른지 자동으로 검증합니다.

python

import json
import re
 
def validate_json_output(response: str) -> dict:
    """JSON 형식 응답의 유효성을 검증합니다."""
    try:
        parsed = json.loads(response)
        return {"valid": True, "data": parsed, "error": None}
    except json.JSONDecodeError as e:
        return {"valid": False, "data": None, "error": str(e)}
 
def validate_format(response: str, expected_pattern: str) -> bool:
    """정규식 패턴으로 응답 형식을 검증합니다."""
    return bool(re.match(expected_pattern, response, re.DOTALL))
 
# 사용 예시: 날짜 형식 검증
is_valid = validate_format("2026-04-04", r"\d{4}-\d{2}-\d{2}")
print(is_valid)  # True

평가 실행 엔진 구축

기본 실행 엔진

python

import asyncio
import time
from typing import Callable
 
class EvalRunner:
    """평가 파이프라인의 실행 엔진입니다."""
 
    def __init__(self, app_fn: Callable, metrics: list, concurrency: int = 5):
        self.app_fn = app_fn          # 평가 대상 LLM 애플리케이션
        self.metrics = metrics         # 적용할 메트릭 목록
        self.concurrency = concurrency # 동시 실행 수
        self.results = []
 
    async def run_single(self, case: EvalCase) -> dict:
        """단일 평가 케이스를 실행합니다."""
        start_time = time.perf_counter()
 
        # 1. LLM 응답 생성
        response = await self.app_fn(case.input_text)
        latency = time.perf_counter() - start_time
 
        # 2. 메트릭 계산
        metric_results = {}
        for metric in self.metrics:
            score = metric.evaluate(
                prediction=response,
                reference=case.expected_output,
                input_text=case.input_text,
                context=case.context
            )
            metric_results[metric.name] = score
 
        return {
            "case_id": case.id,
            "input": case.input_text,
            "output": response,
            "expected": case.expected_output,
            "metrics": metric_results,
            "latency": latency,
            "metadata": case.metadata,
        }
 
    async def run_dataset(self, dataset: EvalDataset) -> list:
        """전체 데이터셋에 대해 평가를 실행합니다."""
        semaphore = asyncio.Semaphore(self.concurrency)
 
        async def bounded_run(case):
            async with semaphore:
                return await self.run_single(case)
 
        tasks = [bounded_run(case) for case in dataset.cases]
        self.results = await asyncio.gather(*tasks)
        return self.results

재시도 및 에러 처리

LLM API 호출은 네트워크 오류, 속도 제한(Rate Limit), 타임아웃 등 다양한 이유로 실패할 수 있습니다.

python

import asyncio
import logging
 
logger = logging.getLogger("eval")
 
async def retry_with_backoff(
    func,
    max_retries: int = 3,
    base_delay: float = 1.0,
    max_delay: float = 60.0
):
    """지수 백오프(Exponential Backoff)로 재시도합니다."""
    for attempt in range(max_retries + 1):
        try:
            return await func()
        except Exception as e:
            if attempt == max_retries:
                logger.error(
                    "최대 재시도 횟수 초과: " + str(e)
                )
                raise
            delay = min(base_delay * (2 ** attempt), max_delay)
            logger.warning(
                "시도 " + str(attempt + 1) + " 실패, "
                + str(delay) + "초 후 재시도: " + str(e)
            )
            await asyncio.sleep(delay)

DeepEval을 활용한 평가 자동화

DeepEval은 pytest와 통합되어 기존 테스트 인프라에 자연스럽게 녹아드는 LLM 평가 프레임워크입니다.

설치 및 기본 설정

bash

pip install deepeval

테스트 작성

tests/test_llm_quality.py

python

import pytest
from deepeval import assert_test
from deepeval.test_case import LLMTestCase
from deepeval.metrics import (
    AnswerRelevancyMetric,
    FaithfulnessMetric,
    HallucinationMetric,
)
 
def get_llm_response(question: str) -> str:
    """평가 대상 LLM 애플리케이션을 호출합니다."""
    # 실제 애플리케이션 호출 로직
    return call_your_app(question)
 
@pytest.mark.parametrize("test_input,expected,context", [
    (
        "파이썬의 GIL이란 무엇인가요?",
        "GIL은 Global Interpreter Lock의 약자로...",
        "파이썬의 GIL은 한 번에 하나의 스레드만 바이트코드를 실행하도록 제한하는 뮤텍스입니다."
    ),
    (
        "REST API와 GraphQL의 차이점은?",
        "REST는 리소스 중심의 엔드포인트를...",
        "REST API는 리소스별 엔드포인트를 노출하고, GraphQL은 단일 엔드포인트에서 클라이언트가 필요한 데이터를 쿼리합니다."
    ),
])
def test_answer_quality(test_input, expected, context):
    """답변 품질을 종합적으로 평가합니다."""
    actual_output = get_llm_response(test_input)
 
    test_case = LLMTestCase(
        input=test_input,
        actual_output=actual_output,
        expected_output=expected,
        retrieval_context=[context],
    )
 
    relevancy = AnswerRelevancyMetric(threshold=0.7)
    faithfulness = FaithfulnessMetric(threshold=0.8)
    hallucination = HallucinationMetric(threshold=0.5)
 
    assert_test(test_case, [relevancy, faithfulness, hallucination])

실행

bash

# 기본 실행
deepeval test run tests/test_llm_quality.py
 
# 상세 결과 확인
deepeval test run tests/test_llm_quality.py -v
 
# 특정 테스트만 실행
deepeval test run tests/test_llm_quality.py -k "test_answer_quality"

Tip

DeepEval은 pytest 플러그인으로 동작하므로, 기존 pytest 옵션을 모두 활용할 수 있습니다. CI/CD 파이프라인에서 pytest로 실행하면 일반 단위 테스트와 LLM 평가를 하나의 파이프라인에서 관리할 수 있습니다.

Promptfoo를 활용한 프롬프트 비교

Promptfoo는 여러 프롬프트 변형을 동시에 테스트하고 비교하는 데 특화된 도구입니다.

설정 파일 작성

promptfooconfig.yaml

yaml

description: "QA 시스템 프롬프트 비교"
 
prompts:
  - id: baseline
    raw: |
      다음 질문에 답변하세요.
      질문: {{question}}
 
  - id: cot
    raw: |
      다음 질문에 단계적으로 사고하여 답변하세요.
      먼저 질문을 분석하고, 관련 지식을 정리한 다음, 최종 답변을 제시하세요.
      질문: {{question}}
 
  - id: structured
    raw: |
      다음 질문에 답변하세요. 응답은 반드시 다음 구조를 따르세요.
      핵심 답변: [1-2문장 요약]
      상세 설명: [구체적 설명]
      예시: [해당 시 예시]
      질문: {{question}}
 
providers:
  - id: openai:gpt-4o
  - id: anthropic:messages:claude-sonnet-4-20250514
 
tests:
  - vars:
      question: "파이썬에서 GIL이란 무엇인가요?"
    assert:
      - type: contains
        value: "Global Interpreter Lock"
      - type: llm-rubric
        value: "답변이 GIL의 정의, 목적, 영향을 모두 설명하는가?"
 
  - vars:
      question: "REST API와 GraphQL의 차이점은?"
    assert:
      - type: llm-rubric
        value: "REST와 GraphQL의 핵심 차이를 3가지 이상 정확히 설명하는가?"
      - type: cost
        threshold: 0.01

실행 및 결과 확인

bash

# 평가 실행
npx promptfoo eval
 
# 웹 UI로 결과 비교
npx promptfoo view

Promptfoo는 결과를 테이블 형태로 보여주어, 프롬프트와 모델 조합별 성능 차이를 한눈에 파악할 수 있습니다.

평가 결과 분석과 리포팅

집계 리포트 생성

python

import statistics
 
def generate_report(results: list, thresholds: dict) -> dict:
    """평가 결과를 집계하여 리포트를 생성합니다."""
    metric_scores = {}
 
    for result in results:
        for metric_name, score in result["metrics"].items():
            if metric_name not in metric_scores:
                metric_scores[metric_name] = []
            metric_scores[metric_name].append(score)
 
    report = {"metrics": {}, "overall_pass": True}
 
    for metric_name, scores in metric_scores.items():
        avg = statistics.mean(scores)
        std = statistics.stdev(scores) if len(scores) > 1 else 0
        threshold = thresholds.get(metric_name, {}).get("minimum", 0)
        passed = avg >= threshold
 
        report["metrics"][metric_name] = {
            "mean": round(avg, 4),
            "std": round(std, 4),
            "min": round(min(scores), 4),
            "max": round(max(scores), 4),
            "count": len(scores),
            "threshold": threshold,
            "passed": passed,
        }
 
        if not passed:
            report["overall_pass"] = False
 
    # 실패 케이스 분석
    failures = []
    for result in results:
        for metric_name, score in result["metrics"].items():
            threshold = thresholds.get(metric_name, {}).get("minimum", 0)
            if score < threshold:
                failures.append({
                    "case_id": result["case_id"],
                    "metric": metric_name,
                    "score": score,
                    "threshold": threshold,
                    "input": result["input"][:100],
                })
 
    report["failures"] = failures
    report["failure_count"] = len(failures)
    return report

슬라이스별 분석

전체 평균 점수가 양호하더라도, 특정 카테고리나 난이도에서 성능이 떨어질 수 있습니다. 슬라이스별(Slice-based) 분석으로 약점을 파악합니다.

python

def slice_analysis(results: list, slice_key: str) -> dict:
    """메타데이터 기준으로 결과를 분할 분석합니다."""
    slices = {}
 
    for result in results:
        slice_value = result["metadata"].get(slice_key, "unknown")
        if slice_value not in slices:
            slices[slice_value] = []
        slices[slice_value].append(result)
 
    analysis = {}
    for slice_value, slice_results in slices.items():
        metric_means = {}
        for result in slice_results:
            for metric, score in result["metrics"].items():
                if metric not in metric_means:
                    metric_means[metric] = []
                metric_means[metric].append(score)
 
        analysis[slice_value] = {
            "count": len(slice_results),
            "metrics": {
                m: round(statistics.mean(s), 4)
                for m, s in metric_means.items()
            }
        }
 
    return analysis
 
# 사용 예시: 카테고리별 분석
category_analysis = slice_analysis(results, "category")
# 난이도별 분석
difficulty_analysis = slice_analysis(results, "difficulty")

평가 파이프라인 전체 흐름

지금까지 다룬 내용을 하나의 파이프라인으로 통합하면 다음과 같습니다.

python

async def run_evaluation_pipeline(
    app_fn,
    dataset_path: str,
    metrics: list,
    thresholds: dict,
    output_path: str
) -> bool:
    """전체 평가 파이프라인을 실행합니다."""
    # 1. 데이터셋 로드
    dataset = load_dataset(dataset_path)
    print("데이터셋 로드 완료: " + str(len(dataset.cases)) + "건")
 
    # 2. 평가 실행
    runner = EvalRunner(app_fn, metrics, concurrency=10)
    results = await runner.run_dataset(dataset)
    print("평가 실행 완료")
 
    # 3. 결과 분석
    report = generate_report(results, thresholds)
 
    # 4. 슬라이스 분석
    report["slice_analysis"] = {
        "category": slice_analysis(results, "category"),
        "difficulty": slice_analysis(results, "difficulty"),
    }
 
    # 5. 결과 저장
    save_report(report, output_path)
    print("리포트 저장 완료: " + output_path)
 
    # 6. 통과 여부 반환
    if report["overall_pass"]:
        print("평가 통과")
    else:
        print("평가 실패 - " + str(report["failure_count"]) + "건 기준 미달")
 
    return report["overall_pass"]

정리

자동 평가 파이프라인은 평가 데이터셋, 코드 기반 메트릭, 실행 엔진, 결과 리포팅의 네 요소로 구성됩니다. DeepEval과 Promptfoo 같은 도구를 활용하면 적은 코드로 체계적인 평가를 구축할 수 있으며, 슬라이스별 분석을 통해 시스템의 약점을 정확히 파악할 수 있습니다.

다음 장에서는 코드 기반 메트릭으로 포착하기 어려운 미묘한 품질 차이를 평가하는 LLM-as-Judge 기법을 다룹니다.

이 글이 도움이 되셨나요?

AI / ML

4장: LLM-as-Judge - LLM으로 LLM 평가하기

LLM을 평가자로 활용하는 LLM-as-Judge 기법의 원리, 프롬프트 설계, 편향 완화 전략을 체계적으로 다룹니다.

2026년 1월 21일·17분

AI / ML

2장: 평가 메트릭 설계 - 정확성, 관련성, 안전성

LLM 애플리케이션의 품질을 수치화하는 핵심 메트릭을 설계하고, 작업 유형별로 적절한 메트릭을 선택하는 방법을 다룹니다.

2026년 1월 17일·19분

AI / ML

5장: 인간 평가와 어노테이션 설계

LLM 평가에서 인간 평가의 역할, 어노테이션 가이드라인 설계, 평가자 간 일치도 관리 방법을 체계적으로 다룹니다.

2026년 1월 23일·20분

2026년 1월 19일·AI / ML·

3장: 자동 평가 파이프라인 구축

코드 기반 메트릭과 벤치마크 자동화로 LLM 애플리케이션의 품질을 체계적으로 측정하는 평가 파이프라인을 구축합니다.

20분1,258자9개 섹션

llm evaluation monitoring observability testing

llm-evaluation3 / 10

1 2 3 4 5 6 7 8 9 10

이전2장: 평가 메트릭 설계 - 정확성, 관련성, 안전성 다음4장: LLM-as-Judge - LLM으로 LLM 평가하기

from dataclasses import dataclass, field
from typing import Optional
 
@dataclass
class EvalCase:
    """단일 평가 케이스를 정의합니다."""
    id: str                          # 고유 식별자
    input_text: str                  # LLM에 전달할 입력
    expected_output: Optional[str]   # 기대 출력 (레퍼런스)
    context: Optional[str] = None    # RAG 컨텍스트 (해당 시)
    metadata: dict = field(default_factory=dict)  # 카테고리, 난이도 등
 
@dataclass
class EvalDataset:
    """평가 데이터셋 전체를 관리합니다."""
    name: str
    version: str
    cases: list  # EvalCase 리스트
    description: str = ""
 
    def filter_by_category(self, category: str) -> list:
        return [c for c in self.cases if c.metadata.get("category") == category]
 
    def sample(self, n: int, seed: int = 42) -> list:
        import random
        random.seed(seed)
        return random.sample(self.cases, min(n, len(self.cases)))

데이터셋 소싱 전략

평가 데이터셋을 구축하는 방법은 여러 가지가 있습니다.

프로덕션 로그에서 추출: 실제 사용자 질문을 수집하고, 전문가가 기대 답변을 작성합니다. 가장 현실적인 데이터를 확보할 수 있습니다.

도메인 전문가 작성: 해당 분야 전문가가 대표적인 질문과 답변을 작성합니다. 품질이 높지만 다양성이 부족할 수 있습니다.

합성 데이터 생성: LLM을 활용하여 다양한 질문-답변 쌍을 생성하고, 인간이 검증합니다.

python

def generate_synthetic_eval_data(
    domain: str,
    num_cases: int,
    difficulty_distribution: dict
) -> list:
    """LLM으로 합성 평가 데이터를 생성합니다."""
    prompt = """다음 도메인에 대한 질문-답변 쌍을 생성하세요.
 
도메인: {domain}
난이도: {difficulty}
생성 개수: {count}
 
각 항목은 다음 형식으로 작성하세요:
질문: [질문 내용]
기대 답변: [모범 답변]
카테고리: [세부 카테고리]
 
질문은 실제 사용자가 할 법한 자연스러운 표현으로 작성하세요."""
 
    cases = []
    for difficulty, count in difficulty_distribution.items():
        formatted = prompt.format(
            domain=domain,
            difficulty=difficulty,
            count=count
        )
        response = call_llm("claude-sonnet-4-20250514", formatted)
        parsed = parse_qa_pairs(response)
        cases.extend(parsed)
 
    return cases[:num_cases]

Warning

데이터셋 버전 관리

평가 데이터셋은 코드와 마찬가지로 버전 관리가 필요합니다. 데이터셋이 변경되면 이전 결과와 비교할 수 없게 됩니다.

python

import json
import hashlib
from datetime import datetime
 
def save_dataset(dataset: EvalDataset, path: str) -> str:
    """데이터셋을 버전 정보와 함께 저장합니다."""
    data = {
        "name": dataset.name,
        "version": dataset.version,
        "description": dataset.description,
        "created_at": datetime.now().isoformat(),
        "num_cases": len(dataset.cases),
        "cases": [vars(c) for c in dataset.cases],
    }
 
    content = json.dumps(data, ensure_ascii=False, indent=2)
    checksum = hashlib.sha256(content.encode()).hexdigest()[:8]
 
    filename = path + "/" + dataset.name + "_v" + dataset.version + "_" + checksum + ".json"
    with open(filename, "w") as f:
        f.write(content)
 
    return filename

코드 기반 메트릭 구현

ROUGE 점수

ROUGE(Recall-Oriented Understudy for Gisting Evaluation)는 요약 평가에 널리 사용되는 메트릭입니다.

python

from rouge_score import rouge_scorer
 
def compute_rouge(prediction: str, reference: str) -> dict:
    """ROUGE-1, ROUGE-2, ROUGE-L 점수를 계산합니다."""
    scorer = rouge_scorer.RougeScorer(
        ["rouge1", "rouge2", "rougeL"],
        use_stemmer=False  # 한국어에는 스테머 비활성화
    )
    scores = scorer.score(reference, prediction)
 
    return {
        "rouge1_f": round(scores["rouge1"].fmeasure, 4),
        "rouge2_f": round(scores["rouge2"].fmeasure, 4),
        "rougeL_f": round(scores["rougeL"].fmeasure, 4),
    }
 
# 사용 예시
reference = "파이썬은 범용 프로그래밍 언어로 간결한 문법과 풍부한 라이브러리가 특징입니다"
prediction = "파이썬은 간결한 문법을 가진 프로그래밍 언어입니다"
print(compute_rouge(prediction, reference))

의미적 유사도 (Semantic Similarity)

토큰 겹침 기반 메트릭의 한계를 보완하기 위해, 임베딩 기반 의미적 유사도를 측정합니다.

python

import numpy as np
 
def cosine_similarity(vec_a: np.ndarray, vec_b: np.ndarray) -> float:
    """코사인 유사도를 계산합니다."""
    dot_product = np.dot(vec_a, vec_b)
    norm_a = np.linalg.norm(vec_a)
    norm_b = np.linalg.norm(vec_b)
    if norm_a == 0 or norm_b == 0:
        return 0.0
    return float(dot_product / (norm_a * norm_b))
 
def semantic_similarity(
    prediction: str,
    reference: str,
    model: str = "text-embedding-3-small"
) -> float:
    """임베딩 기반 의미적 유사도를 계산합니다."""
    pred_embedding = get_embedding(prediction, model)
    ref_embedding = get_embedding(reference, model)
    return cosine_similarity(
        np.array(pred_embedding),
        np.array(ref_embedding)
    )

정규식 기반 형식 검증

구조화된 출력을 요구하는 경우, 형식이 올바른지 자동으로 검증합니다.

python

import json
import re
 
def validate_json_output(response: str) -> dict:
    """JSON 형식 응답의 유효성을 검증합니다."""
    try:
        parsed = json.loads(response)
        return {"valid": True, "data": parsed, "error": None}
    except json.JSONDecodeError as e:
        return {"valid": False, "data": None, "error": str(e)}
 
def validate_format(response: str, expected_pattern: str) -> bool:
    """정규식 패턴으로 응답 형식을 검증합니다."""
    return bool(re.match(expected_pattern, response, re.DOTALL))
 
# 사용 예시: 날짜 형식 검증
is_valid = validate_format("2026-04-04", r"\d{4}-\d{2}-\d{2}")
print(is_valid)  # True

평가 실행 엔진 구축

기본 실행 엔진

python

import asyncio
import time
from typing import Callable
 
class EvalRunner:
    """평가 파이프라인의 실행 엔진입니다."""
 
    def __init__(self, app_fn: Callable, metrics: list, concurrency: int = 5):
        self.app_fn = app_fn          # 평가 대상 LLM 애플리케이션
        self.metrics = metrics         # 적용할 메트릭 목록
        self.concurrency = concurrency # 동시 실행 수
        self.results = []
 
    async def run_single(self, case: EvalCase) -> dict:
        """단일 평가 케이스를 실행합니다."""
        start_time = time.perf_counter()
 
        # 1. LLM 응답 생성
        response = await self.app_fn(case.input_text)
        latency = time.perf_counter() - start_time
 
        # 2. 메트릭 계산
        metric_results = {}
        for metric in self.metrics:
            score = metric.evaluate(
                prediction=response,
                reference=case.expected_output,
                input_text=case.input_text,
                context=case.context
            )
            metric_results[metric.name] = score
 
        return {
            "case_id": case.id,
            "input": case.input_text,
            "output": response,
            "expected": case.expected_output,
            "metrics": metric_results,
            "latency": latency,
            "metadata": case.metadata,
        }
 
    async def run_dataset(self, dataset: EvalDataset) -> list:
        """전체 데이터셋에 대해 평가를 실행합니다."""
        semaphore = asyncio.Semaphore(self.concurrency)
 
        async def bounded_run(case):
            async with semaphore:
                return await self.run_single(case)
 
        tasks = [bounded_run(case) for case in dataset.cases]
        self.results = await asyncio.gather(*tasks)
        return self.results

재시도 및 에러 처리

LLM API 호출은 네트워크 오류, 속도 제한(Rate Limit), 타임아웃 등 다양한 이유로 실패할 수 있습니다.

python

import asyncio
import logging
 
logger = logging.getLogger("eval")
 
async def retry_with_backoff(
    func,
    max_retries: int = 3,
    base_delay: float = 1.0,
    max_delay: float = 60.0
):
    """지수 백오프(Exponential Backoff)로 재시도합니다."""
    for attempt in range(max_retries + 1):
        try:
            return await func()
        except Exception as e:
            if attempt == max_retries:
                logger.error(
                    "최대 재시도 횟수 초과: " + str(e)
                )
                raise
            delay = min(base_delay * (2 ** attempt), max_delay)
            logger.warning(
                "시도 " + str(attempt + 1) + " 실패, "
                + str(delay) + "초 후 재시도: " + str(e)
            )
            await asyncio.sleep(delay)

DeepEval을 활용한 평가 자동화

DeepEval은 pytest와 통합되어 기존 테스트 인프라에 자연스럽게 녹아드는 LLM 평가 프레임워크입니다.

설치 및 기본 설정

bash

pip install deepeval

테스트 작성

tests/test_llm_quality.py

python

import pytest
from deepeval import assert_test
from deepeval.test_case import LLMTestCase
from deepeval.metrics import (
    AnswerRelevancyMetric,
    FaithfulnessMetric,
    HallucinationMetric,
)
 
def get_llm_response(question: str) -> str:
    """평가 대상 LLM 애플리케이션을 호출합니다."""
    # 실제 애플리케이션 호출 로직
    return call_your_app(question)
 
@pytest.mark.parametrize("test_input,expected,context", [
    (
        "파이썬의 GIL이란 무엇인가요?",
        "GIL은 Global Interpreter Lock의 약자로...",
        "파이썬의 GIL은 한 번에 하나의 스레드만 바이트코드를 실행하도록 제한하는 뮤텍스입니다."
    ),
    (
        "REST API와 GraphQL의 차이점은?",
        "REST는 리소스 중심의 엔드포인트를...",
        "REST API는 리소스별 엔드포인트를 노출하고, GraphQL은 단일 엔드포인트에서 클라이언트가 필요한 데이터를 쿼리합니다."
    ),
])
def test_answer_quality(test_input, expected, context):
    """답변 품질을 종합적으로 평가합니다."""
    actual_output = get_llm_response(test_input)
 
    test_case = LLMTestCase(
        input=test_input,
        actual_output=actual_output,
        expected_output=expected,
        retrieval_context=[context],
    )
 
    relevancy = AnswerRelevancyMetric(threshold=0.7)
    faithfulness = FaithfulnessMetric(threshold=0.8)
    hallucination = HallucinationMetric(threshold=0.5)
 
    assert_test(test_case, [relevancy, faithfulness, hallucination])

실행

bash

# 기본 실행
deepeval test run tests/test_llm_quality.py
 
# 상세 결과 확인
deepeval test run tests/test_llm_quality.py -v
 
# 특정 테스트만 실행
deepeval test run tests/test_llm_quality.py -k "test_answer_quality"

Tip

Promptfoo를 활용한 프롬프트 비교

Promptfoo는 여러 프롬프트 변형을 동시에 테스트하고 비교하는 데 특화된 도구입니다.

설정 파일 작성

promptfooconfig.yaml

yaml

description: "QA 시스템 프롬프트 비교"
 
prompts:
  - id: baseline
    raw: |
      다음 질문에 답변하세요.
      질문: {{question}}
 
  - id: cot
    raw: |
      다음 질문에 단계적으로 사고하여 답변하세요.
      먼저 질문을 분석하고, 관련 지식을 정리한 다음, 최종 답변을 제시하세요.
      질문: {{question}}
 
  - id: structured
    raw: |
      다음 질문에 답변하세요. 응답은 반드시 다음 구조를 따르세요.
      핵심 답변: [1-2문장 요약]
      상세 설명: [구체적 설명]
      예시: [해당 시 예시]
      질문: {{question}}
 
providers:
  - id: openai:gpt-4o
  - id: anthropic:messages:claude-sonnet-4-20250514
 
tests:
  - vars:
      question: "파이썬에서 GIL이란 무엇인가요?"
    assert:
      - type: contains
        value: "Global Interpreter Lock"
      - type: llm-rubric
        value: "답변이 GIL의 정의, 목적, 영향을 모두 설명하는가?"
 
  - vars:
      question: "REST API와 GraphQL의 차이점은?"
    assert:
      - type: llm-rubric
        value: "REST와 GraphQL의 핵심 차이를 3가지 이상 정확히 설명하는가?"
      - type: cost
        threshold: 0.01

실행 및 결과 확인

bash

# 평가 실행
npx promptfoo eval
 
# 웹 UI로 결과 비교
npx promptfoo view

Promptfoo는 결과를 테이블 형태로 보여주어, 프롬프트와 모델 조합별 성능 차이를 한눈에 파악할 수 있습니다.

평가 결과 분석과 리포팅

집계 리포트 생성

python

import statistics
 
def generate_report(results: list, thresholds: dict) -> dict:
    """평가 결과를 집계하여 리포트를 생성합니다."""
    metric_scores = {}
 
    for result in results:
        for metric_name, score in result["metrics"].items():
            if metric_name not in metric_scores:
                metric_scores[metric_name] = []
            metric_scores[metric_name].append(score)
 
    report = {"metrics": {}, "overall_pass": True}
 
    for metric_name, scores in metric_scores.items():
        avg = statistics.mean(scores)
        std = statistics.stdev(scores) if len(scores) > 1 else 0
        threshold = thresholds.get(metric_name, {}).get("minimum", 0)
        passed = avg >= threshold
 
        report["metrics"][metric_name] = {
            "mean": round(avg, 4),
            "std": round(std, 4),
            "min": round(min(scores), 4),
            "max": round(max(scores), 4),
            "count": len(scores),
            "threshold": threshold,
            "passed": passed,
        }
 
        if not passed:
            report["overall_pass"] = False
 
    # 실패 케이스 분석
    failures = []
    for result in results:
        for metric_name, score in result["metrics"].items():
            threshold = thresholds.get(metric_name, {}).get("minimum", 0)
            if score < threshold:
                failures.append({
                    "case_id": result["case_id"],
                    "metric": metric_name,
                    "score": score,
                    "threshold": threshold,
                    "input": result["input"][:100],
                })
 
    report["failures"] = failures
    report["failure_count"] = len(failures)
    return report

슬라이스별 분석

전체 평균 점수가 양호하더라도, 특정 카테고리나 난이도에서 성능이 떨어질 수 있습니다. 슬라이스별(Slice-based) 분석으로 약점을 파악합니다.

python

def slice_analysis(results: list, slice_key: str) -> dict:
    """메타데이터 기준으로 결과를 분할 분석합니다."""
    slices = {}
 
    for result in results:
        slice_value = result["metadata"].get(slice_key, "unknown")
        if slice_value not in slices:
            slices[slice_value] = []
        slices[slice_value].append(result)
 
    analysis = {}
    for slice_value, slice_results in slices.items():
        metric_means = {}
        for result in slice_results:
            for metric, score in result["metrics"].items():
                if metric not in metric_means:
                    metric_means[metric] = []
                metric_means[metric].append(score)
 
        analysis[slice_value] = {
            "count": len(slice_results),
            "metrics": {
                m: round(statistics.mean(s), 4)
                for m, s in metric_means.items()
            }
        }
 
    return analysis
 
# 사용 예시: 카테고리별 분석
category_analysis = slice_analysis(results, "category")
# 난이도별 분석
difficulty_analysis = slice_analysis(results, "difficulty")

평가 파이프라인 전체 흐름

지금까지 다룬 내용을 하나의 파이프라인으로 통합하면 다음과 같습니다.

python

async def run_evaluation_pipeline(
    app_fn,
    dataset_path: str,
    metrics: list,
    thresholds: dict,
    output_path: str
) -> bool:
    """전체 평가 파이프라인을 실행합니다."""
    # 1. 데이터셋 로드
    dataset = load_dataset(dataset_path)
    print("데이터셋 로드 완료: " + str(len(dataset.cases)) + "건")
 
    # 2. 평가 실행
    runner = EvalRunner(app_fn, metrics, concurrency=10)
    results = await runner.run_dataset(dataset)
    print("평가 실행 완료")
 
    # 3. 결과 분석
    report = generate_report(results, thresholds)
 
    # 4. 슬라이스 분석
    report["slice_analysis"] = {
        "category": slice_analysis(results, "category"),
        "difficulty": slice_analysis(results, "difficulty"),
    }
 
    # 5. 결과 저장
    save_report(report, output_path)
    print("리포트 저장 완료: " + output_path)
 
    # 6. 통과 여부 반환
    if report["overall_pass"]:
        print("평가 통과")
    else:
        print("평가 실패 - " + str(report["failure_count"]) + "건 기준 미달")
 
    return report["overall_pass"]