2026년 2월 23일·AI / ML·

9장: 멀티모달 에이전트 구축

시각적 이해 능력을 갖춘 AI 에이전트의 설계와 구현 — 화면 상호작용 에이전트, 멀티모달 도구 호출, Computer Use, 그리고 실전 에이전트 패턴을 다룹니다.

12분677자6개 섹션

이전8장: 멀티모달 RAG 시스템 설계 다음10장: 프로덕션 아키텍처와 최적화

8장에서 멀티모달 RAG를 설계했습니다. 이 장에서는 시각적 이해 능력을 갖춘 AI 에이전트를 구축합니다. 화면을 보고 상호작용하는 에이전트, 이미지와 텍스트를 함께 처리하는 멀티모달 도구 호출, 그리고 Claude의 Computer Use 기능을 활용한 실전 패턴을 다룹니다.

멀티모달 에이전트란

전통적인 AI 에이전트는 텍스트 입출력만 처리했습니다. 멀티모달 에이전트는 여기에 시각적 인식, 음성 처리, 시각적 상호작용 능력을 추가합니다.

전통적 에이전트:
  텍스트 → [LLM + 도구] → 텍스트

멀티모달 에이전트:
  텍스트 + 이미지 + 음성 → [VLM + 시각 도구 + 음성 도구] → 텍스트 + 행동

멀티모달 에이전트의 능력

시각적 관찰: 스크린샷, 카메라 피드, 문서 이미지를 이해
시각적 행동: 클릭, 드래그, 타이핑 등 UI 상호작용
멀티모달 추론: 시각 정보와 텍스트 정보를 결합한 의사결정
도구 활용: 카메라, OCR, 이미지 생성 등 멀티모달 도구 호출

Computer Use: 화면 상호작용 에이전트

Claude Computer Use

Anthropic의 Claude는 Computer Use 기능을 제공하여, 스크린샷을 보고 마우스/키보드 행동을 지시할 수 있습니다.

Computer Use 기본 구조

python

import anthropic
 
client = anthropic.Anthropic()
 
# Computer Use 도구 정의
tools = [
    {
        "type": "computer_20250124",
        "name": "computer",
        "display_width_px": 1920,
        "display_height_px": 1080,
        "display_number": 1,
    },
    {
        "type": "text_editor_20250124",
        "name": "str_replace_editor",
    },
    {
        "type": "bash_20250124",
        "name": "bash",
    },
]
 
# 에이전트 루프
messages = [{
    "role": "user",
    "content": "Chrome 브라우저를 열고 'AI agent patterns'를 검색해주세요.",
}]
 
while True:
    response = client.messages.create(
        model="claude-sonnet-4-20250514",
        max_tokens=4096,
        tools=tools,
        messages=messages,
    )
 
    # 도구 호출이 없으면 종료
    if response.stop_reason == "end_turn":
        break
 
    # 도구 호출 처리
    for block in response.content:
        if block.type == "tool_use":
            # 도구 실행 (스크린샷 캡처, 클릭, 타이핑 등)
            result = execute_tool(block)
 
            # 결과를 메시지에 추가
            messages.append({"role": "assistant", "content": response.content})
            messages.append({
                "role": "user",
                "content": [{
                    "type": "tool_result",
                    "tool_use_id": block.id,
                    "content": result,
                }],
            })

Computer Use의 행동 유형

행동 처리 구현

python

def execute_computer_action(action: dict) -> dict:
    """Computer Use 행동 실행"""
    action_type = action["action"]
 
    if action_type == "screenshot":
        # 현재 화면 캡처
        screenshot = capture_screenshot()
        return {
            "type": "image",
            "source": {
                "type": "base64",
                "media_type": "image/png",
                "data": encode_screenshot(screenshot),
            },
        }
 
    elif action_type == "click":
        # 마우스 클릭
        x, y = action["coordinate"]
        pyautogui.click(x, y)
        return {"type": "text", "text": f"Clicked at ({x}, {y})"}
 
    elif action_type == "type":
        # 텍스트 입력
        pyautogui.typewrite(action["text"], interval=0.05)
        return {"type": "text", "text": f"Typed: {action['text']}"}
 
    elif action_type == "key":
        # 키 조합
        pyautogui.hotkey(*action["key"].split("+"))
        return {"type": "text", "text": f"Pressed: {action['key']}"}
 
    elif action_type == "scroll":
        # 스크롤
        direction = action["coordinate"][1]
        pyautogui.scroll(direction)
        return {"type": "text", "text": f"Scrolled: {direction}"}

Warning

Computer Use 에이전트는 강력하지만 보안에 주의가 필요합니다. 에이전트가 시스템의 실제 UI를 제어하므로, 의도하지 않은 작업을 수행할 수 있습니다. 프로덕션 환경에서는 반드시 샌드박스 환경(Docker, VM)에서 실행하고, 작업 범위를 제한하세요.

멀티모달 도구 호출 에이전트

도구 정의

멀티모달 도구 세트

python

tools = [
    {
        "name": "capture_screenshot",
        "description": "현재 화면의 스크린샷을 캡처합니다.",
        "input_schema": {
            "type": "object",
            "properties": {
                "region": {
                    "type": "string",
                    "description": "캡처 영역: full, active_window, 또는 x,y,w,h 좌표",
                },
            },
        },
    },
    {
        "name": "analyze_image",
        "description": "이미지를 분석하여 내용을 설명합니다.",
        "input_schema": {
            "type": "object",
            "properties": {
                "image_path": {"type": "string"},
                "question": {"type": "string"},
            },
            "required": ["image_path"],
        },
    },
    {
        "name": "extract_text_from_image",
        "description": "이미지에서 텍스트를 추출합니다 (OCR).",
        "input_schema": {
            "type": "object",
            "properties": {
                "image_path": {"type": "string"},
                "language": {"type": "string", "default": "ko"},
            },
            "required": ["image_path"],
        },
    },
    {
        "name": "search_similar_images",
        "description": "텍스트 설명으로 유사한 이미지를 검색합니다.",
        "input_schema": {
            "type": "object",
            "properties": {
                "query": {"type": "string"},
                "top_k": {"type": "integer", "default": 5},
            },
            "required": ["query"],
        },
    },
]

에이전트 루프 구현

멀티모달 에이전트 루프

python

class MultimodalAgent:
    def __init__(self, client, tools, max_iterations: int = 10):
        self.client = client
        self.tools = tools
        self.max_iterations = max_iterations
        self.tool_handlers = {}
 
    def register_handler(self, name: str, handler):
        self.tool_handlers[name] = handler
 
    async def run(self, user_message: str, images: list[dict] | None = None) -> str:
        """에이전트 실행"""
        # 초기 메시지 구성
        content = []
        if images:
            content.extend(images)
        content.append({"type": "text", "text": user_message})
 
        messages = [{"role": "user", "content": content}]
 
        for _ in range(self.max_iterations):
            response = self.client.messages.create(
                model="claude-sonnet-4-20250514",
                max_tokens=4096,
                tools=self.tools,
                messages=messages,
            )
 
            # 최종 응답
            if response.stop_reason == "end_turn":
                return self._extract_text(response.content)
 
            # 도구 호출 처리
            messages.append({"role": "assistant", "content": response.content})
            tool_results = []
 
            for block in response.content:
                if block.type == "tool_use":
                    handler = self.tool_handlers.get(block.name)
                    if handler:
                        result = await handler(block.input)
                        tool_results.append({
                            "type": "tool_result",
                            "tool_use_id": block.id,
                            "content": result,
                        })
 
            messages.append({"role": "user", "content": tool_results})
 
        return "최대 반복 횟수에 도달했습니다."
 
    def _extract_text(self, content) -> str:
        return "\n".join(
            block.text for block in content if hasattr(block, "text")
        )

실전 패턴

시각적 QA 에이전트

시각적 질의응답 에이전트

python

# 사용자가 이미지를 업로드하고 질문하면,
# 에이전트가 이미지를 분석하고 추가 도구를 활용하여 답변
 
agent = MultimodalAgent(client, tools)
 
# 도구 핸들러 등록
agent.register_handler("analyze_image", analyze_image_handler)
agent.register_handler("search_similar_images", search_handler)
agent.register_handler("extract_text_from_image", ocr_handler)
 
# 이미지와 함께 질문
result = await agent.run(
    user_message="이 아키텍처 다이어그램에서 데이터베이스 병목 지점을 찾고, AWS에서 사용할 수 있는 대안 서비스를 추천해주세요.",
    images=[{
        "type": "image",
        "source": {"type": "base64", "media_type": "image/png", "data": diagram_b64},
    }],
)

문서 처리 에이전트

python

document_tools = [
    {"name": "extract_tables", "description": "PDF에서 표를 추출합니다."},
    {"name": "extract_charts", "description": "PDF에서 차트 데이터를 추출합니다."},
    {"name": "search_document", "description": "문서 내용을 검색합니다."},
    {"name": "compare_documents", "description": "두 문서를 비교합니다."},
]
 
# 사용 예시
result = await agent.run(
    "이 두 분기 보고서를 비교하여 매출 변화를 분석하고, "
    "가장 큰 변화가 있는 부문을 차트 데이터와 함께 설명해주세요.",
    images=[q3_report_image, q4_report_image],
)

웹 브라우징 에이전트

웹 브라우징 에이전트 개념

python

web_tools = [
    {"name": "navigate", "description": "URL로 이동합니다."},
    {"name": "screenshot", "description": "현재 페이지 스크린샷을 캡처합니다."},
    {"name": "click_element", "description": "페이지의 특정 요소를 클릭합니다."},
    {"name": "fill_form", "description": "폼 필드에 값을 입력합니다."},
    {"name": "extract_content", "description": "페이지 콘텐츠를 추출합니다."},
]
 
# 에이전트가 스크린샷을 보고 다음 행동을 결정
# 1. 페이지 로드 → 스크린샷
# 2. 스크린샷 분석 → 클릭할 요소 결정
# 3. 클릭 → 새 스크린샷
# 4. 반복...

에이전트 안전성

가드레일 설계

멀티모달 에이전트 가드레일

python

class AgentGuardrails:
    def __init__(self):
        self.allowed_domains = ["*.example.com"]
        self.blocked_actions = ["delete", "format", "shutdown"]
        self.max_cost_per_run = 1.0  # USD
 
    def validate_action(self, action: dict) -> bool:
        """행동 사전 검증"""
        # URL 허용 목록 확인
        if "url" in action:
            if not self._is_allowed_url(action["url"]):
                return False
 
        # 위험한 행동 차단
        if action.get("type") in self.blocked_actions:
            return False
 
        return True
 
    def validate_visual_output(self, screenshot: bytes) -> bool:
        """스크린샷에 민감 정보가 포함되어 있는지 확인"""
        # 개인정보, 비밀번호 필드 등 감지
        return True
 
    def check_cost(self, current_cost: float) -> bool:
        """비용 한도 확인"""
        return current_cost < self.max_cost_per_run

Tip

멀티모달 에이전트는 텍스트 전용 에이전트보다 더 넓은 행동 범위를 가지므로, 보안과 안전성에 더 많은 주의가 필요합니다. 최소 권한 원칙을 적용하고, 모든 행동을 로깅하며, 위험한 작업에는 인간 승인(Human-in-the-Loop)을 요구하세요.

정리

멀티모달 에이전트는 시각적 이해와 행동 능력을 결합하여, 화면 상호작용, 문서 처리, 웹 브라우징 등 인간에 가까운 작업을 수행합니다. Computer Use, 멀티모달 도구 호출, 시각적 추론을 결합하면 강력한 자동화 시스템을 구축할 수 있습니다. 보안 가드레일과 비용 관리가 프로덕션 배포의 핵심입니다.

다음 장에서는 멀티모달 AI 시스템의 프로덕션 아키텍처와 최적화를 다룹니다. 서빙 인프라, 비용 관리, 지연 시간 최적화, 모니터링을 배웁니다.

이 글이 도움이 되셨나요?

AI / ML

10장: 프로덕션 아키텍처와 최적화

멀티모달 AI 시스템의 프로덕션 배포 전략 — 서빙 인프라, 비용 관리, 지연 시간 최적화, 캐싱, 모니터링, 그리고 확장성 설계를 다룹니다.

2026년 2월 25일·11분

AI / ML

8장: 멀티모달 RAG 시스템 설계

텍스트, 이미지, 표, 차트 등 다양한 모달리티를 통합하는 멀티모달 RAG 시스템의 설계와 구현을 다룹니다. ColPali, 비전 기반 검색, 문서 파싱 전략을 배웁니다.

2026년 2월 21일·13분

AI / ML

11장: 실전 프로젝트 — 멀티모달 AI 애플리케이션 구축

시리즈 전체의 기법을 종합하여 멀티모달 문서 분석 시스템을 설계하고 구현합니다. 이미지, 표, 차트를 이해하는 RAG 기반 Q&A 시스템을 구축합니다.

2026년 2월 27일·11분

2026년 2월 23일·AI / ML·

9장: 멀티모달 에이전트 구축

12분677자6개 섹션

llm multimodal embedding

multimodal-ai9 / 11

1 2 3 4 5 6 7 8 9 10 11

이전8장: 멀티모달 RAG 시스템 설계 다음10장: 프로덕션 아키텍처와 최적화

멀티모달 에이전트란

전통적 에이전트:
  텍스트 → [LLM + 도구] → 텍스트

멀티모달 에이전트:
  텍스트 + 이미지 + 음성 → [VLM + 시각 도구 + 음성 도구] → 텍스트 + 행동

멀티모달 에이전트의 능력

시각적 관찰: 스크린샷, 카메라 피드, 문서 이미지를 이해
시각적 행동: 클릭, 드래그, 타이핑 등 UI 상호작용
멀티모달 추론: 시각 정보와 텍스트 정보를 결합한 의사결정
도구 활용: 카메라, OCR, 이미지 생성 등 멀티모달 도구 호출

Computer Use: 화면 상호작용 에이전트

Claude Computer Use

Anthropic의 Claude는 Computer Use 기능을 제공하여, 스크린샷을 보고 마우스/키보드 행동을 지시할 수 있습니다.

Computer Use 기본 구조

python

import anthropic
 
client = anthropic.Anthropic()
 
# Computer Use 도구 정의
tools = [
    {
        "type": "computer_20250124",
        "name": "computer",
        "display_width_px": 1920,
        "display_height_px": 1080,
        "display_number": 1,
    },
    {
        "type": "text_editor_20250124",
        "name": "str_replace_editor",
    },
    {
        "type": "bash_20250124",
        "name": "bash",
    },
]
 
# 에이전트 루프
messages = [{
    "role": "user",
    "content": "Chrome 브라우저를 열고 'AI agent patterns'를 검색해주세요.",
}]
 
while True:
    response = client.messages.create(
        model="claude-sonnet-4-20250514",
        max_tokens=4096,
        tools=tools,
        messages=messages,
    )
 
    # 도구 호출이 없으면 종료
    if response.stop_reason == "end_turn":
        break
 
    # 도구 호출 처리
    for block in response.content:
        if block.type == "tool_use":
            # 도구 실행 (스크린샷 캡처, 클릭, 타이핑 등)
            result = execute_tool(block)
 
            # 결과를 메시지에 추가
            messages.append({"role": "assistant", "content": response.content})
            messages.append({
                "role": "user",
                "content": [{
                    "type": "tool_result",
                    "tool_use_id": block.id,
                    "content": result,
                }],
            })

Computer Use의 행동 유형

행동 처리 구현

python

def execute_computer_action(action: dict) -> dict:
    """Computer Use 행동 실행"""
    action_type = action["action"]
 
    if action_type == "screenshot":
        # 현재 화면 캡처
        screenshot = capture_screenshot()
        return {
            "type": "image",
            "source": {
                "type": "base64",
                "media_type": "image/png",
                "data": encode_screenshot(screenshot),
            },
        }
 
    elif action_type == "click":
        # 마우스 클릭
        x, y = action["coordinate"]
        pyautogui.click(x, y)
        return {"type": "text", "text": f"Clicked at ({x}, {y})"}
 
    elif action_type == "type":
        # 텍스트 입력
        pyautogui.typewrite(action["text"], interval=0.05)
        return {"type": "text", "text": f"Typed: {action['text']}"}
 
    elif action_type == "key":
        # 키 조합
        pyautogui.hotkey(*action["key"].split("+"))
        return {"type": "text", "text": f"Pressed: {action['key']}"}
 
    elif action_type == "scroll":
        # 스크롤
        direction = action["coordinate"][1]
        pyautogui.scroll(direction)
        return {"type": "text", "text": f"Scrolled: {direction}"}

Warning

멀티모달 도구 호출 에이전트

도구 정의

멀티모달 도구 세트

python

tools = [
    {
        "name": "capture_screenshot",
        "description": "현재 화면의 스크린샷을 캡처합니다.",
        "input_schema": {
            "type": "object",
            "properties": {
                "region": {
                    "type": "string",
                    "description": "캡처 영역: full, active_window, 또는 x,y,w,h 좌표",
                },
            },
        },
    },
    {
        "name": "analyze_image",
        "description": "이미지를 분석하여 내용을 설명합니다.",
        "input_schema": {
            "type": "object",
            "properties": {
                "image_path": {"type": "string"},
                "question": {"type": "string"},
            },
            "required": ["image_path"],
        },
    },
    {
        "name": "extract_text_from_image",
        "description": "이미지에서 텍스트를 추출합니다 (OCR).",
        "input_schema": {
            "type": "object",
            "properties": {
                "image_path": {"type": "string"},
                "language": {"type": "string", "default": "ko"},
            },
            "required": ["image_path"],
        },
    },
    {
        "name": "search_similar_images",
        "description": "텍스트 설명으로 유사한 이미지를 검색합니다.",
        "input_schema": {
            "type": "object",
            "properties": {
                "query": {"type": "string"},
                "top_k": {"type": "integer", "default": 5},
            },
            "required": ["query"],
        },
    },
]

에이전트 루프 구현

멀티모달 에이전트 루프

python

class MultimodalAgent:
    def __init__(self, client, tools, max_iterations: int = 10):
        self.client = client
        self.tools = tools
        self.max_iterations = max_iterations
        self.tool_handlers = {}
 
    def register_handler(self, name: str, handler):
        self.tool_handlers[name] = handler
 
    async def run(self, user_message: str, images: list[dict] | None = None) -> str:
        """에이전트 실행"""
        # 초기 메시지 구성
        content = []
        if images:
            content.extend(images)
        content.append({"type": "text", "text": user_message})
 
        messages = [{"role": "user", "content": content}]
 
        for _ in range(self.max_iterations):
            response = self.client.messages.create(
                model="claude-sonnet-4-20250514",
                max_tokens=4096,
                tools=self.tools,
                messages=messages,
            )
 
            # 최종 응답
            if response.stop_reason == "end_turn":
                return self._extract_text(response.content)
 
            # 도구 호출 처리
            messages.append({"role": "assistant", "content": response.content})
            tool_results = []
 
            for block in response.content:
                if block.type == "tool_use":
                    handler = self.tool_handlers.get(block.name)
                    if handler:
                        result = await handler(block.input)
                        tool_results.append({
                            "type": "tool_result",
                            "tool_use_id": block.id,
                            "content": result,
                        })
 
            messages.append({"role": "user", "content": tool_results})
 
        return "최대 반복 횟수에 도달했습니다."
 
    def _extract_text(self, content) -> str:
        return "\n".join(
            block.text for block in content if hasattr(block, "text")
        )

실전 패턴

시각적 QA 에이전트

시각적 질의응답 에이전트

python

# 사용자가 이미지를 업로드하고 질문하면,
# 에이전트가 이미지를 분석하고 추가 도구를 활용하여 답변
 
agent = MultimodalAgent(client, tools)
 
# 도구 핸들러 등록
agent.register_handler("analyze_image", analyze_image_handler)
agent.register_handler("search_similar_images", search_handler)
agent.register_handler("extract_text_from_image", ocr_handler)
 
# 이미지와 함께 질문
result = await agent.run(
    user_message="이 아키텍처 다이어그램에서 데이터베이스 병목 지점을 찾고, AWS에서 사용할 수 있는 대안 서비스를 추천해주세요.",
    images=[{
        "type": "image",
        "source": {"type": "base64", "media_type": "image/png", "data": diagram_b64},
    }],
)

문서 처리 에이전트

python

document_tools = [
    {"name": "extract_tables", "description": "PDF에서 표를 추출합니다."},
    {"name": "extract_charts", "description": "PDF에서 차트 데이터를 추출합니다."},
    {"name": "search_document", "description": "문서 내용을 검색합니다."},
    {"name": "compare_documents", "description": "두 문서를 비교합니다."},
]
 
# 사용 예시
result = await agent.run(
    "이 두 분기 보고서를 비교하여 매출 변화를 분석하고, "
    "가장 큰 변화가 있는 부문을 차트 데이터와 함께 설명해주세요.",
    images=[q3_report_image, q4_report_image],
)

웹 브라우징 에이전트

웹 브라우징 에이전트 개념

python

web_tools = [
    {"name": "navigate", "description": "URL로 이동합니다."},
    {"name": "screenshot", "description": "현재 페이지 스크린샷을 캡처합니다."},
    {"name": "click_element", "description": "페이지의 특정 요소를 클릭합니다."},
    {"name": "fill_form", "description": "폼 필드에 값을 입력합니다."},
    {"name": "extract_content", "description": "페이지 콘텐츠를 추출합니다."},
]
 
# 에이전트가 스크린샷을 보고 다음 행동을 결정
# 1. 페이지 로드 → 스크린샷
# 2. 스크린샷 분석 → 클릭할 요소 결정
# 3. 클릭 → 새 스크린샷
# 4. 반복...

에이전트 안전성

가드레일 설계

멀티모달 에이전트 가드레일

python

class AgentGuardrails:
    def __init__(self):
        self.allowed_domains = ["*.example.com"]
        self.blocked_actions = ["delete", "format", "shutdown"]
        self.max_cost_per_run = 1.0  # USD
 
    def validate_action(self, action: dict) -> bool:
        """행동 사전 검증"""
        # URL 허용 목록 확인
        if "url" in action:
            if not self._is_allowed_url(action["url"]):
                return False
 
        # 위험한 행동 차단
        if action.get("type") in self.blocked_actions:
            return False
 
        return True
 
    def validate_visual_output(self, screenshot: bytes) -> bool:
        """스크린샷에 민감 정보가 포함되어 있는지 확인"""
        # 개인정보, 비밀번호 필드 등 감지
        return True
 
    def check_cost(self, current_cost: float) -> bool:
        """비용 한도 확인"""
        return current_cost < self.max_cost_per_run