RAG 시스템 구축하기

개요

RAG (Retrieval-Augmented Generation) 시스템은 외부 문서를 검색하여 더 정확하고 맥락에 맞는 답변을 생성합니다. 이 가이드에서는 Clova Studio GOV의 RAG42 API를 사용하여 완전한 RAG 시스템을 구축하는 방법을 다룹니다.

사용 가능한 모델 종류와 특징은 언어 모델 종류를 참고하세요.

RAG 시스템 구조

컬렉션 생성

문서를 저장할 컬렉션을 생성합니다.

문서 업로드 및 처리

문서를 업로드하고 자동으로 청크로 분할합니다.

문서 검색

사용자 질문과 관련된 문서를 검색합니다.

답변 생성

검색된 문서를 컨텍스트로 사용하여 답변을 생성합니다.

완전한 RAG 시스템 구현

Python 예제

import requests
from typing import List, Dict, Any

class ClovaRAGSystem:
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.base_url = "https://api.clovastudio.go.kr/v1"
        self.headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        }
    
    def create_collection(self, name: str, description: str = "") -> str:
        """컬렉션 생성"""
        data = {
            "name": name,
            "description": description
        }
        
        response = requests.post(
            f"{self.base_url}/rag42/collections",
            headers=self.headers,
            json=data
        )
        response.raise_for_status()
        
        collection_id = response.json()['id']
        print(f"컬렉션 생성 완료: {collection_id}")
        return collection_id
    
    def upload_document(self, collection_id: str, file_path: str, 
                       metadata: Dict[str, Any] = None) -> str:
        """문서 업로드"""
        with open(file_path, 'rb') as f:
            files = {'file': f}
            data = {'collection_id': collection_id}
            
            if metadata:
                data['metadata'] = metadata
            
            # multipart/form-data로 전송
            headers = {"Authorization": f"Bearer {self.api_key}"}
            
            response = requests.post(
                f"{self.base_url}/rag42/documents",
                headers=headers,
                files=files,
                data=data
            )
            response.raise_for_status()
            
            doc_id = response.json()['id']
            print(f"문서 업로드 완료: {doc_id}")
            return doc_id
    
    def search_documents(self, collection_id: str, query: str, 
                        top_k: int = 5) -> List[Dict[str, Any]]:
        """문서 검색"""
        data = {
            "collection_id": collection_id,
            "query": query,
            "top_k": top_k
        }
        
        response = requests.post(
            f"{self.base_url}/rag42/search",
            headers=self.headers,
            json=data
        )
        response.raise_for_status()
        
        results = response.json()['results']
        print(f"검색 완료: {len(results)}개 문서 발견")
        return results
    
    def generate_answer(self, query: str, context_docs: List[Dict[str, Any]]) -> str:
        """검색된 문서를 기반으로 답변 생성"""
        # 검색된 문서들을 컨텍스트로 구성
        context = "\n\n".join([
            f"[문서 {i+1}]\n{doc['content']}" 
            for i, doc in enumerate(context_docs)
        ])
        
        # 시스템 프롬프트와 함께 답변 생성
        messages = [
            {
                "role": "system",
                "content": "당신은 제공된 문서를 기반으로 정확하게 답변하는 AI 어시스턴트입니다. "
                          "문서에 없는 내용은 추측하지 말고, '제공된 문서에서 해당 정보를 찾을 수 없습니다'라고 답변하세요."
            },
            {
                "role": "user",
                "content": f"다음 문서들을 참고하여 질문에 답변해주세요.\n\n"
                          f"## 참고 문서\n{context}\n\n## 질문\n{query}"
            }
        ]
        
        response = requests.post(
            "https://api.clovastudio.go.kr/api/v1/chat/completions",
            headers=self.headers,
            json={
                "model": "HCX-GOV-THINK",
                "messages": messages,
                "temperature": 0.3,  # 낮은 temperature로 정확도 향상
                "stream": True
            }
        )
        response.raise_for_status()
        
        answer = response.json()['choices'][0]['message']['content']
        return answer
    
    def ask(self, collection_id: str, question: str, top_k: int = 5) -> Dict[str, Any]:
        """질문에 대한 답변 생성 (검색 + 생성)"""
        # 1. 관련 문서 검색
        search_results = self.search_documents(collection_id, question, top_k)
        
        # 2. 답변 생성
        answer = self.generate_answer(question, search_results)
        
        return {
            "answer": answer,
            "sources": search_results
        }

# 사용 예제
if __name__ == "__main__":
    rag = ClovaRAGSystem("YOUR_API_KEY")
    
    # 1. 컬렉션 생성
    collection_id = rag.create_collection(
        name="회사 문서",
        description="회사 규정 및 정책 문서"
    )
    
    # 2. 문서 업로드
    doc_id = rag.upload_document(
        collection_id=collection_id,
        file_path="./company_policy.pdf",
        metadata={"category": "policy", "department": "HR"}
    )
    
    # 3. 질문하기
    result = rag.ask(
        collection_id=collection_id,
        question="연차 사용 규정은 어떻게 되나요?"
    )
    
    print(f"\n답변: {result['answer']}")
    print(f"\n참고 문서: {len(result['sources'])}개")

고급 기능: 하이브리드 검색

키워드 검색과 의미 검색을 결합하여 더 정확한 결과를 얻을 수 있습니다.

def hybrid_search(self, collection_id: str, query: str, 
                  top_k: int = 10, rerank: bool = True) -> List[Dict[str, Any]]:
    """하이브리드 검색 (키워드 + 의미 검색)"""
    data = {
        "collection_id": collection_id,
        "query": query,
        "top_k": top_k,
        "search_type": "hybrid",  # 하이브리드 검색
        "alpha": 0.5  # 0: 키워드만, 1: 의미검색만, 0.5: 균등
    }
    
    response = requests.post(
        f"{self.base_url}/rag42/search",
        headers=self.headers,
        json=data
    )
    response.raise_for_status()
    
    results = response.json()['results']
    
    # 재순위 적용
    if rerank:
        results = self.rerank_documents(query, results)
    
    return results

def rerank_documents(self, query: str, documents: List[Dict[str, Any]], 
                     top_k: int = 5) -> List[Dict[str, Any]]:
    """문서 재순위 적용"""
    doc_contents = [doc['content'] for doc in documents]
    
    data = {
        "query": query,
        "documents": doc_contents,
        "top_k": top_k
    }
    
    response = requests.post(
        f"{self.base_url}/text/rerank",
        headers=self.headers,
        json=data
    )
    response.raise_for_status()
    
    reranked_indices = response.json()['rankings']
    return [documents[i] for i in reranked_indices]

문서 전처리 파이프라인

문서를 업로드하기 전에 전처리하여 검색 품질을 향상시킬 수 있습니다.

class DocumentProcessor:
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.base_url = "https://api.clovastudio.go.kr/v1"
        self.headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        }
    
    def parse_document(self, file_path: str) -> str:
        """문서 파싱"""
        with open(file_path, 'rb') as f:
            files = {'file': f}
            headers = {"Authorization": f"Bearer {self.api_key}"}
            
            response = requests.post(
                f"{self.base_url}/rag42/document/parse",
                headers=headers,
                files=files
            )
            response.raise_for_status()
            
            return response.json()['text']
    
    def chunk_text(self, text: str, chunk_size: int = 512, 
                   overlap: int = 50) -> List[str]:
        """텍스트 청킹"""
        data = {
            "text": text,
            "chunk_size": chunk_size,
            "overlap": overlap
        }
        
        response = requests.post(
            f"{self.base_url}/text/chunk",
            headers=self.headers,
            json=data
        )
        response.raise_for_status()

        return response.json()['chunks']
    
    def mask_pii(self, text: str) -> str:
        """PII 마스킹"""
        data = {"text": text}
        
        response = requests.post(
            f"{self.base_url}/text/pii-mask",
            headers=self.headers,
            json=data
        )
        response.raise_for_status()

        return response.json()['masked_text']
    
    def process_and_upload(self, rag_system: ClovaRAGSystem, 
                          collection_id: str, file_path: str):
        """문서 전처리 후 업로드"""
        # 1. 문서 파싱
        text = self.parse_document(file_path)
        
        # 2. PII 마스킹 (필요한 경우)
        text = self.mask_pii(text)
        
        # 3. 청킹
        chunks = self.chunk_text(text)
        
        # 4. 각 청크를 개별 문서로 업로드
        for i, chunk in enumerate(chunks):
            # 임시 파일로 저장 후 업로드
            # 실제로는 API가 텍스트 직접 업로드를 지원할 수 있음
            print(f"청크 {i+1}/{len(chunks)} 업로드 중...")

# 사용 예제
processor = DocumentProcessor("YOUR_API_KEY")
rag = ClovaRAGSystem("YOUR_API_KEY")

collection_id = rag.create_collection("처리된 문서")
processor.process_and_upload(rag, collection_id, "./document.pdf")

대화형 RAG 챗봇

대화 컨텍스트를 유지하면서 RAG를 사용하는 챗봇 구현

class RAGChatBot:
    def __init__(self, api_key: str, collection_id: str):
        self.rag = ClovaRAGSystem(api_key)
        self.collection_id = collection_id
        self.conversation_history = []
    
    def chat(self, user_message: str) -> str:
        # 1. 현재 질문과 대화 히스토리를 고려하여 검색 쿼리 생성
        search_query = self._generate_search_query(user_message)
        
        # 2. 관련 문서 검색
        docs = self.rag.search_documents(self.collection_id, search_query)
        
        # 3. 대화 히스토리와 검색된 문서를 함께 사용하여 답변 생성
        context = "\n\n".join([doc['content'] for doc in docs])
        
        self.conversation_history.append({
            "role": "user",
            "content": user_message
        })
        
        messages = [
            {"role": "system", "content": "당신은 문서 기반 질의응답 어시스턴트입니다."},
            *self.conversation_history[:-1],  # 이전 대화
            {
                "role": "user",
                "content": f"참고 문서:\n{context}\n\n질문: {user_message}"
            }
        ]
        
        # Chat API 호출
        response = requests.post(
            "https://api.clovastudio.go.kr/api/v1/chat/completions",
            headers=self.rag.headers,
            json={
                "model": "HCX-GOV-THINK",
                "messages": messages,
                "stream": True
            }
        )
        
        answer = response.json()['choices'][0]['message']['content']
        
        self.conversation_history.append({
            "role": "assistant",
            "content": answer
        })
        
        return answer
    
    def _generate_search_query(self, current_message: str) -> str:
        """대화 컨텍스트를 고려한 검색 쿼리 생성"""
        if not self.conversation_history:
            return current_message
        
        # 이전 대화를 고려하여 검색 쿼리를 개선
        # 실제로는 더 정교한 로직 필요
        return current_message

# 사용 예제
bot = RAGChatBot("YOUR_API_KEY", "collection_id_here")

print(bot.chat("연차 규정에 대해 알려주세요."))
print(bot.chat("그럼 병가는 어떻게 되나요?"))  # 이전 맥락 유지

성능 최적화 팁

청킹 전략

문서 타입에 따라 적절한 청크 크기 선택
의미 단위로 분할 (문단, 섹션)
청크 간 오버랩 설정으로 컨텍스트 손실 방지

검색 최적화

하이브리드 검색 사용
재순위(rerank) 적용으로 정확도 향상
적절한 top_k 값 설정

답변 생성 최적화

낮은 temperature로 일관성 향상
명확한 시스템 프롬프트 사용
검색된 문서 개수 최적화 (너무 많으면 컨텍스트 혼란)

다음 단계

문서 처리 파이프라인

고급 문서 처리 기법

API Reference

RAG42 API 상세 문서

Cookbook

API 활용 예제

개요

RAG 시스템 구조

완전한 RAG 시스템 구현

Python 예제

고급 기능: 하이브리드 검색

문서 전처리 파이프라인

대화형 RAG 챗봇

성능 최적화 팁

다음 단계

문서 처리 파이프라인

API Reference

Cookbook

API 활용 예제

​개요

​RAG 시스템 구조

​완전한 RAG 시스템 구현

​Python 예제

​고급 기능: 하이브리드 검색

​문서 전처리 파이프라인

​대화형 RAG 챗봇

​성능 최적화 팁

​다음 단계

문서 처리 파이프라인

API Reference

개요

RAG 시스템 구조

완전한 RAG 시스템 구현

Python 예제

고급 기능: 하이브리드 검색

문서 전처리 파이프라인

대화형 RAG 챗봇

성능 최적화 팁

다음 단계