RAG ์์คํ ์ค์ ๊ตฌ์ถ (v21)
1. RAG ๊ธฐ์ด ๊ฐ๋
Retrieval-Augmented Generation (RAG)์ ๊ฒ์ ๊ธฐ๋ฐ ์์ฑ ์์คํ
์ผ๋ก, LLM์ ์ง์ ๋ฒ์๋ฅผ ํ์ฅํ๋ ๋ฐ ํจ๊ณผ์ ์
๋๋ค. ํต์ฌ ๋ฃจํ๋ ๋ค์๊ณผ ๊ฐ์ต๋๋ค:
์
๋ ฅ ์ง์ โ ๊ฒ์๊ธฐ โ ๋ฌธ์ ์กฐ๊ฐ โ ์ฆ๊ฐ๋ ํ๋กฌํํธ โ LLM ์์ฑ โ ์ถ๋ ฅ
RAG ์์คํ ์ 3๊ฐ์ง ์ฃผ์ ๊ตฌ์ฑ ์์๋ก ์ด๋ฃจ์ด์ง๋๋ค:
- Retriever: ๊ด๋ จ ๋ฌธ์ ์ฐพ๊ธฐ
- Generator: ์ฆ๊ฐ๋ ํ๋กฌํํธ๋ก ์์ฑ
- Embedding: ๋ฌธ์ ๋ฐ ์ง์ ๋ฒกํฐํ
2. ์ฒญํน ์ ๋ต
2.1 ์๋ฏธ์ ์ฒญํน
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import numpy as np
class SemanticChunker:
def __init__(self, model_name='all-MiniLM-L6-v2'):
self.model = SentenceTransformer(model_name)
def chunk_by_semantic(self, text, threshold=0.7):
sentences = text.split('. ')
embeddings = self.model.encode(sentences)
# K-means ํด๋ฌ์คํฐ๋ง์ผ๋ก ์๋ฏธ์ ๊ทธ๋ฃน ์์ฑ
kmeans = KMeans(n_clusters=max(1, len(sentences)//3))
kmeans.fit(embeddings)
clusters = {}
for i, cluster_id in enumerate(kmeans.labels_):
if cluster_id not in clusters:
clusters[cluster_id] = []
clusters[cluster_id].append(sentences[i])
return [' '.join(cluster) for cluster in clusters.values()]
2.2 ์ฌ๊ท์ ์ฒญํน
import re
class RecursiveChunker:
def __init__(self, chunk_size=512, overlap=50):
self.chunk_size = chunk_size
self.overlap = overlap
def chunk_recursive(self, text):
chunks = []
start = 0
while start < len(text):
end = min(start + self.chunk_size, len(text))
chunk = text[start:end]
# ์ค๊ฐ์ ๋ฌธ์ฅ ๋ถ๋ฆฌ์ ์ฐพ๊ธฐ
if end < len(text):
separator = chunk.rfind('. ')
if separator > self.chunk_size * 0.3:
end = start + separator + 1
chunk = text[start:end]
chunks.append(chunk)
start = max(0, end - self.overlap)
return chunks
3. ์๋ฒ ๋ฉ ๋ชจ๋ธ ์ ํ
3.1 ๋ชจ๋ธ ๋น๊ต ํ ์คํธ
from sentence_transformers import SentenceTransformer
import time
def compare_embedding_models():
models = {
'all-MiniLM-L6-v2': 'fast',
'all-mpnet-base-v2': 'medium',
'multi-qa-MiniLM-L6-v2': 'fast',
'paraphrase-multilingual-MiniLM-v2': 'fast'
}
test_sentences = [
"The quick brown fox jumps over the lazy dog.",
"Machine learning is a subset of artificial intelligence.",
"Natural language processing enables computers to understand text."
]
results = {}
for model_name, speed in models.items():
model = SentenceTransformer(model_name)
# ์ฑ๋ฅ ์ธก์
start_time = time.time()
embeddings = model.encode(test_sentences)
end_time = time.time()
results[model_name] = {
'speed': end_time - start_time,
'dimensions': len(embeddings[0]),
'speed_category': speed
}
return results
# ์ฌ์ฉ ์์
model_results = compare_embedding_models()
print("๋ชจ๋ธ ์ฑ๋ฅ ๋น๊ต:")
for name, metrics in model_results.items():
print(f"{name}: {metrics['speed']:.2f}s, {metrics['dimensions']}์ฐจ์")
4. ๋ฒกํฐ ๋ฐ์ดํฐ๋ฒ ์ด์ค ๋น๊ต
4.1 Chroma vs Qdrant vs pgvector
import chromadb
from qdrant_client import QdrantClient
import psycopg2
import numpy as np
class VectorDBComparison:
def __init__(self):
# Chroma
self.chroma_client = chromadb.Client()
self.chroma_collection = self.chroma_client.get_or_create_collection(
name="rag_test"
)
# Qdrant
self.qdrant_client = QdrantClient(host='localhost', port=6333)
# pgvector (PostgreSQL)
self.pg_conn = psycopg2.connect(
host="localhost",
database="rag_db",
user="user",
password="password"
)
def chroma_insert(self, documents, embeddings):
self.chroma_collection.add(
documents=documents,
embeddings=embeddings,
ids=[f"doc_{i}" for i in range(len(documents))]
)
def qdrant_insert(self, documents, embeddings):
self.qdrant_client.upsert(
collection_name="rag_test",
points=[
{
"id": i,
"vector": emb.tolist(),
"payload": {"text": doc}
}
for i, (doc, emb) in enumerate(zip(documents, embeddings))
]
)
def pgvector_insert(self, documents, embeddings):
with self.pg_conn.cursor() as cursor:
for i, (doc, emb) in enumerate(zip(documents, embeddings)):
cursor.execute(
"INSERT INTO documents (id, content, embedding) VALUES (%s, %s, %s)",
(i, doc, np.array(emb).tobytes())
)
self.pg_conn.commit()
5. ์ ์ฒด RAG ํ์ดํ๋ผ์ธ ์ฝ๋
import chromadb
from sentence_transformers import SentenceTransformer
from typing import List, Dict
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
class SimpleRAGPipeline:
def __init__(self, chunk_size: int = 512):
self.chunker = RecursiveChunker(chunk_size=chunk_size)
self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
self.chroma_client = chromadb.Client()
self.collection = self.chroma_client.get_or_create_collection(
name="documents"
)
def add_documents(self, documents: List[str]):
"""๋ฌธ์ ์ถ๊ฐ ๋ฐ ์๋ฒ ๋ฉ"""
all_chunks = []
all_texts = []
# ๋ฌธ์ ์ฒญํน
for doc in documents:
chunks = self.chunker.chunk_recursive(doc)
all_chunks.extend(chunks)
all_texts.extend([doc] * len(chunks))
# ์๋ฒ ๋ฉ ์์ฑ
embeddings = self.embedder.encode(all_chunks)
# Chroma์ ์ ์ฅ
self.collection.add(
documents=all_chunks,
embeddings=embeddings.tolist(),
ids=[f"chunk_{i}" for i in range(len(all_chunks))]
)
def retrieve(self, query: str, top_k: int = 3) -> List[Dict]:
"""๊ฒ์"""
query_embedding = self.embedder.encode([query])
results = self.collection.query(
query_embeddings=query_embedding.tolist(),
n_results=top_k
)
return [
{
'text': text,
'score': score
}
for text, score in zip(results['documents'][0], results['distances'][0])
]
def generate_response(self, query: str, retrieved_docs: List[Dict]) -> str:
"""์๋ต ์์ฑ"""
context = "\n".join([doc['text'] for doc in retrieved_docs])
prompt = f"""
์ฃผ์ด์ง ์ปจํ
์คํธ๋ฅผ ๊ธฐ๋ฐ์ผ๋ก ์ง๋ฌธ์ ๋ต๋ณํด์ฃผ์ธ์.
์ปจํ
์คํธ:
{context}
์ง๋ฌธ: {query}
๋ต๋ณ:
"""
# ๊ฐ๋จํ LLM ํธ์ถ ์์ (์ค์ ๊ตฌํ์์๋ ์ค์ LLM ์ฌ์ฉ)
return f"๊ฒ์๋ ์ปจํ
์คํธ ๊ธฐ๋ฐ ๋ต๋ณ: {query}์ ๋ํ ์ ๋ณด๋ {len(retrieved_docs)}๊ฐ ๋ฌธ์์ ํฌํจ๋์ด ์์ต๋๋ค."
# ์ฌ์ฉ ์์
rag = SimpleRAGPipeline()
# ๋ฌธ์ ์ถ๊ฐ
documents = [
"Python์ ๊ณ ๊ธ ํ๋ก๊ทธ๋๋ฐ ์ธ์ด๋ก, ๊ฐ๊ฒฐํ๊ณ ์ฝ๊ธฐ ์ฌ์ด ๋ฌธ๋ฒ์ ๊ฐ์ถ๊ณ ์์ต๋๋ค.",
"Django๋ ํ์ด์ฌ ๊ธฐ๋ฐ ์น ํ๋ ์์ํฌ๋ก, ๋น ๋ฅด๊ณ ์์ ํ ์น ๊ฐ๋ฐ์ ์ง์ํฉ๋๋ค.",
"React๋ JavaScript ๋ผ์ด๋ธ๋ฌ๋ฆฌ๋ก, ์ฌ์ฉ์ ์ธํฐํ์ด์ค ๊ฐ๋ฐ์ ์ต์ ํ๋์ด ์์ต๋๋ค."
]
rag.add_documents(documents)
# ๊ฒ์ ๋ฐ ์๋ต ์์ฑ
query = "Python ํ๋ก๊ทธ๋๋ฐ ์ธ์ด์ ํน์ง์?"
retrieved = rag.retrieve(query)
response = rag.generate_response(query, retrieved)
print(f"์ง์: {query}")
print(f"์๋ต: {response}")
6. ๊ณ ๊ธ ๊ธฐ๋ฅ
6.1 ์ง์ ๋ณํ
python
class QueryTransformer:
def __init__(self):
self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
def transform_query(self, original_query: str) -> List[str]:
"""์ง์ ๋ณํ
---
๐ฅ **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)













