Skip to content

RAG优化技巧

基本的RAG可能效果不理想,本节介绍多种优化技巧,提升检索准确性和生成质量。

常见问题

问题1:检索不到相关内容
- 用户问法和文档表述不一致
- 切分太细,丢失上下文

问题2:检索到无关内容
- 向量相似但语义不相关
- 检索结果太多噪声

问题3:答案不准确
- 上下文太长,模型理解困难
- 多个文档内容冲突

优化策略一:查询重写

把用户问题改写成更适合检索的形式:

python
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

def rewrite_query(original_query: str) -> str:
    """重写查询"""
    model = ChatOpenAI(model="gpt-4o")
    
    prompt = ChatPromptTemplate.from_messages([
        ("system", """你是一个查询优化助手。把用户的问题改写成更适合检索的形式。

规则:
1. 展开缩写和代词
2. 添加相关关键词
3. 保持原意不变"""),
        ("user", "原问题:{query}\n\n改写后的问题:")
    ])
    
    chain = prompt | model
    return chain.invoke({"query": original_query}).content

# 示例
original = "它是什么时候发布的?"
rewritten = rewrite_query(original)
# 输出: "LangChain框架是什么时候发布的?发布日期是?"

优化策略二:多查询检索

从不同角度检索,提高召回率:

python
from langchain.retrievers.multi_query import MultiQueryRetriever

# 自动生成多个查询变体
retriever = MultiQueryRetriever.from_llm(
    retriever=vectorstore.as_retriever(),
    llm=model
)

# 检索时会自动生成多个查询
docs = retriever.invoke("什么是RAG?")

手动实现多查询

python
def multi_query_search(query: str, vectorstore, model, n_queries: int = 3):
    """多查询检索"""
    # 生成多个查询变体
    prompt = ChatPromptTemplate.from_template("""
    生成{ n}个与以下问题语义相同但表述不同的问题:
    
    原问题:{query}
    
    输出格式(每行一个):
    1. ...
    2. ...
    3. ...
    """)
    
    chain = prompt | model
    variations = chain.invoke({"n": n_queries, "query": query}).content
    
    # 合并所有查询的结果
    all_docs = []
    seen = set()
    
    queries = [query] + parse_variations(variations)
    for q in queries:
        docs = vectorstore.similarity_search(q, k=3)
        for doc in docs:
            doc_id = doc.metadata.get("id", doc.page_content[:50])
            if doc_id not in seen:
                all_docs.append(doc)
                seen.add(doc_id)
    
    return all_docs[:10]  # 返回前10个不重复的结果

优化策略三:重排序(Reranking)

先用向量检索召回候选,再用精细模型重排:

python
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import FlashrankRerank

# 使用Flashrank重排序
compressor = FlashrankRerank()
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectorstore.as_retriever(search_kwargs={"k": 10})
)

# 检索后会自动重排
docs = compression_retriever.invoke("查询")

使用Cohere Rerank

python
from langchain.retrievers.document_compressors import CohereRerank

compressor = CohereRerank(
    model="rerank-multilingual",
    top_n=5
)

compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectorstore.as_retriever(search_kwargs={"k": 20})
)

优化策略四:混合检索

结合关键词检索和向量检索:

python
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever

# 向量检索器
vector_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

# 关键词检索器
bm25_retriever = BM25Retriever.from_documents(documents)
bm25_retriever.k = 5

# 混合检索
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, vector_retriever],
    weights=[0.4, 0.6]  # BM25权重40%,向量60%
)

docs = ensemble_retriever.invoke("查询")

优化策略五:文档切分优化

父文档检索

检索小块,返回大块上下文:

python
from langchain.retrievers import ParentDocumentRetriever
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.storage import InMemoryStore

# 子文档切分器(用于检索)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

# 父文档切分器(用于返回)
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)

store = InMemoryStore()

retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter
)

# 检索时返回父文档(包含更完整上下文)
docs = retriever.invoke("查询")

语义切分

按语义边界切分:

python
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings

splitter = SemanticChunker(
    OpenAIEmbeddings(),
    breakpoint_threshold_type="percentile"
)

chunks = splitter.split_text(long_document)

优化策略六:上下文压缩

压缩检索结果,只保留相关部分:

python
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

# 使用LLM提取相关部分
compressor = LLMChainExtractor.from_llm(model)

compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectorstore.as_retriever()
)

docs = compression_retriever.invoke("查询")
# 每个文档只保留与查询相关的部分

优化策略七:回答验证

让模型自我验证答案:

python
def rag_with_verification(question: str, retriever, model):
    """带验证的RAG"""
    # 1. 检索
    docs = retriever.invoke(question)
    context = "\n\n".join(d.page_content for d in docs)
    
    # 2. 生成答案
    answer_prompt = ChatPromptTemplate.from_messages([
        ("system", "根据上下文回答问题:\n{context}"),
        ("user", "{question}")
    ])
    answer = (answer_prompt | model).invoke({
        "context": context,
        "question": question
    }).content
    
    # 3. 验证答案
    verify_prompt = ChatPromptTemplate.from_messages([
        ("system", """验证答案是否正确。

上下文:{context}
问题:{question}
答案:{answer}

请检查:
1. 答案是否来自上下文
2. 答案是否完整
3. 是否有矛盾

如果答案有问题,请给出修正后的答案。如果答案正确,回复"答案正确"。"""),
        ("user", "")
    ])
    
    verification = (verify_prompt | model).invoke({
        "context": context,
        "question": question,
        "answer": answer
    }).content
    
    return {
        "answer": answer,
        "verification": verification
    }

性能评估

使用Ragas评估

python
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevance

# 准备评估数据
eval_data = {
    "question": ["问题1", "问题2"],
    "answer": ["答案1", "答案2"],
    "contexts": [["文档1"], ["文档2"]],
    "ground_truth": ["标准答案1", "标准答案2"]
}

# 评估
result = evaluate(eval_data, metrics=[faithfulness, answer_relevance])
print(result)

综合优化示例

python
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import FlashrankRerank

class OptimizedRAG:
    def __init__(self, documents, persist_dir="./optimized_rag"):
        self.model = ChatOpenAI(model="gpt-4o")
        self.embeddings = OpenAIEmbeddings()
        
        # 创建向量库
        self.vectorstore = Chroma.from_documents(
            documents, 
            self.embeddings,
            persist_directory=persist_dir
        )
        
        # 向量检索器
        vector_retriever = self.vectorstore.as_retriever(
            search_kwargs={"k": 10}
        )
        
        # BM25检索器
        bm25_retriever = BM25Retriever.from_documents(documents)
        bm25_retriever.k = 10
        
        # 混合检索
        ensemble = EnsembleRetriever(
            retrievers=[bm25_retriever, vector_retriever],
            weights=[0.3, 0.7]
        )
        
        # 重排序
        reranker = FlashrankRerank(top_n=5)
        
        # 最终检索器
        self.retriever = ContextualCompressionRetriever(
            base_compressor=reranker,
            base_retriever=ensemble
        )
    
    def query(self, question: str) -> str:
        # 查询重写
        rewritten = self._rewrite_query(question)
        
        # 检索
        docs = self.retriever.invoke(rewritten)
        context = "\n\n".join(d.page_content for d in docs)
        
        # 生成
        prompt = f"""根据上下文回答问题:

上下文:
{context}

问题:{question}

答案:"""
        
        return self.model.invoke(prompt).content
    
    def _rewrite_query(self, query: str) -> str:
        # 简单的查询重写
        prompt = f"把以下问题改写成更适合检索的形式,只输出改写后的问题:\n{query}"
        return self.model.invoke(prompt).content

小结

优化策略效果适用场景
查询重写提高召回率用户问题模糊
多查询提高召回率需要全面信息
重排序提高精准度检索结果噪声多
混合检索平衡召回和精准通用场景
父文档检索保留上下文需要完整信息

下一步

继续学习 Agent开发,让AI能够自主调用工具完成任务。