Skip to content

知识库搭建

有了向量数据库,我们需要把文档处理成可检索的知识库。本节介绍文档加载、切分、索引的完整流程。

知识库搭建流程

原始文档 → 加载 → 切分 → 向量化 → 存入向量库

PDF、Word、网页、数据库...

文档加载

加载文本文件

python
from langchain_community.document_loaders import TextLoader

loader = TextLoader("document.txt", encoding="utf-8")
documents = loader.load()

加载PDF

python
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("document.pdf")
pages = loader.load()  # 每页一个Document对象

print(f"共{len(pages)}页")
print(pages[0].page_content[:200])

加载Word文档

python
from langchain_community.document_loaders import Docx2txtLoader

loader = Docx2txtLoader("document.docx")
documents = loader.load()

加载网页

python
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://example.com/article")
documents = loader.load()

加载Markdown

python
from langchain_community.document_loaders import UnstructuredMarkdownLoader

loader = UnstructuredMarkdownLoader("document.md")
documents = loader.load()

加载目录

python
from langchain_community.document_loaders import DirectoryLoader

loader = DirectoryLoader(
    "./docs",
    glob="**/*.md",  # 匹配所有markdown文件
    show_progress=True
)
documents = loader.load()

加载代码文件

python
from langchain_community.document_loaders import PythonLoader

loader = PythonLoader("script.py")
documents = loader.load()

文档切分

为什么需要切分?

问题:
- 文档太长,超过模型上下文窗口
- 需要精确定位相关片段
- 提高检索效率

解决方案:
- 把长文档切成小块
- 每块独立索引
- 检索时只返回相关块

按字符切分

python
from langchain_text_splitters import CharacterTextSplitter

splitter = CharacterTextSplitter(
    chunk_size=500,    # 每块最大500字符
    chunk_overlap=50,  # 块之间重叠50字符
    separator="\n\n"   # 按段落分隔
)

chunks = splitter.split_documents(documents)

递归字符切分(推荐)

python
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", "。", "!", "?", " ", ""]
)

chunks = splitter.split_documents(documents)

按Token切分

python
from langchain_text_splitters import TokenTextSplitter

splitter = TokenTextSplitter(
    chunk_size=500,     # 每块500个Token
    chunk_overlap=50
)

chunks = splitter.split_documents(documents)

代码切分

python
from langchain_text_splitters import PythonCodeTextSplitter

splitter = PythonCodeTextSplitter(chunk_size=500)
chunks = splitter.split_documents(documents)

Markdown按标题切分

python
from langchain_text_splitters import MarkdownHeaderTextSplitter

markdown_document = """
# 标题1

内容1

## 子标题1

内容2
"""

headers_to_split_on = [
    ("#", "header1"),
    ("##", "header2"),
]

splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
chunks = splitter.split_text(markdown_document)

构建知识库

完整示例

python
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

# 1. 加载文档
loader = DirectoryLoader("./knowledge", glob="**/*.pdf")
documents = loader.load()
print(f"加载了 {len(documents)} 个文档")

# 2. 切分文档
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
chunks = splitter.split_documents(documents)
print(f"切分成 {len(chunks)} 个块")

# 3. 创建向量库
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="./knowledge_db"
)
print("知识库构建完成!")

# 4. 测试检索
results = vectorstore.similarity_search("什么是机器学习?", k=3)
for i, doc in enumerate(results):
    print(f"\n结果{i+1}:")
    print(doc.page_content[:200])

添加元数据

python
from langchain_core.documents import Document

# 创建带元数据的文档
docs = [
    Document(
        page_content="内容...",
        metadata={
            "source": "manual.pdf",
            "page": 1,
            "category": "技术文档"
        }
    )
]

vectorstore.add_documents(docs)

# 按元数据过滤搜索
results = vectorstore.similarity_search(
    "查询",
    filter={"category": "技术文档"}
)

增量更新

python
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

# 加载已有向量库
vectorstore = Chroma(
    persist_directory="./knowledge_db",
    embedding_function=OpenAIEmbeddings()
)

# 添加新文档
new_docs = [...]  # 新文档列表
vectorstore.add_documents(new_docs)

# 删除旧文档
vectorstore.delete(["doc_id_1", "doc_id_2"])

知识库管理类

python
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

class KnowledgeBase:
    def __init__(self, persist_directory: str):
        self.embeddings = OpenAIEmbeddings()
        self.vectorstore = Chroma(
            persist_directory=persist_directory,
            embedding_function=self.embeddings
        )
        self.splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200
        )
    
    def add_pdf(self, file_path: str):
        """添加PDF文档"""
        loader = PyPDFLoader(file_path)
        documents = loader.load()
        chunks = self.splitter.split_documents(documents)
        self.vectorstore.add_documents(chunks)
        return len(chunks)
    
    def add_text(self, file_path: str):
        """添加文本文档"""
        loader = TextLoader(file_path)
        documents = loader.load()
        chunks = self.splitter.split_documents(documents)
        self.vectorstore.add_documents(chunks)
        return len(chunks)
    
    def search(self, query: str, k: int = 5):
        """搜索相关内容"""
        return self.vectorstore.similarity_search(query, k=k)
    
    def get_retriever(self):
        """获取检索器"""
        return self.vectorstore.as_retriever()

# 使用
kb = KnowledgeBase("./knowledge_db")
kb.add_pdf("document.pdf")
results = kb.search("查询内容")

小结

步骤说明
加载支持PDF、Word、网页等格式
切分按字符、Token、语义切分
向量化使用Embedding模型
存储存入向量数据库
更新支持增量添加删除

下一步

继续学习 检索增强生成(RAG),将知识库与大模型结合。