试玩RAG
主要使用的库
langchain 0.1.20
langchain-chroma 0.1.1
langchain-community 0.0.38
langchain-openai 0.1.7
chat
import os
from langchain_openai import ChatOpenAI
os.environ["OPENAI_API_KEY"] = "sk-xxx"
chat = ChatOpenAI(
openai_api_key=os.environ["OPENAI_API_KEY"],
model='gpt-3.5-turbo'
)
from langchain_core.messages import (
SystemMessage,
HumanMessage,
AIMessage
)
messages = [
SystemMessage(content="You are a helpful assistant."),
HumanMessage(content="Knock knock."),
AIMessage(content="Who's there?"),
HumanMessage(content="Orange"),
]
res = chat(messages)
print(res)
messages.append(res)
res = chat(messages)
print(res.content)
1.创建一个RAG对话模型
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("C:\\Users\\abc\\Desktop\\xxx.pdf")
pages = loader.load_and_split()
print(len(pages))
print(pages[0])
2.知识切片,将文档分割成均匀的块
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
chunk_size = 500,
chunk_overlap = 50,
)
docs = text_splitter.split_documents(pages)
print(len(docs))
3.利用embedding模型对每个文本片段进行向量化,并储存到向量数据库中
使用 openai 的模型,api 调用
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
embed_model = OpenAIEmbeddings(model='text-embedding-3-small')
vectorstore = Chroma.from_documents(documents=docs, embedding=embed_model, collection_name="openai_embed")
query = "How many dimensions is each grid feature represented as a vector in Implementation Details?"
result = vectorstore.similarity_search(query, k=3)
print(result)
使用离线模型,本地调用,使用 huggingface
# 一些模型需要授权下载使用
from huggingface_hub import login
login(token='hf_xxx')
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
model_name = "maidalun1020/bce-embedding-base_v1"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'batch_size': 64, 'normalize_embeddings': True}
embedding = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)
embedding = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)
vectorstore_hf = Chroma.from_documents(documents=docs, embedding=embedding, collection_name="huggingface_embed")
result = vectorstore_hf.similarity_search(query, k=3)
print(result)
最后编辑于:2024 年 07 月 19 日 23:33