Spaces:
Sleeping
Sleeping
""" | |
This file es responsible for the ingestion of the data (langchain documentation). | |
It embedds the data into vectors, and stores it in the pinecone vectorstore. | |
""" | |
from langchain.document_loaders import ReadTheDocsLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.embeddings import OpenAIEmbeddings | |
from langchain.vectorstores import Pinecone | |
import os | |
import pinecone | |
from consts import INDEX_NAME | |
# initialize pinecone client | |
pinecone.init(api_key=os.environ["PINECONE_API_KEY"], | |
environment=os.environ["PINECONE_ENVIRONMENT"]) | |
# The ingestion process is divided into 3 steps: | |
# 1. Load the documents from the source (ReadTheDocsLoader) | |
# 2. Split the documents into chunks (RecursiveCharacterTextSplitter) | |
# 3. Embed the chunks into vectors and store them in the vectorstore (Pinecone.from_documents) | |
def ingest_docs() -> None: | |
# The ReadTheDocsLoader is a class that is in charge of taking the dump of some scrapped data-fetching | |
# process and loading it into the vectorstore. | |
loader = ReadTheDocsLoader( | |
"langchain-docs/langchain.readthedocs.io/en/latest/" | |
) | |
# loader.load() -> [documents] (documents are just dictionaries) | |
raw_documents = loader.load() | |
print(f"Loaded {len(raw_documents)} documents") | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=1000, chunk_overlap=100, separators=["\n\n", "\n", " ", ""]) | |
# Execute splitter, to allow parallelization of the embedding process. | |
documents = text_splitter.split_documents(documents=raw_documents) | |
print(f"Split {len(documents)} documents into chunks") | |
# Simple dictionary manipulation to change the source path of the documents, to a valid langchain docs page. | |
# This will enable us later to have easy access to the "relevant" context. (proximity search) | |
for doc in documents: | |
old_path = doc.metadata["source"] | |
new_url = old_path.replace( | |
"langchain-docs/", "https:/") | |
doc.metadata.update({"source": new_url}) | |
print(f"Uploading {len(documents)} documents to vectorstore (pinecone)") | |
# The embeddings object is in charge of embedding the documents into vectors. | |
embeddings = OpenAIEmbeddings() | |
# Take the chunks, imbed them into vectors and store them in the Pinecone vector database. | |
Pinecone.from_documents(documents, | |
embeddings, index_name=INDEX_NAME) | |
print("*********Added documents to Pinecone*********") | |
if __name__ == '__main__': | |
ingest_docs() | |