langchain-docs-chatbot / ingestion.py
Petermoyano's picture
Remove links and add debug file
43fee5b
"""
This file es responsible for the ingestion of the data (langchain documentation).
It embedds the data into vectors, and stores it in the pinecone vectorstore.
"""
from langchain.document_loaders import ReadTheDocsLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
import os
import pinecone
from consts import INDEX_NAME
# initialize pinecone client
pinecone.init(api_key=os.environ["PINECONE_API_KEY"],
environment=os.environ["PINECONE_ENVIRONMENT"])
# The ingestion process is divided into 3 steps:
# 1. Load the documents from the source (ReadTheDocsLoader)
# 2. Split the documents into chunks (RecursiveCharacterTextSplitter)
# 3. Embed the chunks into vectors and store them in the vectorstore (Pinecone.from_documents)
def ingest_docs() -> None:
# The ReadTheDocsLoader is a class that is in charge of taking the dump of some scrapped data-fetching
# process and loading it into the vectorstore.
loader = ReadTheDocsLoader(
"langchain-docs/langchain.readthedocs.io/en/latest/"
)
# loader.load() -> [documents] (documents are just dictionaries)
raw_documents = loader.load()
print(f"Loaded {len(raw_documents)} documents")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, chunk_overlap=100, separators=["\n\n", "\n", " ", ""])
# Execute splitter, to allow parallelization of the embedding process.
documents = text_splitter.split_documents(documents=raw_documents)
print(f"Split {len(documents)} documents into chunks")
# Simple dictionary manipulation to change the source path of the documents, to a valid langchain docs page.
# This will enable us later to have easy access to the "relevant" context. (proximity search)
for doc in documents:
old_path = doc.metadata["source"]
new_url = old_path.replace(
"langchain-docs/", "https:/")
doc.metadata.update({"source": new_url})
print(f"Uploading {len(documents)} documents to vectorstore (pinecone)")
# The embeddings object is in charge of embedding the documents into vectors.
embeddings = OpenAIEmbeddings()
# Take the chunks, imbed them into vectors and store them in the Pinecone vector database.
Pinecone.from_documents(documents,
embeddings, index_name=INDEX_NAME)
print("*********Added documents to Pinecone*********")
if __name__ == '__main__':
ingest_docs()