Spaces:

Petermoyano
/

langchain-docs-chatbot

Sleeping

App Files Files Community

langchain-docs-chatbot / ingestion.py

Petermoyano

Remove links and add debug file

43fee5b over 1 year ago

raw

history blame contribute delete

2.55 kB

	"""
	This file es responsible for the ingestion of the data (langchain documentation).
	It embedds the data into vectors, and stores it in the pinecone vectorstore.
	"""
	from langchain.document_loaders import ReadTheDocsLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.embeddings import OpenAIEmbeddings
	from langchain.vectorstores import Pinecone

	import os
	import pinecone
	from consts import INDEX_NAME

	# initialize pinecone client
	pinecone.init(api_key=os.environ["PINECONE_API_KEY"],
	environment=os.environ["PINECONE_ENVIRONMENT"])

	# The ingestion process is divided into 3 steps:
	# 1. Load the documents from the source (ReadTheDocsLoader)
	# 2. Split the documents into chunks (RecursiveCharacterTextSplitter)
	# 3. Embed the chunks into vectors and store them in the vectorstore (Pinecone.from_documents)


	def ingest_docs() -> None:
	# The ReadTheDocsLoader is a class that is in charge of taking the dump of some scrapped data-fetching
	# process and loading it into the vectorstore.
	loader = ReadTheDocsLoader(
	"langchain-docs/langchain.readthedocs.io/en/latest/"
	)

	# loader.load() -> [documents] (documents are just dictionaries)
	raw_documents = loader.load()

	print(f"Loaded {len(raw_documents)} documents")

	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000, chunk_overlap=100, separators=["\n\n", "\n", " ", ""])

	# Execute splitter, to allow parallelization of the embedding process.
	documents = text_splitter.split_documents(documents=raw_documents)

	print(f"Split {len(documents)} documents into chunks")

	# Simple dictionary manipulation to change the source path of the documents, to a valid langchain docs page.
	# This will enable us later to have easy access to the "relevant" context. (proximity search)
	for doc in documents:
	old_path = doc.metadata["source"]
	new_url = old_path.replace(
	"langchain-docs/", "https:/")
	doc.metadata.update({"source": new_url})

	print(f"Uploading {len(documents)} documents to vectorstore (pinecone)")
	# The embeddings object is in charge of embedding the documents into vectors.
	embeddings = OpenAIEmbeddings()

	# Take the chunks, imbed them into vectors and store them in the Pinecone vector database.
	Pinecone.from_documents(documents,
	embeddings, index_name=INDEX_NAME)
	print("*******Added documents to Pinecone*******")


	if __name__ == '__main__':
	ingest_docs()