Spaces:

sajal2692
/

hello-sajal

Sleeping

hello-sajal / src /ingest_data.py

add hfspace files

274be20 12 months ago

1.03 kB

	"""Script to ingest data to a ChromaDB vector store, and persist it to disk"""

	import os
	from dotenv import load_dotenv

	from langchain.text_splitter import MarkdownHeaderTextSplitter
	from langchain_community.vectorstores import Chroma
	from langchain_openai import OpenAIEmbeddings

	# load the environment variables
	load_dotenv()

	# load the data
	markdown_path = "data/source.md"
	# read the markdown file and return the full document as a string
	with open(markdown_path, "r") as file:
	full_markdown_document = file.read()

	# split the data into chunks based on the markdown heading
	headers_to_split_on = [
	("#", "Header 1"),
	("##", "Header 2"),
	("###", "Header 3"),
	]
	markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)
	chunked_documents = markdown_splitter.split_text(full_markdown_document)

	# create a vector store
	embeddings_model = OpenAIEmbeddings()
	db = Chroma.from_documents(chunked_documents, embeddings_model, persist_directory="data/chroma_db")