Spaces:
Sleeping
Sleeping
File size: 1,030 Bytes
274be20 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 |
"""Script to ingest data to a ChromaDB vector store, and persist it to disk"""
import os
from dotenv import load_dotenv
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
# load the environment variables
load_dotenv()
# load the data
markdown_path = "data/source.md"
# read the markdown file and return the full document as a string
with open(markdown_path, "r") as file:
full_markdown_document = file.read()
# split the data into chunks based on the markdown heading
headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
("###", "Header 3"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)
chunked_documents = markdown_splitter.split_text(full_markdown_document)
# create a vector store
embeddings_model = OpenAIEmbeddings()
db = Chroma.from_documents(chunked_documents, embeddings_model, persist_directory="data/chroma_db")
|