File size: 1,030 Bytes
274be20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
"""Script to ingest data to a ChromaDB vector store, and persist it to disk"""

import os
from dotenv import load_dotenv

from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

# load the environment variables
load_dotenv()

# load the data
markdown_path = "data/source.md"
# read the markdown file and return the full document as a string
with open(markdown_path, "r") as file:
    full_markdown_document = file.read()

# split the data into chunks based on the markdown heading
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)
chunked_documents = markdown_splitter.split_text(full_markdown_document)

# create a vector store
embeddings_model = OpenAIEmbeddings()
db = Chroma.from_documents(chunked_documents, embeddings_model, persist_directory="data/chroma_db")