Spaces:
Runtime error
Runtime error
removed commented out code, improved variable names, showed instance of using Chroma as a retriever
Browse files
innovation_pathfinder_ai/vector_store/chroma_vector_store.py
CHANGED
@@ -13,11 +13,15 @@ import chromadb.utils.embedding_functions as embedding_functions
|
|
13 |
from langchain.text_splitter import CharacterTextSplitter
|
14 |
from langchain_text_splitters import MarkdownHeaderTextSplitter
|
15 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
|
|
|
16 |
from langchain.document_loaders import PyPDFLoader
|
|
|
|
|
17 |
from langchain_community.embeddings.sentence_transformer import (
|
18 |
SentenceTransformerEmbeddings,
|
19 |
)
|
20 |
-
|
21 |
import uuid
|
22 |
import dotenv
|
23 |
import os
|
@@ -159,66 +163,44 @@ def add_pdf_to_vector_store(
|
|
159 |
|
160 |
loader = PyPDFLoader(pdf_file_location)
|
161 |
|
162 |
-
# text_splitter = CharacterTextSplitter(
|
163 |
-
# chunk_size=text_chunk_size,
|
164 |
-
# chunk_overlap=text_chunk_overlap,
|
165 |
-
# )
|
166 |
-
|
167 |
-
text_splitter = CharacterTextSplitter(
|
168 |
-
separator="\n\n",
|
169 |
-
chunk_size=1000,
|
170 |
-
chunk_overlap=200,
|
171 |
-
length_function=len,
|
172 |
-
is_separator_regex=False,
|
173 |
-
)
|
174 |
-
|
175 |
documents.extend(loader.load())
|
176 |
|
177 |
split_docs:list[Document] = []
|
178 |
|
179 |
-
for
|
180 |
sub_docs = split_by_intervals(
|
181 |
-
|
182 |
text_chunk_size,
|
183 |
text_chunk_overlap
|
184 |
)
|
185 |
|
186 |
-
for
|
187 |
-
|
188 |
-
|
189 |
-
split_docs.append(fg)
|
190 |
-
|
191 |
|
192 |
|
193 |
-
# texts = text_splitter.create_documents([state_of_the_union])
|
194 |
-
|
195 |
client = chromadb.PersistentClient(
|
196 |
# path=persist_directory,
|
197 |
)
|
198 |
|
199 |
-
|
200 |
collection = client.get_or_create_collection(
|
201 |
name=collection_name,
|
202 |
)
|
203 |
|
204 |
embed_data = embedding_functions.HuggingFaceEmbeddingFunction(
|
205 |
-
api_key= os.getenv("HUGGINGFACEHUB_API_TOKEN"),
|
|
|
206 |
)
|
207 |
|
208 |
# create the open-source embedding function
|
209 |
# embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
210 |
|
211 |
-
|
212 |
-
|
213 |
-
chunked_documents = text_splitter.split_documents(documents)
|
214 |
|
215 |
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
for i in range(0, len(documents)):
|
220 |
-
data = documents[i]
|
221 |
-
print(i)
|
222 |
collection.add(
|
223 |
ids=[generate_uuid()], # give each document a uuid
|
224 |
documents=documents_page_content[i], # contents of document
|
@@ -258,6 +240,34 @@ if __name__ == "__main__":
|
|
258 |
pdf_file_location=pdf_file_location,
|
259 |
)
|
260 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
# pdf_file_location = "mydir/181000551.pdf"
|
262 |
# pdf_file_location = "/workspaces/InnovationPathfinderAI/2402.17764.pdf"
|
263 |
|
|
|
13 |
from langchain.text_splitter import CharacterTextSplitter
|
14 |
from langchain_text_splitters import MarkdownHeaderTextSplitter
|
15 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
16 |
+
|
17 |
+
from langchain_core.documents import Document
|
18 |
from langchain.document_loaders import PyPDFLoader
|
19 |
+
from langchain_community.vectorstores import Chroma
|
20 |
+
|
21 |
from langchain_community.embeddings.sentence_transformer import (
|
22 |
SentenceTransformerEmbeddings,
|
23 |
)
|
24 |
+
|
25 |
import uuid
|
26 |
import dotenv
|
27 |
import os
|
|
|
163 |
|
164 |
loader = PyPDFLoader(pdf_file_location)
|
165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
documents.extend(loader.load())
|
167 |
|
168 |
split_docs:list[Document] = []
|
169 |
|
170 |
+
for document in documents:
|
171 |
sub_docs = split_by_intervals(
|
172 |
+
document.page_content,
|
173 |
text_chunk_size,
|
174 |
text_chunk_overlap
|
175 |
)
|
176 |
|
177 |
+
for sub_doc in sub_docs:
|
178 |
+
loaded_doc = Document(sub_doc, metadata=document.metadata)
|
179 |
+
split_docs.append(loaded_doc)
|
|
|
|
|
180 |
|
181 |
|
|
|
|
|
182 |
client = chromadb.PersistentClient(
|
183 |
# path=persist_directory,
|
184 |
)
|
185 |
|
|
|
186 |
collection = client.get_or_create_collection(
|
187 |
name=collection_name,
|
188 |
)
|
189 |
|
190 |
embed_data = embedding_functions.HuggingFaceEmbeddingFunction(
|
191 |
+
api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN"),
|
192 |
+
model_name= "sentence-transformers/all-MiniLM-L6-v2" # added model name for clariity
|
193 |
)
|
194 |
|
195 |
# create the open-source embedding function
|
196 |
# embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
197 |
|
198 |
+
documents_page_content:list = [i.page_content for i in split_docs]
|
|
|
|
|
199 |
|
200 |
|
201 |
+
for i in range(0, len(split_docs)):
|
202 |
+
data = split_docs[i]
|
203 |
+
|
|
|
|
|
|
|
204 |
collection.add(
|
205 |
ids=[generate_uuid()], # give each document a uuid
|
206 |
documents=documents_page_content[i], # contents of document
|
|
|
240 |
pdf_file_location=pdf_file_location,
|
241 |
)
|
242 |
|
243 |
+
#create the cliient using Chroma's library
|
244 |
+
client = chromadb.PersistentClient(
|
245 |
+
# path=persist_directory,
|
246 |
+
)
|
247 |
+
|
248 |
+
# This is an example collection name
|
249 |
+
collection_name="ArxivPapers"
|
250 |
+
|
251 |
+
# create the open-source embedding function
|
252 |
+
embedding_function = SentenceTransformerEmbeddings(
|
253 |
+
model_name="all-MiniLM-L6-v2",
|
254 |
+
)
|
255 |
+
|
256 |
+
#method of integrating Chroma and Langchain
|
257 |
+
vector_db = Chroma(
|
258 |
+
client=client, # client for Chroma
|
259 |
+
collection_name=collection_name,
|
260 |
+
embedding_function=embedding_function,
|
261 |
+
)
|
262 |
+
|
263 |
+
query = "ai" # your query
|
264 |
+
|
265 |
+
# using your Chromadb as a retriever for langchain
|
266 |
+
retriever = vector_db.as_retriever()
|
267 |
+
|
268 |
+
# returning a list of documents
|
269 |
+
docs = retriever.get_relevant_documents(query)
|
270 |
+
|
271 |
# pdf_file_location = "mydir/181000551.pdf"
|
272 |
# pdf_file_location = "/workspaces/InnovationPathfinderAI/2402.17764.pdf"
|
273 |
|