isayahc commited on
Commit
cc8fd4e
1 Parent(s): 1a930c9

removed commented out code, improved variable names, showed instance of using Chroma as a retriever

Browse files
innovation_pathfinder_ai/vector_store/chroma_vector_store.py CHANGED
@@ -13,11 +13,15 @@ import chromadb.utils.embedding_functions as embedding_functions
13
  from langchain.text_splitter import CharacterTextSplitter
14
  from langchain_text_splitters import MarkdownHeaderTextSplitter
15
  from langchain_text_splitters import RecursiveCharacterTextSplitter
 
 
16
  from langchain.document_loaders import PyPDFLoader
 
 
17
  from langchain_community.embeddings.sentence_transformer import (
18
  SentenceTransformerEmbeddings,
19
  )
20
- from langchain_core.documents import Document
21
  import uuid
22
  import dotenv
23
  import os
@@ -159,66 +163,44 @@ def add_pdf_to_vector_store(
159
 
160
  loader = PyPDFLoader(pdf_file_location)
161
 
162
- # text_splitter = CharacterTextSplitter(
163
- # chunk_size=text_chunk_size,
164
- # chunk_overlap=text_chunk_overlap,
165
- # )
166
-
167
- text_splitter = CharacterTextSplitter(
168
- separator="\n\n",
169
- chunk_size=1000,
170
- chunk_overlap=200,
171
- length_function=len,
172
- is_separator_regex=False,
173
- )
174
-
175
  documents.extend(loader.load())
176
 
177
  split_docs:list[Document] = []
178
 
179
- for i in documents:
180
  sub_docs = split_by_intervals(
181
- i.page_content,
182
  text_chunk_size,
183
  text_chunk_overlap
184
  )
185
 
186
- for ii in sub_docs:
187
- # Document()
188
- fg = Document(ii, metadata=i.metadata)
189
- split_docs.append(fg)
190
-
191
 
192
 
193
- # texts = text_splitter.create_documents([state_of_the_union])
194
-
195
  client = chromadb.PersistentClient(
196
  # path=persist_directory,
197
  )
198
 
199
-
200
  collection = client.get_or_create_collection(
201
  name=collection_name,
202
  )
203
 
204
  embed_data = embedding_functions.HuggingFaceEmbeddingFunction(
205
- api_key= os.getenv("HUGGINGFACEHUB_API_TOKEN"),
 
206
  )
207
 
208
  # create the open-source embedding function
209
  # embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
210
 
211
- docs = text_splitter.split_documents(documents)
212
-
213
- chunked_documents = text_splitter.split_documents(documents)
214
 
215
 
216
- documents_page_content:list = [i.page_content for i in documents]
217
-
218
-
219
- for i in range(0, len(documents)):
220
- data = documents[i]
221
- print(i)
222
  collection.add(
223
  ids=[generate_uuid()], # give each document a uuid
224
  documents=documents_page_content[i], # contents of document
@@ -258,6 +240,34 @@ if __name__ == "__main__":
258
  pdf_file_location=pdf_file_location,
259
  )
260
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  # pdf_file_location = "mydir/181000551.pdf"
262
  # pdf_file_location = "/workspaces/InnovationPathfinderAI/2402.17764.pdf"
263
 
 
13
  from langchain.text_splitter import CharacterTextSplitter
14
  from langchain_text_splitters import MarkdownHeaderTextSplitter
15
  from langchain_text_splitters import RecursiveCharacterTextSplitter
16
+
17
+ from langchain_core.documents import Document
18
  from langchain.document_loaders import PyPDFLoader
19
+ from langchain_community.vectorstores import Chroma
20
+
21
  from langchain_community.embeddings.sentence_transformer import (
22
  SentenceTransformerEmbeddings,
23
  )
24
+
25
  import uuid
26
  import dotenv
27
  import os
 
163
 
164
  loader = PyPDFLoader(pdf_file_location)
165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  documents.extend(loader.load())
167
 
168
  split_docs:list[Document] = []
169
 
170
+ for document in documents:
171
  sub_docs = split_by_intervals(
172
+ document.page_content,
173
  text_chunk_size,
174
  text_chunk_overlap
175
  )
176
 
177
+ for sub_doc in sub_docs:
178
+ loaded_doc = Document(sub_doc, metadata=document.metadata)
179
+ split_docs.append(loaded_doc)
 
 
180
 
181
 
 
 
182
  client = chromadb.PersistentClient(
183
  # path=persist_directory,
184
  )
185
 
 
186
  collection = client.get_or_create_collection(
187
  name=collection_name,
188
  )
189
 
190
  embed_data = embedding_functions.HuggingFaceEmbeddingFunction(
191
+ api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN"),
192
+ model_name= "sentence-transformers/all-MiniLM-L6-v2" # added model name for clariity
193
  )
194
 
195
  # create the open-source embedding function
196
  # embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
197
 
198
+ documents_page_content:list = [i.page_content for i in split_docs]
 
 
199
 
200
 
201
+ for i in range(0, len(split_docs)):
202
+ data = split_docs[i]
203
+
 
 
 
204
  collection.add(
205
  ids=[generate_uuid()], # give each document a uuid
206
  documents=documents_page_content[i], # contents of document
 
240
  pdf_file_location=pdf_file_location,
241
  )
242
 
243
+ #create the cliient using Chroma's library
244
+ client = chromadb.PersistentClient(
245
+ # path=persist_directory,
246
+ )
247
+
248
+ # This is an example collection name
249
+ collection_name="ArxivPapers"
250
+
251
+ # create the open-source embedding function
252
+ embedding_function = SentenceTransformerEmbeddings(
253
+ model_name="all-MiniLM-L6-v2",
254
+ )
255
+
256
+ #method of integrating Chroma and Langchain
257
+ vector_db = Chroma(
258
+ client=client, # client for Chroma
259
+ collection_name=collection_name,
260
+ embedding_function=embedding_function,
261
+ )
262
+
263
+ query = "ai" # your query
264
+
265
+ # using your Chromadb as a retriever for langchain
266
+ retriever = vector_db.as_retriever()
267
+
268
+ # returning a list of documents
269
+ docs = retriever.get_relevant_documents(query)
270
+
271
  # pdf_file_location = "mydir/181000551.pdf"
272
  # pdf_file_location = "/workspaces/InnovationPathfinderAI/2402.17764.pdf"
273