isayahc commited on
Commit
1a930c9
1 Parent(s): 63822d9

creating custom chunking for pdfs

Browse files
innovation_pathfinder_ai/vector_store/chroma_vector_store.py CHANGED
@@ -3,6 +3,9 @@
3
  # https://stackoverflow.com/questions/76482987/chroma-database-embeddings-none-when-using-get
4
  # https://docs.trychroma.com/embeddings/hugging-face?lang=py
5
  # https://www.datacamp.com/tutorial/chromadb-tutorial-step-by-step-guide
 
 
 
6
 
7
  import chromadb
8
  import chromadb.utils.embedding_functions as embedding_functions
@@ -11,7 +14,10 @@ from langchain.text_splitter import CharacterTextSplitter
11
  from langchain_text_splitters import MarkdownHeaderTextSplitter
12
  from langchain_text_splitters import RecursiveCharacterTextSplitter
13
  from langchain.document_loaders import PyPDFLoader
14
-
 
 
 
15
  import uuid
16
  import dotenv
17
  import os
@@ -88,9 +94,6 @@ def add_markdown_to_collection(
88
  # path=persist_directory,
89
  )
90
 
91
- # client.delete_collection(
92
- # name=collection_name,
93
- # )
94
 
95
  # If the collection already exists, we just return it. This allows us to add more
96
  # data to an existing collection.
@@ -113,6 +116,23 @@ def add_markdown_to_collection(
113
  embeddings=embed_data.embed_with_retries(documents_page_content[i]),
114
  metadatas=data.metadata, # type: ignore
115
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
 
118
  def add_pdf_to_vector_store(
@@ -139,20 +159,43 @@ def add_pdf_to_vector_store(
139
 
140
  loader = PyPDFLoader(pdf_file_location)
141
 
 
 
 
 
 
142
  text_splitter = CharacterTextSplitter(
143
- chunk_size=text_chunk_size,
144
- chunk_overlap=text_chunk_overlap,
145
- )
 
 
 
146
 
147
  documents.extend(loader.load())
148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  client = chromadb.PersistentClient(
150
  # path=persist_directory,
151
  )
152
 
153
- # client.delete_collection(
154
- # name=collection_name,
155
- # )
156
 
157
  collection = client.get_or_create_collection(
158
  name=collection_name,
@@ -162,9 +205,14 @@ def add_pdf_to_vector_store(
162
  api_key= os.getenv("HUGGINGFACEHUB_API_TOKEN"),
163
  )
164
 
 
 
 
 
165
 
166
  chunked_documents = text_splitter.split_documents(documents)
167
 
 
168
  documents_page_content:list = [i.page_content for i in documents]
169
 
170
 
@@ -181,14 +229,13 @@ def add_pdf_to_vector_store(
181
 
182
  if __name__ == "__main__":
183
 
184
- # vector_db = load_vector_store()
185
-
186
  collection_name="ArxivPapers"
187
 
188
  client = chromadb.PersistentClient(
189
  # path=persist_directory,
190
  )
191
 
 
192
  # client.delete_collection(
193
  # name=collection_name,
194
  # )
@@ -215,10 +262,10 @@ if __name__ == "__main__":
215
  # pdf_file_location = "/workspaces/InnovationPathfinderAI/2402.17764.pdf"
216
 
217
 
218
- # example query
219
 
220
- results = collection.query(
221
- query_texts=["benchmark"],
222
- n_results=3,
223
- include=['embeddings', 'documents', 'metadatas'],
224
- )
 
3
  # https://stackoverflow.com/questions/76482987/chroma-database-embeddings-none-when-using-get
4
  # https://docs.trychroma.com/embeddings/hugging-face?lang=py
5
  # https://www.datacamp.com/tutorial/chromadb-tutorial-step-by-step-guide
6
+ # https://python.langchain.com/docs/modules/data_connection/retrievers/self_query
7
+ # https://python.langchain.com/docs/integrations/vectorstores/chroma#update-and-delete
8
+ # https://python.langchain.com/docs/modules/data_connection/retrievers/vectorstore
9
 
10
  import chromadb
11
  import chromadb.utils.embedding_functions as embedding_functions
 
14
  from langchain_text_splitters import MarkdownHeaderTextSplitter
15
  from langchain_text_splitters import RecursiveCharacterTextSplitter
16
  from langchain.document_loaders import PyPDFLoader
17
+ from langchain_community.embeddings.sentence_transformer import (
18
+ SentenceTransformerEmbeddings,
19
+ )
20
+ from langchain_core.documents import Document
21
  import uuid
22
  import dotenv
23
  import os
 
94
  # path=persist_directory,
95
  )
96
 
 
 
 
97
 
98
  # If the collection already exists, we just return it. This allows us to add more
99
  # data to an existing collection.
 
116
  embeddings=embed_data.embed_with_retries(documents_page_content[i]),
117
  metadatas=data.metadata, # type: ignore
118
  )
119
+
120
+ def split_by_intervals(s: str, interval: int, overlapped: int = 0) -> list:
121
+ """
122
+ Split a string into intervals of a given length, with optional overlapping.
123
+
124
+ Args:
125
+ s: The input string.
126
+ interval: The length of each interval.
127
+ overlapped: The number of characters to overlap between intervals. Default is 0.
128
+
129
+ Returns:
130
+ A list of substrings, each containing 'interval' characters from the input string.
131
+ """
132
+ result = []
133
+ for i in range(0, len(s), interval - overlapped):
134
+ result.append(s[i:i + interval])
135
+ return result
136
 
137
 
138
  def add_pdf_to_vector_store(
 
159
 
160
  loader = PyPDFLoader(pdf_file_location)
161
 
162
+ # text_splitter = CharacterTextSplitter(
163
+ # chunk_size=text_chunk_size,
164
+ # chunk_overlap=text_chunk_overlap,
165
+ # )
166
+
167
  text_splitter = CharacterTextSplitter(
168
+ separator="\n\n",
169
+ chunk_size=1000,
170
+ chunk_overlap=200,
171
+ length_function=len,
172
+ is_separator_regex=False,
173
+ )
174
 
175
  documents.extend(loader.load())
176
 
177
+ split_docs:list[Document] = []
178
+
179
+ for i in documents:
180
+ sub_docs = split_by_intervals(
181
+ i.page_content,
182
+ text_chunk_size,
183
+ text_chunk_overlap
184
+ )
185
+
186
+ for ii in sub_docs:
187
+ # Document()
188
+ fg = Document(ii, metadata=i.metadata)
189
+ split_docs.append(fg)
190
+
191
+
192
+
193
+ # texts = text_splitter.create_documents([state_of_the_union])
194
+
195
  client = chromadb.PersistentClient(
196
  # path=persist_directory,
197
  )
198
 
 
 
 
199
 
200
  collection = client.get_or_create_collection(
201
  name=collection_name,
 
205
  api_key= os.getenv("HUGGINGFACEHUB_API_TOKEN"),
206
  )
207
 
208
+ # create the open-source embedding function
209
+ # embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
210
+
211
+ docs = text_splitter.split_documents(documents)
212
 
213
  chunked_documents = text_splitter.split_documents(documents)
214
 
215
+
216
  documents_page_content:list = [i.page_content for i in documents]
217
 
218
 
 
229
 
230
  if __name__ == "__main__":
231
 
 
 
232
  collection_name="ArxivPapers"
233
 
234
  client = chromadb.PersistentClient(
235
  # path=persist_directory,
236
  )
237
 
238
+ # delete existing collection
239
  # client.delete_collection(
240
  # name=collection_name,
241
  # )
 
262
  # pdf_file_location = "/workspaces/InnovationPathfinderAI/2402.17764.pdf"
263
 
264
 
265
+ # example query using Chroma
266
 
267
+ # results = collection.query(
268
+ # query_texts=["benchmark"],
269
+ # n_results=3,
270
+ # include=['embeddings', 'documents', 'metadatas'],
271
+ # )