Spaces:
Runtime error
Runtime error
creating custom chunking for pdfs
Browse files
innovation_pathfinder_ai/vector_store/chroma_vector_store.py
CHANGED
@@ -3,6 +3,9 @@
|
|
3 |
# https://stackoverflow.com/questions/76482987/chroma-database-embeddings-none-when-using-get
|
4 |
# https://docs.trychroma.com/embeddings/hugging-face?lang=py
|
5 |
# https://www.datacamp.com/tutorial/chromadb-tutorial-step-by-step-guide
|
|
|
|
|
|
|
6 |
|
7 |
import chromadb
|
8 |
import chromadb.utils.embedding_functions as embedding_functions
|
@@ -11,7 +14,10 @@ from langchain.text_splitter import CharacterTextSplitter
|
|
11 |
from langchain_text_splitters import MarkdownHeaderTextSplitter
|
12 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
13 |
from langchain.document_loaders import PyPDFLoader
|
14 |
-
|
|
|
|
|
|
|
15 |
import uuid
|
16 |
import dotenv
|
17 |
import os
|
@@ -88,9 +94,6 @@ def add_markdown_to_collection(
|
|
88 |
# path=persist_directory,
|
89 |
)
|
90 |
|
91 |
-
# client.delete_collection(
|
92 |
-
# name=collection_name,
|
93 |
-
# )
|
94 |
|
95 |
# If the collection already exists, we just return it. This allows us to add more
|
96 |
# data to an existing collection.
|
@@ -113,6 +116,23 @@ def add_markdown_to_collection(
|
|
113 |
embeddings=embed_data.embed_with_retries(documents_page_content[i]),
|
114 |
metadatas=data.metadata, # type: ignore
|
115 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
|
118 |
def add_pdf_to_vector_store(
|
@@ -139,20 +159,43 @@ def add_pdf_to_vector_store(
|
|
139 |
|
140 |
loader = PyPDFLoader(pdf_file_location)
|
141 |
|
|
|
|
|
|
|
|
|
|
|
142 |
text_splitter = CharacterTextSplitter(
|
143 |
-
|
144 |
-
|
145 |
-
|
|
|
|
|
|
|
146 |
|
147 |
documents.extend(loader.load())
|
148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
client = chromadb.PersistentClient(
|
150 |
# path=persist_directory,
|
151 |
)
|
152 |
|
153 |
-
# client.delete_collection(
|
154 |
-
# name=collection_name,
|
155 |
-
# )
|
156 |
|
157 |
collection = client.get_or_create_collection(
|
158 |
name=collection_name,
|
@@ -162,9 +205,14 @@ def add_pdf_to_vector_store(
|
|
162 |
api_key= os.getenv("HUGGINGFACEHUB_API_TOKEN"),
|
163 |
)
|
164 |
|
|
|
|
|
|
|
|
|
165 |
|
166 |
chunked_documents = text_splitter.split_documents(documents)
|
167 |
|
|
|
168 |
documents_page_content:list = [i.page_content for i in documents]
|
169 |
|
170 |
|
@@ -181,14 +229,13 @@ def add_pdf_to_vector_store(
|
|
181 |
|
182 |
if __name__ == "__main__":
|
183 |
|
184 |
-
# vector_db = load_vector_store()
|
185 |
-
|
186 |
collection_name="ArxivPapers"
|
187 |
|
188 |
client = chromadb.PersistentClient(
|
189 |
# path=persist_directory,
|
190 |
)
|
191 |
|
|
|
192 |
# client.delete_collection(
|
193 |
# name=collection_name,
|
194 |
# )
|
@@ -215,10 +262,10 @@ if __name__ == "__main__":
|
|
215 |
# pdf_file_location = "/workspaces/InnovationPathfinderAI/2402.17764.pdf"
|
216 |
|
217 |
|
218 |
-
# example query
|
219 |
|
220 |
-
results = collection.query(
|
221 |
-
query_texts=["benchmark"],
|
222 |
-
n_results=3,
|
223 |
-
include=['embeddings', 'documents', 'metadatas'],
|
224 |
-
)
|
|
|
3 |
# https://stackoverflow.com/questions/76482987/chroma-database-embeddings-none-when-using-get
|
4 |
# https://docs.trychroma.com/embeddings/hugging-face?lang=py
|
5 |
# https://www.datacamp.com/tutorial/chromadb-tutorial-step-by-step-guide
|
6 |
+
# https://python.langchain.com/docs/modules/data_connection/retrievers/self_query
|
7 |
+
# https://python.langchain.com/docs/integrations/vectorstores/chroma#update-and-delete
|
8 |
+
# https://python.langchain.com/docs/modules/data_connection/retrievers/vectorstore
|
9 |
|
10 |
import chromadb
|
11 |
import chromadb.utils.embedding_functions as embedding_functions
|
|
|
14 |
from langchain_text_splitters import MarkdownHeaderTextSplitter
|
15 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
16 |
from langchain.document_loaders import PyPDFLoader
|
17 |
+
from langchain_community.embeddings.sentence_transformer import (
|
18 |
+
SentenceTransformerEmbeddings,
|
19 |
+
)
|
20 |
+
from langchain_core.documents import Document
|
21 |
import uuid
|
22 |
import dotenv
|
23 |
import os
|
|
|
94 |
# path=persist_directory,
|
95 |
)
|
96 |
|
|
|
|
|
|
|
97 |
|
98 |
# If the collection already exists, we just return it. This allows us to add more
|
99 |
# data to an existing collection.
|
|
|
116 |
embeddings=embed_data.embed_with_retries(documents_page_content[i]),
|
117 |
metadatas=data.metadata, # type: ignore
|
118 |
)
|
119 |
+
|
120 |
+
def split_by_intervals(s: str, interval: int, overlapped: int = 0) -> list:
|
121 |
+
"""
|
122 |
+
Split a string into intervals of a given length, with optional overlapping.
|
123 |
+
|
124 |
+
Args:
|
125 |
+
s: The input string.
|
126 |
+
interval: The length of each interval.
|
127 |
+
overlapped: The number of characters to overlap between intervals. Default is 0.
|
128 |
+
|
129 |
+
Returns:
|
130 |
+
A list of substrings, each containing 'interval' characters from the input string.
|
131 |
+
"""
|
132 |
+
result = []
|
133 |
+
for i in range(0, len(s), interval - overlapped):
|
134 |
+
result.append(s[i:i + interval])
|
135 |
+
return result
|
136 |
|
137 |
|
138 |
def add_pdf_to_vector_store(
|
|
|
159 |
|
160 |
loader = PyPDFLoader(pdf_file_location)
|
161 |
|
162 |
+
# text_splitter = CharacterTextSplitter(
|
163 |
+
# chunk_size=text_chunk_size,
|
164 |
+
# chunk_overlap=text_chunk_overlap,
|
165 |
+
# )
|
166 |
+
|
167 |
text_splitter = CharacterTextSplitter(
|
168 |
+
separator="\n\n",
|
169 |
+
chunk_size=1000,
|
170 |
+
chunk_overlap=200,
|
171 |
+
length_function=len,
|
172 |
+
is_separator_regex=False,
|
173 |
+
)
|
174 |
|
175 |
documents.extend(loader.load())
|
176 |
|
177 |
+
split_docs:list[Document] = []
|
178 |
+
|
179 |
+
for i in documents:
|
180 |
+
sub_docs = split_by_intervals(
|
181 |
+
i.page_content,
|
182 |
+
text_chunk_size,
|
183 |
+
text_chunk_overlap
|
184 |
+
)
|
185 |
+
|
186 |
+
for ii in sub_docs:
|
187 |
+
# Document()
|
188 |
+
fg = Document(ii, metadata=i.metadata)
|
189 |
+
split_docs.append(fg)
|
190 |
+
|
191 |
+
|
192 |
+
|
193 |
+
# texts = text_splitter.create_documents([state_of_the_union])
|
194 |
+
|
195 |
client = chromadb.PersistentClient(
|
196 |
# path=persist_directory,
|
197 |
)
|
198 |
|
|
|
|
|
|
|
199 |
|
200 |
collection = client.get_or_create_collection(
|
201 |
name=collection_name,
|
|
|
205 |
api_key= os.getenv("HUGGINGFACEHUB_API_TOKEN"),
|
206 |
)
|
207 |
|
208 |
+
# create the open-source embedding function
|
209 |
+
# embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
210 |
+
|
211 |
+
docs = text_splitter.split_documents(documents)
|
212 |
|
213 |
chunked_documents = text_splitter.split_documents(documents)
|
214 |
|
215 |
+
|
216 |
documents_page_content:list = [i.page_content for i in documents]
|
217 |
|
218 |
|
|
|
229 |
|
230 |
if __name__ == "__main__":
|
231 |
|
|
|
|
|
232 |
collection_name="ArxivPapers"
|
233 |
|
234 |
client = chromadb.PersistentClient(
|
235 |
# path=persist_directory,
|
236 |
)
|
237 |
|
238 |
+
# delete existing collection
|
239 |
# client.delete_collection(
|
240 |
# name=collection_name,
|
241 |
# )
|
|
|
262 |
# pdf_file_location = "/workspaces/InnovationPathfinderAI/2402.17764.pdf"
|
263 |
|
264 |
|
265 |
+
# example query using Chroma
|
266 |
|
267 |
+
# results = collection.query(
|
268 |
+
# query_texts=["benchmark"],
|
269 |
+
# n_results=3,
|
270 |
+
# include=['embeddings', 'documents', 'metadatas'],
|
271 |
+
# )
|