Spaces:
Runtime error
Runtime error
freQuensy23
commited on
Commit
•
566eb82
1
Parent(s):
3e243df
[FIX] dependency hell
Browse files- main.py +11 -5
- requirements.txt +4 -4
main.py
CHANGED
@@ -12,14 +12,17 @@ embeddings = SentenceTransformerEmbeddings(model_name=model)
|
|
12 |
prev_files = None
|
13 |
retriever = None
|
14 |
|
|
|
15 |
def handle_files_and_query(query, files):
|
16 |
results = ""
|
17 |
global prev_files, retriever
|
|
|
18 |
if files is not None and files != prev_files:
|
19 |
documents = []
|
20 |
prev_files = files
|
21 |
for file in files:
|
22 |
-
documents.extend(
|
|
|
23 |
retriever = BM25Retriever.from_documents(documents, k=100)
|
24 |
results += "Index created successfully!\n"
|
25 |
print("Index created successfully!")
|
@@ -31,19 +34,22 @@ def handle_files_and_query(query, files):
|
|
31 |
print(f"Query: {query}")
|
32 |
if query:
|
33 |
search_results = retriever.get_relevant_documents(query)
|
34 |
-
pattern = r'[^\\/]+$'
|
35 |
-
reranked_results = FAISS.from_documents(search_results, embeddings,
|
|
|
|
|
36 |
results = "\n".join([
|
37 |
f"Source: {re.search(pattern, result.metadata['file_path']).group(0)}\nPage: {result.metadata['page']}\nContent:\n{result.page_content}\n"
|
38 |
for result in reranked_results
|
39 |
])
|
40 |
return results
|
41 |
|
|
|
42 |
interface = gr.Interface(
|
43 |
fn=handle_files_and_query,
|
44 |
inputs=[
|
45 |
-
gr.Textbox(lines
|
46 |
-
gr.File(file_count="multiple", type="
|
47 |
],
|
48 |
outputs="text",
|
49 |
title="Similarity Search for PDFs"
|
|
|
12 |
prev_files = None
|
13 |
retriever = None
|
14 |
|
15 |
+
|
16 |
def handle_files_and_query(query, files):
|
17 |
results = ""
|
18 |
global prev_files, retriever
|
19 |
+
files = [f.name for f in files]
|
20 |
if files is not None and files != prev_files:
|
21 |
documents = []
|
22 |
prev_files = files
|
23 |
for file in files:
|
24 |
+
documents.extend(
|
25 |
+
PyMuPDFLoader(file).load_and_split(SentenceTransformersTokenTextSplitter(model_name=model)))
|
26 |
retriever = BM25Retriever.from_documents(documents, k=100)
|
27 |
results += "Index created successfully!\n"
|
28 |
print("Index created successfully!")
|
|
|
34 |
print(f"Query: {query}")
|
35 |
if query:
|
36 |
search_results = retriever.get_relevant_documents(query)
|
37 |
+
pattern = r'[^\\/]+$' # pattern to get filename from filepath
|
38 |
+
reranked_results = FAISS.from_documents(search_results, embeddings,
|
39 |
+
distance_strategy=DistanceStrategy.COSINE).similarity_search(query,
|
40 |
+
k=25)
|
41 |
results = "\n".join([
|
42 |
f"Source: {re.search(pattern, result.metadata['file_path']).group(0)}\nPage: {result.metadata['page']}\nContent:\n{result.page_content}\n"
|
43 |
for result in reranked_results
|
44 |
])
|
45 |
return results
|
46 |
|
47 |
+
|
48 |
interface = gr.Interface(
|
49 |
fn=handle_files_and_query,
|
50 |
inputs=[
|
51 |
+
gr.Textbox(lines=1, label="Enter your search query here..."),
|
52 |
+
gr.File(file_count="multiple", type="file", file_types=[".pdf"], label="Upload a file here.")
|
53 |
],
|
54 |
outputs="text",
|
55 |
title="Similarity Search for PDFs"
|
requirements.txt
CHANGED
@@ -26,7 +26,7 @@ emoji==2.8.0
|
|
26 |
et-xmlfile==1.1.0
|
27 |
exceptiongroup==1.1.1
|
28 |
faiss-cpu==1.7.4
|
29 |
-
fastapi
|
30 |
ffmpy==0.3.0
|
31 |
filelock==3.12.2
|
32 |
filetype==1.2.0
|
@@ -34,7 +34,7 @@ flatbuffers==23.5.26
|
|
34 |
fonttools==4.40.0
|
35 |
frozenlist==1.3.3
|
36 |
fsspec==2023.6.0
|
37 |
-
gradio
|
38 |
gradio_client==0.7.3
|
39 |
h11==0.14.0
|
40 |
httpcore==0.17.2
|
@@ -49,7 +49,7 @@ Jinja2==3.1.2
|
|
49 |
joblib==1.3.2
|
50 |
jsonschema==4.17.3
|
51 |
kiwisolver==1.4.4
|
52 |
-
langchain
|
53 |
langchainplus-sdk==0.0.16
|
54 |
langdetect==1.0.9
|
55 |
layoutparser==0.3.4
|
@@ -90,7 +90,7 @@ portalocker==2.8.2
|
|
90 |
protobuf==4.25.1
|
91 |
pycocotools==2.0.7
|
92 |
pycparser==2.21
|
93 |
-
pydantic
|
94 |
pydantic_core==2.14.5
|
95 |
pydub==0.25.1
|
96 |
Pygments==2.15.1
|
|
|
26 |
et-xmlfile==1.1.0
|
27 |
exceptiongroup==1.1.1
|
28 |
faiss-cpu==1.7.4
|
29 |
+
fastapi
|
30 |
ffmpy==0.3.0
|
31 |
filelock==3.12.2
|
32 |
filetype==1.2.0
|
|
|
34 |
fonttools==4.40.0
|
35 |
frozenlist==1.3.3
|
36 |
fsspec==2023.6.0
|
37 |
+
gradio
|
38 |
gradio_client==0.7.3
|
39 |
h11==0.14.0
|
40 |
httpcore==0.17.2
|
|
|
49 |
joblib==1.3.2
|
50 |
jsonschema==4.17.3
|
51 |
kiwisolver==1.4.4
|
52 |
+
langchain
|
53 |
langchainplus-sdk==0.0.16
|
54 |
langdetect==1.0.9
|
55 |
layoutparser==0.3.4
|
|
|
90 |
protobuf==4.25.1
|
91 |
pycocotools==2.0.7
|
92 |
pycparser==2.21
|
93 |
+
pydantic
|
94 |
pydantic_core==2.14.5
|
95 |
pydub==0.25.1
|
96 |
Pygments==2.15.1
|