Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -76,7 +76,7 @@ CSS = """
|
|
76 |
|
77 |
|
78 |
# def get_vectordb(text, images, tables):
|
79 |
-
def get_vectordb(text, images):
|
80 |
client = chromadb.EphemeralClient()
|
81 |
loader = ImageLoader()
|
82 |
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
|
@@ -99,9 +99,9 @@ def get_vectordb(text, images):
|
|
99 |
metadata={"hnsw:space": "cosine"},
|
100 |
)
|
101 |
descs = []
|
102 |
-
for
|
103 |
try:
|
104 |
-
descs.append(get_image_description(
|
105 |
except:
|
106 |
descs.append("Could not generate image description due to some error")
|
107 |
|
@@ -152,6 +152,7 @@ def extract_data_from_pdfs(
|
|
152 |
all_text = ""
|
153 |
|
154 |
images = []
|
|
|
155 |
for doc in docs:
|
156 |
if do_ocr == "Get Text With OCR":
|
157 |
pdf_doc = DocumentFile.from_pdf(doc)
|
@@ -163,11 +164,12 @@ def extract_data_from_pdfs(
|
|
163 |
|
164 |
if include_images == "Include Images":
|
165 |
images.extend(extract_images([doc]))
|
|
|
166 |
|
167 |
progress(
|
168 |
0.6, "Generating image descriptions and inserting everything into vectorDB"
|
169 |
)
|
170 |
-
vectordb = get_vectordb(all_text, images)
|
171 |
|
172 |
progress(1, "Completed")
|
173 |
session["processed"] = True
|
@@ -466,4 +468,4 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft(text_size=sizes.text_md)) as demo:
|
|
466 |
|
467 |
next_p1.click(check_validity_and_llm, session_states, tabs)
|
468 |
if __name__ == "__main__":
|
469 |
-
demo.launch()
|
|
|
76 |
|
77 |
|
78 |
# def get_vectordb(text, images, tables):
|
79 |
+
def get_vectordb(text, images, img_doc_files):
|
80 |
client = chromadb.EphemeralClient()
|
81 |
loader = ImageLoader()
|
82 |
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
|
|
|
99 |
metadata={"hnsw:space": "cosine"},
|
100 |
)
|
101 |
descs = []
|
102 |
+
for i in range(len(images)):
|
103 |
try:
|
104 |
+
descs.append(img_doc_files[i]+"\n"+get_image_description(images[i])[0])
|
105 |
except:
|
106 |
descs.append("Could not generate image description due to some error")
|
107 |
|
|
|
152 |
all_text = ""
|
153 |
|
154 |
images = []
|
155 |
+
img_docs=[]
|
156 |
for doc in docs:
|
157 |
if do_ocr == "Get Text With OCR":
|
158 |
pdf_doc = DocumentFile.from_pdf(doc)
|
|
|
164 |
|
165 |
if include_images == "Include Images":
|
166 |
images.extend(extract_images([doc]))
|
167 |
+
img_docs.append(doc.split("/")[-1])
|
168 |
|
169 |
progress(
|
170 |
0.6, "Generating image descriptions and inserting everything into vectorDB"
|
171 |
)
|
172 |
+
vectordb = get_vectordb(all_text, images, img_docs)
|
173 |
|
174 |
progress(1, "Completed")
|
175 |
session["processed"] = True
|
|
|
468 |
|
469 |
next_p1.click(check_validity_and_llm, session_states, tabs)
|
470 |
if __name__ == "__main__":
|
471 |
+
demo.launch()
|