anand004 commited on
Commit
ec030eb
·
verified ·
1 Parent(s): cf4a2da

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -5
app.py CHANGED
@@ -76,7 +76,7 @@ CSS = """
76
 
77
 
78
  # def get_vectordb(text, images, tables):
79
- def get_vectordb(text, images):
80
  client = chromadb.EphemeralClient()
81
  loader = ImageLoader()
82
  sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
@@ -99,9 +99,9 @@ def get_vectordb(text, images):
99
  metadata={"hnsw:space": "cosine"},
100
  )
101
  descs = []
102
- for image in images:
103
  try:
104
- descs.append(get_image_description(image)[0])
105
  except:
106
  descs.append("Could not generate image description due to some error")
107
 
@@ -152,6 +152,7 @@ def extract_data_from_pdfs(
152
  all_text = ""
153
 
154
  images = []
 
155
  for doc in docs:
156
  if do_ocr == "Get Text With OCR":
157
  pdf_doc = DocumentFile.from_pdf(doc)
@@ -163,11 +164,12 @@ def extract_data_from_pdfs(
163
 
164
  if include_images == "Include Images":
165
  images.extend(extract_images([doc]))
 
166
 
167
  progress(
168
  0.6, "Generating image descriptions and inserting everything into vectorDB"
169
  )
170
- vectordb = get_vectordb(all_text, images)
171
 
172
  progress(1, "Completed")
173
  session["processed"] = True
@@ -466,4 +468,4 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft(text_size=sizes.text_md)) as demo:
466
 
467
  next_p1.click(check_validity_and_llm, session_states, tabs)
468
  if __name__ == "__main__":
469
- demo.launch()
 
76
 
77
 
78
  # def get_vectordb(text, images, tables):
79
+ def get_vectordb(text, images, img_doc_files):
80
  client = chromadb.EphemeralClient()
81
  loader = ImageLoader()
82
  sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
 
99
  metadata={"hnsw:space": "cosine"},
100
  )
101
  descs = []
102
+ for i in range(len(images)):
103
  try:
104
+ descs.append(img_doc_files[i]+"\n"+get_image_description(images[i])[0])
105
  except:
106
  descs.append("Could not generate image description due to some error")
107
 
 
152
  all_text = ""
153
 
154
  images = []
155
+ img_docs=[]
156
  for doc in docs:
157
  if do_ocr == "Get Text With OCR":
158
  pdf_doc = DocumentFile.from_pdf(doc)
 
164
 
165
  if include_images == "Include Images":
166
  images.extend(extract_images([doc]))
167
+ img_docs.append(doc.split("/")[-1])
168
 
169
  progress(
170
  0.6, "Generating image descriptions and inserting everything into vectorDB"
171
  )
172
+ vectordb = get_vectordb(all_text, images, img_docs)
173
 
174
  progress(1, "Completed")
175
  session["processed"] = True
 
468
 
469
  next_p1.click(check_validity_and_llm, session_states, tabs)
470
  if __name__ == "__main__":
471
+ demo.launch()