lfoppiano commited on
Commit
848c18f
Β·
1 Parent(s): 4e9cd1b

update dependencies, remove biblio from search space

Browse files
document_qa/document_qa_engine.py CHANGED
@@ -7,7 +7,6 @@ import tiktoken
7
  from langchain.chains import create_extraction_chain
8
  from langchain.chains.question_answering import load_qa_chain, stuff_prompt, refine_prompts, map_reduce_prompt, \
9
  map_rerank_prompt
10
- from langchain.evaluation import PairwiseEmbeddingDistanceEvalChain, load_evaluator, EmbeddingDistance
11
  from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
12
  from langchain.retrievers import MultiQueryRetriever
13
  from langchain.schema import Document
@@ -273,7 +272,7 @@ class DocumentQAEngine:
273
  """
274
  db = self.data_storage.embeddings_dict[doc_id]
275
  retriever = db.as_retriever(search_kwargs={"k": context_size}, search_type="similarity_with_embeddings")
276
- relevant_documents = retriever.get_relevant_documents(query)
277
 
278
  return relevant_documents
279
 
@@ -284,7 +283,7 @@ class DocumentQAEngine:
284
  # search_type="similarity_score_threshold"
285
  # )
286
  retriever = db.as_retriever(search_kwargs={"k": context_size}, search_type="similarity_with_embeddings")
287
- relevant_documents = retriever.get_relevant_documents(query)
288
  relevant_document_coordinates = [doc.metadata['coordinates'].split(";") if 'coordinates' in doc.metadata else []
289
  for doc in
290
  relevant_documents]
@@ -338,7 +337,7 @@ class DocumentQAEngine:
338
  def _get_context(self, doc_id, query, context_size=4) -> (List[Document], list):
339
  db = self.data_storage.embeddings_dict[doc_id]
340
  retriever = db.as_retriever(search_kwargs={"k": context_size})
341
- relevant_documents = retriever.get_relevant_documents(query)
342
  relevant_document_coordinates = [doc.metadata['coordinates'].split(";") if 'coordinates' in doc.metadata else []
343
  for doc in
344
  relevant_documents]
@@ -361,7 +360,7 @@ class DocumentQAEngine:
361
  def _get_context_multiquery(self, doc_id, query, context_size=4):
362
  db = self.data_storage.embeddings_dict[doc_id].as_retriever(search_kwargs={"k": context_size})
363
  multi_query_retriever = MultiQueryRetriever.from_llm(retriever=db, llm=self.llm)
364
- relevant_documents = multi_query_retriever.get_relevant_documents(query)
365
  return relevant_documents
366
 
367
  def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, verbose=False):
 
7
  from langchain.chains import create_extraction_chain
8
  from langchain.chains.question_answering import load_qa_chain, stuff_prompt, refine_prompts, map_reduce_prompt, \
9
  map_rerank_prompt
 
10
  from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
11
  from langchain.retrievers import MultiQueryRetriever
12
  from langchain.schema import Document
 
272
  """
273
  db = self.data_storage.embeddings_dict[doc_id]
274
  retriever = db.as_retriever(search_kwargs={"k": context_size}, search_type="similarity_with_embeddings")
275
+ relevant_documents = retriever.invoke(query)
276
 
277
  return relevant_documents
278
 
 
283
  # search_type="similarity_score_threshold"
284
  # )
285
  retriever = db.as_retriever(search_kwargs={"k": context_size}, search_type="similarity_with_embeddings")
286
+ relevant_documents = retriever.invoke(query)
287
  relevant_document_coordinates = [doc.metadata['coordinates'].split(";") if 'coordinates' in doc.metadata else []
288
  for doc in
289
  relevant_documents]
 
337
  def _get_context(self, doc_id, query, context_size=4) -> (List[Document], list):
338
  db = self.data_storage.embeddings_dict[doc_id]
339
  retriever = db.as_retriever(search_kwargs={"k": context_size})
340
+ relevant_documents = retriever.invoke(query)
341
  relevant_document_coordinates = [doc.metadata['coordinates'].split(";") if 'coordinates' in doc.metadata else []
342
  for doc in
343
  relevant_documents]
 
360
  def _get_context_multiquery(self, doc_id, query, context_size=4):
361
  db = self.data_storage.embeddings_dict[doc_id].as_retriever(search_kwargs={"k": context_size})
362
  multi_query_retriever = MultiQueryRetriever.from_llm(retriever=db, llm=self.llm)
363
+ relevant_documents = multi_query_retriever.invoke(query)
364
  return relevant_documents
365
 
366
  def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, verbose=False):
document_qa/grobid_processors.py CHANGED
@@ -148,15 +148,15 @@ class GrobidProcessor(BaseProcessor):
148
  soup = BeautifulSoup(text, 'xml')
149
  blocks_header = get_xml_nodes_header(soup, use_paragraphs=True)
150
 
151
- passages.append({
152
- "text": f"authors: {biblio['authors']}",
153
- "type": passage_type,
154
- "section": "<header>",
155
- "subSection": "<authors>",
156
- "passage_id": "hauthors",
157
- "coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
158
- blocks_header['authors']])
159
- })
160
 
161
  passages.append({
162
  "text": self.post_process(" ".join([node.text for node in blocks_header['title']])),
 
148
  soup = BeautifulSoup(text, 'xml')
149
  blocks_header = get_xml_nodes_header(soup, use_paragraphs=True)
150
 
151
+ # passages.append({
152
+ # "text": f"authors: {biblio['authors']}",
153
+ # "type": passage_type,
154
+ # "section": "<header>",
155
+ # "subSection": "<authors>",
156
+ # "passage_id": "hauthors",
157
+ # "coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
158
+ # blocks_header['authors']])
159
+ # })
160
 
161
  passages.append({
162
  "text": self.post_process(" ".join([node.text for node in blocks_header['title']])),
requirements.txt CHANGED
@@ -16,14 +16,17 @@ dateparser
16
 
17
  # LLM
18
  chromadb==0.4.24
19
- tiktoken==0.6.0
20
- openai==1.16.2
21
- langchain==0.1.14
22
- langchain-core==0.1.40
 
 
 
23
  typing-inspect==0.9.0
24
  typing_extensions==4.11.0
25
  pydantic==2.6.4
26
  sentence_transformers==2.6.1
27
- streamlit-pdf-viewer==0.0.17
28
  umap-learn
29
  plotly
 
16
 
17
  # LLM
18
  chromadb==0.4.24
19
+ tiktoken==0.7.0
20
+ openai==1.42.0
21
+ langchain==0.2.14
22
+ langchain-core==0.2.34
23
+ langchain-openai==0.1.22
24
+ langchain-huggingface==0.0.3
25
+ langchain-community==0.2.12
26
  typing-inspect==0.9.0
27
  typing_extensions==4.11.0
28
  pydantic==2.6.4
29
  sentence_transformers==2.6.1
30
+ streamlit-pdf-viewer==0.0.18-dev1
31
  umap-learn
32
  plotly
streamlit_app.py CHANGED
@@ -6,10 +6,11 @@ from tempfile import NamedTemporaryFile
6
  import dotenv
7
  from grobid_quantities.quantities import QuantitiesAPI
8
  from langchain.memory import ConversationBufferWindowMemory
9
- from langchain_community.chat_models.openai import ChatOpenAI
10
- from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
11
- from langchain_community.embeddings.openai import OpenAIEmbeddings
12
  from langchain_community.llms.huggingface_endpoint import HuggingFaceEndpoint
 
 
13
  from streamlit_pdf_viewer import pdf_viewer
14
 
15
  from document_qa.ner_client_generic import NERClientGeneric
@@ -97,6 +98,9 @@ if 'pdf' not in st.session_state:
97
  if 'embeddings' not in st.session_state:
98
  st.session_state['embeddings'] = None
99
 
 
 
 
100
  st.set_page_config(
101
  page_title="Scientific Document Insights Q/A",
102
  page_icon="πŸ“",
@@ -169,7 +173,8 @@ def init_qa(model, embeddings_name=None, api_key=None):
169
  repo_id=OPEN_MODELS[model],
170
  temperature=0.01,
171
  max_new_tokens=4092,
172
- model_kwargs={"max_length": 8192}
 
173
  )
174
  embeddings = HuggingFaceEmbeddings(
175
  model_name=OPEN_EMBEDDINGS[embeddings_name])
@@ -233,8 +238,8 @@ def play_old_messages(container):
233
  # is_api_key_provided = st.session_state['api_key']
234
 
235
  with st.sidebar:
236
- st.title("πŸ“ Scientific Document Insights Q/A")
237
- st.subheader("Upload a scientific article in PDF, ask questions, get insights.")
238
  st.markdown(
239
  ":warning: [Usage disclaimer](https://github.com/lfoppiano/document-qa?tab=readme-ov-file#disclaimer-on-data-security-and-privacy-%EF%B8%8F) :warning: ")
240
 
@@ -301,14 +306,14 @@ with st.sidebar:
301
  # help="Clear the conversational memory. Currently implemented to retrain the 4 most recent messages.",
302
  # disabled=model in st.session_state['rqa'] and st.session_state['rqa'][model].memory is None)
303
 
304
- left_column, right_column = st.columns([1, 1])
305
  right_column = right_column.container(border=True)
306
  left_column = left_column.container(border=True)
307
 
308
  with right_column:
309
  uploaded_file = st.file_uploader(
310
- "Upload an article",
311
- type=("pdf", "txt"),
312
  on_change=new_file,
313
  disabled=st.session_state['model'] is not None and st.session_state['model'] not in
314
  st.session_state['api_keys'],
@@ -343,6 +348,10 @@ with st.sidebar:
343
  "relevant paragraphs to the question in the paper. "
344
  "Question coefficient attempt to estimate how effective the question will be answered."
345
  )
 
 
 
 
346
  st.session_state['ner_processing'] = st.checkbox(
347
  "Identify materials and properties.",
348
  help='The LLM responses undergo post-processing to extract physical quantities, measurements, and materials mentions.'
@@ -415,7 +424,6 @@ def generate_color_gradient(num_elements):
415
 
416
  with right_column:
417
  if st.session_state.loaded_embeddings and question and len(question) > 0 and st.session_state.doc_id:
418
- # messages.chat_message("user").markdown(question)
419
  st.session_state.messages.append({"role": "user", "mode": mode, "content": question})
420
 
421
  for message in st.session_state.messages:
@@ -491,5 +499,6 @@ with left_column:
491
  input=st.session_state['binary'],
492
  annotation_outline_size=2,
493
  annotations=st.session_state['annotations'],
494
- render_text=True
 
495
  )
 
6
  import dotenv
7
  from grobid_quantities.quantities import QuantitiesAPI
8
  from langchain.memory import ConversationBufferWindowMemory
9
+ from langchain_community.callbacks import PromptLayerCallbackHandler
10
+ from langchain_community.chat_models import ChatOpenAI
 
11
  from langchain_community.llms.huggingface_endpoint import HuggingFaceEndpoint
12
+ from langchain_huggingface import HuggingFaceEmbeddings
13
+ from langchain_openai import OpenAIEmbeddings
14
  from streamlit_pdf_viewer import pdf_viewer
15
 
16
  from document_qa.ner_client_generic import NERClientGeneric
 
98
  if 'embeddings' not in st.session_state:
99
  st.session_state['embeddings'] = None
100
 
101
+ if 'scroll_to_first_annotation' not in st.session_state:
102
+ st.session_state['scroll_to_first_annotation'] = False
103
+
104
  st.set_page_config(
105
  page_title="Scientific Document Insights Q/A",
106
  page_icon="πŸ“",
 
173
  repo_id=OPEN_MODELS[model],
174
  temperature=0.01,
175
  max_new_tokens=4092,
176
+ model_kwargs={"max_length": 8192},
177
+ callbacks=[PromptLayerCallbackHandler(pl_tags=[model, "document-qa"])]
178
  )
179
  embeddings = HuggingFaceEmbeddings(
180
  model_name=OPEN_EMBEDDINGS[embeddings_name])
 
238
  # is_api_key_provided = st.session_state['api_key']
239
 
240
  with st.sidebar:
241
+ st.title("πŸ“ Document Q/A")
242
+ st.markdown("Upload a scientific article in PDF, ask questions, get insights.")
243
  st.markdown(
244
  ":warning: [Usage disclaimer](https://github.com/lfoppiano/document-qa?tab=readme-ov-file#disclaimer-on-data-security-and-privacy-%EF%B8%8F) :warning: ")
245
 
 
306
  # help="Clear the conversational memory. Currently implemented to retrain the 4 most recent messages.",
307
  # disabled=model in st.session_state['rqa'] and st.session_state['rqa'][model].memory is None)
308
 
309
+ left_column, right_column = st.columns([5, 4])
310
  right_column = right_column.container(border=True)
311
  left_column = left_column.container(border=True)
312
 
313
  with right_column:
314
  uploaded_file = st.file_uploader(
315
+ "Upload a scientific article",
316
+ type=("pdf"),
317
  on_change=new_file,
318
  disabled=st.session_state['model'] is not None and st.session_state['model'] not in
319
  st.session_state['api_keys'],
 
348
  "relevant paragraphs to the question in the paper. "
349
  "Question coefficient attempt to estimate how effective the question will be answered."
350
  )
351
+ st.session_state['scroll_to_first_annotation'] = st.checkbox(
352
+ "Scroll to context",
353
+ help='The PDF viewer will automatically scroll to the first relevant passage in the document.'
354
+ )
355
  st.session_state['ner_processing'] = st.checkbox(
356
  "Identify materials and properties.",
357
  help='The LLM responses undergo post-processing to extract physical quantities, measurements, and materials mentions.'
 
424
 
425
  with right_column:
426
  if st.session_state.loaded_embeddings and question and len(question) > 0 and st.session_state.doc_id:
 
427
  st.session_state.messages.append({"role": "user", "mode": mode, "content": question})
428
 
429
  for message in st.session_state.messages:
 
499
  input=st.session_state['binary'],
500
  annotation_outline_size=2,
501
  annotations=st.session_state['annotations'],
502
+ render_text=True,
503
+ scroll_to_annotation=1 if (st.session_state['annotations'] and st.session_state['scroll_to_first_annotation']) else None
504
  )