kevin-pek commited on
Commit
3e243df
1 Parent(s): e6dc9f0

edit number of results to retrieve

Browse files
Files changed (1) hide show
  1. main.py +7 -11
main.py CHANGED
@@ -7,7 +7,7 @@ from langchain.vectorstores import FAISS
7
  import gradio as gr
8
  import re
9
 
10
- model = "msmarco-distilbert-base-v4"
11
  embeddings = SentenceTransformerEmbeddings(model_name=model)
12
  prev_files = None
13
  retriever = None
@@ -20,7 +20,7 @@ def handle_files_and_query(query, files):
20
  prev_files = files
21
  for file in files:
22
  documents.extend(PyMuPDFLoader(file).load_and_split(SentenceTransformersTokenTextSplitter(model_name=model)))
23
- retriever = BM25Retriever.from_documents(documents, k=10)
24
  results += "Index created successfully!\n"
25
  print("Index created successfully!")
26
  elif files is None:
@@ -30,17 +30,13 @@ def handle_files_and_query(query, files):
30
 
31
  print(f"Query: {query}")
32
  if query:
33
- search_results = retriever.get_relevant_documents(query, k=25)
34
  pattern = r'[^\\/]+$' # pattern to get filename from filepath
35
- reranked_results = FAISS.from_documents(search_results, embeddings, distance_strategy=DistanceStrategy.COSINE).similarity_search(query, k=1)
36
- print([
37
- f"Source: {re.search(pattern, result.metadata['file_path']).group(0)}\nPage: {result.metadata['page']}"
38
  for result in reranked_results
39
- ][0])
40
- results = [
41
- f"Source: {re.search(pattern, result.metadata['file_path']).group(0)}\nPage: {result.metadata['page']}\nContent:\n{result.page_content}"
42
- for result in reranked_results
43
- ][0]
44
  return results
45
 
46
  interface = gr.Interface(
 
7
  import gradio as gr
8
  import re
9
 
10
+ model = "msmarco-distilbert-base-tas-b"
11
  embeddings = SentenceTransformerEmbeddings(model_name=model)
12
  prev_files = None
13
  retriever = None
 
20
  prev_files = files
21
  for file in files:
22
  documents.extend(PyMuPDFLoader(file).load_and_split(SentenceTransformersTokenTextSplitter(model_name=model)))
23
+ retriever = BM25Retriever.from_documents(documents, k=100)
24
  results += "Index created successfully!\n"
25
  print("Index created successfully!")
26
  elif files is None:
 
30
 
31
  print(f"Query: {query}")
32
  if query:
33
+ search_results = retriever.get_relevant_documents(query)
34
  pattern = r'[^\\/]+$' # pattern to get filename from filepath
35
+ reranked_results = FAISS.from_documents(search_results, embeddings, distance_strategy=DistanceStrategy.COSINE).similarity_search(query, k=25)
36
+ results = "\n".join([
37
+ f"Source: {re.search(pattern, result.metadata['file_path']).group(0)}\nPage: {result.metadata['page']}\nContent:\n{result.page_content}\n"
38
  for result in reranked_results
39
+ ])
 
 
 
 
40
  return results
41
 
42
  interface = gr.Interface(