freQuensy23 commited on
Commit
388ab15
1 Parent(s): 566eb82

[IMP] additional parameters

Browse files
Files changed (1) hide show
  1. main.py +12 -5
main.py CHANGED
@@ -13,7 +13,7 @@ prev_files = None
13
  retriever = None
14
 
15
 
16
- def handle_files_and_query(query, files):
17
  results = ""
18
  global prev_files, retriever
19
  files = [f.name for f in files]
@@ -22,8 +22,11 @@ def handle_files_and_query(query, files):
22
  prev_files = files
23
  for file in files:
24
  documents.extend(
25
- PyMuPDFLoader(file).load_and_split(SentenceTransformersTokenTextSplitter(model_name=model)))
26
- retriever = BM25Retriever.from_documents(documents, k=100)
 
 
 
27
  results += "Index created successfully!\n"
28
  print("Index created successfully!")
29
  elif files is None:
@@ -49,10 +52,14 @@ interface = gr.Interface(
49
  fn=handle_files_and_query,
50
  inputs=[
51
  gr.Textbox(lines=1, label="Enter your search query here..."),
52
- gr.File(file_count="multiple", type="file", file_types=[".pdf"], label="Upload a file here.")
 
 
 
 
53
  ],
54
  outputs="text",
55
- title="Similarity Search for PDFs"
56
  )
57
 
58
  interface.launch()
 
13
  retriever = None
14
 
15
 
16
+ def handle_files_and_query(query, files, chunk_overlap=50, token_per_chunk=256, bm_25_answers=200):
17
  results = ""
18
  global prev_files, retriever
19
  files = [f.name for f in files]
 
22
  prev_files = files
23
  for file in files:
24
  documents.extend(
25
+ PyMuPDFLoader(file).
26
+ load_and_split(SentenceTransformersTokenTextSplitter(model_name=model,
27
+ chunk_overlap=chunk_overlap,
28
+ tokens_per_chunk=token_per_chunk)))
29
+ retriever = BM25Retriever.from_documents(documents, k=bm_25_answers)
30
  results += "Index created successfully!\n"
31
  print("Index created successfully!")
32
  elif files is None:
 
52
  fn=handle_files_and_query,
53
  inputs=[
54
  gr.Textbox(lines=1, label="Enter your search query here..."),
55
+ gr.File(file_count="multiple", type="file", file_types=[".pdf"], label="Upload a file here."),
56
+ gr.Slider(minimum=1, maximum=100, value=50, label="Chunk Overlap"),
57
+ gr.Slider(minimum=64, maximum=512, value=256, label="Tokens Per Chunk (чем больше - тем бОльшие куски книги "
58
+ "сможем находить)"),
59
+ gr.Slider(minimum=1, maximum=1000, value=200, label="BM25 Answers (чем больше - тем больше будем учитывать неявные смысловые сравнения слов)")
60
  ],
61
  outputs="text",
62
+ title="Similarity Search for eksmo books"
63
  )
64
 
65
  interface.launch()