Spaces:
Runtime error
Runtime error
File size: 3,891 Bytes
6a3c13a 82d9634 6a3c13a e6dc9f0 6a3c13a e6dc9f0 6a3c13a 108bb17 e6dc9f0 108bb17 6a3c13a 3e243df e6dc9f0 108bb17 566eb82 82d9634 e6dc9f0 566eb82 e6dc9f0 566eb82 388ab15 e6dc9f0 108bb17 e6dc9f0 3e243df 566eb82 82d9634 e6dc9f0 108bb17 566eb82 108bb17 e6dc9f0 91855c2 566eb82 388ab15 82d9634 91855c2 108bb17 388ab15 108bb17 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import warnings
from langchain_core._api import LangChainDeprecationWarning
import translator
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=LangChainDeprecationWarning)
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.retrievers import BM25Retriever
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_community.vectorstores import FAISS
import gradio as gr
import re
print('All imports are successful')
model = "msmarco-distilbert-base-tas-b"
embeddings = SentenceTransformerEmbeddings(model_name=model)
prev_files = None
retriever = None
def handle_files_and_query(query, files, chunk_overlap=50, token_per_chunk=256, bm_25_answers=200, translate_to_ru=False):
results = ""
global prev_files, retriever
files = [f.name for f in files]
if files is not None and files != prev_files:
documents = []
prev_files = files
for file in files:
documents.extend(
PyMuPDFLoader(file).
load_and_split(SentenceTransformersTokenTextSplitter(model_name=model,
chunk_overlap=chunk_overlap,
tokens_per_chunk=token_per_chunk)))
retriever = BM25Retriever.from_documents(documents, k=bm_25_answers)
results += "Index created successfully!\n"
print("Index created successfully!")
elif files is None:
print("No files uploaded.")
else:
print("Reusing index since no files changed.")
print(f"Query: {query}")
if query:
search_results = retriever.get_relevant_documents(query)
pattern = r'[^\\/]+$' # pattern to get filename from filepath
reranked_results = FAISS.from_documents(search_results, embeddings,
distance_strategy=DistanceStrategy.COSINE).similarity_search(query,
k=25)
if translate_to_ru:
results = "\n".join([
f"Source: {re.search(pattern, result.metadata['file_path']).group(0)}\nPage: {result.metadata['page']}\nContent:\n{translator.translate(result.page_content, 'russian')}\n"
for result in reranked_results
])
else:
results = "\n".join([
f"Source: {re.search(pattern, result.metadata['file_path']).group(0)}\nPage: {result.metadata['page']}\nContent:\n{result.page_content}\n"
for result in reranked_results
])
return results
interface = gr.Interface(
fn=handle_files_and_query,
inputs=[
gr.Textbox(lines=1, label="Enter your search query here..."),
gr.File(file_count="multiple", type="file", file_types=[".pdf"], label="Upload a file here."),
gr.Slider(minimum=1, maximum=100, value=50, label="Chunk Overlap"),
gr.Slider(minimum=64, maximum=512, value=256, label="Tokens Per Chunk (чем больше - тем бОльшие куски книги "
"сможем находить)"),
gr.Slider(minimum=1, maximum=1000, value=200, label="BM25 Answers (чем больше - тем больше будем учитывать неявные смысловые сравнения слов)"),
gr.Checkbox(label="Translate to Russian", value=False),
],
outputs="text",
title="Similarity Search for eksmo books"
)
interface.launch()
|