File size: 3,891 Bytes
6a3c13a
 
 
 
82d9634
 
6a3c13a
 
 
 
 
e6dc9f0
6a3c13a
 
e6dc9f0
6a3c13a
108bb17
e6dc9f0
108bb17
6a3c13a
 
3e243df
e6dc9f0
 
 
108bb17
566eb82
82d9634
e6dc9f0
 
566eb82
e6dc9f0
 
 
 
566eb82
388ab15
 
 
 
 
e6dc9f0
 
 
 
 
 
108bb17
e6dc9f0
 
3e243df
566eb82
 
 
 
82d9634
 
 
 
 
 
 
 
 
 
e6dc9f0
108bb17
566eb82
108bb17
e6dc9f0
91855c2
566eb82
388ab15
 
 
 
82d9634
 
91855c2
108bb17
388ab15
108bb17
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import warnings

from langchain_core._api import LangChainDeprecationWarning

import translator

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=LangChainDeprecationWarning)

from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.retrievers import BM25Retriever
from langchain_community.vectorstores.utils import DistanceStrategy
from  langchain_community.vectorstores import FAISS
import gradio as gr
import re

print('All imports are successful')

model = "msmarco-distilbert-base-tas-b"
embeddings = SentenceTransformerEmbeddings(model_name=model)
prev_files = None
retriever = None


def handle_files_and_query(query, files, chunk_overlap=50, token_per_chunk=256, bm_25_answers=200, translate_to_ru=False):
    results = ""
    global prev_files, retriever
    files = [f.name for f in files]
    if files is not None and files != prev_files:
        documents = []
        prev_files = files
        for file in files:
            documents.extend(
                PyMuPDFLoader(file).
                load_and_split(SentenceTransformersTokenTextSplitter(model_name=model,
                                                                     chunk_overlap=chunk_overlap,
                                                                     tokens_per_chunk=token_per_chunk)))
        retriever = BM25Retriever.from_documents(documents, k=bm_25_answers)
        results += "Index created successfully!\n"
        print("Index created successfully!")
    elif files is None:
        print("No files uploaded.")
    else:
        print("Reusing index since no files changed.")

    print(f"Query: {query}")
    if query:
        search_results = retriever.get_relevant_documents(query)
        pattern = r'[^\\/]+$'  # pattern to get filename from filepath
        reranked_results = FAISS.from_documents(search_results, embeddings,
                                                distance_strategy=DistanceStrategy.COSINE).similarity_search(query,
                                                                                                             k=25)
        if translate_to_ru:
            results = "\n".join([
                f"Source: {re.search(pattern, result.metadata['file_path']).group(0)}\nPage: {result.metadata['page']}\nContent:\n{translator.translate(result.page_content, 'russian')}\n"
                for result in reranked_results
            ])
        else:
            results = "\n".join([
                f"Source: {re.search(pattern, result.metadata['file_path']).group(0)}\nPage: {result.metadata['page']}\nContent:\n{result.page_content}\n"
                for result in reranked_results
            ])
    return results


interface = gr.Interface(
    fn=handle_files_and_query,
    inputs=[
        gr.Textbox(lines=1, label="Enter your search query here..."),
        gr.File(file_count="multiple", type="file", file_types=[".pdf"], label="Upload a file here."),
        gr.Slider(minimum=1, maximum=100, value=50, label="Chunk Overlap"),
        gr.Slider(minimum=64, maximum=512, value=256, label="Tokens Per Chunk (чем больше - тем бОльшие куски книги "
                                                            "сможем находить)"),
        gr.Slider(minimum=1, maximum=1000, value=200, label="BM25 Answers (чем больше - тем больше будем учитывать неявные смысловые сравнения слов)"),
        gr.Checkbox(label="Translate to Russian", value=False),
    ],
    outputs="text",
    title="Similarity Search for eksmo books"
)

interface.launch()