File size: 2,044 Bytes
91855c2
108bb17
91855c2
108bb17
 
 
 
 
 
 
 
 
 
 
91855c2
 
108bb17
91855c2
 
108bb17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91855c2
108bb17
 
 
 
 
 
 
 
 
91855c2
 
 
 
108bb17
91855c2
 
108bb17
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from haystack.nodes import PreProcessor, PDFToTextConverter, EmbeddingRetriever, TransformersReader
from haystack.document_stores import InMemoryDocumentStore
from haystack.pipelines import DocumentSearchPipeline, ExtractiveQAPipeline
import gradio as gr

preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=100,
    split_respect_sentence_boundary=True,
    split_overlap=3
)
document_store = InMemoryDocumentStore(embedding_dim=384)
reader = TransformersReader("sentence-transformers/all-MiniLM-L6-v2")
retriever = EmbeddingRetriever(document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2")
pipeline = ExtractiveQAPipeline(reader, retriever)
converter = PDFToTextConverter(remove_numeric_tables=True)

def print_answers(results):
    fields = ["answer", "score"] # "context"
    answers = results["answers"]
    filtered_answers = []
    for ans in answers:
        filtered_ans = {
            field: getattr(ans, field) for field in fields if getattr(ans, field) is not None
        }
        filtered_answers.append(filtered_ans)
    return filtered_answers

def write_pdf(pdf_file):
    document = converter.convert(file_path=pdf_file.name, meta=None)[0]
    preprocessed_docs = preprocessor.process(document)
    document_store.write_documents(preprocessed_docs)
    document_store.update_embeddings(retriever)

def predict(question, pdf_file):
    write_pdf(pdf_file)
    result = pipeline.run(query=question, params={"Retriever": { "top_k": 2 }})
    answers = print_answers(result)
    return answers

interface = gr.Interface(
    fn=predict,
    inputs=[
        gr.components.Textbox(lines = 1, label="Enter your search query here..."),
        gr.components.File(file_count="single", type="file", label="Upload a file here.")
    ],
    outputs="text",
    title="Search",
    interpretation=None,
    theme="default" # “default", “huggingface", “dark-grass", “peach"
)

interface.launch()