kevin-pek
sbert gradio interface
91855c2
raw
history blame
2.04 kB
from haystack.nodes import PreProcessor, PDFToTextConverter, EmbeddingRetriever, TransformersReader
from haystack.document_stores import InMemoryDocumentStore
from haystack.pipelines import DocumentSearchPipeline, ExtractiveQAPipeline
import gradio as gr
preprocessor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=True,
split_by="word",
split_length=100,
split_respect_sentence_boundary=True,
split_overlap=3
)
document_store = InMemoryDocumentStore(embedding_dim=384)
reader = TransformersReader("sentence-transformers/all-MiniLM-L6-v2")
retriever = EmbeddingRetriever(document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2")
pipeline = ExtractiveQAPipeline(reader, retriever)
converter = PDFToTextConverter(remove_numeric_tables=True)
def print_answers(results):
fields = ["answer", "score"] # "context"
answers = results["answers"]
filtered_answers = []
for ans in answers:
filtered_ans = {
field: getattr(ans, field) for field in fields if getattr(ans, field) is not None
}
filtered_answers.append(filtered_ans)
return filtered_answers
def write_pdf(pdf_file):
document = converter.convert(file_path=pdf_file.name, meta=None)[0]
preprocessed_docs = preprocessor.process(document)
document_store.write_documents(preprocessed_docs)
document_store.update_embeddings(retriever)
def predict(question, pdf_file):
write_pdf(pdf_file)
result = pipeline.run(query=question, params={"Retriever": { "top_k": 2 }})
answers = print_answers(result)
return answers
interface = gr.Interface(
fn=predict,
inputs=[
gr.components.Textbox(lines = 1, label="Enter your search query here..."),
gr.components.File(file_count="single", type="file", label="Upload a file here.")
],
outputs="text",
title="Search",
interpretation=None,
theme="default" # “default", “huggingface", “dark-grass", “peach"
)
interface.launch()