kevin-pek
gradio interface with haystack
108bb17
raw
history blame
1.98 kB
from haystack.nodes import PreProcessor, PDFToTextConverter, EmbeddingRetriever
from haystack.document_stores import InMemoryDocumentStore
from haystack.pipelines import DocumentSearchPipeline
import gradio as gr
preprocessor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=True,
split_by="word",
split_length=100,
split_respect_sentence_boundary=True,
split_overlap=3
)
document_store = InMemoryDocumentStore()
retriever = EmbeddingRetriever(document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2")
pipeline = DocumentSearchPipeline(retriever)
def print_answers(results):
fields = ["answer", "score"] # "context"
answers = results["answers"]
filtered_answers = []
for ans in answers:
filtered_ans = {
field: getattr(ans, field) for field in fields if getattr(ans, field) is not None
}
filtered_answers.append(filtered_ans)
return filtered_answers
def write_pdf(pdf_file):
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
document = converter.convert(file_path=pdf_file.name, meta=None)[0]
preprocessed_docs = preprocessor.process(document)
document_store.write_documents(preprocessed_docs)
def predict(question, pdf_file):
print("Start processing pdf")
write_pdf(pdf_file)
print("Processing done.")
result = pipeline.run(query=question, params={"Retriever": { "top_k": 2 }})
answers = print_answers(result)
return answers
title = "Search"
interface = gr.Interface(
fn=predict,
inputs=[gr.components.Textbox(lines = 3, label="Ask an open question!"),gr.components.File(file_count="single", type="file", label="Upload a pdf")],
outputs="text",
title=title,
flagging_options=["top", "medium", "bad"],
interpretation="default",
theme="default" # “default", “huggingface", “dark-grass", “peach"
)
interface.launch()