Spaces:
Runtime error
Runtime error
from haystack.nodes import PreProcessor, PDFToTextConverter, EmbeddingRetriever | |
from haystack.document_stores import InMemoryDocumentStore | |
from haystack.pipelines import DocumentSearchPipeline | |
import gradio as gr | |
preprocessor = PreProcessor( | |
clean_empty_lines=True, | |
clean_whitespace=True, | |
clean_header_footer=True, | |
split_by="word", | |
split_length=100, | |
split_respect_sentence_boundary=True, | |
split_overlap=3 | |
) | |
document_store = InMemoryDocumentStore() | |
retriever = EmbeddingRetriever(document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2") | |
pipeline = DocumentSearchPipeline(retriever) | |
def print_answers(results): | |
fields = ["answer", "score"] # "context" | |
answers = results["answers"] | |
filtered_answers = [] | |
for ans in answers: | |
filtered_ans = { | |
field: getattr(ans, field) for field in fields if getattr(ans, field) is not None | |
} | |
filtered_answers.append(filtered_ans) | |
return filtered_answers | |
def write_pdf(pdf_file): | |
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"]) | |
document = converter.convert(file_path=pdf_file.name, meta=None)[0] | |
preprocessed_docs = preprocessor.process(document) | |
document_store.write_documents(preprocessed_docs) | |
def predict(question, pdf_file): | |
print("Start processing pdf") | |
write_pdf(pdf_file) | |
print("Processing done.") | |
result = pipeline.run(query=question, params={"Retriever": { "top_k": 2 }}) | |
answers = print_answers(result) | |
return answers | |
title = "Search" | |
interface = gr.Interface( | |
fn=predict, | |
inputs=[gr.components.Textbox(lines = 3, label="Ask an open question!"),gr.components.File(file_count="single", type="file", label="Upload a pdf")], | |
outputs="text", | |
title=title, | |
flagging_options=["top", "medium", "bad"], | |
interpretation="default", | |
theme="default" # “default", “huggingface", “dark-grass", “peach" | |
) | |
interface.launch() | |