Spaces:
Runtime error
Runtime error
import gradio as gr | |
import PyPDF2 | |
import io | |
import requests | |
import torch | |
from transformers import AutoTokenizer, AutoModelForQuestionAnswering | |
# Download and load pre-trained model and tokenizer | |
model_name = "distilbert-base-cased-distilled-squad" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForQuestionAnswering.from_pretrained(model_name) | |
# Define a list of pre-defined questions | |
predefined_questions = [ | |
"What is the purpose of this document?", | |
"What is the main topic of the document?", | |
"Who is the target audience?", | |
"What is the author's main argument?", | |
"What is the conclusion of the document?", | |
] | |
def answer_questions(pdf_file, question): | |
# Load PDF file and extract text | |
pdf_reader = PyPDF2.PdfFileReader(io.BytesIO(pdf_file.read())) | |
text = "" | |
for i in range(pdf_reader.getNumPages()): | |
page = pdf_reader.getPage(i) | |
text += page.extractText() | |
text = text.strip() | |
# Tokenize question and text | |
input_ids = tokenizer.encode(question, text) | |
# Perform question answering | |
outputs = model(torch.tensor([input_ids]), return_dict=True) | |
answer_start = outputs.start_logits.argmax().item() | |
answer_end = outputs.end_logits.argmax().item() | |
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end+1])) | |
return answer | |
inputs = [ | |
gr.inputs.File(label="PDF document"), | |
gr.inputs.Dropdown(label="Question", choices=predefined_questions), | |
] | |
outputs = gr.outputs.Textbox(label="Answer") | |
gr.Interface(fn=answer_questions, inputs=inputs, outputs=outputs, title="PDF Question Answering Tool", | |
description="Upload a PDF document and select a question from the dropdown. The app will use a pre-trained model to find the answer.").launch() | |