Spaces:
Runtime error
Runtime error
import PyPDF2 | |
import pdfplumber | |
from pdfminer.high_level import extract_pages, extract_text | |
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure | |
import re | |
import torch | |
import transformers | |
from transformers import pipeline | |
from datasets import load_dataset | |
import soundfile as sf | |
from IPython.display import Audio | |
from datasets import load_dataset | |
import sentencepiece as spm | |
import os | |
import tempfile | |
import gradio as gr | |
description = """**SpeechAbstractor**\n | |
This app enables users to upload academic articles in PDF format, specifically focusing on abstracts. | |
It efficiently summarizes the abstract and provides an audio playback of the summarized content. | |
Below are some example PDFs for you to experiment with. Feel free to explore the functionality of SpeechAbstractor!""" | |
examples = [ | |
["Article_7.pdf"],["Article_11.pdf"] | |
] | |
#reporting the functions created for the part 1 | |
def text_extraction(element): | |
line_text = element.get_text() | |
line_formats = [] | |
for text_line in element: | |
if isinstance(text_line, LTTextContainer): | |
for character in text_line: | |
if isinstance(character, LTChar): | |
line_formats.append(character.fontname) | |
line_formats.append(character.size) | |
format_per_line = list(set(line_formats)) | |
return (line_text, format_per_line) | |
def read_pdf(pdf_pathy): | |
pdfFileObj = open(pdf_pathy, 'rb') | |
pdfReaded = PyPDF2.PdfReader(pdfFileObj) | |
text_per_pagy = {} | |
for pagenum, page in enumerate(extract_pages(pdf_pathy)): | |
print("Elaborating Page_" +str(pagenum)) | |
pageObj = pdfReaded.pages[pagenum] | |
page_text = [] | |
line_format = [] | |
page_content = [] | |
pdf = pdfplumber.open(pdf_pathy) | |
page_elements = [(element.y1, element) for element in page._objs] | |
page_elements.sort(key=lambda a: a[0], reverse=True) | |
for i,component in enumerate(page_elements): | |
pos= component[0] | |
element = component[1] | |
if isinstance(element, LTTextContainer): | |
(line_text, format_per_line) = text_extraction(element) | |
page_text.append(line_text) | |
line_format.append(format_per_line) | |
page_content.append(line_text) | |
dctkey = 'Page_'+str(pagenum) | |
text_per_pagy[dctkey]= [page_text, line_format, page_content] | |
pdfFileObj.close() | |
return text_per_pagy | |
def clean_text(text): | |
# remove extra spaces | |
text = re.sub(r'\s+', ' ', text) | |
return text.strip() | |
def extract_abstract(text_per_pagy): | |
abstract_text = "" | |
for page_num, page_text in text_per_pagy.items(): | |
if page_text: | |
page_text = page_text.replace("- ", "") | |
start_index = page_text.find("Abstract") | |
if start_index != -1: | |
start_index += len("Abstract") + 1 | |
end_markers = ["Introduction", "Summary", "Overview", "Background", "Contents"] | |
end_index = -1 | |
for marker in end_markers: | |
temp_index = page_text.find(marker, start_index) | |
if temp_index != -1: | |
end_index = temp_index | |
break | |
if end_index == -1: | |
end_index = len(page_text) | |
abstract = page_text[start_index:end_index].strip() | |
abstract_text += " " + abstract | |
break | |
return abstract_text | |
#let's define a main function that gets the uploaded file (pdf) to do the job | |
def main_function(uploaded_filepath): | |
#put a control to see if there is a file uploaded | |
if uploaded_filepath is None: | |
return "No file loaded", None | |
#read and process the file according to read_pdf | |
text_per_pagy = read_pdf(uploaded_filepath) | |
#cleaning the text and getting the abstract using the 2 other functions | |
for key, value in text_per_pagy.items(): | |
cleaned_text = clean_text(' '.join(value[0])) | |
text_per_pagy[key] = cleaned_text | |
abstract_text = extract_abstract(text_per_pagy) | |
#abstract the summary with my pipeline and model, deciding the length | |
summarizer = pipeline("summarization", model="pszemraj/long-t5-tglobal-base-sci-simplify") | |
summary = summarizer(abstract_text, max_length=65, do_sample=False)[0]['summary_text'] | |
#generating the audio from the text, with my pipeline and model | |
synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts") | |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) | |
speech = synthesiser(summary, forward_params={"speaker_embeddings": speaker_embedding}) | |
#saving the audio in a temporary file | |
audio_file_path = "summary.wav" | |
sf.write(audio_file_path, speech["audio"], samplerate=speech["sampling_rate"]) | |
#the function returns the 2 pieces we need | |
return summary, audio_file_path | |
#let's communicate with gradio what it has to put in | |
iface = gr.Interface( | |
fn=main_function, | |
inputs=gr.File(type="filepath"), | |
outputs=[gr.Textbox(label="Summary Text"), gr.Audio(label="Summary Audio", type="filepath")], | |
description=description, | |
examples=examples | |
) | |
#launching the app | |
if __name__ == "__main__": | |
iface.launch() | |