# -*- coding: utf-8 -*- """app.py.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1zk7xuWSf7ii7zowOqNVLy0FwXYVHYE2V """ # https://huggingface.co/spaces/eHemink/assessment3_part2 # Here are the imports import PyPDF2 import re import transformers import scipy from transformers import pipeline from bark import SAMPLE_RATE, generate_audio, preload_models from scipy.io.wavfile import write as write_wav from IPython.display import Audio import gradio as gr # Here is the code def abstract_to_audio(insert_pdf): # Extracting the abstract text from the article pdf def extract_abstract(pdf_file): # Open the PDF file in read-binary mode with open(pdf_file, 'rb') as file: # Create a PDF reader object pdf_reader = PyPDF2.PdfReader(file) # Initialize an empty string to store abstract content abstract_text = '' # Loop through each page in the PDF for page_num in range(len(pdf_reader.pages)): # Get the text from the current page page = pdf_reader.pages[page_num] text = page.extract_text() # Use regular expression to find the "Abstract" section abstract_match = re.search(r'\bAbstract\b', text, re.IGNORECASE) if abstract_match: # Get the text after the "Abstract" heading until the next section, indicated by "Introduction" heading start_index = abstract_match.end() next_section_match = re.search(r'\bIntroduction\b', text[start_index:]) if next_section_match: end_index = start_index + next_section_match.start() abstract_text = text[start_index:end_index] else: # If no next section found, extract text till the end abstract_text = text[start_index:] break # Exit loop once abstract is found return abstract_text.strip() abstract = extract_abstract(insert_pdf) # Creating a summarization pipeline model = "lidiya/bart-large-xsum-samsum" pipeline1 = pipeline(task = "summarization", model = model) # Summarizing the extracted abstract summarized = pipeline1(abstract) print(summarized[0]['summary_text']) tss_prompt = summarized[0]['summary_text'] # Generate audio file that speaks the generated sentence using Bark # download and load all models #preload_models() # generate audio from text #text_prompt = tss_prompt #audio_array = generate_audio(text_prompt) #return (15500, audio_array) tss_pipeline = pipeline("text-to-speech", "suno/bark") speech = tss_pipeline(tss_prompt, forward_params={"do_sample": True}) return (rate=speech["sampling_rate"], data=speech["audio"]) my_app = gr.Interface(fn=abstract_to_audio, inputs='file', outputs='audio', title="PDF Abstract Summarizer", description="Extracts abstracts from PDFs and generates audio summaries. This app only accepts PDFs with abstracts.") my_app.launch()