Spaces:
Runtime error
Runtime error
File size: 3,279 Bytes
72abb26 3ba7fb6 72abb26 8da2e1b 72abb26 be2266d 72abb26 5b59b5e 6bf30eb b54ebb1 72abb26 3ba7fb6 72abb26 5b59b5e 72abb26 5b59b5e be2266d 5b59b5e 72abb26 9cbc782 72abb26 b54ebb1 eec83af cf97f39 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
# -*- coding: utf-8 -*-
"""app.py.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1zk7xuWSf7ii7zowOqNVLy0FwXYVHYE2V
"""
# https://huggingface.co/spaces/eHemink/assessment3_part2
# Here are the imports
import PyPDF2
import re
#import numpy as np
import transformers
import scipy
from transformers import pipeline
#from bark import SAMPLE_RATE, generate_audio, preload_models
#from scipy.io.wavfile import write as write_wav
#from IPython.display import Audio
import gradio as gr
# Here is the code
def abstract_to_audio(insert_pdf):
# Extracting the abstract text from the article pdf
def extract_abstract(pdf_file):
# Open the PDF file in read-binary mode
with open(pdf_file, 'rb') as file:
# Create a PDF reader object
pdf_reader = PyPDF2.PdfReader(file)
# Initialize an empty string to store abstract content
abstract_text = ''
# Loop through each page in the PDF
for page_num in range(len(pdf_reader.pages)):
# Get the text from the current page
page = pdf_reader.pages[page_num]
text = page.extract_text()
# Use regular expression to find the "Abstract" section
abstract_match = re.search(r'\bAbstract\b', text, re.IGNORECASE)
if abstract_match:
# Get the text after the "Abstract" heading until the next section, indicated by "Introduction" heading
start_index = abstract_match.end()
next_section_match = re.search(r'\bIntroduction\b', text[start_index:])
if next_section_match:
end_index = start_index + next_section_match.start()
abstract_text = text[start_index:end_index]
else:
# If no next section found, extract text till the end
abstract_text = text[start_index:]
break # Exit loop once abstract is found
return abstract_text.strip()
abstract = extract_abstract(insert_pdf)
# Creating a summarization pipeline
model = "lidiya/bart-large-xsum-samsum"
pipeline1 = pipeline(task = "summarization", model = model)
# Summarizing the extracted abstract
summarized = pipeline1(abstract)
print(summarized[0]['summary_text'])
tss_prompt = summarized[0]['summary_text']
# Generate audio file that speaks the generated sentence using Bark
# download and load all models
#preload_models()
# generate audio from text
#audio_array = generate_audio(tss_prompt)
#return (SAMPLE_RATE, audio_array)
tss_pipeline = pipeline("text-to-speech", "suno/bark")
speech = tss_pipeline(tss_prompt)
return (speech["sampling_rate"], speech["audio"])
my_app = gr.Interface(fn=abstract_to_audio, inputs='file', outputs='audio', title="PDF Abstract Summarizer",
description="Extracts abstracts from PDFs and generates audio summaries. This app only accepts PDFs with abstracts.", examples=["https://huggingface.co/spaces/eHemink/assessment3_part2/blob/main/Hidden_Technical_Debt_in_MLSystems.pdf"])
my_app.launch() |