Spaces:
Running
Running
import gradio as gr | |
import time | |
import os | |
import zipfile | |
import torch | |
import librosa | |
import soundfile as sf | |
from transformers import pipeline | |
from typing import List, Tuple, Generator | |
import datetime | |
from pydub import AudioSegment | |
# Initial model name | |
MODEL_NAME = "primeline/whisper-tiny-german-1224" | |
speech_to_text = pipeline("automatic-speech-recognition", model=MODEL_NAME) | |
# Initial status message | |
STANDARD_OUTPUT_TEXT = "**Status:**<br>" | |
def get_file_creation_date(file_path: str) -> str: | |
""" | |
Returns the creation date of a file. | |
Args: | |
file_path (str): The path to the file. | |
Returns: | |
str: The creation date in a human-readable format. | |
""" | |
try: | |
# Get file statistics | |
file_stats = os.stat(file_path) | |
# Retrieve and format creation time | |
creation_time = datetime.datetime.fromtimestamp(file_stats.st_ctime) | |
return creation_time.strftime("%Y-%m-%d %H:%M:%S") | |
except FileNotFoundError: | |
return "File not found." | |
def load_model(model_name: str): | |
""" | |
Loads the selected Hugging Face model. | |
Args: | |
model_name (str): The name of the Hugging Face model to load. | |
Returns: | |
pipeline: The loaded model pipeline. | |
""" | |
return pipeline("automatic-speech-recognition", model=model_name) | |
def convert_to_wav(file_path: str) -> str: | |
""" | |
Converts audio files to WAV format if necessary. | |
Args: | |
file_path (str): Path to the uploaded audio file. | |
Returns: | |
str: Path to the converted WAV file. | |
""" | |
if file_path.endswith(".m4a") or file_path.endswith(".aac"): | |
audio = AudioSegment.from_file(file_path) | |
wav_path = file_path.rsplit('.', 1)[0] + ".wav" | |
audio.export(wav_path, format="wav") | |
return wav_path | |
return file_path | |
def preprocess_audio(file_path: str) -> str: | |
""" | |
Preprocesses the audio file to ensure compatibility with the AI model. | |
Args: | |
file_path (str): Path to the uploaded audio file. | |
Returns: | |
str: Path to the preprocessed audio file. | |
""" | |
file_path = convert_to_wav(file_path) # Convert to WAV if necessary | |
y, sr = librosa.load(file_path, sr=16000) # Resample audio to 16kHz | |
processed_path = file_path.replace(".mp3", "_processed.wav").replace(".wav", "_processed.wav") | |
sf.write(processed_path, y, sr) # Save the resampled audio | |
return processed_path | |
def process_files_with_live_updates( | |
files: List[gr.File], | |
model_option: str, | |
output_format: str | |
) -> Generator[Tuple[str, List[str]], None, None]: | |
""" | |
Processes a list of uploaded files, transcribes audio, and provides live updates. | |
Args: | |
files (List[gr.File]): List of files uploaded by the user. | |
model_option (str): Selected model option. | |
output_format (str): Selected output format option. | |
Yields: | |
Tuple[str, List[str]]: Updated status message and list of processed file paths. | |
""" | |
global speech_to_text | |
speech_to_text = load_model(model_option) | |
file_details = [] | |
total_files = len(files) | |
output_files = [] | |
# Create a folder to temporarily store output files | |
output_dir = "output_files" | |
os.makedirs(output_dir, exist_ok=True) | |
for idx, file in enumerate(files): | |
# Preprocess audio file | |
preprocessed_path = preprocess_audio(file.name) | |
# Transcribe audio using the AI model with timestamp support | |
transcription_result = speech_to_text(preprocessed_path, return_timestamps=True) | |
transcription = transcription_result["text"] | |
# Save transcription to file | |
txt_filename = os.path.join(output_dir, f"transcription_{file.name.split('/')[-1].split('.')[0]}.txt") | |
with open(txt_filename, "w", encoding="utf-8") as txt_file: | |
txt_file.write(transcription) | |
output_files.append(txt_filename) | |
# Add to file details | |
detail = ( | |
f"**File Name**: {file.name.split('/')[-1]}<br>" | |
f"**File Date**: {get_file_creation_date(file)}<br>" | |
f"**Options**: {model_option} - {output_format}<br>" | |
f"**Transcription**: {transcription}<br><br>" | |
) | |
file_details.append(detail) | |
# Update progress bar and yield the updated Markdown | |
yield ( | |
f"**Status: {int(((idx + 1) / total_files) * 100)}%**<br>" + "".join(file_details), | |
output_files, | |
) | |
# Create a zip archive | |
zip_filename = os.path.join(output_dir, "output_files.zip") | |
with zipfile.ZipFile(zip_filename, "w") as zipf: | |
for file_path in output_files: | |
zipf.write(file_path, os.path.basename(file_path)) | |
output_files.append(zip_filename) | |
# Final yield | |
yield ( | |
f"**Status: {int(((idx + 1) / total_files) * 100)}%**<br>" + "".join(file_details), | |
output_files, | |
) | |
# Gradio app layout | |
with gr.Blocks() as demo: | |
# Title and Description | |
gr.Markdown("# Speech-to-Text Batch Processor (German)") | |
gr.Markdown( | |
""" | |
Upload multiple audio files (.wav, .mp3, .m4a, .aac), select desired processing options (i.e. the model), and view real-time updates as files are transcribed. | |
The application uses advanced AI models for sequential speech-to-text translation. | |
""" | |
) | |
# Input section | |
with gr.Row(): | |
with gr.Column(): | |
file_input = gr.Files(file_types=[".wav", ".mp3", ".m4a", ".aac"], label="Upload your audio files") | |
with gr.Column(): | |
model_dropdown = gr.Dropdown( | |
choices=[ | |
"primeline/whisper-large-v3-german", | |
"primeline/whisper-tiny-german-1224", | |
"primeline/whisper-tiny-german" | |
], | |
label="Select Model", | |
value="primeline/whisper-large-v3-german", | |
) | |
dropdown_2 = gr.Dropdown( | |
choices=["Format: Plain Text"], | |
label="Select Output Format", | |
value="Format: Plain Text", | |
) | |
# Buttons | |
with gr.Row(): | |
submit_button = gr.Button("Start Transcription") | |
clear_button = gr.Button("Clear") | |
# Output section | |
output_md = gr.Markdown(label="Transcription Progress", value=STANDARD_OUTPUT_TEXT) | |
output_files = gr.Files(label="Generated Output Files") | |
# Button actions | |
submit_button.click( | |
process_files_with_live_updates, | |
inputs=[file_input, model_dropdown, dropdown_2], | |
outputs=[output_md, output_files], | |
) | |
clear_button.click( | |
lambda: (None, "primeline/whisper-large-v3-german", "Format: Plain Text", STANDARD_OUTPUT_TEXT, None), | |
inputs=[], # No inputs | |
outputs=[file_input, model_dropdown, dropdown_2, output_md, output_files], | |
) | |
gr.Image("Fraunhofer-IPA-Logo.jpg", show_label=False) | |
# Centered Footer with Logo and Licensing Text | |
with gr.Row(): | |
gr.Markdown( | |
""" | |
**Fraunhofer IPA** | |
This application is provided under a basic licensing agreement for non-commercial use only. | |
For inquiries, visit [Fraunhofer IPA](https://www.ipa.fraunhofer.de). | |
""", | |
elem_id="footer-markdown", | |
) | |
# CSS to center the footer content | |
demo.css = """ | |
#footer-markdown { | |
text-align: center; | |
margin-top: 20px; | |
padding-top: 10px; | |
border-top: 1px solid #ccc; | |
} | |
""" | |
# Launch app | |
demo.launch() | |