import gradio as gr import time import os import zipfile import torch import librosa import soundfile as sf from transformers import pipeline from typing import List, Tuple, Generator import datetime from pydub import AudioSegment # Initial model name MODEL_NAME = "primeline/whisper-tiny-german-1224" speech_to_text = pipeline("automatic-speech-recognition", model=MODEL_NAME) # Initial status message STANDARD_OUTPUT_TEXT = "**Status:**
" def get_file_creation_date(file_path: str) -> str: """ Returns the creation date of a file. Args: file_path (str): The path to the file. Returns: str: The creation date in a human-readable format. """ try: # Get file statistics file_stats = os.stat(file_path) # Retrieve and format creation time creation_time = datetime.datetime.fromtimestamp(file_stats.st_ctime) return creation_time.strftime("%Y-%m-%d %H:%M:%S") except FileNotFoundError: return "File not found." def load_model(model_name: str): """ Loads the selected Hugging Face model. Args: model_name (str): The name of the Hugging Face model to load. Returns: pipeline: The loaded model pipeline. """ return pipeline("automatic-speech-recognition", model=model_name) def convert_to_wav(file_path: str) -> str: """ Converts audio files to WAV format if necessary. Args: file_path (str): Path to the uploaded audio file. Returns: str: Path to the converted WAV file. """ if file_path.endswith(".m4a") or file_path.endswith(".aac"): audio = AudioSegment.from_file(file_path) wav_path = file_path.rsplit('.', 1)[0] + ".wav" audio.export(wav_path, format="wav") return wav_path return file_path def preprocess_audio(file_path: str) -> str: """ Preprocesses the audio file to ensure compatibility with the AI model. Args: file_path (str): Path to the uploaded audio file. Returns: str: Path to the preprocessed audio file. """ file_path = convert_to_wav(file_path) # Convert to WAV if necessary y, sr = librosa.load(file_path, sr=16000) # Resample audio to 16kHz processed_path = file_path.replace(".mp3", "_processed.wav").replace(".wav", "_processed.wav") sf.write(processed_path, y, sr) # Save the resampled audio return processed_path def process_files_with_live_updates( files: List[gr.File], model_option: str, output_format: str ) -> Generator[Tuple[str, List[str]], None, None]: """ Processes a list of uploaded files, transcribes audio, and provides live updates. Args: files (List[gr.File]): List of files uploaded by the user. model_option (str): Selected model option. output_format (str): Selected output format option. Yields: Tuple[str, List[str]]: Updated status message and list of processed file paths. """ global speech_to_text speech_to_text = load_model(model_option) file_details = [] total_files = len(files) output_files = [] # Create a folder to temporarily store output files output_dir = "output_files" os.makedirs(output_dir, exist_ok=True) for idx, file in enumerate(files): # Preprocess audio file preprocessed_path = preprocess_audio(file.name) # Transcribe audio using the AI model with timestamp support transcription_result = speech_to_text(preprocessed_path, return_timestamps=True) transcription = transcription_result["text"] # Save transcription to file txt_filename = os.path.join(output_dir, f"transcription_{file.name.split('/')[-1].split('.')[0]}.txt") with open(txt_filename, "w", encoding="utf-8") as txt_file: txt_file.write(transcription) output_files.append(txt_filename) # Add to file details detail = ( f"**File Name**: {file.name.split('/')[-1]}
" f"**File Date**: {get_file_creation_date(file)}
" f"**Options**: {model_option} - {output_format}
" f"**Transcription**: {transcription}

" ) file_details.append(detail) # Update progress bar and yield the updated Markdown yield ( f"**Status: {int(((idx + 1) / total_files) * 100)}%**
" + "".join(file_details), output_files, ) # Create a zip archive zip_filename = os.path.join(output_dir, "output_files.zip") with zipfile.ZipFile(zip_filename, "w") as zipf: for file_path in output_files: zipf.write(file_path, os.path.basename(file_path)) output_files.append(zip_filename) # Final yield yield ( f"**Status: {int(((idx + 1) / total_files) * 100)}%**
" + "".join(file_details), output_files, ) # Gradio app layout with gr.Blocks() as demo: # Title and Description gr.Markdown("# Speech-to-Text Batch Processor (German)") gr.Markdown( """ Upload multiple audio files (.wav, .mp3, .m4a, .aac), select desired processing options (i.e. the model), and view real-time updates as files are transcribed. The application uses advanced AI models for sequential speech-to-text translation. """ ) # Input section with gr.Row(): with gr.Column(): file_input = gr.Files(file_types=[".wav", ".mp3", ".m4a", ".aac"], label="Upload your audio files") with gr.Column(): model_dropdown = gr.Dropdown( choices=[ "primeline/whisper-large-v3-german", "primeline/whisper-tiny-german-1224", "primeline/whisper-tiny-german" ], label="Select Model", value="primeline/whisper-large-v3-german", ) dropdown_2 = gr.Dropdown( choices=["Format: Plain Text"], label="Select Output Format", value="Format: Plain Text", ) # Buttons with gr.Row(): submit_button = gr.Button("Start Transcription") clear_button = gr.Button("Clear") # Output section output_md = gr.Markdown(label="Transcription Progress", value=STANDARD_OUTPUT_TEXT) output_files = gr.Files(label="Generated Output Files") # Button actions submit_button.click( process_files_with_live_updates, inputs=[file_input, model_dropdown, dropdown_2], outputs=[output_md, output_files], ) clear_button.click( lambda: (None, "primeline/whisper-large-v3-german", "Format: Plain Text", STANDARD_OUTPUT_TEXT, None), inputs=[], # No inputs outputs=[file_input, model_dropdown, dropdown_2, output_md, output_files], ) gr.Image("Fraunhofer-IPA-Logo.jpg", show_label=False) # Centered Footer with Logo and Licensing Text with gr.Row(): gr.Markdown( """ **Fraunhofer IPA** This application is provided under a basic licensing agreement for non-commercial use only. For inquiries, visit [Fraunhofer IPA](https://www.ipa.fraunhofer.de). """, elem_id="footer-markdown", ) # CSS to center the footer content demo.css = """ #footer-markdown { text-align: center; margin-top: 20px; padding-top: 10px; border-top: 1px solid #ccc; } """ # Launch app demo.launch()