speech-to-text / app.py
pawipa's picture
removed preprocessed now.
8813f41
import gradio as gr
import time
import os
import zipfile
import torch
import librosa
import soundfile as sf
from transformers import pipeline
from typing import List, Tuple, Generator
import datetime
from pydub import AudioSegment
# Initial model name
MODEL_NAME = "primeline/whisper-tiny-german-1224"
speech_to_text = pipeline("automatic-speech-recognition", model=MODEL_NAME)
# Initial status message
STANDARD_OUTPUT_TEXT = "**Status:**<br>"
def get_file_creation_date(file_path: str) -> str:
"""
Returns the creation date of a file.
Args:
file_path (str): The path to the file.
Returns:
str: The creation date in a human-readable format.
"""
try:
# Get file statistics
file_stats = os.stat(file_path)
# Retrieve and format creation time
creation_time = datetime.datetime.fromtimestamp(file_stats.st_ctime)
return creation_time.strftime("%Y-%m-%d %H:%M:%S")
except FileNotFoundError:
return "File not found."
def load_model(model_name: str):
"""
Loads the selected Hugging Face model.
Args:
model_name (str): The name of the Hugging Face model to load.
Returns:
pipeline: The loaded model pipeline.
"""
return pipeline("automatic-speech-recognition", model=model_name)
def convert_to_wav(file_path: str) -> str:
"""
Converts audio files to WAV format if necessary.
Args:
file_path (str): Path to the uploaded audio file.
Returns:
str: Path to the converted WAV file.
"""
if file_path.endswith(".m4a") or file_path.endswith(".aac"):
audio = AudioSegment.from_file(file_path)
wav_path = file_path.rsplit('.', 1)[0] + ".wav"
audio.export(wav_path, format="wav")
return wav_path
return file_path
def preprocess_audio(file_path: str) -> str:
"""
Preprocesses the audio file to ensure compatibility with the AI model.
Args:
file_path (str): Path to the uploaded audio file.
Returns:
str: Path to the preprocessed audio file.
"""
file_path = convert_to_wav(file_path) # Convert to WAV if necessary
y, sr = librosa.load(file_path, sr=16000) # Resample audio to 16kHz
processed_path = file_path.replace(".mp3", "_processed.wav").replace(".wav", "_processed.wav")
sf.write(processed_path, y, sr) # Save the resampled audio
return processed_path
def process_files_with_live_updates(
files: List[gr.File],
model_option: str,
output_format: str
) -> Generator[Tuple[str, List[str]], None, None]:
"""
Processes a list of uploaded files, transcribes audio, and provides live updates.
Args:
files (List[gr.File]): List of files uploaded by the user.
model_option (str): Selected model option.
output_format (str): Selected output format option.
Yields:
Tuple[str, List[str]]: Updated status message and list of processed file paths.
"""
global speech_to_text
speech_to_text = load_model(model_option)
file_details = []
total_files = len(files)
output_files = []
# Create a folder to temporarily store output files
output_dir = "output_files"
os.makedirs(output_dir, exist_ok=True)
for idx, file in enumerate(files):
# Preprocess audio file
preprocessed_path = preprocess_audio(file.name)
# Transcribe audio using the AI model with timestamp support
transcription_result = speech_to_text(preprocessed_path, return_timestamps=True)
transcription = transcription_result["text"]
# Save transcription to file
txt_filename = os.path.join(output_dir, f"transcription_{file.name.split('/')[-1].split('.')[0]}.txt")
with open(txt_filename, "w", encoding="utf-8") as txt_file:
txt_file.write(transcription)
output_files.append(txt_filename)
# Add to file details
detail = (
f"**File Name**: {file.name.split('/')[-1]}<br>"
f"**File Date**: {get_file_creation_date(file)}<br>"
f"**Options**: {model_option} - {output_format}<br>"
f"**Transcription**: {transcription}<br><br>"
)
file_details.append(detail)
# Update progress bar and yield the updated Markdown
yield (
f"**Status: {int(((idx + 1) / total_files) * 100)}%**<br>" + "".join(file_details),
output_files,
)
# Create a zip archive
zip_filename = os.path.join(output_dir, "output_files.zip")
with zipfile.ZipFile(zip_filename, "w") as zipf:
for file_path in output_files:
zipf.write(file_path, os.path.basename(file_path))
output_files.append(zip_filename)
# Final yield
yield (
f"**Status: {int(((idx + 1) / total_files) * 100)}%**<br>" + "".join(file_details),
output_files,
)
# Gradio app layout
with gr.Blocks() as demo:
# Title and Description
gr.Markdown("# Speech-to-Text Batch Processor (German)")
gr.Markdown(
"""
Upload multiple audio files (.wav, .mp3, .m4a, .aac), select desired processing options (i.e. the model), and view real-time updates as files are transcribed.
The application uses advanced AI models for sequential speech-to-text translation.
"""
)
# Input section
with gr.Row():
with gr.Column():
file_input = gr.Files(file_types=[".wav", ".mp3", ".m4a", ".aac"], label="Upload your audio files")
with gr.Column():
model_dropdown = gr.Dropdown(
choices=[
"primeline/whisper-large-v3-german",
"primeline/whisper-tiny-german-1224",
"primeline/whisper-tiny-german"
],
label="Select Model",
value="primeline/whisper-large-v3-german",
)
dropdown_2 = gr.Dropdown(
choices=["Format: Plain Text"],
label="Select Output Format",
value="Format: Plain Text",
)
# Buttons
with gr.Row():
submit_button = gr.Button("Start Transcription")
clear_button = gr.Button("Clear")
# Output section
output_md = gr.Markdown(label="Transcription Progress", value=STANDARD_OUTPUT_TEXT)
output_files = gr.Files(label="Generated Output Files")
# Button actions
submit_button.click(
process_files_with_live_updates,
inputs=[file_input, model_dropdown, dropdown_2],
outputs=[output_md, output_files],
)
clear_button.click(
lambda: (None, "primeline/whisper-large-v3-german", "Format: Plain Text", STANDARD_OUTPUT_TEXT, None),
inputs=[], # No inputs
outputs=[file_input, model_dropdown, dropdown_2, output_md, output_files],
)
gr.Image("Fraunhofer-IPA-Logo.jpg", show_label=False)
# Centered Footer with Logo and Licensing Text
with gr.Row():
gr.Markdown(
"""
**Fraunhofer IPA**
This application is provided under a basic licensing agreement for non-commercial use only.
For inquiries, visit [Fraunhofer IPA](https://www.ipa.fraunhofer.de).
""",
elem_id="footer-markdown",
)
# CSS to center the footer content
demo.css = """
#footer-markdown {
text-align: center;
margin-top: 20px;
padding-top: 10px;
border-top: 1px solid #ccc;
}
"""
# Launch app
demo.launch()