Spaces:

pawipa
/

speech-to-text

Running

App Files Files Community

speech-to-text / app.py

pawipa

removed preprocessed now.

8813f41 13 days ago

raw

history blame contribute delete

7.52 kB

	import gradio as gr
	import time
	import os
	import zipfile
	import torch
	import librosa
	import soundfile as sf
	from transformers import pipeline
	from typing import List, Tuple, Generator
	import datetime
	from pydub import AudioSegment

	# Initial model name
	MODEL_NAME = "primeline/whisper-tiny-german-1224"
	speech_to_text = pipeline("automatic-speech-recognition", model=MODEL_NAME)

	# Initial status message
	STANDARD_OUTPUT_TEXT = "Status:<br>"

	def get_file_creation_date(file_path: str) -> str:
	"""
	Returns the creation date of a file.

	Args:
	file_path (str): The path to the file.

	Returns:
	str: The creation date in a human-readable format.
	"""
	try:
	# Get file statistics
	file_stats = os.stat(file_path)

	# Retrieve and format creation time
	creation_time = datetime.datetime.fromtimestamp(file_stats.st_ctime)
	return creation_time.strftime("%Y-%m-%d %H:%M:%S")
	except FileNotFoundError:
	return "File not found."

	def load_model(model_name: str):
	"""
	Loads the selected Hugging Face model.

	Args:
	model_name (str): The name of the Hugging Face model to load.

	Returns:
	pipeline: The loaded model pipeline.
	"""
	return pipeline("automatic-speech-recognition", model=model_name)

	def convert_to_wav(file_path: str) -> str:
	"""
	Converts audio files to WAV format if necessary.

	Args:
	file_path (str): Path to the uploaded audio file.

	Returns:
	str: Path to the converted WAV file.
	"""
	if file_path.endswith(".m4a") or file_path.endswith(".aac"):
	audio = AudioSegment.from_file(file_path)
	wav_path = file_path.rsplit('.', 1)[0] + ".wav"
	audio.export(wav_path, format="wav")
	return wav_path
	return file_path

	def preprocess_audio(file_path: str) -> str:
	"""
	Preprocesses the audio file to ensure compatibility with the AI model.

	Args:
	file_path (str): Path to the uploaded audio file.

	Returns:
	str: Path to the preprocessed audio file.
	"""
	file_path = convert_to_wav(file_path) # Convert to WAV if necessary
	y, sr = librosa.load(file_path, sr=16000) # Resample audio to 16kHz
	processed_path = file_path.replace(".mp3", "_processed.wav").replace(".wav", "_processed.wav")
	sf.write(processed_path, y, sr) # Save the resampled audio
	return processed_path

	def process_files_with_live_updates(
	files: List[gr.File],
	model_option: str,
	output_format: str
	) -> Generator[Tuple[str, List[str]], None, None]:
	"""
	Processes a list of uploaded files, transcribes audio, and provides live updates.

	Args:
	files (List[gr.File]): List of files uploaded by the user.
	model_option (str): Selected model option.
	output_format (str): Selected output format option.

	Yields:
	Tuple[str, List[str]]: Updated status message and list of processed file paths.
	"""
	global speech_to_text
	speech_to_text = load_model(model_option)

	file_details = []
	total_files = len(files)
	output_files = []

	# Create a folder to temporarily store output files
	output_dir = "output_files"
	os.makedirs(output_dir, exist_ok=True)

	for idx, file in enumerate(files):
	# Preprocess audio file
	preprocessed_path = preprocess_audio(file.name)

	# Transcribe audio using the AI model with timestamp support
	transcription_result = speech_to_text(preprocessed_path, return_timestamps=True)
	transcription = transcription_result["text"]

	# Save transcription to file
	txt_filename = os.path.join(output_dir, f"transcription_{file.name.split('/')[-1].split('.')[0]}.txt")
	with open(txt_filename, "w", encoding="utf-8") as txt_file:
	txt_file.write(transcription)
	output_files.append(txt_filename)

	# Add to file details
	detail = (
	f"File Name: {file.name.split('/')[-1]}<br>"
	f"File Date: {get_file_creation_date(file)}<br>"
	f"Options: {model_option} - {output_format}<br>"
	f"Transcription: {transcription}<br><br>"
	)
	file_details.append(detail)

	# Update progress bar and yield the updated Markdown
	yield (
	f"*Status: {int(((idx + 1) / total_files) 100)}%**<br>" + "".join(file_details),
	output_files,
	)

	# Create a zip archive
	zip_filename = os.path.join(output_dir, "output_files.zip")
	with zipfile.ZipFile(zip_filename, "w") as zipf:
	for file_path in output_files:
	zipf.write(file_path, os.path.basename(file_path))
	output_files.append(zip_filename)

	# Final yield
	yield (
	f"*Status: {int(((idx + 1) / total_files) 100)}%**<br>" + "".join(file_details),
	output_files,
	)

	# Gradio app layout
	with gr.Blocks() as demo:

	# Title and Description
	gr.Markdown("# Speech-to-Text Batch Processor (German)")
	gr.Markdown(
	"""
	Upload multiple audio files (.wav, .mp3, .m4a, .aac), select desired processing options (i.e. the model), and view real-time updates as files are transcribed.
	The application uses advanced AI models for sequential speech-to-text translation.
	"""
	)

	# Input section
	with gr.Row():
	with gr.Column():
	file_input = gr.Files(file_types=[".wav", ".mp3", ".m4a", ".aac"], label="Upload your audio files")
	with gr.Column():
	model_dropdown = gr.Dropdown(
	choices=[
	"primeline/whisper-large-v3-german",
	"primeline/whisper-tiny-german-1224",
	"primeline/whisper-tiny-german"
	],
	label="Select Model",
	value="primeline/whisper-large-v3-german",
	)
	dropdown_2 = gr.Dropdown(
	choices=["Format: Plain Text"],
	label="Select Output Format",
	value="Format: Plain Text",
	)

	# Buttons
	with gr.Row():
	submit_button = gr.Button("Start Transcription")
	clear_button = gr.Button("Clear")

	# Output section
	output_md = gr.Markdown(label="Transcription Progress", value=STANDARD_OUTPUT_TEXT)
	output_files = gr.Files(label="Generated Output Files")

	# Button actions
	submit_button.click(
	process_files_with_live_updates,
	inputs=[file_input, model_dropdown, dropdown_2],
	outputs=[output_md, output_files],
	)

	clear_button.click(
	lambda: (None, "primeline/whisper-large-v3-german", "Format: Plain Text", STANDARD_OUTPUT_TEXT, None),
	inputs=[], # No inputs
	outputs=[file_input, model_dropdown, dropdown_2, output_md, output_files],
	)

	gr.Image("Fraunhofer-IPA-Logo.jpg", show_label=False)

	# Centered Footer with Logo and Licensing Text
	with gr.Row():
	gr.Markdown(
	"""
	Fraunhofer IPA
	This application is provided under a basic licensing agreement for non-commercial use only.
	For inquiries, visit [Fraunhofer IPA](https://www.ipa.fraunhofer.de).
	""",
	elem_id="footer-markdown",
	)

	# CSS to center the footer content
	demo.css = """
	#footer-markdown {
	text-align: center;
	margin-top: 20px;
	padding-top: 10px;
	border-top: 1px solid #ccc;
	}
	"""

	# Launch app
	demo.launch()