import spaces import gradio as gr import json import torch import numpy as np import librosa from accelerate.utils.imports import is_cuda_available from iso639 import iter_langs from ctc_forced_aligner import ( load_alignment_model, generate_emissions, preprocess_text, get_alignments, get_spans, postprocess_results, ) device = "cuda" if is_cuda_available() else "cpu" dtype = torch.float16 if is_cuda_available() else torch.float32 alignment_model, alignment_tokenizer = load_alignment_model( device, dtype=dtype, ) def process_alignment(audio_waveform, text, language="hin"): print(f"{audio_waveform.shape=}, {text=}, {language=}") # Generate emissions emissions, stride = generate_emissions( alignment_model, audio_waveform, batch_size=16 ) # Preprocess text tokens_starred, text_starred = preprocess_text( text, romanize=True, language=language, ) # Get alignments segments, scores, blank_id = get_alignments( emissions, tokens_starred, alignment_tokenizer, ) # Get spans and word timestamps spans = get_spans(tokens_starred, segments, blank_id) word_timestamps = postprocess_results(text_starred, spans, stride, scores) return word_timestamps def trim_audio(audio_array, sample_rate, word_timestamps): start_time = int(word_timestamps[0]["start"] * sample_rate) end_time = int(word_timestamps[-1]["end"] * sample_rate) print(f"{start_time=}, {end_time=}") trimmed_audio = audio_array[start_time:end_time] return (sample_rate, trimmed_audio) def get_language_choices(): return [f"{lang.pt3} - {lang.name}" for lang in iter_langs() if lang.pt3] @spaces.GPU def align_result_only(audio, text, language="hin - Hindi"): # Extract the ISO 639-3 code from the selected language iso_code = language.split(" - ")[0] # Convert the input audio to 16kHz mono sample_rate, audio_array = audio audio_array = ( audio_array.astype(np.float32) / 32768.0 ) # Convert to float32 and normalize print(f"{sample_rate=}, {audio_array.shape=}") if len(audio_array.shape) > 1: audio_array = audio_array.mean(axis=1) # Convert to mono if stereo audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000) # Convert to torch tensor and move to the correct device audio_waveform = torch.from_numpy(audio_array).to(device=device, dtype=dtype) # Process the alignment word_timestamps = process_alignment(audio_waveform, text, iso_code) # Create JSON output output_json = { "input_text": text, "word_timestamps": word_timestamps, "language": language, } return json.dumps(output_json, indent=2) # Create Gradio blocks with gr.Blocks() as demo: gr.Markdown("Align") with gr.Row(): with gr.Column(): audio_input = gr.Audio(label="Input Audio") text_input = gr.Textbox(label="Input Text") language_input = gr.Dropdown( choices=get_language_choices(), label="Language", value="hin - Hindi" ) submit_button_result_only = gr.Button( "Get Alignment", variant="secondary" ) with gr.Column(): json_output = gr.JSON(label="Alignment Results") submit_button_result_only.click( fn=align_result_only, inputs=[audio_input, text_input, language_input], outputs=[json_output], ) # Launch the demo if __name__ == "__main__": demo.launch()