Spaces:
Sleeping
Sleeping
File size: 5,947 Bytes
c145336 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
import gradio as gr
import json
import torch
import numpy as np
import librosa
from accelerate.utils.imports import is_cuda_available
from iso639 import iter_langs
from ctc_forced_aligner import (
load_alignment_model,
generate_emissions,
preprocess_text,
get_alignments,
get_spans,
postprocess_results,
)
device = "cuda" if is_cuda_available() else "cpu"
dtype = torch.float16 if is_cuda_available() else torch.float32
alignment_model, alignment_tokenizer = load_alignment_model(
device,
dtype=dtype,
)
def process_alignment(audio_waveform, text, language="eng"):
print(f"{audio_waveform.shape=}, {text=}, {language=}")
# Generate emissions
emissions, stride = generate_emissions(
alignment_model, audio_waveform, batch_size=16
)
# Preprocess text
tokens_starred, text_starred = preprocess_text(
text,
romanize=True,
language=language,
)
# Get alignments
segments, scores, blank_id = get_alignments(
emissions,
tokens_starred,
alignment_tokenizer,
)
# Get spans and word timestamps
spans = get_spans(tokens_starred, segments, blank_id)
word_timestamps = postprocess_results(text_starred, spans, stride, scores)
return word_timestamps
def trim_audio(audio_array, sample_rate, word_timestamps):
start_time = int(word_timestamps[0]["start"] * sample_rate)
end_time = int(word_timestamps[-1]["end"] * sample_rate)
print(f"{start_time=}, {end_time=}")
trimmed_audio = audio_array[start_time:end_time]
return (sample_rate, trimmed_audio)
def get_language_choices():
return [f"{lang.pt3} - {lang.name}" for lang in iter_langs() if lang.pt3]
def align(audio, text, language="eng - English"):
# Extract the ISO 639-3 code from the selected language
iso_code = language.split(" - ")[0]
# Convert the input audio to 16kHz mono
sample_rate, audio_array = audio
audio_array = (
audio_array.astype(np.float32) / 32768.0
) # Convert to float32 and normalize
print(f"{sample_rate=}, {audio_array.shape=}")
if len(audio_array.shape) > 1:
audio_array = audio_array.mean(axis=1) # Convert to mono if stereo
audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
# Convert to torch tensor and move to the correct device
audio_waveform = torch.from_numpy(audio_array).to(device=device, dtype=dtype)
# Process the alignment
word_timestamps = process_alignment(audio_waveform, text, iso_code)
# Trim the audio
trimmed_audio = trim_audio(audio_array, 16000, word_timestamps)
# Create JSON output
output_json = {
"input_text": text,
"word_timestamps": word_timestamps,
"language": language,
}
return trimmed_audio, json.dumps(output_json, indent=2)
def align_result_only(audio, text, language="eng - English"):
# Extract the ISO 639-3 code from the selected language
iso_code = language.split(" - ")[0]
# Convert the input audio to 16kHz mono
sample_rate, audio_array = audio
audio_array = (
audio_array.astype(np.float32) / 32768.0
) # Convert to float32 and normalize
print(f"{sample_rate=}, {audio_array.shape=}")
if len(audio_array.shape) > 1:
audio_array = audio_array.mean(axis=1) # Convert to mono if stereo
audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
# Convert to torch tensor and move to the correct device
audio_waveform = torch.from_numpy(audio_array).to(device=device, dtype=dtype)
# Process the alignment
word_timestamps = process_alignment(audio_waveform, text, iso_code)
# Create JSON output
output_json = {
"input_text": text,
"word_timestamps": word_timestamps,
"language": language,
}
return json.dumps(output_json, indent=2)
# Create Gradio blocks
with gr.Blocks() as demo:
gr.Markdown("# Forced Alignment")
gr.Markdown(
"""
This tool aligns audio with text and provides word-level timestamps.
## How to use:
1. Upload an audio file or record audio
2. Enter the corresponding text
3. Select the language
4. Click 'Process' to get the alignment results
"""
)
with gr.Row():
with gr.Column():
audio_input = gr.Audio(label="Input Audio")
text_input = gr.Textbox(label="Input Text")
language_input = gr.Dropdown(
choices=get_language_choices(), label="Language", value="eng - English"
)
submit_button = gr.Button(
"Get Alignment and Trimmed Audio", variant="primary"
)
submit_button_result_only = gr.Button(
"Get Alignment Only", variant="secondary"
)
with gr.Column():
audio_output = gr.Audio(label="Trimmed Output Audio")
json_output = gr.JSON(label="Alignment Results")
submit_button.click(
fn=align,
inputs=[audio_input, text_input, language_input],
outputs=[audio_output, json_output],
)
submit_button_result_only.click(
fn=align_result_only,
inputs=[audio_input, text_input, language_input],
outputs=[json_output],
)
gr.Markdown("## Examples")
gr.Examples(
examples=[
["examples/example1.mp3", "我們搭上公車要回台北了", "zho - Chinese"],
[
"examples/example2.wav",
"ON SATURDAY MORNINGS WHEN THE SODALITY MET IN THE CHAPEL TO RECITE THE LITTLE OFFICE HIS PLACE WAS A CUSHIONED KNEELING DESK AT THE RIGHT OF THE ALTAR FROM WHICH HE LED HIS WING OF BOYS THROUGH THE RESPONSES",
"eng - English",
],
],
inputs=[audio_input, text_input, language_input],
)
# Launch the demo
if __name__ == "__main__":
demo.launch()
|