Spaces:
Running
Running
import numpy as np | |
import streamlit as st | |
from constants import WHISPER_MODELS, language_dict | |
import streamlit as st | |
from utils import ( | |
translate_to_english, | |
detect_language, | |
write, | |
read, | |
get_key, | |
) | |
import subprocess | |
import whisperx as whisper | |
import json | |
import pandas as pd | |
from pydub import AudioSegment | |
import os | |
import uuid | |
if "btn1" not in st.session_state: | |
st.session_state["btn1"] = False | |
if "btn2" not in st.session_state: | |
st.session_state["btn2"] = False | |
class ByteEncoder(json.JSONEncoder): | |
def default(self, obj): | |
if isinstance(obj, bytes): | |
return obj.hex() | |
return json.JSONEncoder.default(self, obj) | |
def disable_btn2(): | |
st.session_state["btn2"] = True | |
def disable_btn1(): | |
st.session_state["btn1"] = True | |
st.set_page_config(page_title="Whisper-X", layout="wide") | |
import torch | |
if torch.cuda.is_available(): | |
device = "gpu" | |
else: | |
device = "cpu" | |
input, output = st.columns(2, gap="medium") | |
with input: | |
st.header("Input") | |
audio_file = open("audio.wav", "rb") | |
audio_bytes = audio_file.read() | |
# st.markdown("""**sample audio**""", unsafe_allow_html=True) | |
st.audio(audio_bytes, format="audio/wav") | |
# st.markdown("""**your audio file**""", unsafe_allow_html=True) | |
audio_uploaded = st.file_uploader( | |
label="Upload your file", | |
type=["mp3", "wav"], | |
help="Your input file", | |
# on_change=disable_btn2, | |
# disabled=st.session_state["btn1"], | |
) | |
text_json = st.file_uploader( | |
label="Aligned JSON", | |
type=["json"], | |
help="Your aligned json file (Only if you need to skip transcribe)", | |
# disabled=st.session_state["btn2"], | |
# on_change=disable_btn1, | |
) | |
# text_json = None | |
# st.markdown("""**model**""", unsafe_allow_html=True) | |
model_name = st.selectbox( | |
label="Choose your model", | |
options=WHISPER_MODELS, | |
help="Choose a Whisper model.", | |
) | |
model_name = "base" if model_name == "" else model_name | |
# st.markdown("**transcription**", unsafe_allow_html=True) | |
transcription = st.selectbox( | |
"transcription", | |
options=["plain text", "srt", "vtt", "ass", "tsv"], | |
help="Choose the format for the transcription", | |
) | |
translate = st.checkbox( | |
"translate", help="Translate the text to English when set to True" | |
) | |
language = st.selectbox( | |
label="language", | |
options=list(language_dict.keys()) + list(language_dict.values()), | |
help="Translate the text to English when set to True", | |
) | |
patience = st.number_input( | |
label="patience", | |
step=0.01, | |
value=1.0, | |
help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search", | |
) | |
temperature = st.number_input( | |
label="temperature", | |
step=0.01, | |
value=1.0, | |
help="temperature to use for sampling", | |
) | |
suppress_tokens = st.text_input( | |
"suppress_tokens", | |
value="-1", | |
help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations", | |
) | |
initial_prompt = st.text_area( | |
label="initial_prompt", | |
help="optional text to provide as a prompt for the first window.", | |
) | |
condition_on_previous_text = st.checkbox( | |
"condition_on_previous_text", | |
help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop", | |
) | |
temperature_increment_on_fallback = st.number_input( | |
label="temperature_increment_on_fallback", | |
step=0.01, | |
value=0.2, | |
help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below", | |
) | |
compression_ratio_threshold = st.number_input( | |
label="compression_ratio_threshold", | |
value=2.4, | |
step=0.01, | |
help="if the gzip compression ratio is higher than this value, treat the decoding as failed", | |
) | |
logprob_threshold = st.number_input( | |
label="logprob_threshold", | |
value=-1.0, | |
step=0.01, | |
help="if the average log probability is lower than this value, treat the decoding as failed", | |
) | |
no_speech_threshold = st.number_input( | |
label="no_speech_threshold", | |
value=0.6, | |
step=0.01, | |
help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence", | |
) | |
if temperature_increment_on_fallback is not None: | |
temperature = tuple( | |
np.arange(temperature, 1.0 + 1e-6, temperature_increment_on_fallback) | |
) | |
else: | |
temperature = [temperature] | |
submit = st.button("Submit", type="primary") | |
with output: | |
st.header("Output") | |
segments_pre = st.empty() | |
segments_post = st.empty() | |
segments_post_json = st.empty() | |
segments_post2 = st.empty() | |
trans = st.empty() | |
lang = st.empty() | |
name = str(uuid.uuid1()) | |
if submit: | |
if audio_uploaded is None: | |
# st.audio(audio_bytes, format="audio/wav") | |
audio_uploaded = audio_file | |
if audio_uploaded is not None: | |
if audio_uploaded.name.endswith(".wav"): | |
temp = AudioSegment.from_wav(audio_uploaded) | |
input=f"{name}.wav" | |
temp.export(input) | |
if audio_uploaded.name.endswith(".mp3"): | |
input=f"{name}.mp3" | |
with open(input, "wb") as f: | |
f.write(audio_uploaded.getbuffer()) | |
# subprocess.call(['ffmpeg', '-i', audio_uploaded.name, | |
# f'{name}.wav']) | |
# try: | |
# temp = AudioSegment.from_file(audio_uploaded, format="mp3") | |
# temp.export(f"{name}.wav") | |
# except: | |
# temp = AudioSegment.from_file(audio_uploaded, format="mp4") | |
# temp.export(f"{name}.wav") | |
if language == "": | |
model = whisper.load_model(model_name) | |
with st.spinner("Detecting language..."): | |
detection = detect_language(input, model) | |
language = detection.get("detected_language") | |
del model | |
if len(language) > 2: | |
language = get_key(language) | |
if text_json is None: | |
with st.spinner("Running ... "): | |
decode = {"suppress_tokens": suppress_tokens, "beam_size": 5} | |
model = whisper.load_model(model_name) | |
with st.container(): | |
with st.spinner(f"Running with {model_name} model"): | |
result = model.transcribe( | |
input, | |
language=language, | |
patience=patience, | |
initial_prompt=initial_prompt, | |
condition_on_previous_text=condition_on_previous_text, | |
temperature=temperature, | |
compression_ratio_threshold=compression_ratio_threshold, | |
logprob_threshold=logprob_threshold, | |
no_speech_threshold=no_speech_threshold, | |
**decode, | |
) | |
if translate: | |
result = translate_to_english(result, json=False) | |
with open("transcription.json", "w") as f: | |
json.dump(result["segments"], f, indent=4, cls=ByteEncoder) | |
with st.spinner("Running alignment model ..."): | |
model_a, metadata = whisper.load_align_model( | |
language_code=result["language"], device=device | |
) | |
result_aligned = whisper.align( | |
result["segments"], | |
model_a, | |
metadata, | |
input, | |
device=device, | |
) | |
write( | |
input, | |
dtype=transcription, | |
result_aligned=result_aligned, | |
) | |
trans_text = read(input, transcription) | |
trans.text_area( | |
"transcription", trans_text, height=None, max_chars=None, key=None | |
) | |
char_segments = [] | |
word_segments = [] | |
for x in range(len(result_aligned["segments"])): | |
word_segments.append( | |
{ | |
"word-segments": result_aligned["segments"][x][ | |
"word-segments" | |
] | |
.fillna("") | |
.to_dict(orient="records") | |
} | |
) | |
char_segments.append( | |
{ | |
"char-segments": result_aligned["segments"][x][ | |
"char-segments" | |
] | |
.fillna("") | |
.to_dict(orient="records") | |
} | |
) | |
for x in range(len(result_aligned["segments"])): | |
result_aligned["segments"][x]["word-segments"] = word_segments[x] | |
result_aligned["segments"][x]["char-segments"] = char_segments[x] | |
segments_pre.text_area( | |
"Segments before alignment", | |
result["segments"], | |
height=None, | |
max_chars=None, | |
key=None, | |
) | |
segments_post.text_area( | |
"Word Segments after alignment", | |
result_aligned["word_segments"], | |
height=None, | |
max_chars=None, | |
key=None, | |
) | |
segments_post2.text_area( | |
"Segments after alignment", | |
result_aligned["segments"], | |
height=None, | |
max_chars=None, | |
key=None, | |
) | |
lang.text_input( | |
"detected language", language_dict.get(language), disabled=True | |
) | |
os.remove(f"{name}.wav") | |
if text_json is not None: | |
with st.spinner("Running ... "): | |
model = whisper.load_model(model_name) | |
json_filname = str(uuid.uuid1()) | |
data = json.load(text_json) | |
# Close the uploaded file | |
text_json.close() | |
# Write the JSON data to a new file | |
with open(f"{json_filname}.json", "w") as outfile: | |
json.dump(data, outfile) | |
# with open("fold.json", "w", encoding="utf-8") as f: | |
# json.dump(text_json, f) | |
with open(f"{json_filname}.json", "r", encoding="utf-8") as f: | |
cont = json.load(f) | |
with st.spinner("Running alignment model ..."): | |
model_a, metadata = whisper.load_align_model( | |
language_code=language, device=device | |
) | |
result_aligned = whisper.align( | |
cont, | |
model_a, | |
metadata, | |
input, | |
device=device, | |
) | |
words_segments = result_aligned["word_segments"] | |
write( | |
input, | |
dtype=transcription, | |
result_aligned=result_aligned, | |
) | |
trans_text = read(input, transcription) | |
char_segments = [] | |
word_segments = [] | |
for x in range(len(result_aligned["segments"])): | |
word_segments.append( | |
{ | |
"word-segments": result_aligned["segments"][x][ | |
"word-segments" | |
] | |
.fillna("") | |
.to_dict(orient="records") | |
} | |
) | |
char_segments.append( | |
{ | |
"char-segments": result_aligned["segments"][x][ | |
"char-segments" | |
] | |
.fillna("") | |
.to_dict(orient="records") | |
} | |
) | |
for x in range(len(result_aligned["segments"])): | |
result_aligned["segments"][x]["word-segments"] = word_segments[x] | |
result_aligned["segments"][x]["char-segments"] = char_segments[x] | |
trans.text_area( | |
"transcription", trans_text, height=None, max_chars=None, key=None | |
) | |
segments_pre.text_area( | |
"Segments before alignment", | |
cont, | |
height=None, | |
max_chars=None, | |
key=None, | |
) | |
segments_post.text_area( | |
"Word Segments after alignment", | |
result_aligned["word_segments"], | |
height=None, | |
max_chars=None, | |
key=None, | |
) | |
segments_post2.text_area( | |
"Segments after alignment", | |
result_aligned["segments"], | |
expanded=False, | |
height=None, | |
max_chars=None, | |
key=None, | |
) | |
lang.text_input( | |
"detected language", language_dict.get(language), disabled=True | |
) | |
os.remove(f"{name}.wav") | |
os.remove(f"{json_filname}.json") | |