import os import time import gradio as gr from pathlib import Path import pysrt import pandas as pd if os.path.isdir(f'{os.getcwd() + os.sep}whisper.cpp'): print("Models already loaded") else: os.system('git clone https://github.com/ggerganov/whisper.cpp.git') os.system("git reset --hard 3163090d89c47933d7c2a080b224f0d2e842b468") os.system('git clone https://huggingface.co/Finnish-NLP/Finnish-finetuned-whisper-models-ggml-format') os.system('make -C ./whisper.cpp') whisper_models = ["medium", "large"] whisper_modelpath_translator= { "medium": "./Finnish-finetuned-whisper-models-ggml-format/ggml-model-fi-medium.bin", "large": "./Finnish-finetuned-whisper-models-ggml-format/ggml-model-fi-large-v3.bin" } def speech_to_text(audio_path, whisper_model): if(audio_path is None): retry_cnt = 0 for retry_cnt in range(3): if(audio_path is None): print(f'Retrying, retry counter: {retry_cnt +1}') time.sleep(0.5) retry_cnt +=1 if retry_cnt == 3: raise ValueError("Error no audio input") else: break print(audio_path) try: retry_cnt = 0 for retry_cnt in range(3): try: _,file_ending = os.path.splitext(f'{audio_path}') print(f'file enging is {file_ending}') print("starting conversion to wav") new_path = audio_path.replace(file_ending, "_converted.wav") os.system(f'ffmpeg -i "{audio_path}" -ar 16000 -y -ac 1 -c:a pcm_s16le "{new_path}"') print("conversion to wav ready") break except Exception as e: time.sleep(0.5) retry_cnt +=1 if retry_cnt == 3: pass except Exception as e: raise RuntimeError(f'Error Running inference with local model: {e}') from e try: print("starting whisper c++") srt_path = new_path + ".srt" os.system(f'rm -f {srt_path}') os.system(f'./whisper.cpp/main "{new_path}" -t 4 -m ./{whisper_modelpath_translator.get(whisper_model)} -osrt -l fi') print("starting whisper done with whisper") except Exception as e: raise RuntimeError(f'Error running Whisper cpp model: {e}') from e try: df = pd.DataFrame(columns = ['start','end','text']) subs = pysrt.open(srt_path) rows = [] for sub in subs: start_hours = str(str(sub.start.hours) + "00")[0:2] if len(str(sub.start.hours)) == 2 else str("0" + str(sub.start.hours) + "00")[0:2] end_hours = str(str(sub.end.hours) + "00")[0:2] if len(str(sub.end.hours)) == 2 else str("0" + str(sub.end.hours) + "00")[0:2] start_minutes = str(str(sub.start.minutes) + "00")[0:2] if len(str(sub.start.minutes)) == 2 else str("0" + str(sub.start.minutes) + "00")[0:2] end_minutes = str(str(sub.end.minutes) + "00")[0:2] if len(str(sub.end.minutes)) == 2 else str("0" + str(sub.end.minutes) + "00")[0:2] start_seconds = str(str(sub.start.seconds) + "00")[0:2] if len(str(sub.start.seconds)) == 2 else str("0" + str(sub.start.seconds) + "00")[0:2] end_seconds = str(str(sub.end.seconds) + "00")[0:2] if len(str(sub.end.seconds)) == 2 else str("0" + str(sub.end.seconds) + "00")[0:2] start_millis = str(str(sub.start.milliseconds) + "000")[0:3] end_millis = str(str(sub.end.milliseconds) + "000")[0:3] rows.append([sub.text, f'{start_hours}:{start_minutes}:{start_seconds}.{start_millis}', f'{end_hours}:{end_minutes}:{end_seconds}.{end_millis}']) for row in rows: srt_to_df = { 'start': [row[1]], 'end': [row[2]], 'text': [row[0]] } df = pd.concat([df, pd.DataFrame(srt_to_df)]) except Exception as e: print(f"Error creating srt df with error: {e}") return df def output_to_files(df): df.reset_index(inplace=True) print("Starting SRT-file creation") print(df.head()) with open('subtitles.vtt','w', encoding="utf-8") as file: print("Starting WEBVTT-file creation") for i in range(len(df)): if i == 0: file.write('WEBVTT') file.write('\n') else: file.write(str(i+1)) file.write('\n') start = df.iloc[i]['start'] file.write(f"{start.strip()}") stop = df.iloc[i]['end'] file.write(' --> ') file.write(f"{stop}") file.write('\n') file.writelines(df.iloc[i]['text']) if int(i) != len(df)-1: file.write('\n\n') print("WEBVTT DONE") with open('subtitles.srt','w', encoding="utf-8") as file: print("Starting SRT-file creation") for i in range(len(df)): file.write(str(i+1)) file.write('\n') start = df.iloc[i]['start'] file.write(f"{start.strip()}") stop = df.iloc[i]['end'] file.write(' --> ') file.write(f"{stop}") file.write('\n') file.writelines(df.iloc[i]['text']) if int(i) != len(df)-1: file.write('\n\n') print("SRT DONE") subtitle_files_out = ['subtitles.vtt','subtitles.srt'] return subtitle_files_out # ---- Gradio Layout ----- demo = gr.Blocks(css=''' #cut_btn, #reset_btn { align-self:stretch; } #\\31 3 { max-width: 540px; } .output-markdown {max-width: 65ch !important;} ''') demo.encrypt = False with demo: with gr.Row(): with gr.Column(): gr.Markdown(''' # Simple Finnish Audio --> Text app ### This space allows you to: 1. Insert audio file or record with microphone 2. Run audio through transcription process using speech recognition models 3. Download generated transcriptions in .vtt and .srt formats ''') with gr.Row(): with gr.Column(): audio_in = gr.Audio(label="Audio file", type='filepath') transcribe_btn = gr.Button("Step 1. Transcribe audio") selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="large", label="Selected Whisper model", interactive=True) with gr.Row(): with gr.Column(): transcription_df = gr.DataFrame(headers = ['start','end','text'], label="Transcription dataframe") with gr.Row(): with gr.Column(): translate_transcriptions_button = gr.Button("Step 2. Create subtitle files") with gr.Row(): with gr.Column(): gr.Markdown('''##### From here you can download subtitles in .srt or .vtt format''') subtitle_files = gr.File( label="Download files", file_count="multiple", type="filepath", interactive=False, ) # Functionalities transcribe_btn.click(speech_to_text, [audio_in, selected_whisper_model], [transcription_df]) translate_transcriptions_button.click(output_to_files, transcription_df, [subtitle_files]) demo.launch()