Spaces:
Running
Running
File size: 5,733 Bytes
caaee3e 6980dd0 b357c71 caaee3e 6980dd0 b357c71 caaee3e f2b8075 18b526a f2b8075 9aedf57 caaee3e 8a1e498 70a53fa caaee3e 70a53fa 8621c12 70a53fa 8621c12 caaee3e 8621c12 caaee3e 8621c12 70a53fa caaee3e b357c71 caaee3e 263e119 caaee3e 8a1e498 caaee3e 8a1e498 caaee3e 8a1e498 caaee3e b357c71 caaee3e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
from transformers import WhisperTokenizer
import os
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", task="transcribe") #, language="marathi", task="transcribe"
from transformers import pipeline
import gradio as gr
import torch
import torchaudio
pipe = pipeline(model="thak123/gom-stt-v3", #"thak123/whisper-small-LDC-V1", #"thak123/whisper-small-gom",
task="automatic-speech-recognition",
tokenizer= tokenizer,
) # change to "your-username/the-name-you-picked"
# pipe.model.config.forced_decoder_ids = (
# pipe.tokenizer.get_decoder_prompt_ids(
# language="marathi", task="transcribe"
# )
# )
# def transcribe_speech(filepath):
# # waveform, sample_rate = torchaudio.load(filepath)
# # Resample the audio signal to 16k sampling rate
# # resampler = torchaudio.transforms.Resample(sample_rate, 16000)
# # waveform_16k = resampler(waveform)
# # Save the resampled audio signal to a new file
# # torchaudio.save(filepath, waveform_16k, 16000)
# output = pipe(
# filepath,
# max_new_tokens=3,
# generate_kwargs={
# "task": "transcribe",
# # "language": "konkani",
# }, # update with the language you've fine-tuned on
# chunk_length_s=30,
# batch_size=8,
# # sampling_rate=16000,
# # padding=True
# )
# print(output)
# return output["text"]
def transcribe_speech(filepath):
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch
import librosa
# Load model and processor
model = WhisperForConditionalGeneration.from_pretrained("thak123/gom-stt-v3")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
output = ""
# Load and preprocess audio
audio_path = filepath
audio, sr = librosa.load(audio_path, sr=16000)
input_features = processor(audio, sampling_rate=16000, return_tensors="pt",truncation=False, padding="max_length").input_features
# Check length and process
if input_features.shape[-1] > 3000:
print("Splitting audio required")
# from pydub import AudioSegment
# def split_audio(file_path, chunk_length_ms=30000): # 30 sec chunks
# audio = AudioSegment.from_file(file_path)
# chunks = [audio[i:i+chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
# return chunks
# # Split and transcribe
# audio_chunks = split_audio(audio_path)
# for i, chunk in enumerate(audio_chunks):
# print(i)
# chunk.export(f"chunk_{i}.wav", format="wav")
# result = pipe(f"chunk_{i}.wav")
# output += result['text'] + " "
# print(f"Chunk {i}: {result['text']}")
else:
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
output = transcription
print(transcription)
return output #output["text"]
demo = gr.Blocks()
mic_transcribe = gr.Interface(
fn=transcribe_speech,
inputs=gr.Audio(sources="microphone", type="filepath"),
outputs=gr.components.Textbox(),
)
file_transcribe = gr.Interface(
fn=transcribe_speech,
inputs=gr.Audio(sources="upload", type="filepath"),
outputs=gr.components.Textbox(),
examples=[
[os.path.join(os.path.dirname("."),"audio/chalyaami.mp3")],
[os.path.join(os.path.dirname("."),"audio/ekdonteen.flac")],
[os.path.join(os.path.dirname("."),"audio/heyatachadjaale.mp3")],
[os.path.join(os.path.dirname("."),"audio/panaji1920-9.mp3")],
],
)
with demo:
gr.TabbedInterface(
[mic_transcribe, file_transcribe],
["Transcribe Microphone", "Transcribe Audio File"],
)
demo.launch(debug=True)
# # def transcribe(audio):
# # # text = pipe(audio)["text"]
# # # pipe(audio)
# # text = pipe(audio)
# # print("op",text)
# # return text#pipe(audio) #text
# # iface = gr.Interface(
# # fn=transcribe,
# # inputs=[gr.Audio(sources=["microphone", "upload"])],
# # outputs="text",
# # examples=[
# # [os.path.join(os.path.dirname("."),"audio/chalyaami.mp3")],
# # [os.path.join(os.path.dirname("."),"audio/ekdonteen.flac")],
# # [os.path.join(os.path.dirname("."),"audio/heyatachadjaale.mp3")],
# # ],
# # title="Whisper Konkani",
# # description="Realtime demo for Konkani speech recognition using a fine-tuned Whisper small model.",
# # )
# # iface.launch()
# from transformers import WhisperTokenizer, pipeline
# import gradio as gr
# import os
# tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="marathi", task="transcribe")
# pipe = pipeline(model="thak123/gom-stt-v3", task="automatic-speech-recognition", tokenizer=tokenizer)
# def transcribe(audio):
# result = pipe(audio)
# text = result[0]['text']
# print("op", text)
# return text
# iface = gr.Interface(
# fn=transcribe,
# inputs=[gr.Audio(sources=["microphone", "upload"])],
# outputs="text",
# examples=[
# [os.path.join(os.path.dirname("."), "audio/chalyaami.mp3")],
# [os.path.join(os.path.dirname("."), "audio/ekdonteen.flac")],
# [os.path.join(os.path.dirname("."), "audio/heyatachadjaale.mp3")],
# ],
# title="Whisper Konkani",
# description="Realtime demo for Konkani speech recognition using a fine-tuned Whisper small model.",
# )
# iface.launch() |