from transformers import WhisperTokenizer import os tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", task="transcribe") #, language="marathi", task="transcribe" from transformers import pipeline import gradio as gr import torch import torchaudio pipe = pipeline(model="thak123/gom-stt-v3", #"thak123/whisper-small-LDC-V1", #"thak123/whisper-small-gom", task="automatic-speech-recognition", tokenizer= tokenizer, ) # change to "your-username/the-name-you-picked" # pipe.model.config.forced_decoder_ids = ( # pipe.tokenizer.get_decoder_prompt_ids( # language="marathi", task="transcribe" # ) # ) # def transcribe_speech(filepath): # # waveform, sample_rate = torchaudio.load(filepath) # # Resample the audio signal to 16k sampling rate # # resampler = torchaudio.transforms.Resample(sample_rate, 16000) # # waveform_16k = resampler(waveform) # # Save the resampled audio signal to a new file # # torchaudio.save(filepath, waveform_16k, 16000) # output = pipe( # filepath, # max_new_tokens=3, # generate_kwargs={ # "task": "transcribe", # # "language": "konkani", # }, # update with the language you've fine-tuned on # chunk_length_s=30, # batch_size=8, # # sampling_rate=16000, # # padding=True # ) # print(output) # return output["text"] def transcribe_speech(filepath): from transformers import WhisperProcessor, WhisperForConditionalGeneration import torch import librosa # Load model and processor model = WhisperForConditionalGeneration.from_pretrained("thak123/gom-stt-v3") tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", task="transcribe") processor = WhisperProcessor.from_pretrained("openai/whisper-small") output = "" # Load and preprocess audio audio_path = filepath audio, sr = librosa.load(audio_path, sr=16000) input_features = processor(audio, sampling_rate=16000, return_tensors="pt",truncation=False, padding="max_length").input_features # Check length and process if input_features.shape[-1] > 3000: print("Splitting audio required") # from pydub import AudioSegment # def split_audio(file_path, chunk_length_ms=30000): # 30 sec chunks # audio = AudioSegment.from_file(file_path) # chunks = [audio[i:i+chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)] # return chunks # # Split and transcribe # audio_chunks = split_audio(audio_path) # for i, chunk in enumerate(audio_chunks): # print(i) # chunk.export(f"chunk_{i}.wav", format="wav") # result = pipe(f"chunk_{i}.wav") # output += result['text'] + " " # print(f"Chunk {i}: {result['text']}") else: predicted_ids = model.generate(input_features) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) output = transcription print(transcription) return output #output["text"] demo = gr.Blocks() mic_transcribe = gr.Interface( fn=transcribe_speech, inputs=gr.Audio(sources="microphone", type="filepath"), outputs=gr.components.Textbox(), ) file_transcribe = gr.Interface( fn=transcribe_speech, inputs=gr.Audio(sources="upload", type="filepath"), outputs=gr.components.Textbox(), examples=[ [os.path.join(os.path.dirname("."),"audio/chalyaami.mp3")], [os.path.join(os.path.dirname("."),"audio/ekdonteen.flac")], [os.path.join(os.path.dirname("."),"audio/heyatachadjaale.mp3")], [os.path.join(os.path.dirname("."),"audio/panaji1920-9.mp3")], ], ) with demo: gr.TabbedInterface( [mic_transcribe, file_transcribe], ["Transcribe Microphone", "Transcribe Audio File"], ) demo.launch(debug=True) # # def transcribe(audio): # # # text = pipe(audio)["text"] # # # pipe(audio) # # text = pipe(audio) # # print("op",text) # # return text#pipe(audio) #text # # iface = gr.Interface( # # fn=transcribe, # # inputs=[gr.Audio(sources=["microphone", "upload"])], # # outputs="text", # # examples=[ # # [os.path.join(os.path.dirname("."),"audio/chalyaami.mp3")], # # [os.path.join(os.path.dirname("."),"audio/ekdonteen.flac")], # # [os.path.join(os.path.dirname("."),"audio/heyatachadjaale.mp3")], # # ], # # title="Whisper Konkani", # # description="Realtime demo for Konkani speech recognition using a fine-tuned Whisper small model.", # # ) # # iface.launch() # from transformers import WhisperTokenizer, pipeline # import gradio as gr # import os # tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="marathi", task="transcribe") # pipe = pipeline(model="thak123/gom-stt-v3", task="automatic-speech-recognition", tokenizer=tokenizer) # def transcribe(audio): # result = pipe(audio) # text = result[0]['text'] # print("op", text) # return text # iface = gr.Interface( # fn=transcribe, # inputs=[gr.Audio(sources=["microphone", "upload"])], # outputs="text", # examples=[ # [os.path.join(os.path.dirname("."), "audio/chalyaami.mp3")], # [os.path.join(os.path.dirname("."), "audio/ekdonteen.flac")], # [os.path.join(os.path.dirname("."), "audio/heyatachadjaale.mp3")], # ], # title="Whisper Konkani", # description="Realtime demo for Konkani speech recognition using a fine-tuned Whisper small model.", # ) # iface.launch()