Spaces:

thak123
/

Whisper-Konkani

Running

File size: 5,733 Bytes

caaee3e
 
6980dd0
b357c71
caaee3e
 
 
6980dd0
b357c71
caaee3e
f2b8075
 
18b526a
f2b8075
9aedf57
caaee3e
 
 
 
 
8a1e498
70a53fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
caaee3e
70a53fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8621c12
70a53fa
8621c12
 
 
 
caaee3e
8621c12
 
caaee3e
8621c12
 
 
 
 
 
70a53fa
 
 
 
 
 
 
 
caaee3e
 
 
 
 
 
 
b357c71
caaee3e
 
 
 
263e119
 
 
 
 
 
caaee3e
 
 
 
 
 
8a1e498
caaee3e
8a1e498
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
caaee3e
 
 
8a1e498
caaee3e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b357c71
caaee3e

from transformers import WhisperTokenizer
import os
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", task="transcribe") #, language="marathi", task="transcribe"

from transformers import pipeline
import gradio as gr
import torch 
import torchaudio

pipe = pipeline(model="thak123/gom-stt-v3", #"thak123/whisper-small-LDC-V1", #"thak123/whisper-small-gom", 
                task="automatic-speech-recognition", 
                tokenizer= tokenizer,
               
               )  # change to "your-username/the-name-you-picked"

# pipe.model.config.forced_decoder_ids = (
#         pipe.tokenizer.get_decoder_prompt_ids(
#             language="marathi", task="transcribe"
#         )
#     )

# def transcribe_speech(filepath):
#     # waveform, sample_rate = torchaudio.load(filepath)

#     # Resample the audio signal to 16k sampling rate
#     # resampler = torchaudio.transforms.Resample(sample_rate, 16000)
#     # waveform_16k = resampler(waveform)

#     # Save the resampled audio signal to a new file
#     # torchaudio.save(filepath, waveform_16k, 16000)    
#     output = pipe(
#         filepath,
#         max_new_tokens=3,
#         generate_kwargs={
#             "task": "transcribe",
#             # "language": "konkani",
#         },  # update with the language you've fine-tuned on
#         chunk_length_s=30,
#         batch_size=8,
#          # sampling_rate=16000,
#         # padding=True
#     )
#     print(output)
#     return output["text"]


def transcribe_speech(filepath):
   
    from transformers import WhisperProcessor, WhisperForConditionalGeneration
    import torch
    import librosa

    # Load model and processor
    model = WhisperForConditionalGeneration.from_pretrained("thak123/gom-stt-v3")
    tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", task="transcribe")
    processor = WhisperProcessor.from_pretrained("openai/whisper-small")
    output = ""
    # Load and preprocess audio
    audio_path = filepath 
    audio, sr = librosa.load(audio_path, sr=16000)
    input_features = processor(audio, sampling_rate=16000, return_tensors="pt",truncation=False, padding="max_length").input_features

    # Check length and process
    if input_features.shape[-1] > 3000:
        print("Splitting audio required")
        # from pydub import AudioSegment

        # def split_audio(file_path, chunk_length_ms=30000):  # 30 sec chunks
        #     audio = AudioSegment.from_file(file_path)
        #     chunks = [audio[i:i+chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
        #     return chunks

        # # Split and transcribe
        # audio_chunks = split_audio(audio_path)

        # for i, chunk in enumerate(audio_chunks):
        #     print(i)
        #     chunk.export(f"chunk_{i}.wav", format="wav")
        #     result = pipe(f"chunk_{i}.wav")
        #     output += result['text'] + " "
        #     print(f"Chunk {i}: {result['text']}")
    else:
        predicted_ids = model.generate(input_features)
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
        output = transcription
        print(transcription)
        
    return output #output["text"]
    
demo = gr.Blocks()

mic_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs=gr.components.Textbox(),
)

file_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="upload", type="filepath"),
    outputs=gr.components.Textbox(),
    examples=[
         [os.path.join(os.path.dirname("."),"audio/chalyaami.mp3")],
         [os.path.join(os.path.dirname("."),"audio/ekdonteen.flac")],
         [os.path.join(os.path.dirname("."),"audio/heyatachadjaale.mp3")],
         [os.path.join(os.path.dirname("."),"audio/panaji1920-9.mp3")],
     ],
)
with demo:
    gr.TabbedInterface(
        [mic_transcribe, file_transcribe],
        ["Transcribe Microphone", "Transcribe Audio File"],
    )

demo.launch(debug=True)

# # def transcribe(audio):
# #     # text = pipe(audio)["text"]
# #     # pipe(audio)
# #     text = pipe(audio)
# #     print("op",text)
# #     return text#pipe(audio) #text

# # iface = gr.Interface(
# #     fn=transcribe, 
# #     inputs=[gr.Audio(sources=["microphone", "upload"])], 
# #     outputs="text",
# #     examples=[
# #         [os.path.join(os.path.dirname("."),"audio/chalyaami.mp3")],
# #         [os.path.join(os.path.dirname("."),"audio/ekdonteen.flac")],
# #         [os.path.join(os.path.dirname("."),"audio/heyatachadjaale.mp3")],
# #     ],
# #     title="Whisper Konkani",
# #     description="Realtime demo for Konkani speech recognition using a fine-tuned Whisper small model.",
# # )


# # iface.launch()


# from transformers import WhisperTokenizer, pipeline
# import gradio as gr
# import os

# tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="marathi", task="transcribe")

# pipe = pipeline(model="thak123/gom-stt-v3", task="automatic-speech-recognition", tokenizer=tokenizer)

# def transcribe(audio):
#     result = pipe(audio)
#     text = result[0]['text']
#     print("op", text)
#     return text

# iface = gr.Interface(
#     fn=transcribe,
#     inputs=[gr.Audio(sources=["microphone", "upload"])],
#     outputs="text",
#     examples=[
#         [os.path.join(os.path.dirname("."), "audio/chalyaami.mp3")],
#         [os.path.join(os.path.dirname("."), "audio/ekdonteen.flac")],
#         [os.path.join(os.path.dirname("."), "audio/heyatachadjaale.mp3")],
#     ],
#     title="Whisper Konkani",
#     description="Realtime demo for Konkani speech recognition using a fine-tuned Whisper small model.",
# )

# iface.launch()