from transformers import WhisperTokenizer
import os
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", task="transcribe") #, language="marathi", task="transcribe"

from transformers import pipeline
import gradio as gr
import torch 
import torchaudio

pipe = pipeline(model="thak123/gom-stt-v3", #"thak123/whisper-small-LDC-V1", #"thak123/whisper-small-gom", 
                task="automatic-speech-recognition", 
                tokenizer= tokenizer,
               
               )  # change to "your-username/the-name-you-picked"

# pipe.model.config.forced_decoder_ids = (
#         pipe.tokenizer.get_decoder_prompt_ids(
#             language="marathi", task="transcribe"
#         )
#     )

# def transcribe_speech(filepath):
#     # waveform, sample_rate = torchaudio.load(filepath)

#     # Resample the audio signal to 16k sampling rate
#     # resampler = torchaudio.transforms.Resample(sample_rate, 16000)
#     # waveform_16k = resampler(waveform)

#     # Save the resampled audio signal to a new file
#     # torchaudio.save(filepath, waveform_16k, 16000)    
#     output = pipe(
#         filepath,
#         max_new_tokens=3,
#         generate_kwargs={
#             "task": "transcribe",
#             # "language": "konkani",
#         },  # update with the language you've fine-tuned on
#         chunk_length_s=30,
#         batch_size=8,
#          # sampling_rate=16000,
#         # padding=True
#     )
#     print(output)
#     return output["text"]


def transcribe_speech(filepath):
   
    from transformers import WhisperProcessor, WhisperForConditionalGeneration
    import torch
    import librosa

    # Load model and processor
    model = WhisperForConditionalGeneration.from_pretrained("thak123/gom-stt-v3")
    tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", task="transcribe")
    processor = WhisperProcessor.from_pretrained("openai/whisper-small")
    output = ""
    # Load and preprocess audio
    audio_path = filepath 
    audio, sr = librosa.load(audio_path, sr=16000)
    input_features = processor(audio, sampling_rate=16000, return_tensors="pt",truncation=False, padding="max_length").input_features

    # Check length and process
    if input_features.shape[-1] > 3000:
        print("Splitting audio required")
        # from pydub import AudioSegment

        # def split_audio(file_path, chunk_length_ms=30000):  # 30 sec chunks
        #     audio = AudioSegment.from_file(file_path)
        #     chunks = [audio[i:i+chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
        #     return chunks

        # # Split and transcribe
        # audio_chunks = split_audio(audio_path)

        # for i, chunk in enumerate(audio_chunks):
        #     print(i)
        #     chunk.export(f"chunk_{i}.wav", format="wav")
        #     result = pipe(f"chunk_{i}.wav")
        #     output += result['text'] + " "
        #     print(f"Chunk {i}: {result['text']}")
    else:
        predicted_ids = model.generate(input_features)
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
        output = transcription
        print(transcription)
        
    return output #output["text"]
    
demo = gr.Blocks()

mic_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs=gr.components.Textbox(),
)

file_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="upload", type="filepath"),
    outputs=gr.components.Textbox(),
    examples=[
         [os.path.join(os.path.dirname("."),"audio/chalyaami.mp3")],
         [os.path.join(os.path.dirname("."),"audio/ekdonteen.flac")],
         [os.path.join(os.path.dirname("."),"audio/heyatachadjaale.mp3")],
         [os.path.join(os.path.dirname("."),"audio/panaji1920-9.mp3")],
     ],
)
with demo:
    gr.TabbedInterface(
        [mic_transcribe, file_transcribe],
        ["Transcribe Microphone", "Transcribe Audio File"],
    )

demo.launch(debug=True)

# # def transcribe(audio):
# #     # text = pipe(audio)["text"]
# #     # pipe(audio)
# #     text = pipe(audio)
# #     print("op",text)
# #     return text#pipe(audio) #text

# # iface = gr.Interface(
# #     fn=transcribe, 
# #     inputs=[gr.Audio(sources=["microphone", "upload"])], 
# #     outputs="text",
# #     examples=[
# #         [os.path.join(os.path.dirname("."),"audio/chalyaami.mp3")],
# #         [os.path.join(os.path.dirname("."),"audio/ekdonteen.flac")],
# #         [os.path.join(os.path.dirname("."),"audio/heyatachadjaale.mp3")],
# #     ],
# #     title="Whisper Konkani",
# #     description="Realtime demo for Konkani speech recognition using a fine-tuned Whisper small model.",
# # )


# # iface.launch()


# from transformers import WhisperTokenizer, pipeline
# import gradio as gr
# import os

# tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="marathi", task="transcribe")

# pipe = pipeline(model="thak123/gom-stt-v3", task="automatic-speech-recognition", tokenizer=tokenizer)

# def transcribe(audio):
#     result = pipe(audio)
#     text = result[0]['text']
#     print("op", text)
#     return text

# iface = gr.Interface(
#     fn=transcribe,
#     inputs=[gr.Audio(sources=["microphone", "upload"])],
#     outputs="text",
#     examples=[
#         [os.path.join(os.path.dirname("."), "audio/chalyaami.mp3")],
#         [os.path.join(os.path.dirname("."), "audio/ekdonteen.flac")],
#         [os.path.join(os.path.dirname("."), "audio/heyatachadjaale.mp3")],
#     ],
#     title="Whisper Konkani",
#     description="Realtime demo for Konkani speech recognition using a fine-tuned Whisper small model.",
# )

# iface.launch()