import gradio as gr
from faster_whisper import WhisperModel
import torch
import numpy as np

import os
import wave

def model_init():
    # get device
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model_size = "large-v3"
    if device == "cuda:0":
        # Run on GPU with FP16
        model = WhisperModel(model_size, device="cuda", compute_type="float16")
        print("--------------")
        print("Model runs on GPU")
        print("--------------")
        # or Run on GPU with INT8
        # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
    else:
        # Run on CPU with INT8
        model = WhisperModel(model_size, device="cpu", compute_type="int8")

    return model

model = model_init()

import time

def transcribe_moon(stream, new_chunk):
    start_time = time.time()  # Start timing
    sr, y = new_chunk
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    if stream is not None:
        stream = np.concatenate([stream, y])
    else:
        stream = y

    # Perform the transcription using the specified model and settings
    segments, info = model.transcribe(
        stream,
    )
        # beam_size=5,
        # vad_filter=True,
        # vad_parameters={'min_silence_duration_ms': 500}
    # return stream, transcriber({"sampling_rate": sr, "raw": stream})["text"]
    # Compile the transcript with timestamps
    transcript = "\n".join([f"[{seg.start:.2f}s -> {seg.end:.2f}s] {seg.text}" for seg in segments])
    language_info = f"Detected language: {info.language} with probability {info.language_probability:.2f}"

    end_time = time.time()  # End timing
    execution_time = f"Execution time: {end_time - start_time:.2f} seconds"  # Calculate execution time

    return stream, transcript, language_info, execution_time

def transcribe(audio_file):
    start_time = time.time()  # Start timing

    # Perform the transcription using the specified model and settings
    segments, info = model.transcribe(
        audio_file,
        beam_size=5,
        vad_filter=True,
        vad_parameters={'min_silence_duration_ms': 500}
    )

    # Compile the transcript with timestamps
    transcript = "\n".join([f"[{seg.start:.2f}s -> {seg.end:.2f}s] {seg.text}" for seg in segments])
    language_info = f"Detected language: {info.language} with probability {info.language_probability:.2f}"

    end_time = time.time()  # End timing
    execution_time = f"Execution time: {end_time - start_time:.2f} seconds"  # Calculate execution time

    return transcript, language_info, execution_time


# Input and Interface setup for file upload
# , "microphone"
input_audio = gr.Audio(sources=["upload" , "microphone"], type="filepath", label="Upload or Record Audio")

file_upload_interface = gr.Interface(
    fn=transcribe,
    inputs=input_audio,
     outputs=["text", "text", "text"],
    live=True,
    title="Speak to Transcript",
    description="Upload an audio file to transcribe and detect the spoken language."
)

input_audio_mic = gr.Audio(sources=["microphone"], label="Record Audio", streaming=True)
streaming_interface = gr.Interface(
    transcribe_moon,
    ["state", gr.Audio(sources=["microphone"], streaming=True)],
    ["state", "text", "text", "text"],
    live=True,
)

# Combine both interfaces in a single Gradio app using Tabs
tabbed_interface = gr.TabbedInterface(
    interface_list=[file_upload_interface, streaming_interface],
    tab_names=["Upload File", "Live Stream"]
)

tabbed_interface.launch(debug=True)