import spaces
import gradio as gr
import json
import torch
import numpy as np
import librosa
from accelerate.utils.imports import is_cuda_available
from iso639 import iter_langs
from ctc_forced_aligner import (
    load_alignment_model,
    generate_emissions,
    preprocess_text,
    get_alignments,
    get_spans,
    postprocess_results,
)

device = "cuda" if is_cuda_available() else "cpu"
dtype = torch.float16 if is_cuda_available() else torch.float32


alignment_model, alignment_tokenizer = load_alignment_model(
    device,
    dtype=dtype,
)


def process_alignment(audio_waveform, text, language="hin"):
    print(f"{audio_waveform.shape=}, {text=}, {language=}")
    # Generate emissions
    emissions, stride = generate_emissions(
        alignment_model, audio_waveform, batch_size=16
    )

    # Preprocess text
    tokens_starred, text_starred = preprocess_text(
        text,
        romanize=True,
        language=language,
    )

    # Get alignments
    segments, scores, blank_id = get_alignments(
        emissions,
        tokens_starred,
        alignment_tokenizer,
    )

    # Get spans and word timestamps
    spans = get_spans(tokens_starred, segments, blank_id)
    word_timestamps = postprocess_results(text_starred, spans, stride, scores)

    return word_timestamps


def trim_audio(audio_array, sample_rate, word_timestamps):
    start_time = int(word_timestamps[0]["start"] * sample_rate)
    end_time = int(word_timestamps[-1]["end"] * sample_rate)
    print(f"{start_time=}, {end_time=}")
    trimmed_audio = audio_array[start_time:end_time]
    return (sample_rate, trimmed_audio)


def get_language_choices():
    return [f"{lang.pt3} - {lang.name}" for lang in iter_langs() if lang.pt3]


@spaces.GPU
def align_result_only(audio, text, language="hin - Hindi"):
    # Extract the ISO 639-3 code from the selected language
    iso_code = language.split(" - ")[0]

    # Convert the input audio to 16kHz mono
    sample_rate, audio_array = audio
    audio_array = (
        audio_array.astype(np.float32) / 32768.0
    )  # Convert to float32 and normalize
    print(f"{sample_rate=}, {audio_array.shape=}")

    if len(audio_array.shape) > 1:
        audio_array = audio_array.mean(axis=1)  # Convert to mono if stereo
    audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)

    # Convert to torch tensor and move to the correct device
    audio_waveform = torch.from_numpy(audio_array).to(device=device, dtype=dtype)

    # Process the alignment
    word_timestamps = process_alignment(audio_waveform, text, iso_code)

    # Create JSON output
    output_json = {
        "input_text": text,
        "word_timestamps": word_timestamps,
        "language": language,
    }

    return json.dumps(output_json, indent=2)


# Create Gradio blocks
with gr.Blocks() as demo:
    gr.Markdown("Align")

    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(label="Input Audio")
            text_input = gr.Textbox(label="Input Text")
            language_input = gr.Dropdown(
                choices=get_language_choices(), label="Language", value="hin - Hindi"
            )
            submit_button_result_only = gr.Button(
                "Get Alignment", variant="secondary"
            )

        with gr.Column():
            json_output = gr.JSON(label="Alignment Results")

    submit_button_result_only.click(
        fn=align_result_only,
        inputs=[audio_input, text_input, language_input],
        outputs=[json_output],
    )

# Launch the demo
if __name__ == "__main__":
    demo.launch()