import gradio as gr
from huggingface_hub import (
    list_datasets,
    create_collection,
    get_collection,
    add_collection_item,
    update_collection_item,
)
from tqdm.auto import tqdm
from toolz import unique
from collections import defaultdict
from huggingface_hub import login
import os
from dotenv import load_dotenv

load_dotenv()
login(token=os.getenv("HF_TOKEN"))


def extract_languages(dataset_info):
    return [
        tag.split(":")[1] for tag in dataset_info.tags if tag.startswith("language:")
    ]


def create_dataset_info():
    all_datasets = list(tqdm(list_datasets(full=True)))
    all_datasets = [d for d in all_datasets if "open-llm-leaderboard" not in d.id]

    dpo_in_name = [
        dataset
        for dataset in all_datasets
        if "_dpo" in dataset.id or "dpo_" in dataset.id
    ]
    dpo_in_tags = [
        dataset for dataset in all_datasets if any(tag == "dpo" for tag in dataset.tags)
    ]

    all_dpo_datasets = dpo_in_name + dpo_in_tags
    dpo_datasets = list(unique(all_dpo_datasets, key=lambda x: x.id))
    dpo_datasets = [d for d in dpo_datasets if d.card_data is not None]
    dpo_datasets_with_languages = [
        dpo_dataset
        for dpo_dataset in dpo_datasets
        if dpo_dataset.card_data.get("language") is not None
    ]

    language_groups = defaultdict(list)
    for dataset in dpo_datasets_with_languages:
        languages = extract_languages(dataset)
        for language in languages:
            language_groups[language].append(dataset)

    return language_groups


def create_update_collections(language_groups):
    collections = {}
    for language, dataset_list in language_groups.items():
        collection_title = f"DPO datasets for {language.upper()}"
        try:
            collection = create_collection(
                title=collection_title,
                description=f"A collection of DPO datasets for the {language.upper()} language.",
            )
        except Exception:
            collection = get_collection(f"DPO-datasets-for-{language.upper()}")

        existing_items = {item.item_id for item in collection.items}

        for dataset in dataset_list:
            if dataset.id not in existing_items:
                add_collection_item(
                    collection.slug, item_id=dataset.id, item_type="dataset"
                )

        collections[language] = collection

    return collections


def display_datasets(language):
    if language not in datasets:
        return "No datasets found for the selected language."
    dataset_list = datasets[language]
    collection = collections[language]
    output = f"## Datasets for {language.upper()}\n\n"
    output += f"Total datasets: {len(dataset_list)}\n\n"
    output += (
        f"[View Collection](https://huggingface.co/collections/{collection.slug})\n\n"
    )
    for dataset in dataset_list:
        output += f"- [{dataset.id}](https://huggingface.co/datasets/{dataset.id})\n"
    return output


def display_overview():
    total_datasets = sum(len(datasets) for datasets in datasets.values())
    total_languages = len(datasets)

    overview = "## Dataset Overview\n\n"
    overview += f"- Total number of datasets: {total_datasets}\n"
    overview += f"- Total number of languages covered: {total_languages}\n\n"

    overview += "### Datasets per Language\n\n"
    for language, dataset_list in datasets.items():
        collection = collections[language]
        overview += f"- {language.upper()}: {len(dataset_list)} datasets ([View Collection](https://huggingface.co/collections/{collection.slug}))\n"

    return overview


# Create the dataset information
datasets = create_dataset_info()

# Create/update collections for each language
collections = create_update_collections(datasets)

# Get the list of available languages
languages = list(datasets.keys())

with gr.Blocks() as iface:
    gr.Markdown("# DPO Datasets by Language")
    gr.Markdown("Explore DPO datasets grouped by language.")

    with gr.Row():
        with gr.Column():
            language_dropdown = gr.Dropdown(languages, label="Select Language")
            dataset_info = gr.Markdown()

        with gr.Column():
            overview = gr.Markdown(display_overview())

    language_dropdown.change(
        display_datasets, inputs=language_dropdown, outputs=dataset_info
    )

iface.launch()