import gradio as gr from huggingface_hub import ( list_datasets, create_collection, get_collection, add_collection_item, update_collection_item, ) from tqdm.auto import tqdm from toolz import unique from collections import defaultdict from huggingface_hub import login import os from dotenv import load_dotenv load_dotenv() login(token=os.getenv("HF_TOKEN")) def extract_languages(dataset_info): return [ tag.split(":")[1] for tag in dataset_info.tags if tag.startswith("language:") ] def create_dataset_info(): all_datasets = list(tqdm(list_datasets(full=True))) all_datasets = [d for d in all_datasets if "open-llm-leaderboard" not in d.id] dpo_in_name = [ dataset for dataset in all_datasets if "_dpo" in dataset.id or "dpo_" in dataset.id ] dpo_in_tags = [ dataset for dataset in all_datasets if any(tag == "dpo" for tag in dataset.tags) ] all_dpo_datasets = dpo_in_name + dpo_in_tags dpo_datasets = list(unique(all_dpo_datasets, key=lambda x: x.id)) dpo_datasets = [d for d in dpo_datasets if d.card_data is not None] dpo_datasets_with_languages = [ dpo_dataset for dpo_dataset in dpo_datasets if dpo_dataset.card_data.get("language") is not None ] language_groups = defaultdict(list) for dataset in dpo_datasets_with_languages: languages = extract_languages(dataset) for language in languages: language_groups[language].append(dataset) return language_groups def create_update_collections(language_groups): collections = {} for language, dataset_list in language_groups.items(): collection_title = f"DPO datasets for {language.upper()}" try: collection = create_collection( title=collection_title, description=f"A collection of DPO datasets for the {language.upper()} language.", ) except Exception: collection = get_collection(f"DPO-datasets-for-{language.upper()}") existing_items = {item.item_id for item in collection.items} for dataset in dataset_list: if dataset.id not in existing_items: add_collection_item( collection.slug, item_id=dataset.id, item_type="dataset" ) collections[language] = collection return collections def display_datasets(language): if language not in datasets: return "No datasets found for the selected language." dataset_list = datasets[language] collection = collections[language] output = f"## Datasets for {language.upper()}\n\n" output += f"Total datasets: {len(dataset_list)}\n\n" output += ( f"[View Collection](https://huggingface.co/collections/{collection.slug})\n\n" ) for dataset in dataset_list: output += f"- [{dataset.id}](https://huggingface.co/datasets/{dataset.id})\n" return output def display_overview(): total_datasets = sum(len(datasets) for datasets in datasets.values()) total_languages = len(datasets) overview = "## Dataset Overview\n\n" overview += f"- Total number of datasets: {total_datasets}\n" overview += f"- Total number of languages covered: {total_languages}\n\n" overview += "### Datasets per Language\n\n" for language, dataset_list in datasets.items(): collection = collections[language] overview += f"- {language.upper()}: {len(dataset_list)} datasets ([View Collection](https://huggingface.co/collections/{collection.slug}))\n" return overview # Create the dataset information datasets = create_dataset_info() # Create/update collections for each language collections = create_update_collections(datasets) # Get the list of available languages languages = list(datasets.keys()) with gr.Blocks() as iface: gr.Markdown("# DPO Datasets by Language") gr.Markdown("Explore DPO datasets grouped by language.") with gr.Row(): with gr.Column(): language_dropdown = gr.Dropdown(languages, label="Select Language") dataset_info = gr.Markdown() with gr.Column(): overview = gr.Markdown(display_overview()) language_dropdown.change( display_datasets, inputs=language_dropdown, outputs=dataset_info ) iface.launch()