Spaces:
Runtime error
Runtime error
import gradio as gr | |
from huggingface_hub import ( | |
list_datasets, | |
create_collection, | |
get_collection, | |
add_collection_item, | |
update_collection_item, | |
) | |
from tqdm.auto import tqdm | |
from toolz import unique | |
from collections import defaultdict | |
from huggingface_hub import login | |
import os | |
from dotenv import load_dotenv | |
load_dotenv() | |
login(token=os.getenv("HF_TOKEN")) | |
def extract_languages(dataset_info): | |
return [ | |
tag.split(":")[1] for tag in dataset_info.tags if tag.startswith("language:") | |
] | |
def create_dataset_info(): | |
all_datasets = list(tqdm(list_datasets(full=True))) | |
all_datasets = [d for d in all_datasets if "open-llm-leaderboard" not in d.id] | |
dpo_in_name = [ | |
dataset | |
for dataset in all_datasets | |
if "_dpo" in dataset.id or "dpo_" in dataset.id | |
] | |
dpo_in_tags = [ | |
dataset for dataset in all_datasets if any(tag == "dpo" for tag in dataset.tags) | |
] | |
all_dpo_datasets = dpo_in_name + dpo_in_tags | |
dpo_datasets = list(unique(all_dpo_datasets, key=lambda x: x.id)) | |
dpo_datasets = [d for d in dpo_datasets if d.card_data is not None] | |
dpo_datasets_with_languages = [ | |
dpo_dataset | |
for dpo_dataset in dpo_datasets | |
if dpo_dataset.card_data.get("language") is not None | |
] | |
language_groups = defaultdict(list) | |
for dataset in dpo_datasets_with_languages: | |
languages = extract_languages(dataset) | |
for language in languages: | |
language_groups[language].append(dataset) | |
return language_groups | |
def create_update_collections(language_groups): | |
collections = {} | |
for language, dataset_list in language_groups.items(): | |
collection_title = f"DPO datasets for {language.upper()}" | |
try: | |
collection = create_collection( | |
title=collection_title, | |
description=f"A collection of DPO datasets for the {language.upper()} language.", | |
) | |
except Exception: | |
collection = get_collection(f"DPO-datasets-for-{language.upper()}") | |
existing_items = {item.item_id for item in collection.items} | |
for dataset in dataset_list: | |
if dataset.id not in existing_items: | |
add_collection_item( | |
collection.slug, item_id=dataset.id, item_type="dataset" | |
) | |
collections[language] = collection | |
return collections | |
def display_datasets(language): | |
if language not in datasets: | |
return "No datasets found for the selected language." | |
dataset_list = datasets[language] | |
collection = collections[language] | |
output = f"## Datasets for {language.upper()}\n\n" | |
output += f"Total datasets: {len(dataset_list)}\n\n" | |
output += ( | |
f"[View Collection](https://huggingface.co/collections/{collection.slug})\n\n" | |
) | |
for dataset in dataset_list: | |
output += f"- [{dataset.id}](https://huggingface.co/datasets/{dataset.id})\n" | |
return output | |
def display_overview(): | |
total_datasets = sum(len(datasets) for datasets in datasets.values()) | |
total_languages = len(datasets) | |
overview = "## Dataset Overview\n\n" | |
overview += f"- Total number of datasets: {total_datasets}\n" | |
overview += f"- Total number of languages covered: {total_languages}\n\n" | |
overview += "### Datasets per Language\n\n" | |
for language, dataset_list in datasets.items(): | |
collection = collections[language] | |
overview += f"- {language.upper()}: {len(dataset_list)} datasets ([View Collection](https://huggingface.co/collections/{collection.slug}))\n" | |
return overview | |
# Create the dataset information | |
datasets = create_dataset_info() | |
# Create/update collections for each language | |
collections = create_update_collections(datasets) | |
# Get the list of available languages | |
languages = list(datasets.keys()) | |
with gr.Blocks() as iface: | |
gr.Markdown("# DPO Datasets by Language") | |
gr.Markdown("Explore DPO datasets grouped by language.") | |
with gr.Row(): | |
with gr.Column(): | |
language_dropdown = gr.Dropdown(languages, label="Select Language") | |
dataset_info = gr.Markdown() | |
with gr.Column(): | |
overview = gr.Markdown(display_overview()) | |
language_dropdown.change( | |
display_datasets, inputs=language_dropdown, outputs=dataset_info | |
) | |
iface.launch() | |