davanstrien's picture
davanstrien HF staff
working
809b033
raw
history blame
4.37 kB
import gradio as gr
from huggingface_hub import (
list_datasets,
create_collection,
get_collection,
add_collection_item,
update_collection_item,
)
from tqdm.auto import tqdm
from toolz import unique
from collections import defaultdict
from huggingface_hub import login
import os
from dotenv import load_dotenv
load_dotenv()
login(token=os.getenv("HF_TOKEN"))
def extract_languages(dataset_info):
return [
tag.split(":")[1] for tag in dataset_info.tags if tag.startswith("language:")
]
def create_dataset_info():
all_datasets = list(tqdm(list_datasets(full=True)))
all_datasets = [d for d in all_datasets if "open-llm-leaderboard" not in d.id]
dpo_in_name = [
dataset
for dataset in all_datasets
if "_dpo" in dataset.id or "dpo_" in dataset.id
]
dpo_in_tags = [
dataset for dataset in all_datasets if any(tag == "dpo" for tag in dataset.tags)
]
all_dpo_datasets = dpo_in_name + dpo_in_tags
dpo_datasets = list(unique(all_dpo_datasets, key=lambda x: x.id))
dpo_datasets = [d for d in dpo_datasets if d.card_data is not None]
dpo_datasets_with_languages = [
dpo_dataset
for dpo_dataset in dpo_datasets
if dpo_dataset.card_data.get("language") is not None
]
language_groups = defaultdict(list)
for dataset in dpo_datasets_with_languages:
languages = extract_languages(dataset)
for language in languages:
language_groups[language].append(dataset)
return language_groups
def create_update_collections(language_groups):
collections = {}
for language, dataset_list in language_groups.items():
collection_title = f"DPO datasets for {language.upper()}"
try:
collection = create_collection(
title=collection_title,
description=f"A collection of DPO datasets for the {language.upper()} language.",
)
except Exception:
collection = get_collection(f"DPO-datasets-for-{language.upper()}")
existing_items = {item.item_id for item in collection.items}
for dataset in dataset_list:
if dataset.id not in existing_items:
add_collection_item(
collection.slug, item_id=dataset.id, item_type="dataset"
)
collections[language] = collection
return collections
def display_datasets(language):
if language not in datasets:
return "No datasets found for the selected language."
dataset_list = datasets[language]
collection = collections[language]
output = f"## Datasets for {language.upper()}\n\n"
output += f"Total datasets: {len(dataset_list)}\n\n"
output += (
f"[View Collection](https://huggingface.co/collections/{collection.slug})\n\n"
)
for dataset in dataset_list:
output += f"- [{dataset.id}](https://huggingface.co/datasets/{dataset.id})\n"
return output
def display_overview():
total_datasets = sum(len(datasets) for datasets in datasets.values())
total_languages = len(datasets)
overview = "## Dataset Overview\n\n"
overview += f"- Total number of datasets: {total_datasets}\n"
overview += f"- Total number of languages covered: {total_languages}\n\n"
overview += "### Datasets per Language\n\n"
for language, dataset_list in datasets.items():
collection = collections[language]
overview += f"- {language.upper()}: {len(dataset_list)} datasets ([View Collection](https://huggingface.co/collections/{collection.slug}))\n"
return overview
# Create the dataset information
datasets = create_dataset_info()
# Create/update collections for each language
collections = create_update_collections(datasets)
# Get the list of available languages
languages = list(datasets.keys())
with gr.Blocks() as iface:
gr.Markdown("# DPO Datasets by Language")
gr.Markdown("Explore DPO datasets grouped by language.")
with gr.Row():
with gr.Column():
language_dropdown = gr.Dropdown(languages, label="Select Language")
dataset_info = gr.Markdown()
with gr.Column():
overview = gr.Markdown(display_overview())
language_dropdown.change(
display_datasets, inputs=language_dropdown, outputs=dataset_info
)
iface.launch()