Spaces:

DIBT
/

preference_data_by_language

Runtime error

App Files Files Community

preference_data_by_language / app.py

davanstrien HF staff

working

809b033 7 months ago

raw

history blame

4.37 kB

	import gradio as gr
	from huggingface_hub import (
	list_datasets,
	create_collection,
	get_collection,
	add_collection_item,
	update_collection_item,
	)
	from tqdm.auto import tqdm
	from toolz import unique
	from collections import defaultdict
	from huggingface_hub import login
	import os
	from dotenv import load_dotenv

	load_dotenv()
	login(token=os.getenv("HF_TOKEN"))


	def extract_languages(dataset_info):
	return [
	tag.split(":")[1] for tag in dataset_info.tags if tag.startswith("language:")
	]


	def create_dataset_info():
	all_datasets = list(tqdm(list_datasets(full=True)))
	all_datasets = [d for d in all_datasets if "open-llm-leaderboard" not in d.id]

	dpo_in_name = [
	dataset
	for dataset in all_datasets
	if "_dpo" in dataset.id or "dpo_" in dataset.id
	]
	dpo_in_tags = [
	dataset for dataset in all_datasets if any(tag == "dpo" for tag in dataset.tags)
	]

	all_dpo_datasets = dpo_in_name + dpo_in_tags
	dpo_datasets = list(unique(all_dpo_datasets, key=lambda x: x.id))
	dpo_datasets = [d for d in dpo_datasets if d.card_data is not None]
	dpo_datasets_with_languages = [
	dpo_dataset
	for dpo_dataset in dpo_datasets
	if dpo_dataset.card_data.get("language") is not None
	]

	language_groups = defaultdict(list)
	for dataset in dpo_datasets_with_languages:
	languages = extract_languages(dataset)
	for language in languages:
	language_groups[language].append(dataset)

	return language_groups


	def create_update_collections(language_groups):
	collections = {}
	for language, dataset_list in language_groups.items():
	collection_title = f"DPO datasets for {language.upper()}"
	try:
	collection = create_collection(
	title=collection_title,
	description=f"A collection of DPO datasets for the {language.upper()} language.",
	)
	except Exception:
	collection = get_collection(f"DPO-datasets-for-{language.upper()}")

	existing_items = {item.item_id for item in collection.items}

	for dataset in dataset_list:
	if dataset.id not in existing_items:
	add_collection_item(
	collection.slug, item_id=dataset.id, item_type="dataset"
	)

	collections[language] = collection

	return collections


	def display_datasets(language):
	if language not in datasets:
	return "No datasets found for the selected language."
	dataset_list = datasets[language]
	collection = collections[language]
	output = f"## Datasets for {language.upper()}\n\n"
	output += f"Total datasets: {len(dataset_list)}\n\n"
	output += (
	f"[View Collection](https://huggingface.co/collections/{collection.slug})\n\n"
	)
	for dataset in dataset_list:
	output += f"- [{dataset.id}](https://huggingface.co/datasets/{dataset.id})\n"
	return output


	def display_overview():
	total_datasets = sum(len(datasets) for datasets in datasets.values())
	total_languages = len(datasets)

	overview = "## Dataset Overview\n\n"
	overview += f"- Total number of datasets: {total_datasets}\n"
	overview += f"- Total number of languages covered: {total_languages}\n\n"

	overview += "### Datasets per Language\n\n"
	for language, dataset_list in datasets.items():
	collection = collections[language]
	overview += f"- {language.upper()}: {len(dataset_list)} datasets ([View Collection](https://huggingface.co/collections/{collection.slug}))\n"

	return overview


	# Create the dataset information
	datasets = create_dataset_info()

	# Create/update collections for each language
	collections = create_update_collections(datasets)

	# Get the list of available languages
	languages = list(datasets.keys())

	with gr.Blocks() as iface:
	gr.Markdown("# DPO Datasets by Language")
	gr.Markdown("Explore DPO datasets grouped by language.")

	with gr.Row():
	with gr.Column():
	language_dropdown = gr.Dropdown(languages, label="Select Language")
	dataset_info = gr.Markdown()

	with gr.Column():
	overview = gr.Markdown(display_overview())

	language_dropdown.change(
	display_datasets, inputs=language_dropdown, outputs=dataset_info
	)

	iface.launch()