Spaces:
Runtime error
Runtime error
Commit
β’
809b033
1
Parent(s):
0fa257c
working
Browse files
app.py
CHANGED
@@ -1,84 +1,115 @@
|
|
1 |
import gradio as gr
|
2 |
-
from huggingface_hub import
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
from tqdm.auto import tqdm
|
4 |
from toolz import unique
|
5 |
from collections import defaultdict
|
6 |
from huggingface_hub import login
|
7 |
import os
|
|
|
8 |
|
9 |
-
|
|
|
10 |
|
11 |
|
12 |
def extract_languages(dataset_info):
|
13 |
-
return [
|
|
|
|
|
|
|
14 |
|
15 |
def create_dataset_info():
|
16 |
all_datasets = list(tqdm(list_datasets(full=True)))
|
17 |
-
all_datasets = [d for d in all_datasets if
|
18 |
-
|
19 |
-
dpo_in_name = [
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
all_dpo_datasets = dpo_in_name + dpo_in_tags
|
23 |
dpo_datasets = list(unique(all_dpo_datasets, key=lambda x: x.id))
|
24 |
dpo_datasets = [d for d in dpo_datasets if d.card_data is not None]
|
25 |
-
dpo_datasets_with_languages = [
|
26 |
-
|
|
|
|
|
|
|
|
|
27 |
language_groups = defaultdict(list)
|
28 |
for dataset in dpo_datasets_with_languages:
|
29 |
languages = extract_languages(dataset)
|
30 |
for language in languages:
|
31 |
language_groups[language].append(dataset)
|
32 |
-
|
33 |
return language_groups
|
34 |
|
|
|
35 |
def create_update_collections(language_groups):
|
36 |
collections = {}
|
37 |
for language, dataset_list in language_groups.items():
|
38 |
collection_title = f"DPO datasets for {language.upper()}"
|
39 |
try:
|
40 |
-
collection = create_collection(
|
41 |
-
|
|
|
|
|
|
|
42 |
collection = get_collection(f"DPO-datasets-for-{language.upper()}")
|
43 |
-
|
44 |
-
existing_items =
|
45 |
-
|
46 |
for dataset in dataset_list:
|
47 |
if dataset.id not in existing_items:
|
48 |
-
add_collection_item(
|
49 |
-
|
|
|
|
|
50 |
collections[language] = collection
|
51 |
-
|
52 |
return collections
|
53 |
|
|
|
54 |
def display_datasets(language):
|
55 |
-
if language in datasets:
|
56 |
-
dataset_list = datasets[language]
|
57 |
-
collection = collections[language]
|
58 |
-
output = f"## Datasets for {language.upper()}\n\n"
|
59 |
-
output += f"Total datasets: {len(dataset_list)}\n\n"
|
60 |
-
output += f"[View Collection](https://huggingface.co/collections/{collection.slug})\n\n"
|
61 |
-
for dataset in dataset_list:
|
62 |
-
output += f"- [{dataset.id}](https://huggingface.co/datasets/{dataset.id})\n"
|
63 |
-
return output
|
64 |
-
else:
|
65 |
return "No datasets found for the selected language."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
def display_overview():
|
68 |
total_datasets = sum(len(datasets) for datasets in datasets.values())
|
69 |
total_languages = len(datasets)
|
70 |
-
|
71 |
overview = "## Dataset Overview\n\n"
|
72 |
overview += f"- Total number of datasets: {total_datasets}\n"
|
73 |
overview += f"- Total number of languages covered: {total_languages}\n\n"
|
74 |
-
|
75 |
overview += "### Datasets per Language\n\n"
|
76 |
for language, dataset_list in datasets.items():
|
77 |
collection = collections[language]
|
78 |
overview += f"- {language.upper()}: {len(dataset_list)} datasets ([View Collection](https://huggingface.co/collections/{collection.slug}))\n"
|
79 |
-
|
80 |
return overview
|
81 |
|
|
|
82 |
# Create the dataset information
|
83 |
datasets = create_dataset_info()
|
84 |
|
@@ -91,15 +122,17 @@ languages = list(datasets.keys())
|
|
91 |
with gr.Blocks() as iface:
|
92 |
gr.Markdown("# DPO Datasets by Language")
|
93 |
gr.Markdown("Explore DPO datasets grouped by language.")
|
94 |
-
|
95 |
with gr.Row():
|
96 |
with gr.Column():
|
97 |
language_dropdown = gr.Dropdown(languages, label="Select Language")
|
98 |
dataset_info = gr.Markdown()
|
99 |
-
|
100 |
with gr.Column():
|
101 |
overview = gr.Markdown(display_overview())
|
102 |
-
|
103 |
-
language_dropdown.change(display_datasets, inputs=language_dropdown, outputs=dataset_info)
|
104 |
|
105 |
-
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
from huggingface_hub import (
|
3 |
+
list_datasets,
|
4 |
+
create_collection,
|
5 |
+
get_collection,
|
6 |
+
add_collection_item,
|
7 |
+
update_collection_item,
|
8 |
+
)
|
9 |
from tqdm.auto import tqdm
|
10 |
from toolz import unique
|
11 |
from collections import defaultdict
|
12 |
from huggingface_hub import login
|
13 |
import os
|
14 |
+
from dotenv import load_dotenv
|
15 |
|
16 |
+
load_dotenv()
|
17 |
+
login(token=os.getenv("HF_TOKEN"))
|
18 |
|
19 |
|
20 |
def extract_languages(dataset_info):
|
21 |
+
return [
|
22 |
+
tag.split(":")[1] for tag in dataset_info.tags if tag.startswith("language:")
|
23 |
+
]
|
24 |
+
|
25 |
|
26 |
def create_dataset_info():
|
27 |
all_datasets = list(tqdm(list_datasets(full=True)))
|
28 |
+
all_datasets = [d for d in all_datasets if "open-llm-leaderboard" not in d.id]
|
29 |
+
|
30 |
+
dpo_in_name = [
|
31 |
+
dataset
|
32 |
+
for dataset in all_datasets
|
33 |
+
if "_dpo" in dataset.id or "dpo_" in dataset.id
|
34 |
+
]
|
35 |
+
dpo_in_tags = [
|
36 |
+
dataset for dataset in all_datasets if any(tag == "dpo" for tag in dataset.tags)
|
37 |
+
]
|
38 |
+
|
39 |
all_dpo_datasets = dpo_in_name + dpo_in_tags
|
40 |
dpo_datasets = list(unique(all_dpo_datasets, key=lambda x: x.id))
|
41 |
dpo_datasets = [d for d in dpo_datasets if d.card_data is not None]
|
42 |
+
dpo_datasets_with_languages = [
|
43 |
+
dpo_dataset
|
44 |
+
for dpo_dataset in dpo_datasets
|
45 |
+
if dpo_dataset.card_data.get("language") is not None
|
46 |
+
]
|
47 |
+
|
48 |
language_groups = defaultdict(list)
|
49 |
for dataset in dpo_datasets_with_languages:
|
50 |
languages = extract_languages(dataset)
|
51 |
for language in languages:
|
52 |
language_groups[language].append(dataset)
|
53 |
+
|
54 |
return language_groups
|
55 |
|
56 |
+
|
57 |
def create_update_collections(language_groups):
|
58 |
collections = {}
|
59 |
for language, dataset_list in language_groups.items():
|
60 |
collection_title = f"DPO datasets for {language.upper()}"
|
61 |
try:
|
62 |
+
collection = create_collection(
|
63 |
+
title=collection_title,
|
64 |
+
description=f"A collection of DPO datasets for the {language.upper()} language.",
|
65 |
+
)
|
66 |
+
except Exception:
|
67 |
collection = get_collection(f"DPO-datasets-for-{language.upper()}")
|
68 |
+
|
69 |
+
existing_items = {item.item_id for item in collection.items}
|
70 |
+
|
71 |
for dataset in dataset_list:
|
72 |
if dataset.id not in existing_items:
|
73 |
+
add_collection_item(
|
74 |
+
collection.slug, item_id=dataset.id, item_type="dataset"
|
75 |
+
)
|
76 |
+
|
77 |
collections[language] = collection
|
78 |
+
|
79 |
return collections
|
80 |
|
81 |
+
|
82 |
def display_datasets(language):
|
83 |
+
if language not in datasets:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
return "No datasets found for the selected language."
|
85 |
+
dataset_list = datasets[language]
|
86 |
+
collection = collections[language]
|
87 |
+
output = f"## Datasets for {language.upper()}\n\n"
|
88 |
+
output += f"Total datasets: {len(dataset_list)}\n\n"
|
89 |
+
output += (
|
90 |
+
f"[View Collection](https://huggingface.co/collections/{collection.slug})\n\n"
|
91 |
+
)
|
92 |
+
for dataset in dataset_list:
|
93 |
+
output += f"- [{dataset.id}](https://huggingface.co/datasets/{dataset.id})\n"
|
94 |
+
return output
|
95 |
+
|
96 |
|
97 |
def display_overview():
|
98 |
total_datasets = sum(len(datasets) for datasets in datasets.values())
|
99 |
total_languages = len(datasets)
|
100 |
+
|
101 |
overview = "## Dataset Overview\n\n"
|
102 |
overview += f"- Total number of datasets: {total_datasets}\n"
|
103 |
overview += f"- Total number of languages covered: {total_languages}\n\n"
|
104 |
+
|
105 |
overview += "### Datasets per Language\n\n"
|
106 |
for language, dataset_list in datasets.items():
|
107 |
collection = collections[language]
|
108 |
overview += f"- {language.upper()}: {len(dataset_list)} datasets ([View Collection](https://huggingface.co/collections/{collection.slug}))\n"
|
109 |
+
|
110 |
return overview
|
111 |
|
112 |
+
|
113 |
# Create the dataset information
|
114 |
datasets = create_dataset_info()
|
115 |
|
|
|
122 |
with gr.Blocks() as iface:
|
123 |
gr.Markdown("# DPO Datasets by Language")
|
124 |
gr.Markdown("Explore DPO datasets grouped by language.")
|
125 |
+
|
126 |
with gr.Row():
|
127 |
with gr.Column():
|
128 |
language_dropdown = gr.Dropdown(languages, label="Select Language")
|
129 |
dataset_info = gr.Markdown()
|
130 |
+
|
131 |
with gr.Column():
|
132 |
overview = gr.Markdown(display_overview())
|
|
|
|
|
133 |
|
134 |
+
language_dropdown.change(
|
135 |
+
display_datasets, inputs=language_dropdown, outputs=dataset_info
|
136 |
+
)
|
137 |
+
|
138 |
+
iface.launch()
|