davanstrien HF staff commited on
Commit
809b033
β€’
1 Parent(s): 0fa257c
Files changed (1) hide show
  1. app.py +70 -37
app.py CHANGED
@@ -1,84 +1,115 @@
1
  import gradio as gr
2
- from huggingface_hub import list_datasets, create_collection, get_collection, add_collection_item, update_collection_item
 
 
 
 
 
 
3
  from tqdm.auto import tqdm
4
  from toolz import unique
5
  from collections import defaultdict
6
  from huggingface_hub import login
7
  import os
 
8
 
9
- login(token=os.getenv('HF_TOKEN'))
 
10
 
11
 
12
  def extract_languages(dataset_info):
13
- return [tag.split(':')[1] for tag in dataset_info.tags if tag.startswith('language:')]
 
 
 
14
 
15
  def create_dataset_info():
16
  all_datasets = list(tqdm(list_datasets(full=True)))
17
- all_datasets = [d for d in all_datasets if not "open-llm-leaderboard" in d.id]
18
-
19
- dpo_in_name = [dataset for dataset in all_datasets if "_dpo" in dataset.id or "dpo_" in dataset.id]
20
- dpo_in_tags = [dataset for dataset in all_datasets if any(tag == "dpo" for tag in dataset.tags)]
21
-
 
 
 
 
 
 
22
  all_dpo_datasets = dpo_in_name + dpo_in_tags
23
  dpo_datasets = list(unique(all_dpo_datasets, key=lambda x: x.id))
24
  dpo_datasets = [d for d in dpo_datasets if d.card_data is not None]
25
- dpo_datasets_with_languages = [dpo_dataset for dpo_dataset in dpo_datasets if dpo_dataset.card_data.get('language') is not None]
26
-
 
 
 
 
27
  language_groups = defaultdict(list)
28
  for dataset in dpo_datasets_with_languages:
29
  languages = extract_languages(dataset)
30
  for language in languages:
31
  language_groups[language].append(dataset)
32
-
33
  return language_groups
34
 
 
35
  def create_update_collections(language_groups):
36
  collections = {}
37
  for language, dataset_list in language_groups.items():
38
  collection_title = f"DPO datasets for {language.upper()}"
39
  try:
40
- collection = create_collection(title=collection_title, description=f"A collection of DPO datasets for the {language.upper()} language.")
41
- except:
 
 
 
42
  collection = get_collection(f"DPO-datasets-for-{language.upper()}")
43
-
44
- existing_items = set(item.item_id for item in collection.items)
45
-
46
  for dataset in dataset_list:
47
  if dataset.id not in existing_items:
48
- add_collection_item(collection.slug, item_id=dataset.id, item_type="dataset")
49
-
 
 
50
  collections[language] = collection
51
-
52
  return collections
53
 
 
54
  def display_datasets(language):
55
- if language in datasets:
56
- dataset_list = datasets[language]
57
- collection = collections[language]
58
- output = f"## Datasets for {language.upper()}\n\n"
59
- output += f"Total datasets: {len(dataset_list)}\n\n"
60
- output += f"[View Collection](https://huggingface.co/collections/{collection.slug})\n\n"
61
- for dataset in dataset_list:
62
- output += f"- [{dataset.id}](https://huggingface.co/datasets/{dataset.id})\n"
63
- return output
64
- else:
65
  return "No datasets found for the selected language."
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  def display_overview():
68
  total_datasets = sum(len(datasets) for datasets in datasets.values())
69
  total_languages = len(datasets)
70
-
71
  overview = "## Dataset Overview\n\n"
72
  overview += f"- Total number of datasets: {total_datasets}\n"
73
  overview += f"- Total number of languages covered: {total_languages}\n\n"
74
-
75
  overview += "### Datasets per Language\n\n"
76
  for language, dataset_list in datasets.items():
77
  collection = collections[language]
78
  overview += f"- {language.upper()}: {len(dataset_list)} datasets ([View Collection](https://huggingface.co/collections/{collection.slug}))\n"
79
-
80
  return overview
81
 
 
82
  # Create the dataset information
83
  datasets = create_dataset_info()
84
 
@@ -91,15 +122,17 @@ languages = list(datasets.keys())
91
  with gr.Blocks() as iface:
92
  gr.Markdown("# DPO Datasets by Language")
93
  gr.Markdown("Explore DPO datasets grouped by language.")
94
-
95
  with gr.Row():
96
  with gr.Column():
97
  language_dropdown = gr.Dropdown(languages, label="Select Language")
98
  dataset_info = gr.Markdown()
99
-
100
  with gr.Column():
101
  overview = gr.Markdown(display_overview())
102
-
103
- language_dropdown.change(display_datasets, inputs=language_dropdown, outputs=dataset_info)
104
 
105
- iface.launch()
 
 
 
 
 
1
  import gradio as gr
2
+ from huggingface_hub import (
3
+ list_datasets,
4
+ create_collection,
5
+ get_collection,
6
+ add_collection_item,
7
+ update_collection_item,
8
+ )
9
  from tqdm.auto import tqdm
10
  from toolz import unique
11
  from collections import defaultdict
12
  from huggingface_hub import login
13
  import os
14
+ from dotenv import load_dotenv
15
 
16
+ load_dotenv()
17
+ login(token=os.getenv("HF_TOKEN"))
18
 
19
 
20
  def extract_languages(dataset_info):
21
+ return [
22
+ tag.split(":")[1] for tag in dataset_info.tags if tag.startswith("language:")
23
+ ]
24
+
25
 
26
  def create_dataset_info():
27
  all_datasets = list(tqdm(list_datasets(full=True)))
28
+ all_datasets = [d for d in all_datasets if "open-llm-leaderboard" not in d.id]
29
+
30
+ dpo_in_name = [
31
+ dataset
32
+ for dataset in all_datasets
33
+ if "_dpo" in dataset.id or "dpo_" in dataset.id
34
+ ]
35
+ dpo_in_tags = [
36
+ dataset for dataset in all_datasets if any(tag == "dpo" for tag in dataset.tags)
37
+ ]
38
+
39
  all_dpo_datasets = dpo_in_name + dpo_in_tags
40
  dpo_datasets = list(unique(all_dpo_datasets, key=lambda x: x.id))
41
  dpo_datasets = [d for d in dpo_datasets if d.card_data is not None]
42
+ dpo_datasets_with_languages = [
43
+ dpo_dataset
44
+ for dpo_dataset in dpo_datasets
45
+ if dpo_dataset.card_data.get("language") is not None
46
+ ]
47
+
48
  language_groups = defaultdict(list)
49
  for dataset in dpo_datasets_with_languages:
50
  languages = extract_languages(dataset)
51
  for language in languages:
52
  language_groups[language].append(dataset)
53
+
54
  return language_groups
55
 
56
+
57
  def create_update_collections(language_groups):
58
  collections = {}
59
  for language, dataset_list in language_groups.items():
60
  collection_title = f"DPO datasets for {language.upper()}"
61
  try:
62
+ collection = create_collection(
63
+ title=collection_title,
64
+ description=f"A collection of DPO datasets for the {language.upper()} language.",
65
+ )
66
+ except Exception:
67
  collection = get_collection(f"DPO-datasets-for-{language.upper()}")
68
+
69
+ existing_items = {item.item_id for item in collection.items}
70
+
71
  for dataset in dataset_list:
72
  if dataset.id not in existing_items:
73
+ add_collection_item(
74
+ collection.slug, item_id=dataset.id, item_type="dataset"
75
+ )
76
+
77
  collections[language] = collection
78
+
79
  return collections
80
 
81
+
82
  def display_datasets(language):
83
+ if language not in datasets:
 
 
 
 
 
 
 
 
 
84
  return "No datasets found for the selected language."
85
+ dataset_list = datasets[language]
86
+ collection = collections[language]
87
+ output = f"## Datasets for {language.upper()}\n\n"
88
+ output += f"Total datasets: {len(dataset_list)}\n\n"
89
+ output += (
90
+ f"[View Collection](https://huggingface.co/collections/{collection.slug})\n\n"
91
+ )
92
+ for dataset in dataset_list:
93
+ output += f"- [{dataset.id}](https://huggingface.co/datasets/{dataset.id})\n"
94
+ return output
95
+
96
 
97
  def display_overview():
98
  total_datasets = sum(len(datasets) for datasets in datasets.values())
99
  total_languages = len(datasets)
100
+
101
  overview = "## Dataset Overview\n\n"
102
  overview += f"- Total number of datasets: {total_datasets}\n"
103
  overview += f"- Total number of languages covered: {total_languages}\n\n"
104
+
105
  overview += "### Datasets per Language\n\n"
106
  for language, dataset_list in datasets.items():
107
  collection = collections[language]
108
  overview += f"- {language.upper()}: {len(dataset_list)} datasets ([View Collection](https://huggingface.co/collections/{collection.slug}))\n"
109
+
110
  return overview
111
 
112
+
113
  # Create the dataset information
114
  datasets = create_dataset_info()
115
 
 
122
  with gr.Blocks() as iface:
123
  gr.Markdown("# DPO Datasets by Language")
124
  gr.Markdown("Explore DPO datasets grouped by language.")
125
+
126
  with gr.Row():
127
  with gr.Column():
128
  language_dropdown = gr.Dropdown(languages, label="Select Language")
129
  dataset_info = gr.Markdown()
130
+
131
  with gr.Column():
132
  overview = gr.Markdown(display_overview())
 
 
133
 
134
+ language_dropdown.change(
135
+ display_datasets, inputs=language_dropdown, outputs=dataset_info
136
+ )
137
+
138
+ iface.launch()