Spaces:

DIBT
/

preference_data_by_language

Runtime error

App Files Files Community

davanstrien HF staff commited on Apr 15

Commit

809b033

•

1 Parent(s): 0fa257c

working

Browse files

Files changed (1) hide show

app.py +70 -37

app.py CHANGED Viewed

@@ -1,84 +1,115 @@
 import gradio as gr
-from huggingface_hub import list_datasets, create_collection, get_collection, add_collection_item, update_collection_item
 from tqdm.auto import tqdm
 from toolz import unique
 from collections import defaultdict
 from huggingface_hub import login
 import os
-login(token=os.getenv('HF_TOKEN'))
 def extract_languages(dataset_info):
-    return [tag.split(':')[1] for tag in dataset_info.tags if tag.startswith('language:')]
 def create_dataset_info():
     all_datasets = list(tqdm(list_datasets(full=True)))
-    all_datasets = [d for d in all_datasets if not "open-llm-leaderboard" in d.id]
-    dpo_in_name = [dataset for dataset in all_datasets if "_dpo" in dataset.id or "dpo_" in dataset.id]
-    dpo_in_tags = [dataset for dataset in all_datasets if any(tag == "dpo" for tag in dataset.tags)]
     all_dpo_datasets = dpo_in_name + dpo_in_tags
     dpo_datasets = list(unique(all_dpo_datasets, key=lambda x: x.id))
     dpo_datasets = [d for d in dpo_datasets if d.card_data is not None]
-    dpo_datasets_with_languages = [dpo_dataset for dpo_dataset in dpo_datasets if dpo_dataset.card_data.get('language') is not None]
     language_groups = defaultdict(list)
     for dataset in dpo_datasets_with_languages:
         languages = extract_languages(dataset)
         for language in languages:
             language_groups[language].append(dataset)
     return language_groups
 def create_update_collections(language_groups):
     collections = {}
     for language, dataset_list in language_groups.items():
         collection_title = f"DPO datasets for {language.upper()}"
         try:
-            collection = create_collection(title=collection_title, description=f"A collection of DPO datasets for the {language.upper()} language.")
-        except:
             collection = get_collection(f"DPO-datasets-for-{language.upper()}")
-        existing_items = set(item.item_id for item in collection.items)
         for dataset in dataset_list:
             if dataset.id not in existing_items:
-                add_collection_item(collection.slug, item_id=dataset.id, item_type="dataset")
         collections[language] = collection
     return collections
 def display_datasets(language):
-    if language in datasets:
-        dataset_list = datasets[language]
-        collection = collections[language]
-        output = f"## Datasets for {language.upper()}\n\n"
-        output += f"Total datasets: {len(dataset_list)}\n\n"
-        output += f"[View Collection](https://huggingface.co/collections/{collection.slug})\n\n"
-        for dataset in dataset_list:
-            output += f"- [{dataset.id}](https://huggingface.co/datasets/{dataset.id})\n"
-        return output
-    else:
         return "No datasets found for the selected language."
 def display_overview():
     total_datasets = sum(len(datasets) for datasets in datasets.values())
     total_languages = len(datasets)
     overview = "## Dataset Overview\n\n"
     overview += f"- Total number of datasets: {total_datasets}\n"
     overview += f"- Total number of languages covered: {total_languages}\n\n"
     overview += "### Datasets per Language\n\n"
     for language, dataset_list in datasets.items():
         collection = collections[language]
         overview += f"- {language.upper()}: {len(dataset_list)} datasets ([View Collection](https://huggingface.co/collections/{collection.slug}))\n"
     return overview
 # Create the dataset information
 datasets = create_dataset_info()
@@ -91,15 +122,17 @@ languages = list(datasets.keys())
 with gr.Blocks() as iface:
     gr.Markdown("# DPO Datasets by Language")
     gr.Markdown("Explore DPO datasets grouped by language.")
     with gr.Row():
         with gr.Column():
             language_dropdown = gr.Dropdown(languages, label="Select Language")
             dataset_info = gr.Markdown()
         with gr.Column():
             overview = gr.Markdown(display_overview())
-    language_dropdown.change(display_datasets, inputs=language_dropdown, outputs=dataset_info)
-iface.launch()

 import gradio as gr
+from huggingface_hub import (
+    list_datasets,
+    create_collection,
+    get_collection,
+    add_collection_item,
+    update_collection_item,
+)
 from tqdm.auto import tqdm
 from toolz import unique
 from collections import defaultdict
 from huggingface_hub import login
 import os
+from dotenv import load_dotenv
+load_dotenv()
+login(token=os.getenv("HF_TOKEN"))
 def extract_languages(dataset_info):
+    return [
+        tag.split(":")[1] for tag in dataset_info.tags if tag.startswith("language:")
+    ]
 def create_dataset_info():
     all_datasets = list(tqdm(list_datasets(full=True)))
+    all_datasets = [d for d in all_datasets if "open-llm-leaderboard" not in d.id]
+    dpo_in_name = [
+        dataset
+        for dataset in all_datasets
+        if "_dpo" in dataset.id or "dpo_" in dataset.id
+    ]
+    dpo_in_tags = [
+        dataset for dataset in all_datasets if any(tag == "dpo" for tag in dataset.tags)
+    ]
     all_dpo_datasets = dpo_in_name + dpo_in_tags
     dpo_datasets = list(unique(all_dpo_datasets, key=lambda x: x.id))
     dpo_datasets = [d for d in dpo_datasets if d.card_data is not None]
+    dpo_datasets_with_languages = [
+        dpo_dataset
+        for dpo_dataset in dpo_datasets
+        if dpo_dataset.card_data.get("language") is not None
+    ]
     language_groups = defaultdict(list)
     for dataset in dpo_datasets_with_languages:
         languages = extract_languages(dataset)
         for language in languages:
             language_groups[language].append(dataset)
     return language_groups
 def create_update_collections(language_groups):
     collections = {}
     for language, dataset_list in language_groups.items():
         collection_title = f"DPO datasets for {language.upper()}"
         try:
+            collection = create_collection(
+                title=collection_title,
+                description=f"A collection of DPO datasets for the {language.upper()} language.",
+            )
+        except Exception:
             collection = get_collection(f"DPO-datasets-for-{language.upper()}")
+        existing_items = {item.item_id for item in collection.items}
         for dataset in dataset_list:
             if dataset.id not in existing_items:
+                add_collection_item(
+                    collection.slug, item_id=dataset.id, item_type="dataset"
+                )
         collections[language] = collection
     return collections
 def display_datasets(language):
+    if language not in datasets:
         return "No datasets found for the selected language."
+    dataset_list = datasets[language]
+    collection = collections[language]
+    output = f"## Datasets for {language.upper()}\n\n"
+    output += f"Total datasets: {len(dataset_list)}\n\n"
+    output += (
+        f"[View Collection](https://huggingface.co/collections/{collection.slug})\n\n"
+    )
+    for dataset in dataset_list:
+        output += f"- [{dataset.id}](https://huggingface.co/datasets/{dataset.id})\n"
+    return output
 def display_overview():
     total_datasets = sum(len(datasets) for datasets in datasets.values())
     total_languages = len(datasets)
     overview = "## Dataset Overview\n\n"
     overview += f"- Total number of datasets: {total_datasets}\n"
     overview += f"- Total number of languages covered: {total_languages}\n\n"
     overview += "### Datasets per Language\n\n"
     for language, dataset_list in datasets.items():
         collection = collections[language]
         overview += f"- {language.upper()}: {len(dataset_list)} datasets ([View Collection](https://huggingface.co/collections/{collection.slug}))\n"
     return overview
 # Create the dataset information
 datasets = create_dataset_info()
 with gr.Blocks() as iface:
     gr.Markdown("# DPO Datasets by Language")
     gr.Markdown("Explore DPO datasets grouped by language.")
     with gr.Row():
         with gr.Column():
             language_dropdown = gr.Dropdown(languages, label="Select Language")
             dataset_info = gr.Markdown()
         with gr.Column():
             overview = gr.Markdown(display_overview())
+    language_dropdown.change(
+        display_datasets, inputs=language_dropdown, outputs=dataset_info
+    )
+iface.launch()