Commit
·
2d84a88
1
Parent(s):
e1cb58c
add task categories
Browse files
src/synthetic_dataset_generator/_distiset.py
CHANGED
@@ -81,6 +81,15 @@ class CustomDistisetWithAdditionalTag(distilabel.distiset.Distiset):
|
|
81 |
dataset[0] if not isinstance(dataset, dict) else dataset["train"][0]
|
82 |
)
|
83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
readme_metadata = {}
|
85 |
if repo_id and token:
|
86 |
readme_metadata = self._extract_readme_metadata(repo_id, token)
|
@@ -90,6 +99,7 @@ class CustomDistisetWithAdditionalTag(distilabel.distiset.Distiset):
|
|
90 |
"size_categories": size_categories_parser(
|
91 |
max(len(dataset) for dataset in self.values())
|
92 |
),
|
|
|
93 |
"tags": [
|
94 |
"synthetic",
|
95 |
"distilabel",
|
|
|
81 |
dataset[0] if not isinstance(dataset, dict) else dataset["train"][0]
|
82 |
)
|
83 |
|
84 |
+
keys = list(sample_records.keys())
|
85 |
+
if len(keys) != 2 or not (
|
86 |
+
("label" in keys and "text" in keys)
|
87 |
+
or ("labels" in keys and "text" in keys)
|
88 |
+
):
|
89 |
+
task_categories = ["text-classification"]
|
90 |
+
elif "prompt" in keys or "messages" in keys:
|
91 |
+
task_categories = ["text-generation", "text2text-generation"]
|
92 |
+
|
93 |
readme_metadata = {}
|
94 |
if repo_id and token:
|
95 |
readme_metadata = self._extract_readme_metadata(repo_id, token)
|
|
|
99 |
"size_categories": size_categories_parser(
|
100 |
max(len(dataset) for dataset in self.values())
|
101 |
),
|
102 |
+
"task_categories": task_categories,
|
103 |
"tags": [
|
104 |
"synthetic",
|
105 |
"distilabel",
|