from pathlib import Path from typing import Optional, Union import distilabel import distilabel.distiset from distilabel.utils.card.dataset_card import ( DistilabelDatasetCard, size_categories_parser, ) from huggingface_hub import DatasetCardData, HfApi, upload_file class CustomDistisetWithAdditionalTag(distilabel.distiset.Distiset): def _generate_card( self, repo_id: str, token: str, include_script: bool = False, filename_py: Optional[str] = None, ) -> None: """Generates a dataset card and pushes it to the Hugging Face Hub, and if the `pipeline.yaml` path is available in the `Distiset`, uploads that to the same repository. Args: repo_id: The ID of the repository to push to, from the `push_to_hub` method. token: The token to authenticate with the Hugging Face Hub, from the `push_to_hub` method. include_script: Whether to upload the script to the hugging face repository. filename_py: The name of the script. If `include_script` is True, the script will be uploaded to the repository using this name, otherwise it won't be used. """ card = self._get_card( repo_id=repo_id, token=token, include_script=include_script, filename_py=filename_py, ) card.push_to_hub( repo_id, repo_type="dataset", token=token, ) if self.pipeline_path: # If the pipeline.yaml is available, upload it to the Hugging Face Hub as well. HfApi().upload_file( path_or_fileobj=self.pipeline_path, path_in_repo=distilabel.distiset.PIPELINE_CONFIG_FILENAME, repo_id=repo_id, repo_type="dataset", token=token, ) def _get_card( self, repo_id: str, token: Optional[str] = None, include_script: bool = False, filename_py: Optional[str] = None, ) -> DistilabelDatasetCard: """Generates the dataset card for the `Distiset`. Note: If `repo_id` and `token` are provided, it will extract the metadata from the README.md file on the hub. Args: repo_id: Name of the repository to push to, or the path for the distiset if saved to disk. token: The token to authenticate with the Hugging Face Hub. We assume that if it's provided, the dataset will be in the Hugging Face Hub, so the README metadata will be extracted from there. include_script: Whether to upload the script to the hugging face repository. filename_py: The name of the script. If `include_script` is True, the script will be uploaded to the repository using this name, otherwise it won't be used. Returns: The dataset card for the `Distiset`. """ sample_records = {} for name, dataset in self.items(): sample_records[name] = ( dataset[0] if not isinstance(dataset, dict) else dataset["train"][0] ) readme_metadata = {} if repo_id and token: readme_metadata = self._extract_readme_metadata(repo_id, token) metadata = { **readme_metadata, "size_categories": size_categories_parser( max(len(dataset) for dataset in self.values()) ), "tags": [ "synthetic", "distilabel", "rlaif", "datacraft", ], } card = DistilabelDatasetCard.from_template( card_data=DatasetCardData(**metadata), repo_id=repo_id, sample_records=sample_records, include_script=include_script, filename_py=filename_py, references=self.citations, ) return card distilabel.distiset.Distiset = CustomDistisetWithAdditionalTag