SummerTime / dataset /dataset_loaders.py
akhaliq3
spaces demo
546a9ba
from os import path
from tqdm import tqdm
from typing import List, Generator, Optional, Union
from datasets import Dataset
from dataset.st_dataset import SummInstance, SummDataset
# Set directory to load non_huggingface dataset scripts
FILE_DIRECTORY_PATH = path.dirname(path.realpath(__file__))
BASE_NONHUGGINGFACE_DATASETS_PATH = path.join(
FILE_DIRECTORY_PATH, "non_huggingface_datasets_builders"
)
# Huggingface Datasets
class CnndmDataset(SummDataset):
"""
The CNN/DM dataset
"""
dataset_name = "CNN/DailyMail"
is_query_based = False
is_dialogue_based = False
is_multi_document = False
huggingface_dataset = True
huggingface_page = "https://huggingface.co/datasets/cnn_dailymail"
def __init__(self):
super().__init__(
dataset_args=(
"cnn_dailymail",
"3.0.0",
)
)
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]:
"""
Overrides the SummDataset '_process_data()' method
This method processes the data contained in the dataset
and puts each data instance into a SummInstance object
:param dataset: a train/validation/test dataset
:rtype: a generator yielding SummInstance objects
"""
for instance in tqdm(data):
article: str = instance["article"]
highlights: str = instance["highlights"]
summ_instance = SummInstance(source=article, summary=highlights)
yield summ_instance
class MultinewsDataset(SummDataset):
"""
The Multi News dataset
"""
dataset_name = "Multinews"
is_query_based = False
is_dialogue_based = False
is_multi_document = True
huggingface_dataset = True
huggingface_page = "https://huggingface.co/datasets/multi_news"
def __init__(self):
super().__init__(dataset_args=("multi_news",))
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]:
"""
Overrides the SummDataset '_process_data()' method
This method processes the data contained in the dataset
and puts each data instance into a SummInstance object
:param dataset: a train/validation/test dataset
:rtype: a generator yielding SummInstance objects
"""
for instance in tqdm(data):
document: list = [
doc for doc in instance["document"].split("|||||") if doc
] # removes the empty string generated
# since each doc ends with the delimiting token '|||||'
# the final doc creates an empty string
summary: str = instance["summary"]
summ_instance = SummInstance(source=document, summary=summary)
yield summ_instance
class SamsumDataset(SummDataset):
"""
The SAMsum Dataset
"""
dataset_name = "Samsum"
is_query_based = False
is_dialogue_based = True
is_multi_document = False
huggingface_dataset = True
huggingface_page = "https://huggingface.co/datasets/samsum"
def __init__(self):
super().__init__(dataset_args=("samsum",))
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]:
"""
Overrides the SummDataset '_process_data()' method
This method processes the data contained in the dataset
and puts each data instance into a SummInstance object
:param dataset: a train/validation/test dataset
:rtype: a generator yielding SummInstance objects
"""
for instance in tqdm(data):
dialogue: List = instance["dialogue"].split(
"\r\n"
) # split each dialogue into a list of strings such as
# ["speaker1 : utter..", "speaker2 : utter..."]
summary: str = instance["summary"]
summ_instance = SummInstance(source=dialogue, summary=summary)
yield summ_instance
class XsumDataset(SummDataset):
"""
The Xsum Dataset
"""
dataset_name = "Xsum"
huggingface_dataset = True
huggingface_page = "https://huggingface.co/datasets/xsum"
is_query_based = False
is_dialogue_based = False
is_multi_document = False
def __init__(self):
super().__init__(dataset_args=("xsum",))
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]:
"""
Overrides the SummDataset '_process_data()' method
This method processes the data contained in the dataset
and puts each data instance into a SummInstance object
:param dataset: a train/validation/test dataset
:rtype: a generator yielding SummInstance objects
"""
for instance in tqdm(data):
document: List = instance["document"]
summary: str = instance["summary"]
summ_instance = SummInstance(source=document, summary=summary)
yield summ_instance
class PubmedqaDataset(SummDataset):
"""
The Pubmed QA dataset
"""
dataset_name = "Pubmedqa"
is_query_based = True
is_dialogue_based = False
is_multi_document = False
huggingface_dataset = True
huggingface_page = "https://huggingface.co/datasets/pubmed_qa"
def __init__(self, seed=None):
super().__init__(
dataset_args=(
"pubmed_qa",
"pqa_artificial",
)
)
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]:
"""
Overrides the SummDataset '_process_data()' method
This method processes the data contained in the dataset
and puts each data instance into a SummInstance object
:param dataset: a train/validation/test dataset
:rtype: a generator yielding SummInstance objects
"""
for instance in tqdm(data):
context: str = " ".join(instance["context"]["contexts"])
answer: str = instance["long_answer"]
query: str = instance["question"]
summ_instance = SummInstance(source=context, summary=answer, query=query)
yield summ_instance
class MlsumDataset(SummDataset):
"""
The MLsum Dataset - A multi-lingual dataset featuring 5 languages
Includes 1.5 million news articles and their corresponding summaries
"de" - German
"es" - Spanish
"fr" - French
"ru" - Russian
"tu" - Turkish
"""
dataset_name = "MlSum"
is_query_based = False
is_dialogue_based = False
is_multi_document = False
huggingface_dataset = True
huggingface_page = "https://huggingface.co/datasets/mlsum"
supported_languages = ["de", "es", "fr", "ru", "tu"]
mlsum_instantiation_guide = """The languages supported for the Mlsum Dataset are:
de - German
es - Spanish
fr - French
ru - Russian
tu - Turkish
Examples to instantiate the dataset:
1. Dataset with only one language
dataset = MlsumDataset({language_token})
dataset = MlsumDataset("es")
dataset = MlsumDataset("tu")...
2. Dataset with a multiple languages
dataset = MlsumDataset({list of language_token})
dataset = MlsumDataset(["es","de"])
dataset = MlsumDataset(["es","de", "tu"])...
3. Dataset with all supported languages (default)
dataset = MlsumDataset(all)
dataset = MlsumDataset()
"""
def __init__(self, languages: Optional[Union[str, List[str]]] = "all"):
super().__init__(dataset_args=(languages,))
def _load_dataset_safe(self, languages: Optional[Union[str, List[str]]]):
"""
Overrides the parent class method
Method loads multiple datasets of different languages provided in :param languages:
It then concatenates these datasets into one combined dataset
:rtype: datasetDict containing the combined dataset
:param languages: Optional, either a string or list of strings specifying the languages
to load
"""
print(MlsumDataset.mlsum_instantiation_guide)
# Choose languages to download articles
if languages == "all":
selected_languages = MlsumDataset.supported_languages
elif isinstance(languages, list):
for language in languages:
assert self.is_supported(language)
selected_languages = languages
else:
assert self.is_supported(languages)
selected_languages = [languages]
# Concatenate selected languaeges into one dataset
language_datasets = []
for language in selected_languages:
dataset = super()._load_dataset_safe(
"mlsum",
language,
)
language_datasets.append(dataset)
mlsum_dataset = self._concatenate_dataset_dicts(language_datasets)
return mlsum_dataset
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]:
"""
Overrides the SummDataset '_process_data()' method
This method processes the data contained in the dataset
and puts each data instance into a SummInstance object
:param dataset: a train/validation/test dataset
:rtype: a generator yielding SummInstance objects
"""
for instance in tqdm(data):
article: List = instance["text"]
summary: str = instance["summary"]
summ_instance = SummInstance(source=article, summary=summary)
yield summ_instance
def is_supported(self, language: str):
"""
Checks whether the requested langues is supported
:param language: string containing the requested language
:rtype bool:
"""
if language not in MlsumDataset.supported_languages:
print(MlsumDataset.mlsum_instantiation_guide)
raise ValueError(
f"The language(s): '{language}' entered is not supported. See above message for usage info"
)
else:
return True
# Non-huggingface datasets
class ScisummnetDataset(SummDataset):
"""
The SciSummNet dataset. As a dataset not included by huggingface, we need to do manually download, set basic
information for the dataset
"""
dataset_name = "ScisummNet"
version = "1.1.0"
description = (
"A summary of scientific papers should ideally incorporate the impact of the papers on the "
"research community reflected by citations. To facilitate research in citation-aware scientific "
"paper summarization (Scisumm), the CL-Scisumm shared task has been organized since 2014 for "
"papers in the computational linguistics and NLP domain."
)
is_dialogue_based = False
is_multi_document = False
is_query_based = False
huggingface_dataset = False
builder_script_path = path.join(
BASE_NONHUGGINGFACE_DATASETS_PATH, dataset_name.lower() + ".py"
)
def __init__(self, seed=None):
super().__init__()
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]:
"""
Overrides the SummDataset '_process_data()' method
This method processes the data contained in the dataset
and puts each data instance into a SummInstance object
:param dataset: a train/validation/test dataset
:rtype: a generator yielding SummInstance objects
"""
for instance in tqdm(data):
docs: List = [
instance["document_xml"],
instance["citing_sentences_annotated.json"],
]
summary: str = instance["summary"]
summ_instance = SummInstance(source=docs, summary=summary)
yield summ_instance
class SummscreenDataset(SummDataset):
"""
The SummScreen dataset. As a dataset not included by huggingface, we need to do manually download, set basic
information for the dataset
"""
dataset_name = "Summscreen"
version = "1.1.0"
is_dialogue_based = True
is_multi_document = False
is_query_based = False
huggingface_dataset = False
builder_script_path = path.join(
BASE_NONHUGGINGFACE_DATASETS_PATH, dataset_name.lower() + ".py"
)
def __init__(self, seed=None):
super().__init__()
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]:
"""
Overrides the SummDataset '_process_data()' method
This method processes the data contained in the dataset
and puts each data instance into a SummInstance object
:param dataset: a train/validation/test dataset
:rtype: a generator yielding SummInstance objects
"""
for instance in tqdm(data):
transcript: List = instance[
"transcript"
] # convert string into a list of string dialogues
recap: str = instance["recap"]
summ_instance = SummInstance(source=transcript, summary=recap)
yield summ_instance
class QMsumDataset(SummDataset):
"""
QMSum Dataset
"""
dataset_name = "QMsum"
description = """
QMSum is a new human-annotated benchmark for query-based multi-domain meeting summarization task,
which consists of 1,808 query-summary pairs over 232 meetings in multiple domains.
"""
is_dialogue_based = True
is_multi_document = False
is_query_based = True
huggingface_dataset = False
builder_script_path = path.join(
BASE_NONHUGGINGFACE_DATASETS_PATH, dataset_name.lower() + ".py"
)
def __init__(self):
super().__init__()
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]:
"""
Overrides the SummDataset '_process_data()' method
This method processes the data contained in the dataset
and puts each data instance into a SummInstance object
:param dataset: a train/validation/test dataset
:rtype: a generator yielding SummInstance objects
"""
for instance in tqdm(data):
for query_set in (
instance["general_query_list"] + instance["specific_query_list"]
):
meeting: List = [
utterance["speaker"] + " : " + utterance["content"]
for utterance in instance["meeting_transcripts"]
]
query: str = query_set["query"]
summary: str = query_set["answer"]
summ_instance = SummInstance(
source=meeting, summary=summary, query=query
)
yield summ_instance
class ArxivDataset(SummDataset):
"""
The Arxiv Dataset
"""
dataset_name = "Arxiv_longsummarization"
description = """
A summarization dataset comprised of pairs of scientific papers.
The dataset provides a challenging testbed for abstractive summarization.
It contains papers and their abstracts.
"""
is_dialogue_based = False
is_multi_document = False
is_query_based = False
huggingface_dataset = False
builder_script_path = path.join(
BASE_NONHUGGINGFACE_DATASETS_PATH, dataset_name.lower() + ".py"
)
def __init__(self):
print(
"*****************",
"***Attention***",
"This dataset is quite large (approx 5Gb and will need about 15 Gb for the extraction process",
"Cancel/interrupt the download if size and time constraints will not be met",
"*****************",
sep="\n",
)
super().__init__()
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]:
"""
Overrides the SummDataset '_process_data()' method
This method processes the data contained in the dataset
and puts each data instance into a SummInstance object
:param dataset: a train/validation/test dataset
:rtype: a generator yielding SummInstance objects
"""
for instance in tqdm(data):
article: List = instance["article_text"]
abstract: str = " ".join(instance["abstract_text"])
summ_instance = SummInstance(source=article, summary=abstract)
yield summ_instance