asahi417 commited on
Commit
95dbd3a
1 Parent(s): 0e4ed70

Delete reazon_custom_loader.py

Browse files
Files changed (1) hide show
  1. reazon_custom_loader.py +0 -81
reazon_custom_loader.py DELETED
@@ -1,81 +0,0 @@
1
- """custom HF data loader to load a large audio dataset from local
2
- - run `reazon_downloader.py` first to download the desired data type (["tiny", "small", "medium", "large", "all"]) locally.
3
- - credit: https://huggingface.co/datasets/reazon-research/reazonspeech/blob/main/reazonspeech.py
4
-
5
- Example:
6
- ```
7
- import os
8
- from datasets import load_dataset
9
-
10
- dataset = load_dataset(
11
- f"{os.getcwd()}/reazon_custom_loader.py",
12
- "tiny",
13
- split="train",
14
- trust_remote_code=True
15
- )
16
- ```
17
- """
18
- import os
19
- from glob import glob
20
-
21
- import datasets
22
- from datasets.tasks import AutomaticSpeechRecognition
23
-
24
- _SIZE = ["tiny", "small", "medium", "large", "all"]
25
-
26
-
27
- class ReazonSpeechConfig(datasets.BuilderConfig):
28
-
29
- def __init__(self, *args, **kwargs):
30
- super().__init__(*args, **kwargs)
31
-
32
-
33
- class ReazonSpeech(datasets.GeneratorBasedBuilder):
34
- BUILDER_CONFIGS = [ReazonSpeechConfig(name=name) for name in _SIZE]
35
- DEFAULT_CONFIG_NAME = "tiny"
36
- DEFAULT_WRITER_BATCH_SIZE = 256
37
-
38
- def _info(self):
39
- return datasets.DatasetInfo(
40
- task_templates=[AutomaticSpeechRecognition()],
41
- features=datasets.Features(
42
- {
43
- "name": datasets.Value("string"),
44
- "audio": datasets.Audio(sampling_rate=16000),
45
- "transcription": datasets.Value("string"),
46
- }
47
- )
48
- )
49
-
50
- def _split_generators(self, dl_manager):
51
- data_dir = f"{os.path.expanduser('~')}/.cache/reazon_manual_download/{self.config.name}"
52
- audio_files = glob(f"{data_dir}/*.tar")
53
- audio = [dl_manager.iter_archive(path) for path in audio_files]
54
- transcript_file = f"{data_dir}/{self.config.name}.{self.config.name}.tsv"
55
- return [
56
- datasets.SplitGenerator(
57
- name=datasets.Split.TRAIN,
58
- gen_kwargs={"audio_files": audio_files, "transcript_file": transcript_file, "audio": audio},
59
- ),
60
- ]
61
-
62
- def _generate_examples(self, audio_files, transcript_file, audio):
63
-
64
- # hashTable of a file and the associated transcript
65
- meta = {}
66
- with open(transcript_file, "r", encoding="utf-8") as fp:
67
- for line in fp:
68
- filename, transcription = line.rstrip("\n").split("\t")
69
- meta[filename] = transcription
70
-
71
- # iterator over audio
72
- for i, audio_single_dump in enumerate(audio):
73
- for filename, file in audio_single_dump:
74
- filename = filename.lstrip("./")
75
- if filename not in meta: # skip audio without transcription
76
- continue
77
- yield filename, {
78
- "name": filename,
79
- "audio": {"path": os.path.join(audio_files[i], filename), "bytes": file.read()},
80
- "transcription": meta[filename],
81
- }