cahya commited on
Commit
878a84a
·
1 Parent(s): 288b57e

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +6 -7
README.md CHANGED
@@ -57,8 +57,8 @@ def load_dataset_sundanese():
57
 
58
  dfs = []
59
 
60
- dfs.append(pd.read_csv(filenames[0], sep='\\\\\\\\t\\\\\\\\t', names=["path", "sentence"]))
61
- dfs.append(pd.read_csv(filenames[1], sep='\\\\\\\\t\\\\\\\\t', names=["path", "sentence"]))
62
 
63
  for i, dir in enumerate(data_dirs):
64
  dfs[i]["path"] = dfs[i].apply(lambda row: str(data_dirs[i]) + "/" + row + ".wav", axis=1)
@@ -124,8 +124,8 @@ def load_dataset_sundanese():
124
 
125
  dfs = []
126
 
127
- dfs.append(pd.read_csv(filenames[0], sep='\\\\\\\\t\\\\\\\\t', names=["path", "sentence"]))
128
- dfs.append(pd.read_csv(filenames[1], sep='\\\\\\\\t\\\\\\\\t', names=["path", "sentence"]))
129
 
130
  for i, dir in enumerate(data_dirs):
131
  dfs[i]["path"] = dfs[i].apply(lambda row: str(data_dirs[i]) + "/" + row + ".wav", axis=1)
@@ -145,7 +145,7 @@ processor = Wav2Vec2Processor.from_pretrained("cahya/wav2vec2-large-xlsr-sundane
145
  model = Wav2Vec2ForCTC.from_pretrained("cahya/wav2vec2-large-xlsr-sundanese")
146
  model.to("cuda")
147
 
148
- chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“\\%\\‘\\'\\”_\\�]'
149
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
150
 
151
  # Preprocessing the datasets.
@@ -179,7 +179,6 @@ print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"],
179
 
180
  ## Training
181
 
182
- The Common Voice `train`, `validation`, and ... datasets were used for training as well as ... and ... # TODO
183
-
184
  The script used for training can be found [here](https://github.com/cahya-wirawan/indonesian-speech-recognition/blob/main/XLSR_Wav2Vec2_for_Indonesian_Evaluation-Sundanese.ipynb)
185
  and to [evaluate it](https://github.com/cahya-wirawan/indonesian-speech-recognition/blob/main/XLSR_Wav2Vec2_for_Indonesian_Evaluation-Sundanese.ipynb)
 
57
 
58
  dfs = []
59
 
60
+ dfs.append(pd.read_csv(filenames[0], sep='\\\\\\\\\\\\\\\\t\\\\\\\\\\\\\\\\t', names=["path", "sentence"]))
61
+ dfs.append(pd.read_csv(filenames[1], sep='\\\\\\\\\\\\\\\\t\\\\\\\\\\\\\\\\t', names=["path", "sentence"]))
62
 
63
  for i, dir in enumerate(data_dirs):
64
  dfs[i]["path"] = dfs[i].apply(lambda row: str(data_dirs[i]) + "/" + row + ".wav", axis=1)
 
124
 
125
  dfs = []
126
 
127
+ dfs.append(pd.read_csv(filenames[0], sep='\\\\\\\\\\\\\\\\t\\\\\\\\\\\\\\\\t', names=["path", "sentence"]))
128
+ dfs.append(pd.read_csv(filenames[1], sep='\\\\\\\\\\\\\\\\t\\\\\\\\\\\\\\\\t', names=["path", "sentence"]))
129
 
130
  for i, dir in enumerate(data_dirs):
131
  dfs[i]["path"] = dfs[i].apply(lambda row: str(data_dirs[i]) + "/" + row + ".wav", axis=1)
 
145
  model = Wav2Vec2ForCTC.from_pretrained("cahya/wav2vec2-large-xlsr-sundanese")
146
  model.to("cuda")
147
 
148
+ chars_to_ignore_regex = '[\\\\,\\\\?\\\\.\\\\!\\\\-\\\\;\\\\:\\\\"\\\\“\\\\%\\\\‘\\\\'\\\\”_\\\\�]'
149
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
150
 
151
  # Preprocessing the datasets.
 
179
 
180
  ## Training
181
 
182
+ [OpenSLR High quality TTS data for Sundanese](https://openslr.org/44/) was used for training.
 
183
  The script used for training can be found [here](https://github.com/cahya-wirawan/indonesian-speech-recognition/blob/main/XLSR_Wav2Vec2_for_Indonesian_Evaluation-Sundanese.ipynb)
184
  and to [evaluate it](https://github.com/cahya-wirawan/indonesian-speech-recognition/blob/main/XLSR_Wav2Vec2_for_Indonesian_Evaluation-Sundanese.ipynb)