Update README.md
Browse files
README.md
CHANGED
@@ -57,8 +57,8 @@ def load_dataset_sundanese():
|
|
57 |
|
58 |
dfs = []
|
59 |
|
60 |
-
dfs.append(pd.read_csv(filenames[0], sep='
|
61 |
-
dfs.append(pd.read_csv(filenames[1], sep='
|
62 |
|
63 |
for i, dir in enumerate(data_dirs):
|
64 |
dfs[i]["path"] = dfs[i].apply(lambda row: str(data_dirs[i]) + "/" + row + ".wav", axis=1)
|
@@ -124,8 +124,8 @@ def load_dataset_sundanese():
|
|
124 |
|
125 |
dfs = []
|
126 |
|
127 |
-
dfs.append(pd.read_csv(filenames[0], sep='
|
128 |
-
dfs.append(pd.read_csv(filenames[1], sep='
|
129 |
|
130 |
for i, dir in enumerate(data_dirs):
|
131 |
dfs[i]["path"] = dfs[i].apply(lambda row: str(data_dirs[i]) + "/" + row + ".wav", axis=1)
|
@@ -145,7 +145,7 @@ processor = Wav2Vec2Processor.from_pretrained("cahya/wav2vec2-large-xlsr-sundane
|
|
145 |
model = Wav2Vec2ForCTC.from_pretrained("cahya/wav2vec2-large-xlsr-sundanese")
|
146 |
model.to("cuda")
|
147 |
|
148 |
-
chars_to_ignore_regex = '[
|
149 |
resampler = torchaudio.transforms.Resample(48_000, 16_000)
|
150 |
|
151 |
# Preprocessing the datasets.
|
@@ -179,7 +179,6 @@ print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"],
|
|
179 |
|
180 |
## Training
|
181 |
|
182 |
-
|
183 |
-
|
184 |
The script used for training can be found [here](https://github.com/cahya-wirawan/indonesian-speech-recognition/blob/main/XLSR_Wav2Vec2_for_Indonesian_Evaluation-Sundanese.ipynb)
|
185 |
and to [evaluate it](https://github.com/cahya-wirawan/indonesian-speech-recognition/blob/main/XLSR_Wav2Vec2_for_Indonesian_Evaluation-Sundanese.ipynb)
|
|
|
57 |
|
58 |
dfs = []
|
59 |
|
60 |
+
dfs.append(pd.read_csv(filenames[0], sep='\\\\\\\\\\\\\\\\t\\\\\\\\\\\\\\\\t', names=["path", "sentence"]))
|
61 |
+
dfs.append(pd.read_csv(filenames[1], sep='\\\\\\\\\\\\\\\\t\\\\\\\\\\\\\\\\t', names=["path", "sentence"]))
|
62 |
|
63 |
for i, dir in enumerate(data_dirs):
|
64 |
dfs[i]["path"] = dfs[i].apply(lambda row: str(data_dirs[i]) + "/" + row + ".wav", axis=1)
|
|
|
124 |
|
125 |
dfs = []
|
126 |
|
127 |
+
dfs.append(pd.read_csv(filenames[0], sep='\\\\\\\\\\\\\\\\t\\\\\\\\\\\\\\\\t', names=["path", "sentence"]))
|
128 |
+
dfs.append(pd.read_csv(filenames[1], sep='\\\\\\\\\\\\\\\\t\\\\\\\\\\\\\\\\t', names=["path", "sentence"]))
|
129 |
|
130 |
for i, dir in enumerate(data_dirs):
|
131 |
dfs[i]["path"] = dfs[i].apply(lambda row: str(data_dirs[i]) + "/" + row + ".wav", axis=1)
|
|
|
145 |
model = Wav2Vec2ForCTC.from_pretrained("cahya/wav2vec2-large-xlsr-sundanese")
|
146 |
model.to("cuda")
|
147 |
|
148 |
+
chars_to_ignore_regex = '[\\\\,\\\\?\\\\.\\\\!\\\\-\\\\;\\\\:\\\\"\\\\“\\\\%\\\\‘\\\\'\\\\”_\\\\�]'
|
149 |
resampler = torchaudio.transforms.Resample(48_000, 16_000)
|
150 |
|
151 |
# Preprocessing the datasets.
|
|
|
179 |
|
180 |
## Training
|
181 |
|
182 |
+
[OpenSLR High quality TTS data for Sundanese](https://openslr.org/44/) was used for training.
|
|
|
183 |
The script used for training can be found [here](https://github.com/cahya-wirawan/indonesian-speech-recognition/blob/main/XLSR_Wav2Vec2_for_Indonesian_Evaluation-Sundanese.ipynb)
|
184 |
and to [evaluate it](https://github.com/cahya-wirawan/indonesian-speech-recognition/blob/main/XLSR_Wav2Vec2_for_Indonesian_Evaluation-Sundanese.ipynb)
|