gagan3012 commited on
Commit
4341bdc
·
1 Parent(s): 95b10f7

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +24 -23
README.md CHANGED
@@ -44,19 +44,19 @@ import torchaudio
44
  from datasets import load_dataset
45
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
46
 
47
- !wget https://www.openslr.org/resources/43/ne_np_female.zip
48
- !unzip ne_np_female.zip
49
- !ls ne_np_female
50
 
51
  colnames=['path','sentence']
52
- df = pd.read_csv('/content/ne_np_female/line_index.tsv',sep='\\\\\\\\t',header=None,names = colnames)
53
- df['path'] = '/content/ne_np_female/wavs/'+df['path'] +'.wav'
54
 
55
  train, test = train_test_split(df, test_size=0.1)
56
 
57
- test.to_csv('/content/ne_np_female/line_index_test.csv')
58
 
59
- test_dataset = load_dataset('csv', data_files='/content/ne_np_female/line_index_test.csv',split = 'train')
60
 
61
  processor = Wav2Vec2Processor.from_pretrained("gagan3012/wav2vec2-xlsr-nepali")
62
  model = Wav2Vec2ForCTC.from_pretrained("gagan3012/wav2vec2-xlsr-nepali")
@@ -66,15 +66,15 @@ resampler = torchaudio.transforms.Resample(48_000, 16_000)
66
  # Preprocessing the datasets.
67
  # We need to read the aduio files as arrays
68
  def speech_file_to_array_fn(batch):
69
- \\\\tspeech_array, sampling_rate = torchaudio.load(batch["path"])
70
- \\\\tbatch["speech"] = resampler(speech_array).squeeze().numpy()
71
- \\\\treturn batch
72
 
73
  test_dataset = test_dataset.map(speech_file_to_array_fn)
74
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
75
 
76
  with torch.no_grad():
77
- \\\\tlogits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
78
 
79
  predicted_ids = torch.argmax(logits, dim=-1)
80
 
@@ -120,37 +120,38 @@ processor = Wav2Vec2Processor.from_pretrained("gagan3012/wav2vec2-xlsr-khmer")
120
  model = Wav2Vec2ForCTC.from_pretrained("gagan3012/wav2vec2-xlsr-khmer")
121
  model.to("cuda")
122
 
123
- chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“]'
124
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
125
 
126
  # Preprocessing the datasets.
127
  # We need to read the aduio files as arrays
128
  def speech_file_to_array_fn(batch):
129
- batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower()
130
- speech_array, sampling_rate = torchaudio.load(batch["path"])
131
- batch["speech"] = resampler(speech_array).squeeze().numpy()
132
- return batch
133
 
134
  test_dataset = test_dataset.map(speech_file_to_array_fn)
135
 
136
  # Preprocessing the datasets.
137
  # We need to read the aduio files as arrays
138
  def evaluate(batch):
139
- inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
140
 
141
- with torch.no_grad():
142
- logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
143
 
144
- pred_ids = torch.argmax(logits, dim=-1)
145
- batch["pred_strings"] = processor.batch_decode(pred_ids)
146
- return batch
147
 
148
  cer = load_metric("cer")
149
 
150
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
151
 
152
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["text"])))
153
- print("CER: {:2f}".format(100 * cer.compute(predictions=result["pred_strings"], references=result["text"])))```
 
154
 
155
  **Test Result**: 24.96 %
156
 
 
44
  from datasets import load_dataset
45
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
46
 
47
+ !wget https://www.openslr.org/resources/43/km_kh_male.zip
48
+ !unzip km_kh_male.zip
49
+ !ls km_kh_male
50
 
51
  colnames=['path','sentence']
52
+ df = pd.read_csv('/content/km_kh_male/line_index.tsv',sep='\\\\\\\\\\\\\\\\t',header=None,names = colnames)
53
+ df['path'] = '/content/km_kh_male/wavs/'+df['path'] +'.wav'
54
 
55
  train, test = train_test_split(df, test_size=0.1)
56
 
57
+ test.to_csv('/content/km_kh_male/line_index_test.csv')
58
 
59
+ test_dataset = load_dataset('csv', data_files='/content/km_kh_male/line_index_test.csv',split = 'train')
60
 
61
  processor = Wav2Vec2Processor.from_pretrained("gagan3012/wav2vec2-xlsr-nepali")
62
  model = Wav2Vec2ForCTC.from_pretrained("gagan3012/wav2vec2-xlsr-nepali")
 
66
  # Preprocessing the datasets.
67
  # We need to read the aduio files as arrays
68
  def speech_file_to_array_fn(batch):
69
+ \\\\\\\\tspeech_array, sampling_rate = torchaudio.load(batch["path"])
70
+ \\\\\\\\tbatch["speech"] = resampler(speech_array).squeeze().numpy()
71
+ \\\\\\\\treturn batch
72
 
73
  test_dataset = test_dataset.map(speech_file_to_array_fn)
74
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
75
 
76
  with torch.no_grad():
77
+ \\\\\\\\tlogits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
78
 
79
  predicted_ids = torch.argmax(logits, dim=-1)
80
 
 
120
  model = Wav2Vec2ForCTC.from_pretrained("gagan3012/wav2vec2-xlsr-khmer")
121
  model.to("cuda")
122
 
123
+ chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“]'
124
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
125
 
126
  # Preprocessing the datasets.
127
  # We need to read the aduio files as arrays
128
  def speech_file_to_array_fn(batch):
129
+ \tbatch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower()
130
+ \tspeech_array, sampling_rate = torchaudio.load(batch["path"])
131
+ \tbatch["speech"] = resampler(speech_array).squeeze().numpy()
132
+ \treturn batch
133
 
134
  test_dataset = test_dataset.map(speech_file_to_array_fn)
135
 
136
  # Preprocessing the datasets.
137
  # We need to read the aduio files as arrays
138
  def evaluate(batch):
139
+ \tinputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
140
 
141
+ \twith torch.no_grad():
142
+ \t\tlogits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
143
 
144
+ \tpred_ids = torch.argmax(logits, dim=-1)
145
+ \tbatch["pred_strings"] = processor.batch_decode(pred_ids)
146
+ \treturn batch
147
 
148
  cer = load_metric("cer")
149
 
150
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
151
 
152
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["text"])))
153
+ print("CER: {:2f}".format(100 * cer.compute(predictions=result["pred_strings"], references=result["text"])))
154
+ ```
155
 
156
  **Test Result**: 24.96 %
157