Files changed (1) hide show
  1. README.md +22 -14
README.md CHANGED
@@ -85,16 +85,24 @@ To transcribe audio files the model can be used as a standalone acoustic model a
85
  transcription = processor.batch_decode(predicted_ids)
86
  ```
87
 
88
- ## Evaluation
89
-
90
- This code snippet shows how to evaluate **facebook/wav2vec2-large-960h-lv60-self** on LibriSpeech's "clean" and "other" test data.
91
 
 
 
 
 
 
 
 
 
 
 
 
92
  ```python
 
93
  from datasets import load_dataset
94
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
95
- import torch
96
- from jiwer import wer
97
-
98
 
99
  librispeech_eval = load_dataset("librispeech_asr", "clean", split="test")
100
 
@@ -102,21 +110,21 @@ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
102
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
103
 
104
  def map_to_pred(batch):
105
- inputs = processor(batch["audio"]["array"], return_tensors="pt", padding="longest")
106
- input_values = inputs.input_values.to("cuda")
107
- attention_mask = inputs.attention_mask.to("cuda")
108
-
109
  with torch.no_grad():
110
- logits = model(input_values, attention_mask=attention_mask).logits
111
 
112
  predicted_ids = torch.argmax(logits, dim=-1)
113
  transcription = processor.batch_decode(predicted_ids)
114
- batch["transcription"] = transcription
115
  return batch
116
 
117
- result = librispeech_eval.map(map_to_pred, remove_columns=["audio"])
 
118
 
119
- print("WER:", wer(result["text"], result["transcription"]))
120
  ```
121
 
122
  *Result (WER)*:
 
85
  transcription = processor.batch_decode(predicted_ids)
86
  ```
87
 
88
+ ## Evaluation
 
 
89
 
90
+ First, ensure the required Python packages are installed. We'll require `transformers` for running the Wav2Vec2 model,
91
+ `datasets` for loading the LibriSpeech dataset, and `evaluate` plus `jiwer` for computing the word-error rate (WER):
92
+
93
+ ```
94
+ pip install --upgrade pip
95
+ pip install --upgrade transformers datasets evaluate jiwer
96
+ ```
97
+
98
+ The following code snippet shows how to evaluate **facebook/wav2vec2-large-960h-lv60-self** on LibriSpeech's "clean" and "other" test data.
99
+ The batch size can be set according to your device, and is set to `8` by default:
100
+
101
  ```python
102
+ import torch
103
  from datasets import load_dataset
104
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
105
+ from evaluate import load
 
 
106
 
107
  librispeech_eval = load_dataset("librispeech_asr", "clean", split="test")
108
 
 
110
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
111
 
112
  def map_to_pred(batch):
113
+ audios = [audio["array"] for audio in batch["audio"]]
114
+ sampling_rate = batch["audio"][0]["sampling_rate"]
115
+ input_values = processor(audios, sampling_rate=sampling_rate, return_tensors="pt", padding="longest").input_values
 
116
  with torch.no_grad():
117
+ logits = model(input_values.to("cuda")).logits
118
 
119
  predicted_ids = torch.argmax(logits, dim=-1)
120
  transcription = processor.batch_decode(predicted_ids)
121
+ batch["transcription"] = [t for t in transcription]
122
  return batch
123
 
124
+ result = librispeech_eval.map(map_to_pred, batched=True, batch_size=8, remove_columns=["audio"])
125
+ wer = load("wer")
126
 
127
+ print("WER:", wer.compute(references=result["text"], predictions=result["transcription"]))
128
  ```
129
 
130
  *Result (WER)*: