Update README.md
Browse filesFix the code snippet with the correct sampling size.
README.md
CHANGED
@@ -124,7 +124,7 @@ class to transcribe short-form audio files (< 30-seconds) as follows:
|
|
124 |
```python
|
125 |
import torch
|
126 |
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
|
127 |
-
from datasets import load_dataset
|
128 |
|
129 |
# config
|
130 |
model_id = "kotoba-tech/kotoba-whisper-v1.0"
|
@@ -145,8 +145,9 @@ pipe = pipeline(
|
|
145 |
device=device,
|
146 |
)
|
147 |
|
148 |
-
# load sample audio
|
149 |
-
dataset = load_dataset("japanese-asr/ja_asr.
|
|
|
150 |
sample = dataset[0]["audio"]
|
151 |
|
152 |
# run inference
|
@@ -154,7 +155,7 @@ result = pipe(sample)
|
|
154 |
print(result["text"])
|
155 |
```
|
156 |
|
157 |
-
- To transcribe a local audio file, simply pass the path to your audio file when you call the pipeline:
|
158 |
```diff
|
159 |
- result = pipe(sample)
|
160 |
+ result = pipe("audio.mp3")
|
@@ -205,7 +206,8 @@ pipe = pipeline(
|
|
205 |
)
|
206 |
|
207 |
# load sample audio (concatenate instances to creaete a long audio)
|
208 |
-
dataset = load_dataset("japanese-asr/ja_asr.
|
|
|
209 |
sample = {"array": np.concatenate([i["array"] for i in dataset[:20]["audio"]]), "sampling_rate": dataset[0]['audio']['sampling_rate'], "path": "tmp"}
|
210 |
|
211 |
# run inference
|
@@ -247,7 +249,8 @@ pipe = pipeline(
|
|
247 |
)
|
248 |
|
249 |
# load sample audio (concatenate instances to creaete a long audio)
|
250 |
-
dataset = load_dataset("japanese-asr/ja_asr.
|
|
|
251 |
sample = {"array": np.concatenate([i["array"] for i in dataset[:20]["audio"]]), "sampling_rate": dataset[0]['audio']['sampling_rate'], "path": "tmp"}
|
252 |
|
253 |
# run inference
|
@@ -318,14 +321,14 @@ Evaluation can then be run end-to-end with the following example:
|
|
318 |
|
319 |
```python
|
320 |
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
|
321 |
-
from datasets import load_dataset,
|
322 |
from evaluate import load
|
323 |
import torch
|
324 |
from tqdm import tqdm
|
325 |
|
326 |
# config
|
327 |
model_id = "kotoba-tech/kotoba-whisper-v1.0"
|
328 |
-
dataset_name = "japanese-asr/ja_asr.
|
329 |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
330 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
331 |
audio_column = 'audio'
|
@@ -339,8 +342,7 @@ processor = AutoProcessor.from_pretrained(model_id)
|
|
339 |
|
340 |
# load the dataset and sample the audio with 16kHz
|
341 |
dataset = load_dataset(dataset_name, split="test")
|
342 |
-
dataset = dataset.cast_column(audio_column,
|
343 |
-
dataset = dataset.select([0, 1, 2, 3, 4, 5, 6])
|
344 |
|
345 |
# preprocess and batch the dataset
|
346 |
|
@@ -379,7 +381,7 @@ The huggingface links to the major Japanese ASR datasets for evaluation are summ
|
|
379 |
For example, to evaluate the model on JSUT Basic5000, change the `dataset_name`:
|
380 |
|
381 |
```diff
|
382 |
-
- dataset_name = "japanese-asr/ja_asr.
|
383 |
+ dataset_name = "japanese-asr/ja_asr.jsut_basic5000"
|
384 |
```
|
385 |
|
|
|
124 |
```python
|
125 |
import torch
|
126 |
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
|
127 |
+
from datasets import load_dataset, Audio
|
128 |
|
129 |
# config
|
130 |
model_id = "kotoba-tech/kotoba-whisper-v1.0"
|
|
|
145 |
device=device,
|
146 |
)
|
147 |
|
148 |
+
# load sample audio & downsample to 16kHz
|
149 |
+
dataset = load_dataset("japanese-asr/ja_asr.reazonspeech_test", split="test")
|
150 |
+
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
|
151 |
sample = dataset[0]["audio"]
|
152 |
|
153 |
# run inference
|
|
|
155 |
print(result["text"])
|
156 |
```
|
157 |
|
158 |
+
- To transcribe a local audio file, simply pass the path to your audio file when you call the pipeline (make sure the audio is sampled in 16kHz):
|
159 |
```diff
|
160 |
- result = pipe(sample)
|
161 |
+ result = pipe("audio.mp3")
|
|
|
206 |
)
|
207 |
|
208 |
# load sample audio (concatenate instances to creaete a long audio)
|
209 |
+
dataset = load_dataset("japanese-asr/ja_asr.reazonspeech_test", split="test")
|
210 |
+
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
|
211 |
sample = {"array": np.concatenate([i["array"] for i in dataset[:20]["audio"]]), "sampling_rate": dataset[0]['audio']['sampling_rate'], "path": "tmp"}
|
212 |
|
213 |
# run inference
|
|
|
249 |
)
|
250 |
|
251 |
# load sample audio (concatenate instances to creaete a long audio)
|
252 |
+
dataset = load_dataset("japanese-asr/ja_asr.reazonspeech_test", split="test")
|
253 |
+
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
|
254 |
sample = {"array": np.concatenate([i["array"] for i in dataset[:20]["audio"]]), "sampling_rate": dataset[0]['audio']['sampling_rate'], "path": "tmp"}
|
255 |
|
256 |
# run inference
|
|
|
321 |
|
322 |
```python
|
323 |
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
|
324 |
+
from datasets import load_dataset, Audio
|
325 |
from evaluate import load
|
326 |
import torch
|
327 |
from tqdm import tqdm
|
328 |
|
329 |
# config
|
330 |
model_id = "kotoba-tech/kotoba-whisper-v1.0"
|
331 |
+
dataset_name = "japanese-asr/ja_asr.reazonspeech_test"
|
332 |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
333 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
334 |
audio_column = 'audio'
|
|
|
342 |
|
343 |
# load the dataset and sample the audio with 16kHz
|
344 |
dataset = load_dataset(dataset_name, split="test")
|
345 |
+
dataset = dataset.cast_column(audio_column, Audio(sampling_rate=processor.feature_extractor.sampling_rate))
|
|
|
346 |
|
347 |
# preprocess and batch the dataset
|
348 |
|
|
|
381 |
For example, to evaluate the model on JSUT Basic5000, change the `dataset_name`:
|
382 |
|
383 |
```diff
|
384 |
+
- dataset_name = "japanese-asr/ja_asr.reazonspeech_test"
|
385 |
+ dataset_name = "japanese-asr/ja_asr.jsut_basic5000"
|
386 |
```
|
387 |
|