update model

Browse files

Files changed (5) hide show

README.md +54 -41
config.json +9 -9
pytorch_model.bin +2 -2
special_tokens_map.json +1 -1
vocab.json +1 -1

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ datasets:
 - common_voice
 metrics:
 - wer
 tags:
 - audio
 - automatic-speech-recognition
@@ -23,53 +24,68 @@ model-index:
     metrics:
        - name: Test WER
          type: wer
-         value: 62.39
 ---
 # Wav2Vec2-Large-XLSR-53-Finnish
-Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Finnish using the [Common Voice](https://huggingface.co/datasets/common_voice).
 When using this model, make sure that your speech input is sampled at 16kHz.
 ## Usage
 The model can be used directly (without a language model) as follows:
 ```python
 import torch
-import torchaudio
 from datasets import load_dataset
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 LANG_ID = "fi"
 MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-finnish"
-test_dataset = load_dataset("common_voice", LANG_ID, split="test[:2%]")
 processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
 model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
-resampler = torchaudio.transforms.Resample(48_000, 16_000)
 # Preprocessing the datasets.
 # We need to read the audio files as arrays
 def speech_file_to_array_fn(batch):
-\tspeech_array, sampling_rate = torchaudio.load(batch["path"])
-\tbatch["speech"] = resampler(speech_array).squeeze().numpy()
-\treturn batch
 test_dataset = test_dataset.map(speech_file_to_array_fn)
-inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
 with torch.no_grad():
-\tlogits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
 predicted_ids = torch.argmax(logits, dim=-1)
-print("Prediction:", processor.batch_decode(predicted_ids))
-print("Reference:", test_dataset["sentence"][:2])
 ```
 ## Evaluation
@@ -77,45 +93,38 @@ The model can be evaluated as follows on the Finnish test data of Common Voice.
 ```python
 import torch
-import torchaudio
 from datasets import load_dataset, load_metric
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
-import re
-import homoglyphs as hg
 LANG_ID = "fi"
 MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-finnish"
 DEVICE = "cuda"
-CHARS_TO_IGNORE = [",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�", "·", "჻", "¿", "¡", "~", "՞", "؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》"]
-CURRENCY_SYMBOLS = ["{{%htmlContent%}}quot;, "£", "€", "¥", "₩", "₹", "₽", "₱", "₦", "₼", "ლ", "₭", "₴", "₲", "₫", "₡", "₵", "₿", "฿", "¢"]
 test_dataset = load_dataset("common_voice", LANG_ID, split="test")
-wer = load_metric("wer")
-unk_regex = None
-if LANG_ID in hg.Languages.get_all():
-    # creating regex to match language specific non valid characters
-    alphabet = list(hg.Languages.get_alphabet([LANG_ID]))
-    valid_chars = alphabet + CURRENCY_SYMBOLS
-    unk_regex = "[^"+re.escape("".join(valid_chars))+"\\s\\d]"
-chars_to_ignore_regex = f'[{re.escape("".join(CHARS_TO_IGNORE))}]'
 processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
 model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
 model.to(DEVICE)
-resampler = torchaudio.transforms.Resample(48_000, 16_000)
 # Preprocessing the datasets.
 # We need to read the audio files as arrays
 def speech_file_to_array_fn(batch):
-    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
-    if unk_regex is not None:
-        batch["sentence"] = re.sub(unk_regex, "[UNK]", batch["sentence"])
-    speech_array, sampling_rate = torchaudio.load(batch["path"])
-    batch["speech"] = resampler(speech_array).squeeze().numpy()
     return batch
 test_dataset = test_dataset.map(speech_file_to_array_fn)
@@ -123,18 +132,22 @@ test_dataset = test_dataset.map(speech_file_to_array_fn)
 # Preprocessing the datasets.
 # We need to read the audio files as arrays
 def evaluate(batch):
-\tinputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
-\twith torch.no_grad():
-\t\tlogits = model(inputs.input_values.to(DEVICE), attention_mask=inputs.attention_mask.to(DEVICE)).logits
-\tpred_ids = torch.argmax(logits, dim=-1)
-\tbatch["pred_strings"] = processor.batch_decode(pred_ids)
-\treturn batch
 result = test_dataset.map(evaluate, batched=True, batch_size=8)
-print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
 ```
-**Test Result**: 62.39%

 - common_voice
 metrics:
 - wer
+- cer
 tags:
 - audio
 - automatic-speech-recognition
     metrics:
        - name: Test WER
          type: wer
+         value: 41.60
+       - name: Test CER
+         type: cer
+         value: 8.23
 ---
 # Wav2Vec2-Large-XLSR-53-Finnish
+Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Finnish using the [Common Voice](https://huggingface.co/datasets/common_voice) and [CSS10](https://github.com/Kyubyong/css10).
 When using this model, make sure that your speech input is sampled at 16kHz.
+The script used for training can be found here: https://github.com/jonatasgrosman/wav2vec2-sprint
 ## Usage
 The model can be used directly (without a language model) as follows:
 ```python
 import torch
+import librosa
 from datasets import load_dataset
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 LANG_ID = "fi"
 MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-finnish"
+SAMPLES = 5
+test_dataset = load_dataset("common_voice", LANG_ID, split=f"test[:{SAMPLES}]")
 processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
 model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
 # Preprocessing the datasets.
 # We need to read the audio files as arrays
 def speech_file_to_array_fn(batch):
+    speech_array, sampling_rate = librosa.load(batch["path"], sr=16_000)
+    batch["speech"] = speech_array
+    batch["sentence"] = batch["sentence"].upper()
+    return batch
 test_dataset = test_dataset.map(speech_file_to_array_fn)
+inputs = processor(test_dataset["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
 with torch.no_grad():
+    logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
 predicted_ids = torch.argmax(logits, dim=-1)
+predicted_sentences = processor.batch_decode(predicted_ids)
+for i, predicted_sentence in enumerate(predicted_sentences):
+    print("-" * 100)
+    print("Reference:", test_dataset[i]["sentence"])
+    print("Prediction:", predicted_sentence)
 ```
+| Reference  | Prediction |
+| ------------- | ------------- |
+| MYSTEERIMIES OLI OPPINUT MORAALINSA TARUISTA, ELOKUVISTA JA PELEISTÄ. | MYSTEERIMIES OLI OPPINUT MORALINSA TARUISTA ELOKUVISTA JA PELEISTÄ |
+| ÄÄNESTIN MIETINNÖN PUOLESTA! | ÄÄNESTIN MIETINNÖN PUOLESTA |
+| VAIN TUNTIA AIKAISEMMIN OLIMME MIEHENI KANSSA TUNTENEET SUURINTA ILOA. | PAIN TUNTIA AIKAISEMMIN OLIN MIEHENI KANSSA TUNTENEET SUURINTA ILAA |
+| ENSIMMÄISELLE MIEHELLE SAI KOLME LASTA. | ENSIMMÄISELLE MIEHELLE SAI KOLME LASTA |
+| ÄÄNESTIN MIETINNÖN PUOLESTA, SILLÄ POHJIMMILTAAN SIINÄ VASTUSTETAAN TÄTÄ SUUNTAUSTA. | ÄÄNESTIN MIETINNÖN PUOLESTA SILLÄ POHJIMMILTAAN SIINÄ VASTOTTETAAN TÄTÄ SUUNTAUSTA |
 ## Evaluation
 ```python
 import torch
+import re
+import librosa
 from datasets import load_dataset, load_metric
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 LANG_ID = "fi"
 MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-finnish"
 DEVICE = "cuda"
+CHARS_TO_IGNORE = [",", "?", "¿", ".", "!", "¡", ";", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
+                   "؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
+                   "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。"]
 test_dataset = load_dataset("common_voice", LANG_ID, split="test")
+wer = load_metric("wer.py") # https://github.com/jonatasgrosman/wav2vec2-sprint/blob/main/wer.py
+cer = load_metric("cer.py") # https://github.com/jonatasgrosman/wav2vec2-sprint/blob/main/cer.py
+chars_to_ignore_regex = f"[{re.escape(''.join(CHARS_TO_IGNORE))}]"
 processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
 model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
 model.to(DEVICE)
 # Preprocessing the datasets.
 # We need to read the audio files as arrays
 def speech_file_to_array_fn(batch):
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        speech_array, sampling_rate = librosa.load(batch["path"], sr=16_000)
+    batch["speech"] = speech_array
+    batch["sentence"] = re.sub(chars_to_ignore_regex, "", batch["sentence"]).upper()
     return batch
 test_dataset = test_dataset.map(speech_file_to_array_fn)
 # Preprocessing the datasets.
 # We need to read the audio files as arrays
 def evaluate(batch):
+	inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
+	with torch.no_grad():
+		logits = model(inputs.input_values.to(DEVICE), attention_mask=inputs.attention_mask.to(DEVICE)).logits
+	pred_ids = torch.argmax(logits, dim=-1)
+	batch["pred_strings"] = processor.batch_decode(pred_ids)
+	return batch
 result = test_dataset.map(evaluate, batched=True, batch_size=8)
+print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"], chunk_size=1000)))
+print("CER: {:2f}".format(100 * cer.compute(predictions=result["pred_strings"], references=result["sentence"], chunk_size=1000)))
 ```
+**Test Result**:
+- WER: 41.60%
+- CER: 8.23%

config.json CHANGED Viewed

@@ -1,11 +1,11 @@
 {
-  "_name_or_path": "../models-fi/wav2vec2-large-xlsr-fi-sweep/checkpoint-500",
-  "activation_dropout": 0.0,
   "apply_spec_augment": true,
   "architectures": [
     "Wav2Vec2ForCTC"
   ],
-  "attention_dropout": 0.2,
   "bos_token_id": 1,
   "conv_bias": true,
   "conv_dim": [
@@ -42,16 +42,16 @@
   "feat_extract_activation": "gelu",
   "feat_extract_dropout": 0.0,
   "feat_extract_norm": "layer",
-  "feat_proj_dropout": 0.2,
   "final_dropout": 0.0,
   "gradient_checkpointing": true,
   "hidden_act": "gelu",
-  "hidden_dropout": 0.0,
   "hidden_size": 1024,
   "initializer_range": 0.02,
   "intermediate_size": 4096,
   "layer_norm_eps": 1e-05,
-  "layerdrop": 0.0,
   "mask_channel_length": 10,
   "mask_channel_min_space": 1,
   "mask_channel_other": 0.0,
@@ -62,7 +62,7 @@
   "mask_time_length": 10,
   "mask_time_min_space": 1,
   "mask_time_other": 0.0,
-  "mask_time_prob": 0.2,
   "mask_time_selection": "static",
   "model_type": "wav2vec2",
   "num_attention_heads": 16,
@@ -70,7 +70,7 @@
   "num_conv_pos_embeddings": 128,
   "num_feat_extract_layers": 7,
   "num_hidden_layers": 24,
-  "pad_token_id": 29,
   "transformers_version": "4.5.0.dev0",
-  "vocab_size": 30
 }

 {
+  "_name_or_path": "facebook/wav2vec2-large-xlsr-53",
+  "activation_dropout": 0.05,
   "apply_spec_augment": true,
   "architectures": [
     "Wav2Vec2ForCTC"
   ],
+  "attention_dropout": 0.1,
   "bos_token_id": 1,
   "conv_bias": true,
   "conv_dim": [
   "feat_extract_activation": "gelu",
   "feat_extract_dropout": 0.0,
   "feat_extract_norm": "layer",
+  "feat_proj_dropout": 0.05,
   "final_dropout": 0.0,
   "gradient_checkpointing": true,
   "hidden_act": "gelu",
+  "hidden_dropout": 0.05,
   "hidden_size": 1024,
   "initializer_range": 0.02,
   "intermediate_size": 4096,
   "layer_norm_eps": 1e-05,
+  "layerdrop": 0.05,
   "mask_channel_length": 10,
   "mask_channel_min_space": 1,
   "mask_channel_other": 0.0,
   "mask_time_length": 10,
   "mask_time_min_space": 1,
   "mask_time_other": 0.0,
+  "mask_time_prob": 0.05,
   "mask_time_selection": "static",
   "model_type": "wav2vec2",
   "num_attention_heads": 16,
   "num_conv_pos_embeddings": 128,
   "num_feat_extract_layers": 7,
   "num_hidden_layers": 24,
+  "pad_token_id": 0,
   "transformers_version": "4.5.0.dev0",
+  "vocab_size": 34
 }

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9067ba6beee4c6ad2692651ff18d9e95c522143d8fa38e5f155af833118e156d
-size 1262056855

 version https://git-lfs.github.com/spec/v1
+oid sha256:b3293144121790976a21ddd565d25aef7024c94309d9638e12c4e77106eb5ac2
+size 1262073239

special_tokens_map.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "~~[UNK]~~", "pad_token": "~~[PAD]~~"}


1	+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}

vocab.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"r": 0, "h": 1, "k": 2, "g": 3, "u": 4, "m": 5, "t": 6, "z": 7, "s": 8, "i": 9, "ö": 10, "v": 11, "l": 12, "q": 13, "b": 14, "e": 15, "p": 16, "y": 17, "f": 18, "d": 19, "ä": 21, "j": 22, "x": 23, "a": 24, "c": 25, "n": 26, "o": 27, "\|": 20, "~~[UNK]~~": 28, "~~[PAD]~~": 29}


1	+ {"<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3, "\|": 4, "J": 5, "Q": 6, "B": 7, "X": 8, "I": 9, "D": 10, "R": 11, "U": 12, "-": 13, "K": 14, "T": 15, "L": 17, "V": 18, "Ä": 19, "A": 20, "F": 21, "S": 22, "'": 23, "G": 24, "N": 25, "Y": 26, "M": 27, "C": 28, "E": 29, "Ö": 30, "O": 31, "H": 32, "P": 33, "Z": 34}