AlexN commited on
Commit
f1b4b39
1 Parent(s): 885990e

corrected tokenizer chars

Browse files
config.json CHANGED
@@ -6,7 +6,7 @@
6
  "add_adapter": false,
7
  "apply_spec_augment": true,
8
  "architectures": [
9
- "Wav2Vec2ForCTC"
10
  ],
11
  "attention_dropout": 0.0,
12
  "bos_token_id": 1,
@@ -61,10 +61,10 @@
61
  "layerdrop": 0.0,
62
  "mask_feature_length": 10,
63
  "mask_feature_min_masks": 0,
64
- "mask_feature_prob": 0.3,
65
  "mask_time_length": 10,
66
  "mask_time_min_masks": 2,
67
- "mask_time_prob": 0.15,
68
  "model_type": "wav2vec2",
69
  "num_adapter_layers": 3,
70
  "num_attention_heads": 16,
@@ -100,7 +100,7 @@
100
  1
101
  ],
102
  "torch_dtype": "float32",
103
- "transformers_version": "4.16.0.dev0",
104
  "use_weighted_layer_sum": false,
105
  "vocab_size": 218,
106
  "xvector_output_dim": 512
 
6
  "add_adapter": false,
7
  "apply_spec_augment": true,
8
  "architectures": [
9
+ "Wav2Vec2ForPreTraining"
10
  ],
11
  "attention_dropout": 0.0,
12
  "bos_token_id": 1,
 
61
  "layerdrop": 0.0,
62
  "mask_feature_length": 10,
63
  "mask_feature_min_masks": 0,
64
+ "mask_feature_prob": 0.33,
65
  "mask_time_length": 10,
66
  "mask_time_min_masks": 2,
67
+ "mask_time_prob": 0.05,
68
  "model_type": "wav2vec2",
69
  "num_adapter_layers": 3,
70
  "num_attention_heads": 16,
 
100
  1
101
  ],
102
  "torch_dtype": "float32",
103
+ "transformers_version": "4.17.0.dev0",
104
  "use_weighted_layer_sum": false,
105
  "vocab_size": 218,
106
  "xvector_output_dim": 512
run.sh CHANGED
@@ -4,6 +4,7 @@ python run_speech_recognition_ctc.py \
4
  --model_name_or_path="facebook/wav2vec2-xls-r-300m" \
5
  --dataset_config_name="fr" \
6
  --output_dir="./" \
 
7
  --overwrite_output_dir \
8
  --num_train_epochs="5" \
9
  --per_device_train_batch_size="64" \
 
4
  --model_name_or_path="facebook/wav2vec2-xls-r-300m" \
5
  --dataset_config_name="fr" \
6
  --output_dir="./" \
7
+ --tokenizer_name_or_path="./" \
8
  --overwrite_output_dir \
9
  --num_train_epochs="5" \
10
  --per_device_train_batch_size="64" \
run_speech_recognition_ctc.py CHANGED
@@ -643,7 +643,7 @@ def main():
643
 
644
  pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
645
 
646
- pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
647
  # we do not want to group tokens when computing the metrics
648
  label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
649
 
 
643
 
644
  pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
645
 
646
+ pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)#being sure to remove <s> from the output
647
  # we do not want to group tokens when computing the metrics
648
  label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
649
 
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./xls-r-300m-fr/", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
 
1
+ {"unk_token": "[UNK]", "bos_token": null, "eos_token": null, "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
vocab.json CHANGED
@@ -1 +1 @@
1
- {"'": 1, "(": 2, ")": 3, "*": 4, ".": 5, "/": 6, "1": 7, "2": 8, "=": 9, "C": 10, "E": 11, "N": 12, "Q": 13, "R": 14, "Z": 15, "`": 16, "a": 17, "b": 18, "c": 19, "d": 20, "e": 21, "f": 22, "g": 23, "h": 24, "i": 25, "j": 26, "k": 27, "l": 28, "m": 29, "n": 30, "o": 31, "p": 32, "q": 33, "r": 34, "s": 35, "t": 36, "u": 37, "v": 38, "w": 39, "x": 40, "y": 41, "z": 42, "{": 43, "|": 0, "}": 45, "~": 46, "§": 47, "«": 48, "®": 49, "°": 50, "±": 51, "·": 52, "»": 53, "×": 54, "ß": 55, "æ": 56, "ç": 57, "ð": 58, "ø": 59, "þ": 60, "đ": 61, "ħ": 62, "ı": 63, "ł": 64, "œ": 65, "ǀ": 66, "ǃ": 67, "ɑ": 68, "ə": 69, "ɨ": 70, "ʉ": 71, "ʔ": 72, "ʻ": 73, "ʼ": 74, "ʽ": 75, "ʾ": 76, "ʿ": 77, "ː": 78, "α": 79, "β": 80, "γ": 81, "δ": 82, "ε": 83, "ζ": 84, "η": 85, "θ": 86, "ι": 87, "κ": 88, "λ": 89, "μ": 90, "ν": 91, "ο": 92, "π": 93, "ρ": 94, "ς": 95, "σ": 96, "τ": 97, "υ": 98, "φ": 99, "χ": 100, "ψ": 101, "ω": 102, "а": 103, "г": 104, "е": 105, "з": 106, "и": 107, "к": 108, "м": 109, "н": 110, "о": 111, "п": 112, "р": 113, "ц": 114, "ч": 115, "э": 116, "я": 117, "є": 118, "і": 119, "ј": 120, "џ": 121, "ҫ": 122, "ӌ": 123, "գ": 124, "զ": 125, "ا": 126, "ب": 127, "ة": 128, "د": 129, "ر": 130, "ل": 131, "م": 132, "ن": 133, "و": 134, "ي": 135, "": 136, "": 137, "": 138, "": 139, "": 140, "": 141, "": 142, "": 143, "": 144, "": 145, "": 146, "": 147, "": 148, "": 149, "": 150, "": 151, "": 152, "": 153, "": 154, "": 155, "": 156, "": 157, "": 158, "": 159, "": 160, "": 161, "": 162, "": 163, "": 164, "": 165, "": 166, "": 167, "": 168, "": 169, "": 170, "": 171, "": 172, "": 173, "": 174, "": 175, "": 176, "": 177, "": 178, "": 179, "": 180, "": 181, "": 182, "": 183, "": 184, "": 185, "": 186, "": 187, "": 188, "": 189, "": 190, "": 191, "": 192, "": 193, "": 194, "": 195, "": 196, "": 197, "": 198, "": 199, "": 200, "": 201, "": 202, "": 203, "": 204, "": 205, "": 206, "": 207, "": 208, "": 209, "": 210, "西": 211, "": 212, "": 213, "": 214, "": 215, "[UNK]": 215, "[PAD]": 216}
 
1
+ {"'": 1, "(": 2, ")": 3, "*": 4, ".": 5, "/": 6, "1": 7, "2": 8, "=": 9, "C": 10, "E": 11, "N": 12, "Q": 13, "R": 14, "Z": 15, "`": 16, "a": 17, "b": 18, "c": 19, "d": 20, "e": 21, "f": 22, "g": 23, "h": 24, "i": 25, "j": 26, "k": 27, "l": 28, "m": 29, "n": 30, "o": 31, "p": 32, "q": 33, "r": 34, "s": 35, "t": 36, "u": 37, "v": 38, "w": 39, "x": 40, "y": 41, "z": 42, "{": 43, "|": 0, "}": 45, "~": 46, "\u00a7": 47, "\u00ab": 48, "\u00ae": 49, "\u00b0": 50, "\u00b1": 51, "\u00b7": 52, "\u00bb": 53, "\u00d7": 54, "\u00df": 55, "\u00e6": 56, "\u00e7": 57, "\u00f0": 58, "\u00f8": 59, "\u00fe": 60, "\u0111": 61, "\u0127": 62, "\u0131": 63, "\u0142": 64, "\u0153": 65, "\u01c0": 66, "\u01c3": 67, "\u0251": 68, "\u0259": 69, "\u0268": 70, "\u0289": 71, "\u0294": 72, "\u02bb": 73, "\u02bc": 74, "\u02bd": 75, "\u02be": 76, "\u02bf": 77, "\u02d0": 78, "\u03b1": 79, "\u03b2": 80, "\u03b3": 81, "\u03b4": 82, "\u03b5": 83, "\u03b6": 84, "\u03b7": 85, "\u03b8": 86, "\u03b9": 87, "\u03ba": 88, "\u03bb": 89, "\u03bc": 90, "\u03bd": 91, "\u03bf": 92, "\u03c0": 93, "\u03c1": 94, "\u03c2": 95, "\u03c3": 96, "\u03c4": 97, "\u03c5": 98, "\u03c6": 99, "\u03c7": 100, "\u03c8": 101, "\u03c9": 102, "\u0430": 103, "\u0433": 104, "\u0435": 105, "\u0437": 106, "\u0438": 107, "\u043a": 108, "\u043c": 109, "\u043d": 110, "\u043e": 111, "\u043f": 112, "\u0440": 113, "\u0446": 114, "\u0447": 115, "\u044d": 116, "\u044f": 117, "\u0454": 118, "\u0456": 119, "\u0458": 120, "\u045f": 121, "\u04ab": 122, "\u04cc": 123, "\u0563": 124, "\u0566": 125, "\u0627": 126, "\u0628": 127, "\u0629": 128, "\u062f": 129, "\u0631": 130, "\u0644": 131, "\u0645": 132, "\u0646": 133, "\u0648": 134, "\u064a": 135, "\u1100": 136, "\u1106": 137, "\u1109": 138, "\u110c": 139, "\u1161": 140, "\u1162": 141, "\u1165": 142, "\u1169": 143, "\u1175": 144, "\u11a8": 145, "\u11b7": 146, "\u11b8": 147, "\u11bc": 148, "\u1240": 149, "\u12a8": 150, "\u12c8": 151, "\u12f0": 152, "\u1300": 153, "\u2010": 154, "\u2013": 155, "\u2014": 156, "\u2015": 157, "\u2019": 158, "\u201e": 159, "\u2020": 160, "\u2032": 161, "\u2039": 162, "\u203a": 163, "\u2044": 164, "\u20bd": 165, "\u2192": 166, "\u2194": 167, "\u2205": 168, "\u2206": 169, "\u2208": 170, "\u2212": 171, "\u221e": 172, "\u2228": 173, "\u223c": 174, "\u2265": 175, "\u22a8": 176, "\u22c5": 177, "\u2500": 178, "\u2609": 179, "\u2c45": 180, "\u2c4e": 181, "\u3044": 182, "\u3046": 183, "\u305f": 184, "\u3064": 185, "\u306e": 186, "\u3072": 187, "\u3078": 188, "\u307e": 189, "\u3080": 190, "\u3081": 191, "\u3082": 192, "\u3084": 193, "\u4e09": 194, "\u4e39": 195, "\u4e43": 196, "\u4eac": 197, "\u4fdd": 198, "\u5317": 199, "\u53b3": 200, "\u5b87": 201, "\u626c": 202, "\u6587": 203, "\u661f": 204, "\u672f": 205, "\u675c": 206, "\u6d25": 207, "\u7261": 208, "\u750c": 209, "\u7f8e": 210, "\u897f": 211, "\u8cb4": 212, "\u9752": 213, "\u9986": 214, "\ua751": 215, "[UNK]": 215, "[PAD]": 216}