{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "85541f92", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "02/07/2022 11:34:12 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: True\n", "02/07/2022 11:34:12 - INFO - __main__ - Training/evaluation parameters TrainingArguments(\n", "_n_gpu=1,\n", "adafactor=False,\n", "adam_beta1=0.9,\n", "adam_beta2=0.999,\n", "adam_epsilon=1e-08,\n", "bf16=False,\n", "bf16_full_eval=False,\n", "dataloader_drop_last=False,\n", "dataloader_num_workers=0,\n", "dataloader_pin_memory=True,\n", "ddp_bucket_cap_mb=None,\n", "ddp_find_unused_parameters=None,\n", "debug=[],\n", "deepspeed=None,\n", "disable_tqdm=False,\n", "do_eval=True,\n", "do_predict=False,\n", "do_train=True,\n", "eval_accumulation_steps=None,\n", "eval_steps=400,\n", "evaluation_strategy=IntervalStrategy.STEPS,\n", "fp16=True,\n", "fp16_backend=auto,\n", "fp16_full_eval=False,\n", "fp16_opt_level=O1,\n", "gradient_accumulation_steps=1,\n", "gradient_checkpointing=True,\n", "greater_is_better=None,\n", "group_by_length=True,\n", "half_precision_backend=auto,\n", "hub_model_id=None,\n", "hub_strategy=HubStrategy.EVERY_SAVE,\n", "hub_token=,\n", "ignore_data_skip=False,\n", "label_names=None,\n", "label_smoothing_factor=0.0,\n", "learning_rate=0.0003,\n", "length_column_name=input_length,\n", "load_best_model_at_end=False,\n", "local_rank=-1,\n", "log_level=-1,\n", "log_level_replica=-1,\n", "log_on_each_node=True,\n", "logging_dir=./runs/Feb07_11-34-12_job-7e123c9a-c8eb-4ec4-9153-164c740ace86,\n", "logging_first_step=False,\n", "logging_nan_inf_filter=True,\n", "logging_steps=500,\n", "logging_strategy=IntervalStrategy.STEPS,\n", "lr_scheduler_type=SchedulerType.LINEAR,\n", "max_grad_norm=1.0,\n", "max_steps=8000,\n", "metric_for_best_model=None,\n", "mp_parameters=,\n", "no_cuda=False,\n", "num_train_epochs=3.0,\n", "optim=OptimizerNames.ADAMW_HF,\n", "output_dir=./,\n", "overwrite_output_dir=True,\n", "past_index=-1,\n", "per_device_eval_batch_size=8,\n", "per_device_train_batch_size=16,\n", "prediction_loss_only=False,\n", "push_to_hub=True,\n", "push_to_hub_model_id=None,\n", "push_to_hub_organization=None,\n", "push_to_hub_token=,\n", "remove_unused_columns=True,\n", "report_to=[],\n", "resume_from_checkpoint=None,\n", "run_name=./,\n", "save_on_each_node=False,\n", "save_steps=200,\n", "save_strategy=IntervalStrategy.STEPS,\n", "save_total_limit=3,\n", "seed=42,\n", "sharded_ddp=[],\n", "skip_memory_metrics=True,\n", "tf32=None,\n", "tpu_metrics_debug=False,\n", "tpu_num_cores=None,\n", "use_legacy_prediction_loop=False,\n", "warmup_ratio=0.0,\n", "warmup_steps=500,\n", "weight_decay=0.0,\n", "xpu_backend=None,\n", ")\n", "02/07/2022 11:34:14 - WARNING - datasets.builder - Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/hi/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b)\n", "02/07/2022 11:34:17 - WARNING - datasets.builder - Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/hi/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b)\n", "02/07/2022 11:34:17 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/hi/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b/cache-56c362c60f5a4e8d.arrow\n", "02/07/2022 11:34:17 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/hi/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b/cache-c2794d9326d1e793.arrow\n", "loading configuration file https://huggingface.co/facebook/wav2vec2-xls-r-300m/resolve/main/config.json from cache at /workspace/.cache/huggingface/transformers/dabc27df63e37bd2a7a221c7774e35f36a280fbdf917cf54cadfc7df8c786f6f.a3e4c3c967d9985881e0ae550a5f6f668f897db5ab2e0802f9b97973b15970e6\n", "Model config Wav2Vec2Config {\n", " \"_name_or_path\": \"facebook/wav2vec2-xls-r-300m\",\n", " \"activation_dropout\": 0.0,\n", " \"adapter_kernel_size\": 3,\n", " \"adapter_stride\": 2,\n", " \"add_adapter\": false,\n", " \"apply_spec_augment\": true,\n", " \"architectures\": [\n", " \"Wav2Vec2ForPreTraining\"\n", " ],\n", " \"attention_dropout\": 0.1,\n", " \"bos_token_id\": 1,\n", " \"classifier_proj_size\": 256,\n", " \"codevector_dim\": 768,\n", " \"contrastive_logits_temperature\": 0.1,\n", " \"conv_bias\": true,\n", " \"conv_dim\": [\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512\n", " ],\n", " \"conv_kernel\": [\n", " 10,\n", " 3,\n", " 3,\n", " 3,\n", " 3,\n", " 2,\n", " 2\n", " ],\n", " \"conv_stride\": [\n", " 5,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2\n", " ],\n", " \"ctc_loss_reduction\": \"sum\",\n", " \"ctc_zero_infinity\": false,\n", " \"diversity_loss_weight\": 0.1,\n", " \"do_stable_layer_norm\": true,\n", " \"eos_token_id\": 2,\n", " \"feat_extract_activation\": \"gelu\",\n", " \"feat_extract_dropout\": 0.0,\n", " \"feat_extract_norm\": \"layer\",\n", " \"feat_proj_dropout\": 0.1,\n", " \"feat_quantizer_dropout\": 0.0,\n", " \"final_dropout\": 0.0,\n", " \"gradient_checkpointing\": false,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout\": 0.1,\n", " \"hidden_size\": 1024,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 4096,\n", " \"layer_norm_eps\": 1e-05,\n", " \"layerdrop\": 0.1,\n", " \"mask_feature_length\": 10,\n", " \"mask_feature_min_masks\": 0,\n", " \"mask_feature_prob\": 0.0,\n", " \"mask_time_length\": 10,\n", " \"mask_time_min_masks\": 2,\n", " \"mask_time_prob\": 0.075,\n", " \"model_type\": \"wav2vec2\",\n", " \"num_adapter_layers\": 3,\n", " \"num_attention_heads\": 16,\n", " \"num_codevector_groups\": 2,\n", " \"num_codevectors_per_group\": 320,\n", " \"num_conv_pos_embedding_groups\": 16,\n", " \"num_conv_pos_embeddings\": 128,\n", " \"num_feat_extract_layers\": 7,\n", " \"num_hidden_layers\": 24,\n", " \"num_negatives\": 100,\n", " \"output_hidden_size\": 1024,\n", " \"pad_token_id\": 0,\n", " \"proj_codevector_dim\": 768,\n", " \"tdnn_dilation\": [\n", " 1,\n", " 2,\n", " 3,\n", " 1,\n", " 1\n", " ],\n", " \"tdnn_dim\": [\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 1500\n", " ],\n", " \"tdnn_kernel\": [\n", " 5,\n", " 3,\n", " 3,\n", " 1,\n", " 1\n", " ],\n", " \"torch_dtype\": \"float32\",\n", " \"transformers_version\": \"4.16.2\",\n", " \"use_weighted_layer_sum\": false,\n", " \"vocab_size\": 32,\n", " \"xvector_output_dim\": 512\n", "}\n", "\n", "100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 3.97ba/s]\n", "100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 15.14ba/s]\n", "{'j', 'y', 'x', 'l', 'm', 'ज़', 'क', 'ऐ', 'ऊ', 'त', 'ञ', 'p', 'n', 'u', 'ी', 'ऋ', 'ठ', 'छ', 'ा', 'क़', '&', 'c', 'ण', 'ढ़', 'w', 'अ', 'r', ' ', 'ष', 'ट', \"'\", 'ग़', 'f', 'k', 't', 'ृ', 'v', 'भ', 'g', 'स', 'ऑ', 'े', 'झ', 'z', 'ो', 'इ', '|', '।', 'ु', 'ड़', 'ए', 'h', 'ब', 'ध', 'ग', 'ः', 'i', 'श', 'औ', 'र', 'e', 'य', 'ड', 'प', 'ि', 's', 'म', 'b', 'ख', '़', 'ल', 'ई', 'उ', 'द', 'ज', 'ढ', 'ओ', 'ॉ', 'ं', 'च', 'a', 'न', 'ै', 'घ', 'थ', 'o', 'फ', 'ँ', 'आ', 'd', 'ौ', 'ॅ', 'व', 'ू', '्', 'ह'} 96 {' ': 0, '&': 1, \"'\": 2, 'a': 3, 'b': 4, 'c': 5, 'd': 6, 'e': 7, 'f': 8, 'g': 9, 'h': 10, 'i': 11, 'j': 12, 'k': 13, 'l': 14, 'm': 15, 'n': 16, 'o': 17, 'p': 18, 'r': 19, 's': 20, 't': 21, 'u': 22, 'v': 23, 'w': 24, 'x': 25, 'y': 26, 'z': 27, '|': 28, 'ँ': 29, 'ं': 30, 'ः': 31, 'अ': 32, 'आ': 33, 'इ': 34, 'ई': 35, 'उ': 36, 'ऊ': 37, 'ऋ': 38, 'ए': 39, 'ऐ': 40, 'ऑ': 41, 'ओ': 42, 'औ': 43, 'क': 44, 'ख': 45, 'ग': 46, 'घ': 47, 'च': 48, 'छ': 49, 'ज': 50, 'झ': 51, 'ञ': 52, 'ट': 53, 'ठ': 54, 'ड': 55, 'ढ': 56, 'ण': 57, 'त': 58, 'थ': 59, 'द': 60, 'ध': 61, 'न': 62, 'प': 63, 'फ': 64, 'ब': 65, 'भ': 66, 'म': 67, 'य': 68, 'र': 69, 'ल': 70, 'व': 71, 'श': 72, 'ष': 73, 'स': 74, 'ह': 75, '़': 76, 'ा': 77, 'ि': 78, 'ी': 79, 'ु': 80, 'ू': 81, 'ृ': 82, 'ॅ': 83, 'े': 84, 'ै': 85, 'ॉ': 86, 'ो': 87, 'ौ': 88, '्': 89, 'क़': 90, 'ग़': 91, 'ज़': 92, 'ड़': 93, 'ढ़': 94, '।': 95}\n", "{'&': 1, \"'\": 2, 'a': 3, 'b': 4, 'c': 5, 'd': 6, 'e': 7, 'f': 8, 'g': 9, 'h': 10, 'i': 11, 'j': 12, 'k': 13, 'l': 14, 'm': 15, 'n': 16, 'o': 17, 'p': 18, 'r': 19, 's': 20, 't': 21, 'u': 22, 'v': 23, 'w': 24, 'x': 25, 'y': 26, 'z': 27, '|': 28, 'ँ': 29, 'ं': 30, 'ः': 31, 'अ': 32, 'आ': 33, 'इ': 34, 'ई': 35, 'उ': 36, 'ऊ': 37, 'ऋ': 38, 'ए': 39, 'ऐ': 40, 'ऑ': 41, 'ओ': 42, 'औ': 43, 'क': 44, 'ख': 45, 'ग': 46, 'घ': 47, 'च': 48, 'छ': 49, 'ज': 50, 'झ': 51, 'ञ': 52, 'ट': 53, 'ठ': 54, 'ड': 55, 'ढ': 56, 'ण': 57, 'त': 58, 'थ': 59, 'द': 60, 'ध': 61, 'न': 62, 'प': 63, 'फ': 64, 'ब': 65, 'भ': 66, 'म': 67, 'य': 68, 'र': 69, 'ल': 70, 'व': 71, 'श': 72, 'ष': 73, 'स': 74, 'ह': 75, '़': 76, 'ा': 77, 'ि': 78, 'ी': 79, 'ु': 80, 'ू': 81, 'ृ': 82, 'ॅ': 83, 'े': 84, 'ै': 85, 'ॉ': 86, 'ो': 87, 'ौ': 88, '्': 89, 'क़': 90, 'ग़': 91, 'ज़': 92, 'ड़': 93, 'ढ़': 94, '।': 95, '$': 0, '[UNK]': 96, '[PAD]': 97}\n", "Didn't find file ./tokenizer_config.json. We won't load it.\n", "Didn't find file ./added_tokens.json. We won't load it.\n", "Didn't find file ./special_tokens_map.json. We won't load it.\n", "Didn't find file ./tokenizer.json. We won't load it.\n", "loading file ./vocab.json\n", "loading file None\n", "loading file None\n", "loading file None\n", "loading file None\n", "file ./config.json not found\n", "Adding to the vocabulary\n", "Adding to the vocabulary\n", "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", "loading configuration file https://huggingface.co/facebook/wav2vec2-xls-r-300m/resolve/main/config.json from cache at /workspace/.cache/huggingface/transformers/dabc27df63e37bd2a7a221c7774e35f36a280fbdf917cf54cadfc7df8c786f6f.a3e4c3c967d9985881e0ae550a5f6f668f897db5ab2e0802f9b97973b15970e6\n", "Model config Wav2Vec2Config {\n", " \"_name_or_path\": \"facebook/wav2vec2-xls-r-300m\",\n", " \"activation_dropout\": 0.0,\n", " \"adapter_kernel_size\": 3,\n", " \"adapter_stride\": 2,\n", " \"add_adapter\": false,\n", " \"apply_spec_augment\": true,\n", " \"architectures\": [\n", " \"Wav2Vec2ForPreTraining\"\n", " ],\n", " \"attention_dropout\": 0.1,\n", " \"bos_token_id\": 1,\n", " \"classifier_proj_size\": 256,\n", " \"codevector_dim\": 768,\n", " \"contrastive_logits_temperature\": 0.1,\n", " \"conv_bias\": true,\n", " \"conv_dim\": [\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 512\n", " ],\n", " \"conv_kernel\": [\n", " 10,\n", " 3,\n", " 3,\n", " 3,\n", " 3,\n", " 2,\n", " 2\n", " ],\n", " \"conv_stride\": [\n", " 5,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2\n", " ],\n", " \"ctc_loss_reduction\": \"sum\",\n", " \"ctc_zero_infinity\": false,\n", " \"diversity_loss_weight\": 0.1,\n", " \"do_stable_layer_norm\": true,\n", " \"eos_token_id\": 2,\n", " \"feat_extract_activation\": \"gelu\",\n", " \"feat_extract_dropout\": 0.0,\n", " \"feat_extract_norm\": \"layer\",\n", " \"feat_proj_dropout\": 0.1,\n", " \"feat_quantizer_dropout\": 0.0,\n", " \"final_dropout\": 0.0,\n", " \"gradient_checkpointing\": false,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout\": 0.1,\n", " \"hidden_size\": 1024,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 4096,\n", " \"layer_norm_eps\": 1e-05,\n", " \"layerdrop\": 0.1,\n", " \"mask_feature_length\": 10,\n", " \"mask_feature_min_masks\": 0,\n", " \"mask_feature_prob\": 0.0,\n", " \"mask_time_length\": 10,\n", " \"mask_time_min_masks\": 2,\n", " \"mask_time_prob\": 0.075,\n", " \"model_type\": \"wav2vec2\",\n", " \"num_adapter_layers\": 3,\n", " \"num_attention_heads\": 16,\n", " \"num_codevector_groups\": 2,\n", " \"num_codevectors_per_group\": 320,\n", " \"num_conv_pos_embedding_groups\": 16,\n", " \"num_conv_pos_embeddings\": 128,\n", " \"num_feat_extract_layers\": 7,\n", " \"num_hidden_layers\": 24,\n", " \"num_negatives\": 100,\n", " \"output_hidden_size\": 1024,\n", " \"pad_token_id\": 0,\n", " \"proj_codevector_dim\": 768,\n", " \"tdnn_dilation\": [\n", " 1,\n", " 2,\n", " 3,\n", " 1,\n", " 1\n", " ],\n", " \"tdnn_dim\": [\n", " 512,\n", " 512,\n", " 512,\n", " 512,\n", " 1500\n", " ],\n", " \"tdnn_kernel\": [\n", " 5,\n", " 3,\n", " 3,\n", " 1,\n", " 1\n", " ],\n", " \"torch_dtype\": \"float32\",\n", " \"transformers_version\": \"4.16.2\",\n", " \"use_weighted_layer_sum\": false,\n", " \"vocab_size\": 32,\n", " \"xvector_output_dim\": 512\n", "}\n", "\n", "loading feature extractor configuration file https://huggingface.co/facebook/wav2vec2-xls-r-300m/resolve/main/preprocessor_config.json from cache at /workspace/.cache/huggingface/transformers/6fb028b95b394059e7d3b367bbca2382b576c66aebe896f04d2cd34e1b575f5b.d4484dc1c81456a2461485e7168b04347a7b9a4e3b1ef3aba723323b33e12326\n", "Feature extractor Wav2Vec2FeatureExtractor {\n", " \"do_normalize\": true,\n", " \"feature_extractor_type\": \"Wav2Vec2FeatureExtractor\",\n", " \"feature_size\": 1,\n", " \"padding_side\": \"right\",\n", " \"padding_value\": 0,\n", " \"return_attention_mask\": true,\n", " \"sampling_rate\": 16000\n", "}\n", "\n", "loading weights file https://huggingface.co/facebook/wav2vec2-xls-r-300m/resolve/main/pytorch_model.bin from cache at /workspace/.cache/huggingface/transformers/1e6a6507f3b689035cd4b247e2a37c154e27f39143f31357a49b4e38baeccc36.1edb32803799e27ed554eb7dd935f6745b1a0b17b0ea256442fe24db6eb546cd\n", "^C\n", "Traceback (most recent call last):\n", " File \"run_speech_recognition_ctc.py\", line 765, in \n", " main()\n", " File \"run_speech_recognition_ctc.py\", line 572, in main\n", " model = AutoModelForCTC.from_pretrained(\n", " File \"/workspace/.local/lib/python3.8/site-packages/transformers/models/auto/auto_factory.py\", line 447, in from_pretrained\n", " return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)\n", " File \"/workspace/.local/lib/python3.8/site-packages/transformers/modeling_utils.py\", line 1489, in from_pretrained\n", " model = cls(config, *model_args, **model_kwargs)\n", " File \"/workspace/.local/lib/python3.8/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py\", line 1680, in __init__\n", " self.wav2vec2 = Wav2Vec2Model(config)\n", " File \"/workspace/.local/lib/python3.8/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py\", line 1234, in __init__\n", " self.encoder = Wav2Vec2EncoderStableLayerNorm(config)\n", " File \"/workspace/.local/lib/python3.8/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py\", line 859, in __init__\n", " [Wav2Vec2EncoderLayerStableLayerNorm(config) for _ in range(config.num_hidden_layers)]\n", " File \"/workspace/.local/lib/python3.8/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py\", line 859, in \n", " [Wav2Vec2EncoderLayerStableLayerNorm(config) for _ in range(config.num_hidden_layers)]\n", " File \"/workspace/.local/lib/python3.8/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py\", line 737, in __init__\n", " self.attention = Wav2Vec2Attention(\n", " File \"/workspace/.local/lib/python3.8/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py\", line 559, in __init__\n", " self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)\n", " File \"/opt/conda/lib/python3.8/site-packages/torch/nn/modules/linear.py\", line 90, in __init__\n", " self.reset_parameters()\n", " File \"/opt/conda/lib/python3.8/site-packages/torch/nn/modules/linear.py\", line 96, in reset_parameters\n", " init.kaiming_uniform_(self.weight, a=math.sqrt(5))\n", " File \"/opt/conda/lib/python3.8/site-packages/torch/nn/init.py\", line 395, in kaiming_uniform_\n", " return tensor.uniform_(-bound, bound)\n", "KeyboardInterrupt\n" ] } ], "source": [ "!. run.sh" ] }, { "cell_type": "code", "execution_count": null, "id": "5c2d3236", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 }