# HuggingFace challenge - Debugger notebook
Run this notebook to verify your libraries versions, check GPU config and run a quick training

In [1]:
import platform
import multiprocessing

import torch
import transformers
import datasets

import soundfile

## Print main infos

In [2]:
print(f"Platform: {platform.platform()}")
print(f"CPU cores: {multiprocessing.cpu_count()}")

print(f"Python version: {platform.python_version()}")

print(f"PyTorch version: {torch.__version__}")
print(f"GPU is visible: {torch.cuda.is_available()}")

print(f"Transformers version: {transformers.__version__}")
print(f"Datasets version: {datasets.__version__}")

print(f"soundfile version: {soundfile.__version__}")

Platform: Linux-5.11.0-37-generic-x86_64-with-glibc2.10
CPU cores: 60
Python version: 3.8.8
PyTorch version: 1.10.1+cu102
GPU is visible: True
Transformers version: 4.16.0.dev0
Datasets version: 1.17.1.dev0
soundfile version: 0.10.3


## Check your GPU informations (if any)
If you launched an AI Training job with GPU resources, they should be listed below (Tesla V100s 32GB).
Driver and CUDA version 

In [3]:
!nvidia-smi

Wed Jan 26 10:18:27 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100S-PCI...  Off  | 00000000:00:06.0 Off |                    0 |
| N/A   35C    P0    26W / 250W |      4MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center>\n<img src=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Quick training run with a dummy model and data
more information on https://github.com/huggingface/transformers/tree/master/examples/pytorch/speech-recognition

In [5]:
!wget -O run_speech_recognition_ctc.py https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py

--2022-01-22 15:01:09--  https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30348 (30K) [text/plain]
Saving to: ‘run_speech_recognition_ctc.py’


2022-01-22 15:01:09 (20.1 MB/s) - ‘run_speech_recognition_ctc.py’ saved [30348/30348]



In [None]:
# 	--learning_rate="7.5e-5" \
# 84.5

In [None]:
!python run_speech_recognition_ctc.py \
	--dataset_name="mozilla-foundation/common_voice_7_0" \
	--model_name_or_path="facebook/wav2vec2-xls-r-300m" \
	--dataset_config_name="bg" \
	--output_dir="./wav2vec2-large-xls-r-300m-bulgarian" \
	--overwrite_output_dir \
	--num_train_epochs="100" \
	--per_device_train_batch_size="32" \
	--per_device_eval_batch_size="32" \
	--gradient_accumulation_steps="1" \
	--learning_rate="7e-5" \
	--warmup_steps="500" \
	--length_column_name="input_length" \
	--evaluation_strategy="steps" \
	--text_column_name="sentence" \
	--chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — ’ … \( \) \` \
	--save_steps="500" \
	--eval_steps="500" \
	--logging_steps="100" \
	--layerdrop="0.0" \
	--activation_dropout="0.1" \
	--save_total_limit="2" \
	--freeze_feature_encoder \
	--feat_proj_dropout="0.0" \
	--mask_time_prob="0.75" \
	--mask_time_length="10" \
	--mask_feature_prob="0.25" \
	--mask_feature_length="64" \
	--gradient_checkpointing \
	--use_auth_token \
	--fp16 \
	--group_by_length \
	--do_train --do_eval \
    --push_to_hub

01/26/2022 10:35:27 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
bf16=False,
bf16_full_eval=False,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=500,
evaluation_strategy=IntervalStrategy.STEPS,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=True,
greater_is_better=None,
group_by_length=True,
half_precision_backend=auto,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=7e-05,
length_column_name=input_length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_l

In [25]:
# !rm -rf wav2vec2-large-xls-r-300m-bashkir

In [None]:
!ls -ltr

In [23]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
overlay         3.5T  963G  2.4T  29% /
tmpfs            64M     0   64M   0% /dev
tmpfs            87G     0   87G   0% /sys/fs/cgroup
tmpfs            87G  8.0K   87G   1% /dev/shm
/dev/md0        3.5T  963G  2.4T  29% /etc/group
tmpfs            87G   12K   87G   1% /proc/driver/nvidia
/dev/vda1        49G  6.4G   42G  14% /usr/bin/nvidia-smi
udev             87G     0   87G   0% /dev/nvidia0
tmpfs            87G     0   87G   0% /proc/acpi
tmpfs            87G     0   87G   0% /proc/scsi
tmpfs            87G     0   87G   0% /sys/firmware


In [50]:
from datasets import load_dataset, load_metric, Audio

common_voice_train = load_dataset("mozilla-foundation/common_voice_7_0", "bg", use_auth_token=True, split="train+validation")
common_voice_test = load_dataset("mozilla-foundation/common_voice_7_0", "bg", use_auth_token=True, split="test")

print(len(common_voice_train))

Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/bg/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)
Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/bg/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)


2498


In [51]:
len(common_voice_train) * 100 / 32

7806.25

In [52]:
common_voice_train = common_voice_train.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
common_voice_test = common_voice_test.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])

In [53]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [54]:
show_random_elements(common_voice_train.remove_columns(["path", "audio"]), num_examples=10)

Unnamed: 0,sentence
0,При тия страдания се прибавяха и грозните безпокойствия за изхода на делото.
1,В неподвижните очи на тая бронзова нощ сякаш блясва живот.
2,"Който се е заловил, не е патил добро."
3,От очите на хубавицата почнаха да падат бързо едри сълзи.
4,Отведнъж те паднаха на земята и се вцепениха като умрели.
5,"И ето оттогаз Василена все се бави, все отлага."
6,"Огнянову се сви от болка сърцето, като че клъцнато от змия."
7,Вънка стана голяма бъркотия и после всичко утихна.
8,Сълзите му капеха върху огледалния под и тупкаха като зърна от разсипана леща.
9,И ти лижеш еминиите на онези хайдутяги!


In [55]:
import re
chars_to_remove_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\—\’\…\(\)\`]'

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_remove_regex, '', batch["sentence"]).lower()
    return batch

In [56]:
common_voice_train = common_voice_train.map(remove_special_characters)
common_voice_test = common_voice_test.map(remove_special_characters)

  0%|          | 0/2498 [00:00<?, ?ex/s]

  0%|          | 0/953 [00:00<?, ?ex/s]

In [57]:
# start_with_ar = common_voice_train.filter(lambda example: '„' in example['sentence'])
# start_with_ar[0]

In [58]:
# start_with_ar

In [59]:
def replace_hatted_characters(batch):
#     batch["sentence"] = re.sub('[â]', 'a', batch["sentence"])
#     batch["sentence"] = re.sub('[î]', 'i', batch["sentence"])
#     batch["sentence"] = re.sub('[ô]', 'o', batch["sentence"])
#     batch["sentence"] = re.sub('[û]', 'u', batch["sentence"])
    return batch

In [60]:
common_voice_train = common_voice_train.map(replace_hatted_characters)
common_voice_test = common_voice_test.map(replace_hatted_characters)

  0%|          | 0/2498 [00:00<?, ?ex/s]

  0%|          | 0/953 [00:00<?, ?ex/s]

In [61]:
def extract_all_chars(batch):
  all_text = " ".join(batch["sentence"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [62]:
vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)
vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [63]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))

In [65]:
'a' == 'а'

False

In [66]:
vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}
vocab_dict

{' ': 0,
 'a': 1,
 'e': 2,
 '«': 3,
 '»': 4,
 'а': 5,
 'б': 6,
 'в': 7,
 'г': 8,
 'д': 9,
 'е': 10,
 'ж': 11,
 'з': 12,
 'и': 13,
 'й': 14,
 'к': 15,
 'л': 16,
 'м': 17,
 'н': 18,
 'о': 19,
 'п': 20,
 'р': 21,
 'с': 22,
 'т': 23,
 'у': 24,
 'ф': 25,
 'х': 26,
 'ц': 27,
 'ч': 28,
 'ш': 29,
 'щ': 30,
 'ъ': 31,
 'ь': 32,
 'ю': 33,
 'я': 34,
 'ѝ': 35,
 '„': 36}

In [67]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
print(len(vocab_dict))

import json
with open('./vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)
    
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("./", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

repo_name = "wav2vec2-large-xls-r-300m-bulgarian"

tokenizer.push_to_hub(repo_name)

file ./config.json not found
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


39


Cloning https://huggingface.co/infinitejoy/wav2vec2-large-xls-r-300m-bulgarian into local empty directory.
To https://huggingface.co/infinitejoy/wav2vec2-large-xls-r-300m-bulgarian
   70cec1c..d1a190e  main -> main



'https://huggingface.co/infinitejoy/wav2vec2-large-xls-r-300m-bulgarian/commit/d1a190ef83537b70975ae7e7943d9697632dcf43'

In [27]:
!wget -O eval.py https://raw.githubusercontent.com/huggingface/transformers/master/examples/research_projects/robust-speech-event/eval.py
!cp eval.py wav2vec2-large-xls-r-300m-bashkir
!ls -ltr wav2vec2-large-xls-r-300m-bashkir

--2022-01-25 05:51:53--  https://raw.githubusercontent.com/huggingface/transformers/master/examples/research_projects/robust-speech-event/eval.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4421 (4.3K) [text/plain]
Saving to: ‘eval.py’


2022-01-25 05:51:53 (11.6 MB/s) - ‘eval.py’ saved [4421/4421]

total 1232556
-rw-r--r-- 1 ovh ovh        272 Jan 25 02:49 vocab.json
-rw-r--r-- 1 ovh ovh        260 Jan 25 02:49 tokenizer_config.json
-rw-r--r-- 1 ovh ovh        309 Jan 25 02:49 special_tokens_map.json
-rw-r--r-- 1 ovh ovh         23 Jan 25 02:49 added_tokens.json
drwxr-xr-x 2 ovh ovh       4096 Jan 25 05:21 checkpoint-5500
drwxr-xr-x 2 ovh ovh       4096 Jan 25 05:35 checkpoint-6000
-rw-r--r-- 1 ovh ovh        197 Jan 25 05:46 train_results.json
-rw-r--r

In [29]:
!cd wav2vec2-large-xls-r-300m-bashkir; python eval.py --model_id ./ --dataset mozilla-foundation/common_voice_7_0 --config ba --split test --log_outputs

Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/bas/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)
100%|█████████████████████████████████████████| 375/375 [03:03<00:00,  2.04ex/s]
WER: 1.0408274360370169
CER: 2.2848350566223536
100%|██████████████████████████████████████| 375/375 [00:00<00:00, 20474.93ex/s]


In [1]:
from transformers import AutoModelForCTC, Wav2Vec2Processor

model = AutoModelForCTC.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-bashkir")
processor = Wav2Vec2Processor.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-bashkir")



Downloading:   0%|          | 0.00/260 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/574 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/309 [00:00<?, ?B/s]

In [23]:
from transformers import AutoModelForCTC, AutoProcessor
from datasets import load_dataset

model = AutoModelForCTC.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-bashkir")
processor = AutoProcessor.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-bashkir")

input_values = processor(common_voice_test[0]["audio"]["array"], return_tensors="pt", sampling_rate=16_000).input_values
# input_values = input_values.to("cuda")

logits = model(input_values).logits

assert logits.shape[-1] == 32, logits.shape[-1]

Downloading:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/212 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/260 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/520 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/309 [00:00<?, ?B/s]

AssertionError: 55

In [None]:
from datasets import Audio, Dataset, load_dataset, load_metric
from transformers import AutoFeatureExtractor, pipeline

dataset = load_dataset("mozilla-foundation/common_voice_7_0", "bg", use_auth_token=True, split="train+validation")

# for testing: only process the first two examples as a test
dataset = dataset.select(range(10))

repo_name = 'infinitejoy/wav2vec2-large-xls-r-300m-bulgarian'

# load processor
feature_extractor = AutoFeatureExtractor.from_pretrained(repo_name)
# feature_extractor = processor_with_lm.feature_extractor
sampling_rate = feature_extractor.sampling_rate

# resample audio
dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))

# load eval pipeline
asr = pipeline("automatic-speech-recognition", model=repo_name, feature_extractor=feature_extractor)

# map function to decode audio
def map_to_pred(batch):
    prediction = asr(
        batch["audio"]["array"])

    batch["prediction"] = prediction["text"]
    batch["target"] = batch["sentence"]
    return batch

# run inference on all examples
result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
print(result["prediction"])

result[0]['target']

Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/bg/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)


Downloading:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/212 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/260 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/384 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/309 [00:00<?, ?B/s]

  0%|          | 0/10 [00:00<?, ?ex/s]