Annanay
/

asr-model-eval

Model card Files Files and versions Community

Annanay commited on May 2, 2023

Commit

2f77abd

1 Parent(s): f4813a7

Upload copy_of_dl_project.py

Browse files

Files changed (1) hide show

copy_of_dl_project.py +442 -0

copy_of_dl_project.py ADDED Viewed

	@@ -0,0 +1,442 @@

+# -*- coding: utf-8 -*-
+"""Copy_of_dl_project.ipynb
+Automatically generated by Colaboratory.
+Original file is located at
+    https://colab.research.google.com/drive/1SAMY64pTPqfF7T0Slzr5K_Xeu7l43Kjj
+## IGNORE EVERYTHING ABOVE THIS. ITS TOO SLOW AND PAINFUL.
+## ONLY RUN CELLS BELOW THIS.
+"""
+# using an alternate download method
+! pip install datasets
+from datasets import load_dataset, Audio
+# this is annanay's private token from huggingface, please do not leak/share this 🙏
+access_token = "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"
+gs = load_dataset("speechcolab/gigaspeech", "xs", "train[:2]", use_auth_token=access_token)
+gs['train'] = gs['train'].cast_column("audio", Audio(sampling_rate=16_000))
+# see structure
+print(gs)
+# load audio sample on the fly
+audio_input = gs["train"][0]["audio"]  # first decoded audio sample
+transcription = gs["train"][0]["text"]  # first transcription
+print(len(gs["train"]), len(gs["test"]), len(gs["validation"]))
+"""## phew, that took 25 mins to run!! we can't keep doing this everytime we need to run an experiment
+### (^ temporarily switched to smaller split but still slow)
+## anyway, for now lets investigate the dataset
+"""
+print("# training datapoints", len(gs["train"]), "# test datapoints", len(gs["test"]))
+categories = ["People and Blogs", "Business", "Nonprofits and Activism", "Crime", "History", "Pets and Animals", "News and Politics", "Travel and Events", "Kids and Family", "Leisure", "N/A", "Comedy", "News and Politics", "Sports", "Arts", "Science and Technology", "Autos and Vehicles", "Science and Technology", "People and Blogs", "Music", "Society and Culture", "Education", "Howto and Style", "Film and Animation", "Gaming", "Entertainment", "Travel and Events", "Health and Fitness", "audiobook"]
+print("number of categories", len(categories))
+print("check if categories are zero indexed")
+for i in range(0, 30):
+    print(gs["train"][i]["category"], categories[gs["train"][i]["category"]-1])
+print("i see a zero in there somewhere, so yes, they are.")
+print("Category", gs["train"][0]["category"], "is", categories[gs["train"][0]["category"]])
+"""Ignore the next cell"""
+# trying to play an audio sample
+from IPython.display import Audio, display
+display(Audio(gs["train"][0]["audio"]["path"], autoplay=True))
+"""# Keyur's implementation
+## Ok now let's load and use the wav2vec model
+"""
+# see https://huggingface.co/docs/transformers/model_doc/wav2vec2#transformers.Wav2Vec2CTCTokenizer.decode.example
+! pip install transformers torch
+from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModelForCTC
+import torch
+# import model, feature extractor, tokenizer
+model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h").to("cuda")
+tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
+feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
+# load first 10 training samples
+first10 = [a['array'] for a in gs['train'][0:10]['audio']]#gs['train'][0]['audio']['array']#
+# forward sample through model to get greedily predicted transcription ids
+input_values = feature_extractor(
+    first10,
+    return_tensors="pt",
+    sampling_rate=16_000,
+    padding=True
+).input_values.to("cuda")
+logits = model(input_values).logits
+pred_ids = torch.argmax(logits, axis=-1)
+# Output word_offsets (i.e. recognized words and their timestamps)
+# If we wanted recognized characters and their timestamps, replace with analogous output_char_offsets
+outputs = tokenizer.batch_decode(pred_ids, output_word_offsets=True)
+# compute `time_offset` in seconds as product of downsampling ratio and sampling_rate
+time_offset = model.config.inputs_to_logits_ratio / feature_extractor.sampling_rate
+for i, (text, row) in enumerate(zip(outputs.text, outputs.word_offsets)):
+    print(f"#{i}: {text}")
+    for word_offset in row:
+        print({
+            "word": word_offset["word"],
+            "start_time": round(word_offset["start_offset"] * time_offset, 2),
+            "end_time": round(word_offset["end_offset"] * time_offset, 2),
+        })
+"""## We need to deal with this error message from loading the weights though. Are there fully-trained weights elsewhere?
+```
+Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
+You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+```
+## (otherwise maybe it doesn't matter since we'll be finetuning anyway)
+"""
+# sample package for computing WER, there may be a faster one for batches though
+! pip install torchmetrics
+from torchmetrics import WordErrorRate
+wer = WordErrorRate()
+# remove punctuation / special marks
+trues = [
+    " ".join(
+        token
+        for token in transcript.split()
+        if not (token.startswith("<") and token.endswith(">"))
+    ).upper()
+    for transcript in gs['train']['text']
+]
+print(trues)
+predicteds = [t.upper() for t in outputs.text]
+print(predicteds)
+print(wer(predicteds, trues).item())
+"""## This WER is computed over the entire dataset. Do we want individual WER of each row instead?
+# Split Datasets into categories
+"""
+len(gs["train"])
+"""
+### I'd recommend not to split the datasets (the cell below) again and again. Takes too long!"""
+# split the dataset
+for i in range(len(categories)):
+  print(i, categories[i])
+# datasets
+audiobooks = []
+comedy = []
+people_blogs = []
+education = []
+gaming = []
+for i in range(len(gs["train"])):
+  if gs['train'][i]['category'] == 28:
+    audiobooks.append(gs['train'][i])
+  elif gs['train'][i]['category'] == 11:
+    comedy.append(gs['train'][i])
+  elif (gs['train'][i]['category'] == 0) or (gs['train'][i]['category'] == 18):
+    people_blogs.append(gs['train'][i])
+  elif gs['train'][i]['category'] == 21:
+    education.append(gs['train'][i])
+  elif gs['train'][i]['category'] == 24:
+    gaming.append(gs['train'][i])
+# could have written code for the split, but i did it manually.
+# train dataset (70% of the len of datasets)
+audiobooks_train = audiobooks[:1648]
+comedy_train = comedy[:89]
+people_blogs_train = people_blogs[:244]
+education_train = education[:978]
+gaming_train = gaming[:713]
+# test dataset (30% of the len of datasets)
+audiobooks_test = audiobooks[1648:]
+comedy_test = comedy[89:]
+people_blogs_test = people_blogs[244:]
+education_test = education[978:]
+gaming_test = gaming[713:]
+print(len(audiobooks_train), len(comedy_train), len(people_blogs_train), len(education_train), len(gaming_train))
+print(len(audiobooks_test), len(comedy_test), len(people_blogs_test), len(education_test), len(gaming_test))
+audiobooks_train[:2]
+from datasets import Dataset, DatasetDict
+import pandas as pd
+audiobooks_train_df = pd.DataFrame(audiobooks_train)
+audiobooks_test_df = pd.DataFrame(audiobooks_test)
+audiobooks_train_data = Dataset.from_pandas(audiobooks_train_df)
+audiobooks_test_data = Dataset.from_pandas(audiobooks_test_df)
+# create a DatasetDict with two train and test datasets
+DataDict_audiobooks = DatasetDict({
+    "train": audiobooks_train_data,
+    "test": audiobooks_test_data
+})
+DataDict_audiobooks
+DataDict_audiobooks = DataDict_audiobooks.remove_columns(['segment_id', 'speaker', 'begin_time', 'end_time', 'audio_id', 'title', 'url', 'source', 'original_full_path'])
+DataDict_audiobooks["train"][0].keys()
+DataDict_audiobooks["train"][0]["audio"]
+"""# Text Preprocessing"""
+import re
+chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\<.*?>]'
+def remove_special_characters(batch):
+  batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower()
+  return batch
+DataDict_audiobooks = DataDict_audiobooks.map(remove_special_characters)
+import json
+def extract_chars(batch):
+  all_text = " ".join(batch["text"])
+  vocab = list(set(all_text))
+  return {"vocab": [vocab], "all_text": [all_text]}
+vocabs = DataDict_audiobooks.map(extract_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=DataDict_audiobooks.column_names["train"])
+vocab_list = list(set(vocabs["train"]["vocab"][0]) | set(vocabs["test"]["vocab"][0]))
+vocab_dict = {v: k for k, v in enumerate(vocab_list)}
+vocab_dict["|"] = vocab_dict[" "]
+del vocab_dict[" "]
+vocab_dict["[UNK]"] = len(vocab_dict) # add "unknown" token
+vocab_dict["[PAD]"] = len(vocab_dict) # add a padding token that corresponds to CTC's "blank token"
+with open('vocab.json', 'w') as vocab_file:
+    json.dump(vocab_dict, vocab_file)
+vocab_dict
+DataDict_audiobooks
+import random
+import numpy as np
+rand_int = random.randint(0, len(DataDict_audiobooks["train"]))
+print("Target text:", DataDict_audiobooks["train"][rand_int]["text"])
+print("Input array shape:", np.asarray(DataDict_audiobooks["train"][rand_int]["audio"]["array"]).shape)
+print("Sampling rate:", DataDict_audiobooks["train"][rand_int]["audio"]["sampling_rate"])
+"""# Time to fine-tune"""
+!pip install transformers
+from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor
+tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
+feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)
+processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+DataDict_audiobooks
+DataDict_audiobooks["train"][0]['audio']['sampling_rate']
+print(type(DataDict_audiobooks["train"][0]["audio"]))
+def modify_sample(sample):
+  sample['audio'] = sample['audio']['array']
+  return sample
+temp = DataDict_audiobooks.map(modify_sample)
+temp['train'][0]['audio']
+def prepare_dataset(batch):
+  # print(type(batch), batch.keys())
+  x = batch["audio"]
+  # print("SIZE: ", len(x))
+  # y = x["array"]
+  z = 16000
+  batch["input_values"] = processor(x, sampling_rate=z).input_values
+  with processor.as_target_processor():
+    batch["labels"] = processor(batch["text"]).input_ids
+  return batch
+processor.tokenizer.is_fast
+DataDict_audiobooks_prepared = temp.map(prepare_dataset, batch_size=8, batched=True)
+temp
+# DataDict_audiobooks_prepared = DataDict_audiobooks.map(prepare_dataset, batch_size=8, num_proc=4, batched=True)
+"""# Training and Evaluation"""
+import torch
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Union
+@dataclass
+class DataCollatorCTCWithPadding:
+    processor: Wav2Vec2Processor
+    padding: Union[bool, str] = True
+    max_length: Optional[int] = None
+    max_length_labels: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    pad_to_multiple_of_labels: Optional[int] = None
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        # split inputs and labels since they have to be of different lenghts and need
+        # different padding methods
+        input_features = [{"input_values": feature["input_values"]} for feature in features]
+        label_features = [{"input_ids": feature["labels"]} for feature in features]
+        batch = self.processor.pad(
+            input_features,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors="pt",
+        )
+        with self.processor.as_target_processor():
+            labels_batch = self.processor.pad(
+                label_features,
+                padding=self.padding,
+                max_length=self.max_length_labels,
+                pad_to_multiple_of=self.pad_to_multiple_of_labels,
+                return_tensors="pt",
+            )
+        # replace padding with -100 to ignore loss correctly
+        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
+        batch["labels"] = labels
+        return batch
+data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
+!pip install jiwer
+from datasets import load_dataset, load_metric
+wer_metric = load_metric("wer")
+def compute_metrics(pred):
+    pred_logits = pred.predictions
+    pred_ids = np.argmax(pred_logits, axis=-1)
+    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
+    pred_str = processor.batch_decode(pred_ids)
+    # we do not want to group tokens when computing the metrics
+    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
+    wer = wer_metric.compute(predictions=pred_str, references=label_str)
+    return {"wer": wer}
+from transformers import Wav2Vec2ForCTC
+model = Wav2Vec2ForCTC.from_pretrained(
+    "facebook/wav2vec2-base",
+    gradient_checkpointing=True,
+    ctc_loss_reduction="mean",
+    pad_token_id=processor.tokenizer.pad_token_id,
+)
+model.freeze_feature_extractor()
+from transformers import TrainingArguments
+training_args = TrainingArguments(
+  output_dir="./wav2vec2-base-speechcolab-demo",
+  group_by_length=True,
+  per_device_train_batch_size=32,
+  evaluation_strategy="steps",
+  num_train_epochs=30,
+  fp16=True,
+  save_steps=500,
+  eval_steps=500,
+  logging_steps=100,
+  learning_rate=1e-4,
+  weight_decay=0.005,
+  warmup_steps=1000,
+  save_total_limit=2,
+)
+from transformers import Trainer
+trainer = Trainer(
+    model=model,
+    data_collator=data_collator,
+    args=training_args,
+    compute_metrics=compute_metrics,
+    train_dataset=DataDict_audiobooks_prepared["train"],
+    eval_dataset=DataDict_audiobooks_prepared["test"],
+    tokenizer=processor.feature_extractor,
+)
+trainer.train()
+"""# Evaluate"""
+processor = Wav2Vec2Processor.from_pretrained("/content/wav2vec2-base-speechcolab-demo")
+model = Wav2Vec2ForCTC.from_pretrained("/content/wav2vec2-base-speechcolab-demo")
+def map_to_result(batch):
+  model.to("cuda")
+  input_values = processor(
+      batch["array"]["audio"],
+      sampling_rate=batch["array"]["sampling_rate"],
+      return_tensors="pt"
+  ).input_values.to("cuda")
+  with torch.no_grad():
+    logits = model(input_values).logits
+  pred_ids = torch.argmax(logits, dim=-1)
+  batch["pred_str"] = processor.batch_decode(pred_ids)[0]
+  return batch
+results = DataDict_audiobooks_prepared["test"].map(map_to_result)
+# Preprocess text?
+# Test dataset is empty
+# Something wrong in the input I'm sending and the model is expecting. Trying to fix that