import torchaudio class TextTransform: def __init__(self): char_map_str = """ ' 0 1 a 2 b 3 c 4 d 5 e 6 f 7 g 8 h 9 i 10 j 11 k 12 l 13 m 14 n 15 o 16 p 17 q 18 r 19 s 20 t 21 u 22 v 23 w 24 x 25 y 26 z 27 """ self.char_map = {} self.index_map = {} for line in char_map_str.strip().split('\n'): ch, index = line.split() self.char_map[ch] = int(index) self.index_map[int(index)] = ch self.index_map[1] = ' ' def text_to_int(self, text): int_sequence = [] for c in text: if c == ' ': ch = self.char_map[''] else: ch = self.char_map[c] int_sequence.append(ch) return int_sequence def int_to_text(self,labels): string = [] for i in labels: string.append(self.index_map[i]) return ''.join(string).replace('', ' ') from torch import nn trainaudio_transforms = nn.Sequential( torchaudio.transforms.MelSpectrogram(sample_rate = 16000, n_mels = 128), torchaudio.transforms.FrequencyMasking(freq_mask_param = 15), torchaudio.transforms.TimeMasking(time_mask_param = 35)) text_transform = TextTransform() import torch.nn.functional as F class CNNLayerNorm(nn.Module): def __init__(self, n_feats): super(CNNLayerNorm, self).__init__() self.layer_norm = nn.LayerNorm(n_feats) def forward(self, x): x = x.transpose(2,3).contiguous() x = self.layer_norm(x) return x.transpose(2,3).contiguous() class ResidualCNN(nn.Module): def __init__(self, in_channels, out_channels, kernel, stride, dropout, n_feats): super(ResidualCNN, self).__init__() self.cnn1 = nn.Conv2d(in_channels, out_channels, kernel, stride, padding = kernel//2) self.cnn2 = nn.Conv2d(out_channels, out_channels, kernel, stride, padding = kernel//2) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.layernorm1 = CNNLayerNorm(n_feats) self.layernorm2 = CNNLayerNorm(n_feats) def forward(self, x): residual = x x = self.layernorm1(x) x = self.dropout1(x) x = F.gelu(x) x = self.cnn1(x) x = self.layernorm2(x) x = self.dropout2(x) x = F.gelu(x) x = self.cnn2(x) x += residual return x class BiDirectionalGRU(nn.Module): def __init__(self, rnn_dim, hidden_size, dropout, batch_first): super(BiDirectionalGRU, self).__init__() self.BiGRU = nn.GRU( input_size = rnn_dim, hidden_size = hidden_size, num_layers = 1, batch_first = batch_first, bidirectional = True) self.layernorm = nn.LayerNorm(rnn_dim) self.dropout = nn.Dropout(dropout) def forward(self, x): x = self.layernorm(x) x = F.gelu(x) x, _ = self.BiGRU(x) x = self.dropout(x) return x class SpeechRecognitionModel(nn.Module): def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride = 2, dropout = 0.1): super(SpeechRecognitionModel, self).__init__() n_feats = n_feats//2 self.cnn = nn. Conv2d(1, 32, 3, stride = stride, padding = 3//2) self.rescnn_layers = nn.Sequential(*[ ResidualCNN(32, 32, kernel = 3, stride = 1, dropout = dropout, n_feats = n_feats) for _ in range(n_cnn_layers) ]) self.fully_connected = nn.Linear(n_feats*32, rnn_dim) self.birnn_layers = nn.Sequential(*[ BiDirectionalGRU(rnn_dim=rnn_dim if i==0 else rnn_dim*2, hidden_size=rnn_dim, dropout=dropout, batch_first=i==0) for i in range(n_rnn_layers) ]) self.classifier = nn.Sequential( nn.Linear(rnn_dim*2, rnn_dim), nn.GELU(), nn.Dropout(dropout), nn.Linear(rnn_dim, n_class)) def forward(self, x): x = self.cnn(x) x = self.rescnn_layers(x) sizes = x.size() x = x.view(sizes[0], sizes[1]*sizes[2], sizes[3]) x = x.transpose(1,2) x = self.fully_connected(x) x= self.birnn_layers(x) x = self.classifier(x) return x import torch import os from pathlib import Path learning_rate=5e-4 batch_size=16 epochs=5 libri_train_set = "train-clean-100" libri_test_set = "test-clean" hparams = { "n_cnn_layers": 3, "n_rnn_layers": 5, "rnn_dim": 512, "n_class": 29, "n_feats": 128, "stride":2, "dropout": 0.1, "learning_rate": learning_rate, "batch_size": batch_size, "epochs": epochs }