NeerAbhy commited on
Commit
8bf30cb
·
verified ·
1 Parent(s): ed28acd

Create Notebook.py

Browse files
Files changed (1) hide show
  1. Notebook.py +178 -0
Notebook.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torchaudio
2
+
3
+ class TextTransform:
4
+ def __init__(self):
5
+ char_map_str = """
6
+ ' 0
7
+ <SPACE> 1
8
+ a 2
9
+ b 3
10
+ c 4
11
+ d 5
12
+ e 6
13
+ f 7
14
+ g 8
15
+ h 9
16
+ i 10
17
+ j 11
18
+ k 12
19
+ l 13
20
+ m 14
21
+ n 15
22
+ o 16
23
+ p 17
24
+ q 18
25
+ r 19
26
+ s 20
27
+ t 21
28
+ u 22
29
+ v 23
30
+ w 24
31
+ x 25
32
+ y 26
33
+ z 27
34
+ """
35
+
36
+ self.char_map = {}
37
+ self.index_map = {}
38
+ for line in char_map_str.strip().split('\n'):
39
+ ch, index = line.split()
40
+ self.char_map[ch] = int(index)
41
+ self.index_map[int(index)] = ch
42
+ self.index_map[1] = ' '
43
+
44
+ def text_to_int(self, text):
45
+ int_sequence = []
46
+ for c in text:
47
+ if c == ' ':
48
+ ch = self.char_map['<SPACE>']
49
+ else:
50
+ ch = self.char_map[c]
51
+ int_sequence.append(ch)
52
+ return int_sequence
53
+
54
+ def int_to_text(self,labels):
55
+ string = []
56
+ for i in labels:
57
+ string.append(self.index_map[i])
58
+ return ''.join(string).replace('<SPACE>', ' ')
59
+
60
+
61
+ from torch import nn
62
+ trainaudio_transforms = nn.Sequential(
63
+ torchaudio.transforms.MelSpectrogram(sample_rate = 16000, n_mels = 128),
64
+ torchaudio.transforms.FrequencyMasking(freq_mask_param = 15),
65
+ torchaudio.transforms.TimeMasking(time_mask_param = 35))
66
+
67
+
68
+ text_transform = TextTransform()
69
+
70
+ import torch.nn.functional as F
71
+ class CNNLayerNorm(nn.Module):
72
+ def __init__(self, n_feats):
73
+ super(CNNLayerNorm, self).__init__()
74
+ self.layer_norm = nn.LayerNorm(n_feats)
75
+
76
+ def forward(self, x):
77
+ x = x.transpose(2,3).contiguous()
78
+ x = self.layer_norm(x)
79
+ return x.transpose(2,3).contiguous()
80
+
81
+
82
+ class ResidualCNN(nn.Module):
83
+ def __init__(self, in_channels, out_channels, kernel, stride, dropout, n_feats):
84
+ super(ResidualCNN, self).__init__()
85
+
86
+ self.cnn1 = nn.Conv2d(in_channels, out_channels, kernel, stride, padding = kernel//2)
87
+ self.cnn2 = nn.Conv2d(out_channels, out_channels, kernel, stride, padding = kernel//2)
88
+ self.dropout1 = nn.Dropout(dropout)
89
+ self.dropout2 = nn.Dropout(dropout)
90
+ self.layernorm1 = CNNLayerNorm(n_feats)
91
+ self.layernorm2 = CNNLayerNorm(n_feats)
92
+
93
+ def forward(self, x):
94
+ residual = x
95
+ x = self.layernorm1(x)
96
+ x = self.dropout1(x)
97
+ x = F.gelu(x)
98
+ x = self.cnn1(x)
99
+ x = self.layernorm2(x)
100
+ x = self.dropout2(x)
101
+ x = F.gelu(x)
102
+ x = self.cnn2(x)
103
+ x += residual
104
+ return x
105
+
106
+ class BiDirectionalGRU(nn.Module):
107
+ def __init__(self, rnn_dim, hidden_size, dropout, batch_first):
108
+ super(BiDirectionalGRU, self).__init__()
109
+ self.BiGRU = nn.GRU(
110
+ input_size = rnn_dim, hidden_size = hidden_size,
111
+ num_layers = 1, batch_first = batch_first, bidirectional = True)
112
+ self.layernorm = nn.LayerNorm(rnn_dim)
113
+ self.dropout = nn.Dropout(dropout)
114
+
115
+ def forward(self, x):
116
+ x = self.layernorm(x)
117
+ x = F.gelu(x)
118
+ x, _ = self.BiGRU(x)
119
+ x = self.dropout(x)
120
+ return x
121
+
122
+
123
+ class SpeechRecognitionModel(nn.Module):
124
+ def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride = 2, dropout = 0.1):
125
+ super(SpeechRecognitionModel, self).__init__()
126
+ n_feats = n_feats//2
127
+ self.cnn = nn. Conv2d(1, 32, 3, stride = stride, padding = 3//2)
128
+
129
+ self.rescnn_layers = nn.Sequential(*[
130
+ ResidualCNN(32, 32, kernel = 3, stride = 1, dropout = dropout, n_feats = n_feats)
131
+ for _ in range(n_cnn_layers)
132
+ ])
133
+ self.fully_connected = nn.Linear(n_feats*32, rnn_dim)
134
+ self.birnn_layers = nn.Sequential(*[
135
+ BiDirectionalGRU(rnn_dim=rnn_dim if i==0 else rnn_dim*2,
136
+ hidden_size=rnn_dim, dropout=dropout, batch_first=i==0)
137
+ for i in range(n_rnn_layers)
138
+ ])
139
+ self.classifier = nn.Sequential(
140
+ nn.Linear(rnn_dim*2, rnn_dim),
141
+ nn.GELU(),
142
+ nn.Dropout(dropout),
143
+ nn.Linear(rnn_dim, n_class))
144
+
145
+ def forward(self, x):
146
+ x = self.cnn(x)
147
+ x = self.rescnn_layers(x)
148
+ sizes = x.size()
149
+ x = x.view(sizes[0], sizes[1]*sizes[2], sizes[3])
150
+ x = x.transpose(1,2)
151
+ x = self.fully_connected(x)
152
+ x= self.birnn_layers(x)
153
+ x = self.classifier(x)
154
+ return x
155
+
156
+ import torch
157
+ import os
158
+ from pathlib import Path
159
+ learning_rate=5e-4
160
+ batch_size=16
161
+ epochs=5
162
+
163
+ libri_train_set = "train-clean-100"
164
+
165
+ libri_test_set = "test-clean"
166
+ hparams = {
167
+ "n_cnn_layers": 3,
168
+ "n_rnn_layers": 5,
169
+ "rnn_dim": 512,
170
+ "n_class": 29,
171
+ "n_feats": 128,
172
+ "stride":2,
173
+ "dropout": 0.1,
174
+ "learning_rate": learning_rate,
175
+ "batch_size": batch_size,
176
+ "epochs": epochs
177
+ }
178
+