NeerAbhy commited on
Commit
5153766
·
verified ·
1 Parent(s): 9e3d73f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +157 -0
app.py CHANGED
@@ -1,4 +1,161 @@
1
  import gradio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
 
4
  learning_rate=5e-4
 
1
  import gradio
2
+ import torchaudio
3
+ import torch
4
+ import os
5
+ from pathlib import Path
6
+
7
+ class TextTransform:
8
+ def __init__(self):
9
+ char_map_str = """
10
+ ' 0
11
+ <SPACE> 1
12
+ a 2
13
+ b 3
14
+ c 4
15
+ d 5
16
+ e 6
17
+ f 7
18
+ g 8
19
+ h 9
20
+ i 10
21
+ j 11
22
+ k 12
23
+ l 13
24
+ m 14
25
+ n 15
26
+ o 16
27
+ p 17
28
+ q 18
29
+ r 19
30
+ s 20
31
+ t 21
32
+ u 22
33
+ v 23
34
+ w 24
35
+ x 25
36
+ y 26
37
+ z 27
38
+ """
39
+
40
+ self.char_map = {}
41
+ self.index_map = {}
42
+ for line in char_map_str.strip().split('\n'):
43
+ ch, index = line.split()
44
+ self.char_map[ch] = int(index)
45
+ self.index_map[int(index)] = ch
46
+ self.index_map[1] = ' '
47
+
48
+ def text_to_int(self, text):
49
+ int_sequence = []
50
+ for c in text:
51
+ if c == ' ':
52
+ ch = self.char_map['<SPACE>']
53
+ else:
54
+ ch = self.char_map[c]
55
+ int_sequence.append(ch)
56
+ return int_sequence
57
+
58
+ def int_to_text(self,labels):
59
+ string = []
60
+ for i in labels:
61
+ string.append(self.index_map[i])
62
+ return ''.join(string).replace('<SPACE>', ' ')
63
+
64
+
65
+ from torch import nn
66
+ trainaudio_transforms = nn.Sequential(
67
+ torchaudio.transforms.MelSpectrogram(sample_rate = 16000, n_mels = 128),
68
+ torchaudio.transforms.FrequencyMasking(freq_mask_param = 15),
69
+ torchaudio.transforms.TimeMasking(time_mask_param = 35))
70
+
71
+
72
+ text_transform = TextTransform()
73
+
74
+ import torch.nn.functional as F
75
+ class CNNLayerNorm(nn.Module):
76
+ def __init__(self, n_feats):
77
+ super(CNNLayerNorm, self).__init__()
78
+ self.layer_norm = nn.LayerNorm(n_feats)
79
+
80
+ def forward(self, x):
81
+ x = x.transpose(2,3).contiguous()
82
+ x = self.layer_norm(x)
83
+ return x.transpose(2,3).contiguous()
84
+
85
+
86
+ class ResidualCNN(nn.Module):
87
+ def __init__(self, in_channels, out_channels, kernel, stride, dropout, n_feats):
88
+ super(ResidualCNN, self).__init__()
89
+
90
+ self.cnn1 = nn.Conv2d(in_channels, out_channels, kernel, stride, padding = kernel//2)
91
+ self.cnn2 = nn.Conv2d(out_channels, out_channels, kernel, stride, padding = kernel//2)
92
+ self.dropout1 = nn.Dropout(dropout)
93
+ self.dropout2 = nn.Dropout(dropout)
94
+ self.layernorm1 = CNNLayerNorm(n_feats)
95
+ self.layernorm2 = CNNLayerNorm(n_feats)
96
+
97
+ def forward(self, x):
98
+ residual = x
99
+ x = self.layernorm1(x)
100
+ x = self.dropout1(x)
101
+ x = F.gelu(x)
102
+ x = self.cnn1(x)
103
+ x = self.layernorm2(x)
104
+ x = self.dropout2(x)
105
+ x = F.gelu(x)
106
+ x = self.cnn2(x)
107
+ x += residual
108
+ return x
109
+
110
+ class BiDirectionalGRU(nn.Module):
111
+ def __init__(self, rnn_dim, hidden_size, dropout, batch_first):
112
+ super(BiDirectionalGRU, self).__init__()
113
+ self.BiGRU = nn.GRU(
114
+ input_size = rnn_dim, hidden_size = hidden_size,
115
+ num_layers = 1, batch_first = batch_first, bidirectional = True)
116
+ self.layernorm = nn.LayerNorm(rnn_dim)
117
+ self.dropout = nn.Dropout(dropout)
118
+
119
+ def forward(self, x):
120
+ x = self.layernorm(x)
121
+ x = F.gelu(x)
122
+ x, _ = self.BiGRU(x)
123
+ x = self.dropout(x)
124
+ return x
125
+
126
+
127
+ class SpeechRecognitionModel(nn.Module):
128
+ def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride = 2, dropout = 0.1):
129
+ super(SpeechRecognitionModel, self).__init__()
130
+ n_feats = n_feats//2
131
+ self.cnn = nn. Conv2d(1, 32, 3, stride = stride, padding = 3//2)
132
+
133
+ self.rescnn_layers = nn.Sequential(*[
134
+ ResidualCNN(32, 32, kernel = 3, stride = 1, dropout = dropout, n_feats = n_feats)
135
+ for _ in range(n_cnn_layers)
136
+ ])
137
+ self.fully_connected = nn.Linear(n_feats*32, rnn_dim)
138
+ self.birnn_layers = nn.Sequential(*[
139
+ BiDirectionalGRU(rnn_dim=rnn_dim if i==0 else rnn_dim*2,
140
+ hidden_size=rnn_dim, dropout=dropout, batch_first=i==0)
141
+ for i in range(n_rnn_layers)
142
+ ])
143
+ self.classifier = nn.Sequential(
144
+ nn.Linear(rnn_dim*2, rnn_dim),
145
+ nn.GELU(),
146
+ nn.Dropout(dropout),
147
+ nn.Linear(rnn_dim, n_class))
148
+
149
+ def forward(self, x):
150
+ x = self.cnn(x)
151
+ x = self.rescnn_layers(x)
152
+ sizes = x.size()
153
+ x = x.view(sizes[0], sizes[1]*sizes[2], sizes[3])
154
+ x = x.transpose(1,2)
155
+ x = self.fully_connected(x)
156
+ x= self.birnn_layers(x)
157
+ x = self.classifier(x)
158
+ return x
159
 
160
 
161
  learning_rate=5e-4