Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,161 @@
|
|
1 |
import gradio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
|
4 |
learning_rate=5e-4
|
|
|
1 |
import gradio
|
2 |
+
import torchaudio
|
3 |
+
import torch
|
4 |
+
import os
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
class TextTransform:
|
8 |
+
def __init__(self):
|
9 |
+
char_map_str = """
|
10 |
+
' 0
|
11 |
+
<SPACE> 1
|
12 |
+
a 2
|
13 |
+
b 3
|
14 |
+
c 4
|
15 |
+
d 5
|
16 |
+
e 6
|
17 |
+
f 7
|
18 |
+
g 8
|
19 |
+
h 9
|
20 |
+
i 10
|
21 |
+
j 11
|
22 |
+
k 12
|
23 |
+
l 13
|
24 |
+
m 14
|
25 |
+
n 15
|
26 |
+
o 16
|
27 |
+
p 17
|
28 |
+
q 18
|
29 |
+
r 19
|
30 |
+
s 20
|
31 |
+
t 21
|
32 |
+
u 22
|
33 |
+
v 23
|
34 |
+
w 24
|
35 |
+
x 25
|
36 |
+
y 26
|
37 |
+
z 27
|
38 |
+
"""
|
39 |
+
|
40 |
+
self.char_map = {}
|
41 |
+
self.index_map = {}
|
42 |
+
for line in char_map_str.strip().split('\n'):
|
43 |
+
ch, index = line.split()
|
44 |
+
self.char_map[ch] = int(index)
|
45 |
+
self.index_map[int(index)] = ch
|
46 |
+
self.index_map[1] = ' '
|
47 |
+
|
48 |
+
def text_to_int(self, text):
|
49 |
+
int_sequence = []
|
50 |
+
for c in text:
|
51 |
+
if c == ' ':
|
52 |
+
ch = self.char_map['<SPACE>']
|
53 |
+
else:
|
54 |
+
ch = self.char_map[c]
|
55 |
+
int_sequence.append(ch)
|
56 |
+
return int_sequence
|
57 |
+
|
58 |
+
def int_to_text(self,labels):
|
59 |
+
string = []
|
60 |
+
for i in labels:
|
61 |
+
string.append(self.index_map[i])
|
62 |
+
return ''.join(string).replace('<SPACE>', ' ')
|
63 |
+
|
64 |
+
|
65 |
+
from torch import nn
|
66 |
+
trainaudio_transforms = nn.Sequential(
|
67 |
+
torchaudio.transforms.MelSpectrogram(sample_rate = 16000, n_mels = 128),
|
68 |
+
torchaudio.transforms.FrequencyMasking(freq_mask_param = 15),
|
69 |
+
torchaudio.transforms.TimeMasking(time_mask_param = 35))
|
70 |
+
|
71 |
+
|
72 |
+
text_transform = TextTransform()
|
73 |
+
|
74 |
+
import torch.nn.functional as F
|
75 |
+
class CNNLayerNorm(nn.Module):
|
76 |
+
def __init__(self, n_feats):
|
77 |
+
super(CNNLayerNorm, self).__init__()
|
78 |
+
self.layer_norm = nn.LayerNorm(n_feats)
|
79 |
+
|
80 |
+
def forward(self, x):
|
81 |
+
x = x.transpose(2,3).contiguous()
|
82 |
+
x = self.layer_norm(x)
|
83 |
+
return x.transpose(2,3).contiguous()
|
84 |
+
|
85 |
+
|
86 |
+
class ResidualCNN(nn.Module):
|
87 |
+
def __init__(self, in_channels, out_channels, kernel, stride, dropout, n_feats):
|
88 |
+
super(ResidualCNN, self).__init__()
|
89 |
+
|
90 |
+
self.cnn1 = nn.Conv2d(in_channels, out_channels, kernel, stride, padding = kernel//2)
|
91 |
+
self.cnn2 = nn.Conv2d(out_channels, out_channels, kernel, stride, padding = kernel//2)
|
92 |
+
self.dropout1 = nn.Dropout(dropout)
|
93 |
+
self.dropout2 = nn.Dropout(dropout)
|
94 |
+
self.layernorm1 = CNNLayerNorm(n_feats)
|
95 |
+
self.layernorm2 = CNNLayerNorm(n_feats)
|
96 |
+
|
97 |
+
def forward(self, x):
|
98 |
+
residual = x
|
99 |
+
x = self.layernorm1(x)
|
100 |
+
x = self.dropout1(x)
|
101 |
+
x = F.gelu(x)
|
102 |
+
x = self.cnn1(x)
|
103 |
+
x = self.layernorm2(x)
|
104 |
+
x = self.dropout2(x)
|
105 |
+
x = F.gelu(x)
|
106 |
+
x = self.cnn2(x)
|
107 |
+
x += residual
|
108 |
+
return x
|
109 |
+
|
110 |
+
class BiDirectionalGRU(nn.Module):
|
111 |
+
def __init__(self, rnn_dim, hidden_size, dropout, batch_first):
|
112 |
+
super(BiDirectionalGRU, self).__init__()
|
113 |
+
self.BiGRU = nn.GRU(
|
114 |
+
input_size = rnn_dim, hidden_size = hidden_size,
|
115 |
+
num_layers = 1, batch_first = batch_first, bidirectional = True)
|
116 |
+
self.layernorm = nn.LayerNorm(rnn_dim)
|
117 |
+
self.dropout = nn.Dropout(dropout)
|
118 |
+
|
119 |
+
def forward(self, x):
|
120 |
+
x = self.layernorm(x)
|
121 |
+
x = F.gelu(x)
|
122 |
+
x, _ = self.BiGRU(x)
|
123 |
+
x = self.dropout(x)
|
124 |
+
return x
|
125 |
+
|
126 |
+
|
127 |
+
class SpeechRecognitionModel(nn.Module):
|
128 |
+
def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride = 2, dropout = 0.1):
|
129 |
+
super(SpeechRecognitionModel, self).__init__()
|
130 |
+
n_feats = n_feats//2
|
131 |
+
self.cnn = nn. Conv2d(1, 32, 3, stride = stride, padding = 3//2)
|
132 |
+
|
133 |
+
self.rescnn_layers = nn.Sequential(*[
|
134 |
+
ResidualCNN(32, 32, kernel = 3, stride = 1, dropout = dropout, n_feats = n_feats)
|
135 |
+
for _ in range(n_cnn_layers)
|
136 |
+
])
|
137 |
+
self.fully_connected = nn.Linear(n_feats*32, rnn_dim)
|
138 |
+
self.birnn_layers = nn.Sequential(*[
|
139 |
+
BiDirectionalGRU(rnn_dim=rnn_dim if i==0 else rnn_dim*2,
|
140 |
+
hidden_size=rnn_dim, dropout=dropout, batch_first=i==0)
|
141 |
+
for i in range(n_rnn_layers)
|
142 |
+
])
|
143 |
+
self.classifier = nn.Sequential(
|
144 |
+
nn.Linear(rnn_dim*2, rnn_dim),
|
145 |
+
nn.GELU(),
|
146 |
+
nn.Dropout(dropout),
|
147 |
+
nn.Linear(rnn_dim, n_class))
|
148 |
+
|
149 |
+
def forward(self, x):
|
150 |
+
x = self.cnn(x)
|
151 |
+
x = self.rescnn_layers(x)
|
152 |
+
sizes = x.size()
|
153 |
+
x = x.view(sizes[0], sizes[1]*sizes[2], sizes[3])
|
154 |
+
x = x.transpose(1,2)
|
155 |
+
x = self.fully_connected(x)
|
156 |
+
x= self.birnn_layers(x)
|
157 |
+
x = self.classifier(x)
|
158 |
+
return x
|
159 |
|
160 |
|
161 |
learning_rate=5e-4
|