Spaces:

NeerAbhy
/

Speech

Sleeping

App Files Files

NeerAbhy commited on May 16, 2024

Commit

5153766

verified ·

1 Parent(s): 9e3d73f

Update app.py

Browse files

Files changed (1) hide show

app.py +157 -0

app.py CHANGED Viewed

@@ -1,4 +1,161 @@
 import gradio
 learning_rate=5e-4

 import gradio
+import torchaudio
+import torch
+import os
+from pathlib import Path
+class TextTransform:
+    def __init__(self):
+        char_map_str = """
+ ' 0
+ <SPACE> 1
+ a 2
+ b 3
+ c 4
+ d 5
+ e 6
+ f 7
+ g 8
+ h 9
+ i 10
+ j 11
+ k 12
+ l 13
+ m 14
+ n 15
+ o 16
+ p 17
+ q 18
+ r 19
+ s 20
+ t 21
+ u 22
+ v 23
+ w 24
+ x 25
+ y 26
+ z 27
+ """
+        self.char_map = {}
+        self.index_map = {}
+        for line in char_map_str.strip().split('\n'):
+            ch, index = line.split()
+            self.char_map[ch] = int(index)
+            self.index_map[int(index)] = ch
+        self.index_map[1] = ' '
+    def text_to_int(self, text):
+        int_sequence = []
+        for c in text:
+            if  c == ' ':
+                ch = self.char_map['<SPACE>']
+            else:
+                ch = self.char_map[c]
+            int_sequence.append(ch)
+        return int_sequence
+    def int_to_text(self,labels):
+        string = []
+        for i in labels:
+            string.append(self.index_map[i])
+        return ''.join(string).replace('<SPACE>', ' ')
+from torch import nn
+trainaudio_transforms = nn.Sequential(
+torchaudio.transforms.MelSpectrogram(sample_rate = 16000, n_mels = 128),
+torchaudio.transforms.FrequencyMasking(freq_mask_param = 15),
+torchaudio.transforms.TimeMasking(time_mask_param = 35))
+text_transform = TextTransform()
+import torch.nn.functional as F
+class CNNLayerNorm(nn.Module):
+    def __init__(self, n_feats):
+        super(CNNLayerNorm, self).__init__()
+        self.layer_norm = nn.LayerNorm(n_feats)
+    def forward(self, x):
+        x = x.transpose(2,3).contiguous()
+        x = self.layer_norm(x)
+        return x.transpose(2,3).contiguous()
+class ResidualCNN(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel, stride, dropout, n_feats):
+        super(ResidualCNN, self).__init__()
+        self.cnn1 = nn.Conv2d(in_channels, out_channels, kernel, stride, padding = kernel//2)
+        self.cnn2 = nn.Conv2d(out_channels, out_channels, kernel, stride, padding = kernel//2)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.layernorm1 = CNNLayerNorm(n_feats)
+        self.layernorm2 = CNNLayerNorm(n_feats)
+    def forward(self, x):
+        residual = x
+        x = self.layernorm1(x)
+        x = self.dropout1(x)
+        x = F.gelu(x)
+        x = self.cnn1(x)
+        x = self.layernorm2(x)
+        x = self.dropout2(x)
+        x = F.gelu(x)
+        x = self.cnn2(x)
+        x += residual
+        return x
+class BiDirectionalGRU(nn.Module):
+    def __init__(self, rnn_dim, hidden_size, dropout, batch_first):
+        super(BiDirectionalGRU, self).__init__()
+        self.BiGRU = nn.GRU(
+        input_size = rnn_dim, hidden_size = hidden_size,
+        num_layers = 1, batch_first = batch_first, bidirectional = True)
+        self.layernorm = nn.LayerNorm(rnn_dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        x = self.layernorm(x)
+        x = F.gelu(x)
+        x, _ = self.BiGRU(x)
+        x = self.dropout(x)
+        return x
+class SpeechRecognitionModel(nn.Module):
+    def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride = 2, dropout = 0.1):
+        super(SpeechRecognitionModel, self).__init__()
+        n_feats = n_feats//2
+        self.cnn = nn. Conv2d(1, 32, 3, stride = stride, padding = 3//2)
+        self.rescnn_layers = nn.Sequential(*[
+            ResidualCNN(32, 32, kernel = 3, stride = 1, dropout = dropout, n_feats = n_feats)
+            for _ in range(n_cnn_layers)
+        ])
+        self.fully_connected = nn.Linear(n_feats*32, rnn_dim)
+        self.birnn_layers = nn.Sequential(*[
+            BiDirectionalGRU(rnn_dim=rnn_dim if i==0 else rnn_dim*2,
+                             hidden_size=rnn_dim, dropout=dropout, batch_first=i==0)
+            for i in range(n_rnn_layers)
+        ])
+        self.classifier = nn.Sequential(
+        nn.Linear(rnn_dim*2, rnn_dim),
+        nn.GELU(),
+        nn.Dropout(dropout),
+        nn.Linear(rnn_dim, n_class))
+    def forward(self, x):
+        x = self.cnn(x)
+        x = self.rescnn_layers(x)
+        sizes = x.size()
+        x = x.view(sizes[0], sizes[1]*sizes[2], sizes[3])
+        x = x.transpose(1,2)
+        x = self.fully_connected(x)
+        x= self.birnn_layers(x)
+        x = self.classifier(x)
+        return x
 learning_rate=5e-4