Speech / app.py
NeerAbhy's picture
Update app.py
4ab8825 verified
raw
history blame
6.84 kB
import spaces
from huggingface_hub import from_pretrained_fastai
import gradio as gr
import torch
import torchaudio
import torchaudio.functional as Ft
import torchaudio.transforms as T
import os
from pathlib import Path
repo_id = "NeerAbhy/Speech_recognotion_model/Speech_recognition_model_2.pth"
class TextTransform:
def __init__(self):
char_map_str = """
' 0
<SPACE> 1
a 2
b 3
c 4
d 5
e 6
f 7
g 8
h 9
i 10
j 11
k 12
l 13
m 14
n 15
o 16
p 17
q 18
r 19
s 20
t 21
u 22
v 23
w 24
x 25
y 26
z 27
"""
self.char_map = {}
self.index_map = {}
for line in char_map_str.strip().split('\n'):
ch, index = line.split()
self.char_map[ch] = int(index)
self.index_map[int(index)] = ch
self.index_map[1] = ' '
def text_to_int(self, text):
int_sequence = []
for c in text:
if c == ' ':
ch = self.char_map['<SPACE>']
else:
ch = self.char_map[c]
int_sequence.append(ch)
return int_sequence
def int_to_text(self,labels):
string = []
for i in labels:
string.append(self.index_map[i])
return ''.join(string).replace('<SPACE>', ' ')
from torch import nn
trainaudio_transforms = nn.Sequential(
torchaudio.transforms.MelSpectrogram(sample_rate = 16000, n_mels = 128),
torchaudio.transforms.FrequencyMasking(freq_mask_param = 15),
torchaudio.transforms.TimeMasking(time_mask_param = 35))
text_transform = TextTransform()
import torch.nn.functional as F
class CNNLayerNorm(nn.Module):
def __init__(self, n_feats):
super(CNNLayerNorm, self).__init__()
self.layer_norm = nn.LayerNorm(n_feats)
def forward(self, x):
x = x.transpose(2,3).contiguous()
x = self.layer_norm(x)
return x.transpose(2,3).contiguous()
class ResidualCNN(nn.Module):
def __init__(self, in_channels, out_channels, kernel, stride, dropout, n_feats):
super(ResidualCNN, self).__init__()
self.cnn1 = nn.Conv2d(in_channels, out_channels, kernel, stride, padding = kernel//2)
self.cnn2 = nn.Conv2d(out_channels, out_channels, kernel, stride, padding = kernel//2)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
self.layernorm1 = CNNLayerNorm(n_feats)
self.layernorm2 = CNNLayerNorm(n_feats)
def forward(self, x):
residual = x
x = self.layernorm1(x)
x = self.dropout1(x)
x = F.gelu(x)
x = self.cnn1(x)
x = self.layernorm2(x)
x = self.dropout2(x)
x = F.gelu(x)
x = self.cnn2(x)
x += residual
return x
class BiDirectionalGRU(nn.Module):
def __init__(self, rnn_dim, hidden_size, dropout, batch_first):
super(BiDirectionalGRU, self).__init__()
self.BiGRU = nn.GRU(
input_size = rnn_dim, hidden_size = hidden_size,
num_layers = 1, batch_first = batch_first, bidirectional = True)
self.layernorm = nn.LayerNorm(rnn_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
x = self.layernorm(x)
x = F.gelu(x)
x, _ = self.BiGRU(x)
x = self.dropout(x)
return x
class SpeechRecognitionModel(nn.Module):
def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride = 2, dropout = 0.1):
super(SpeechRecognitionModel, self).__init__()
n_feats = n_feats//2
self.cnn = nn. Conv2d(1, 32, 3, stride = stride, padding = 3//2)
self.rescnn_layers = nn.Sequential(*[
ResidualCNN(32, 32, kernel = 3, stride = 1, dropout = dropout, n_feats = n_feats)
for _ in range(n_cnn_layers)
])
self.fully_connected = nn.Linear(n_feats*32, rnn_dim)
self.birnn_layers = nn.Sequential(*[
BiDirectionalGRU(rnn_dim=rnn_dim if i==0 else rnn_dim*2,
hidden_size=rnn_dim, dropout=dropout, batch_first=i==0)
for i in range(n_rnn_layers)
])
self.classifier = nn.Sequential(
nn.Linear(rnn_dim*2, rnn_dim),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(rnn_dim, n_class))
def forward(self, x):
x = self.cnn(x)
x = self.rescnn_layers(x)
sizes = x.size()
x = x.view(sizes[0], sizes[1]*sizes[2], sizes[3])
x = x.transpose(1,2)
x = self.fully_connected(x)
x= self.birnn_layers(x)
x = self.classifier(x)
return x
learning_rate=5e-4
batch_size=16
epochs=5
hparams = {
"n_cnn_layers": 3,
"n_rnn_layers": 5,
"rnn_dim": 512,
"n_class": 29,
"n_feats": 128,
"stride":2,
"dropout": 0.1,
"learning_rate": learning_rate,
"batch_size": batch_size,
"epochs": epochs
}
model = SpeechRecognitionModel(
hparams['n_cnn_layers'], hparams['n_rnn_layers'], hparams['rnn_dim'],
hparams['n_class'], hparams['n_feats'], hparams['stride'], hparams['dropout']
)
model.load_state_dict(torch.load("./Speech_recognition_model_2.pth",map_location=torch.device('cpu')))
#model = from_pretrained_fastai(repo_id)
def prediction(audio):
valid_audio_transforms = torchaudio.transforms.MelSpectrogram(sample_rate = 16000)
spectrogram = []
waveform, freq = torchaudio.load(audio)
resampler = T.Resample(freq, 16000, dtype=waveform.dtype)
resampled_waveform = resampler(waveform)
spec = valid_audio_transforms(resampled_waveform).squeeze(0).transpose(0,1)
spectrogram.append(spec)
spec = nn.utils.rnn.pad_sequence(spectrogram, batch_first = True).unsqueeze(1).transpose(2,3)
spec = spec
model.eval()
with torch.no_grad():
output = model(spec)
output = F.log_softmax(output, dim=2)
arg_maxes = torch.argmax(output, dim=2)
decodes = []
for i, args in enumerate(arg_maxes):
decode = []
for j, index in enumerate(args):
if index != 28:
if True and j != 0 and index == args[j -1]:
continue
decode.append(index.item())
decodes.append(text_transform.int_to_text(decode))
return decodes
#microphone_input = gr.Audio( type="filepath", label="Record")
#upload_input = gr.Audio( type="filepath", label="Upload File")
audio_input = gr.Audio(type='filepath', label="Upload an audio file")
emotion_output = gr.Textbox(label="Predicted Emotion")
examples=[["2902-9008-0000.flac"],["2902-9008-0001.flac"],["2902-9008-0002.flac"],["2902-9008-0003.flac"],["2902-9008-0004.flac"]]
interface = gr.Interface(fn=prediction, inputs=audio_input, outputs=emotion_output,examples =examples)
interface.launch(share=True)