import spaces from huggingface_hub import from_pretrained_fastai import gradio as gr import torch import torchaudio import torchaudio.functional as Ft import torchaudio.transforms as T import os from pathlib import Path repo_id = "NeerAbhy/Speech_recognotion_model/Speech_recognition_model_2.pth" class TextTransform: def __init__(self): char_map_str = """ ' 0 1 a 2 b 3 c 4 d 5 e 6 f 7 g 8 h 9 i 10 j 11 k 12 l 13 m 14 n 15 o 16 p 17 q 18 r 19 s 20 t 21 u 22 v 23 w 24 x 25 y 26 z 27 """ self.char_map = {} self.index_map = {} for line in char_map_str.strip().split('\n'): ch, index = line.split() self.char_map[ch] = int(index) self.index_map[int(index)] = ch self.index_map[1] = ' ' def text_to_int(self, text): int_sequence = [] for c in text: if c == ' ': ch = self.char_map[''] else: ch = self.char_map[c] int_sequence.append(ch) return int_sequence def int_to_text(self,labels): string = [] for i in labels: string.append(self.index_map[i]) return ''.join(string).replace('', ' ') from torch import nn trainaudio_transforms = nn.Sequential( torchaudio.transforms.MelSpectrogram(sample_rate = 16000, n_mels = 128), torchaudio.transforms.FrequencyMasking(freq_mask_param = 15), torchaudio.transforms.TimeMasking(time_mask_param = 35)) text_transform = TextTransform() import torch.nn.functional as F class CNNLayerNorm(nn.Module): def __init__(self, n_feats): super(CNNLayerNorm, self).__init__() self.layer_norm = nn.LayerNorm(n_feats) def forward(self, x): x = x.transpose(2,3).contiguous() x = self.layer_norm(x) return x.transpose(2,3).contiguous() class ResidualCNN(nn.Module): def __init__(self, in_channels, out_channels, kernel, stride, dropout, n_feats): super(ResidualCNN, self).__init__() self.cnn1 = nn.Conv2d(in_channels, out_channels, kernel, stride, padding = kernel//2) self.cnn2 = nn.Conv2d(out_channels, out_channels, kernel, stride, padding = kernel//2) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.layernorm1 = CNNLayerNorm(n_feats) self.layernorm2 = CNNLayerNorm(n_feats) def forward(self, x): residual = x x = self.layernorm1(x) x = self.dropout1(x) x = F.gelu(x) x = self.cnn1(x) x = self.layernorm2(x) x = self.dropout2(x) x = F.gelu(x) x = self.cnn2(x) x += residual return x class BiDirectionalGRU(nn.Module): def __init__(self, rnn_dim, hidden_size, dropout, batch_first): super(BiDirectionalGRU, self).__init__() self.BiGRU = nn.GRU( input_size = rnn_dim, hidden_size = hidden_size, num_layers = 1, batch_first = batch_first, bidirectional = True) self.layernorm = nn.LayerNorm(rnn_dim) self.dropout = nn.Dropout(dropout) def forward(self, x): x = self.layernorm(x) x = F.gelu(x) x, _ = self.BiGRU(x) x = self.dropout(x) return x class SpeechRecognitionModel(nn.Module): def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride = 2, dropout = 0.1): super(SpeechRecognitionModel, self).__init__() n_feats = n_feats//2 self.cnn = nn. Conv2d(1, 32, 3, stride = stride, padding = 3//2) self.rescnn_layers = nn.Sequential(*[ ResidualCNN(32, 32, kernel = 3, stride = 1, dropout = dropout, n_feats = n_feats) for _ in range(n_cnn_layers) ]) self.fully_connected = nn.Linear(n_feats*32, rnn_dim) self.birnn_layers = nn.Sequential(*[ BiDirectionalGRU(rnn_dim=rnn_dim if i==0 else rnn_dim*2, hidden_size=rnn_dim, dropout=dropout, batch_first=i==0) for i in range(n_rnn_layers) ]) self.classifier = nn.Sequential( nn.Linear(rnn_dim*2, rnn_dim), nn.GELU(), nn.Dropout(dropout), nn.Linear(rnn_dim, n_class)) def forward(self, x): x = self.cnn(x) x = self.rescnn_layers(x) sizes = x.size() x = x.view(sizes[0], sizes[1]*sizes[2], sizes[3]) x = x.transpose(1,2) x = self.fully_connected(x) x= self.birnn_layers(x) x = self.classifier(x) return x learning_rate=5e-4 batch_size=16 epochs=5 hparams = { "n_cnn_layers": 3, "n_rnn_layers": 5, "rnn_dim": 512, "n_class": 29, "n_feats": 128, "stride":2, "dropout": 0.1, "learning_rate": learning_rate, "batch_size": batch_size, "epochs": epochs } model = SpeechRecognitionModel( hparams['n_cnn_layers'], hparams['n_rnn_layers'], hparams['rnn_dim'], hparams['n_class'], hparams['n_feats'], hparams['stride'], hparams['dropout'] ) model.load_state_dict(torch.load("./Speech_recognition_model_2.pth",map_location=torch.device('cpu'))) #model = from_pretrained_fastai(repo_id) def prediction(audio): valid_audio_transforms = torchaudio.transforms.MelSpectrogram(sample_rate = 16000) spectrogram = [] waveform, freq = torchaudio.load(audio) resampler = T.Resample(freq, 16000, dtype=waveform.dtype) resampled_waveform = resampler(waveform) spec = valid_audio_transforms(resampled_waveform).squeeze(0).transpose(0,1) spectrogram.append(spec) spec = nn.utils.rnn.pad_sequence(spectrogram, batch_first = True).unsqueeze(1).transpose(2,3) spec = spec model.eval() with torch.no_grad(): output = model(spec) output = F.log_softmax(output, dim=2) arg_maxes = torch.argmax(output, dim=2) decodes = [] for i, args in enumerate(arg_maxes): decode = [] for j, index in enumerate(args): if index != 28: if True and j != 0 and index == args[j -1]: continue decode.append(index.item()) decodes.append(text_transform.int_to_text(decode)) return decodes #microphone_input = gr.Audio( type="filepath", label="Record") #upload_input = gr.Audio( type="filepath", label="Upload File") audio_input = gr.Audio(type='filepath', label="Upload an audio file") emotion_output = gr.Textbox(label="Predicted Emotion") examples=[["2902-9008-0000.flac"],["2902-9008-0001.flac"],["2902-9008-0002.flac"],["2902-9008-0003.flac"],["2902-9008-0004.flac"]] interface = gr.Interface(fn=prediction, inputs=audio_input, outputs=emotion_output,examples =examples) interface.launch(share=True)