Spaces:

NeerAbhy
/

Speech

Sleeping

File size: 6,844 Bytes

import spaces
from huggingface_hub import from_pretrained_fastai

import gradio as gr

import torch
import torchaudio
import torchaudio.functional as Ft
import torchaudio.transforms as T

import os
from pathlib import Path

repo_id = "NeerAbhy/Speech_recognotion_model/Speech_recognition_model_2.pth"
class TextTransform:
    def __init__(self):
        char_map_str = """
 ' 0
 <SPACE> 1
 a 2
 b 3
 c 4
 d 5
 e 6
 f 7
 g 8
 h 9
 i 10
 j 11
 k 12
 l 13
 m 14
 n 15
 o 16
 p 17
 q 18
 r 19
 s 20
 t 21
 u 22
 v 23
 w 24
 x 25
 y 26
 z 27
 """
        
        self.char_map = {}
        self.index_map = {}
        for line in char_map_str.strip().split('\n'):
            ch, index = line.split()
            self.char_map[ch] = int(index)
            self.index_map[int(index)] = ch
        self.index_map[1] = ' '
        
    def text_to_int(self, text):
        int_sequence = []
        for c in text:
            if  c == ' ':
                ch = self.char_map['<SPACE>']
            else:
                ch = self.char_map[c]
            int_sequence.append(ch)
        return int_sequence
    
    def int_to_text(self,labels):
        string = []
        for i in labels:
            string.append(self.index_map[i])
        return ''.join(string).replace('<SPACE>', ' ')


from torch import nn
trainaudio_transforms = nn.Sequential(
torchaudio.transforms.MelSpectrogram(sample_rate = 16000, n_mels = 128),
torchaudio.transforms.FrequencyMasking(freq_mask_param = 15),
torchaudio.transforms.TimeMasking(time_mask_param = 35))


text_transform = TextTransform()

import torch.nn.functional as F
class CNNLayerNorm(nn.Module):
    def __init__(self, n_feats):
        super(CNNLayerNorm, self).__init__()
        self.layer_norm = nn.LayerNorm(n_feats)
        
    def forward(self, x):
        x = x.transpose(2,3).contiguous()
        x = self.layer_norm(x)
        return x.transpose(2,3).contiguous()
        

class ResidualCNN(nn.Module):
    def __init__(self, in_channels, out_channels, kernel, stride, dropout, n_feats):
        super(ResidualCNN, self).__init__()
        
        self.cnn1 = nn.Conv2d(in_channels, out_channels, kernel, stride, padding = kernel//2)
        self.cnn2 = nn.Conv2d(out_channels, out_channels, kernel, stride, padding = kernel//2)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.layernorm1 = CNNLayerNorm(n_feats)
        self.layernorm2 = CNNLayerNorm(n_feats)
        
    def forward(self, x):
        residual = x
        x = self.layernorm1(x)
        x = self.dropout1(x)
        x = F.gelu(x)
        x = self.cnn1(x)
        x = self.layernorm2(x)
        x = self.dropout2(x)
        x = F.gelu(x)
        x = self.cnn2(x)
        x += residual
        return x
    
class BiDirectionalGRU(nn.Module):
    def __init__(self, rnn_dim, hidden_size, dropout, batch_first):
        super(BiDirectionalGRU, self).__init__()
        self.BiGRU = nn.GRU(
        input_size = rnn_dim, hidden_size = hidden_size,
        num_layers = 1, batch_first = batch_first, bidirectional = True)
        self.layernorm = nn.LayerNorm(rnn_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x = self.layernorm(x)
        x = F.gelu(x)
        x, _ = self.BiGRU(x)
        x = self.dropout(x)
        return x
    
    
class SpeechRecognitionModel(nn.Module):
    def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride = 2, dropout = 0.1):
        super(SpeechRecognitionModel, self).__init__()
        n_feats = n_feats//2
        self.cnn = nn. Conv2d(1, 32, 3, stride = stride, padding = 3//2)
        
        self.rescnn_layers = nn.Sequential(*[
            ResidualCNN(32, 32, kernel = 3, stride = 1, dropout = dropout, n_feats = n_feats)
            for _ in range(n_cnn_layers)
        ])
        self.fully_connected = nn.Linear(n_feats*32, rnn_dim)
        self.birnn_layers = nn.Sequential(*[
            BiDirectionalGRU(rnn_dim=rnn_dim if i==0 else rnn_dim*2,
                             hidden_size=rnn_dim, dropout=dropout, batch_first=i==0)
            for i in range(n_rnn_layers)
        ])
        self.classifier = nn.Sequential(
        nn.Linear(rnn_dim*2, rnn_dim),
        nn.GELU(),
        nn.Dropout(dropout),
        nn.Linear(rnn_dim, n_class))
        
    def forward(self, x):
        x = self.cnn(x)
        x = self.rescnn_layers(x)
        sizes = x.size()
        x = x.view(sizes[0], sizes[1]*sizes[2], sizes[3])
        x = x.transpose(1,2)
        x = self.fully_connected(x)
        x= self.birnn_layers(x)
        x = self.classifier(x)
        return x


learning_rate=5e-4
batch_size=16
epochs=5

hparams = {
    "n_cnn_layers": 3,
    "n_rnn_layers": 5,
    "rnn_dim": 512,
    "n_class": 29,
    "n_feats": 128,
    "stride":2,
    "dropout": 0.1,
    "learning_rate": learning_rate,
    "batch_size": batch_size,
    "epochs": epochs
}

model = SpeechRecognitionModel(
        hparams['n_cnn_layers'], hparams['n_rnn_layers'], hparams['rnn_dim'],
        hparams['n_class'], hparams['n_feats'], hparams['stride'], hparams['dropout']
        )
model.load_state_dict(torch.load("./Speech_recognition_model_2.pth",map_location=torch.device('cpu')))
#model = from_pretrained_fastai(repo_id)
def prediction(audio):
    valid_audio_transforms = torchaudio.transforms.MelSpectrogram(sample_rate = 16000)

    spectrogram = []
    waveform, freq = torchaudio.load(audio)


    resampler = T.Resample(freq, 16000, dtype=waveform.dtype)
    resampled_waveform = resampler(waveform)

    spec = valid_audio_transforms(resampled_waveform).squeeze(0).transpose(0,1)

    spectrogram.append(spec)
    spec = nn.utils.rnn.pad_sequence(spectrogram, batch_first = True).unsqueeze(1).transpose(2,3)
    spec = spec

    model.eval()
    with torch.no_grad():
        output = model(spec)
        output = F.log_softmax(output, dim=2)

    arg_maxes = torch.argmax(output, dim=2)

    decodes = []
    for i, args in enumerate(arg_maxes):
        decode = []
        for j, index in enumerate(args):
            if index != 28:
                if True and j != 0 and index == args[j -1]:
                    continue
                decode.append(index.item())
        decodes.append(text_transform.int_to_text(decode))

    return decodes
#microphone_input = gr.Audio( type="filepath", label="Record")
#upload_input = gr.Audio( type="filepath", label="Upload File")
audio_input = gr.Audio(type='filepath', label="Upload an audio file")
emotion_output = gr.Textbox(label="Predicted Emotion")
examples=[["2902-9008-0000.flac"],["2902-9008-0001.flac"],["2902-9008-0002.flac"],["2902-9008-0003.flac"],["2902-9008-0004.flac"]]
interface = gr.Interface(fn=prediction, inputs=audio_input, outputs=emotion_output,examples =examples)

interface.launch(share=True)