Spaces:

NeerAbhy
/

Speech

Sleeping

App Files Files

Speech / app.py

NeerAbhy

Update app.py

4ab8825 verified 10 months ago

raw

history blame

6.84 kB

	import spaces
	from huggingface_hub import from_pretrained_fastai

	import gradio as gr

	import torch
	import torchaudio
	import torchaudio.functional as Ft
	import torchaudio.transforms as T

	import os
	from pathlib import Path

	repo_id = "NeerAbhy/Speech_recognotion_model/Speech_recognition_model_2.pth"
	class TextTransform:
	def __init__(self):
	char_map_str = """
	' 0
	<SPACE> 1
	a 2
	b 3
	c 4
	d 5
	e 6
	f 7
	g 8
	h 9
	i 10
	j 11
	k 12
	l 13
	m 14
	n 15
	o 16
	p 17
	q 18
	r 19
	s 20
	t 21
	u 22
	v 23
	w 24
	x 25
	y 26
	z 27
	"""

	self.char_map = {}
	self.index_map = {}
	for line in char_map_str.strip().split('\n'):
	ch, index = line.split()
	self.char_map[ch] = int(index)
	self.index_map[int(index)] = ch
	self.index_map[1] = ' '

	def text_to_int(self, text):
	int_sequence = []
	for c in text:
	if c == ' ':
	ch = self.char_map['<SPACE>']
	else:
	ch = self.char_map[c]
	int_sequence.append(ch)
	return int_sequence

	def int_to_text(self,labels):
	string = []
	for i in labels:
	string.append(self.index_map[i])
	return ''.join(string).replace('<SPACE>', ' ')


	from torch import nn
	trainaudio_transforms = nn.Sequential(
	torchaudio.transforms.MelSpectrogram(sample_rate = 16000, n_mels = 128),
	torchaudio.transforms.FrequencyMasking(freq_mask_param = 15),
	torchaudio.transforms.TimeMasking(time_mask_param = 35))


	text_transform = TextTransform()

	import torch.nn.functional as F
	class CNNLayerNorm(nn.Module):
	def __init__(self, n_feats):
	super(CNNLayerNorm, self).__init__()
	self.layer_norm = nn.LayerNorm(n_feats)

	def forward(self, x):
	x = x.transpose(2,3).contiguous()
	x = self.layer_norm(x)
	return x.transpose(2,3).contiguous()


	class ResidualCNN(nn.Module):
	def __init__(self, in_channels, out_channels, kernel, stride, dropout, n_feats):
	super(ResidualCNN, self).__init__()

	self.cnn1 = nn.Conv2d(in_channels, out_channels, kernel, stride, padding = kernel//2)
	self.cnn2 = nn.Conv2d(out_channels, out_channels, kernel, stride, padding = kernel//2)
	self.dropout1 = nn.Dropout(dropout)
	self.dropout2 = nn.Dropout(dropout)
	self.layernorm1 = CNNLayerNorm(n_feats)
	self.layernorm2 = CNNLayerNorm(n_feats)

	def forward(self, x):
	residual = x
	x = self.layernorm1(x)
	x = self.dropout1(x)
	x = F.gelu(x)
	x = self.cnn1(x)
	x = self.layernorm2(x)
	x = self.dropout2(x)
	x = F.gelu(x)
	x = self.cnn2(x)
	x += residual
	return x

	class BiDirectionalGRU(nn.Module):
	def __init__(self, rnn_dim, hidden_size, dropout, batch_first):
	super(BiDirectionalGRU, self).__init__()
	self.BiGRU = nn.GRU(
	input_size = rnn_dim, hidden_size = hidden_size,
	num_layers = 1, batch_first = batch_first, bidirectional = True)
	self.layernorm = nn.LayerNorm(rnn_dim)
	self.dropout = nn.Dropout(dropout)

	def forward(self, x):
	x = self.layernorm(x)
	x = F.gelu(x)
	x, _ = self.BiGRU(x)
	x = self.dropout(x)
	return x


	class SpeechRecognitionModel(nn.Module):
	def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride = 2, dropout = 0.1):
	super(SpeechRecognitionModel, self).__init__()
	n_feats = n_feats//2
	self.cnn = nn. Conv2d(1, 32, 3, stride = stride, padding = 3//2)

	self.rescnn_layers = nn.Sequential(*[
	ResidualCNN(32, 32, kernel = 3, stride = 1, dropout = dropout, n_feats = n_feats)
	for _ in range(n_cnn_layers)
	])
	self.fully_connected = nn.Linear(n_feats*32, rnn_dim)
	self.birnn_layers = nn.Sequential(*[
	BiDirectionalGRU(rnn_dim=rnn_dim if i==0 else rnn_dim*2,
	hidden_size=rnn_dim, dropout=dropout, batch_first=i==0)
	for i in range(n_rnn_layers)
	])
	self.classifier = nn.Sequential(
	nn.Linear(rnn_dim*2, rnn_dim),
	nn.GELU(),
	nn.Dropout(dropout),
	nn.Linear(rnn_dim, n_class))

	def forward(self, x):
	x = self.cnn(x)
	x = self.rescnn_layers(x)
	sizes = x.size()
	x = x.view(sizes[0], sizes[1]*sizes[2], sizes[3])
	x = x.transpose(1,2)
	x = self.fully_connected(x)
	x= self.birnn_layers(x)
	x = self.classifier(x)
	return x


	learning_rate=5e-4
	batch_size=16
	epochs=5

	hparams = {
	"n_cnn_layers": 3,
	"n_rnn_layers": 5,
	"rnn_dim": 512,
	"n_class": 29,
	"n_feats": 128,
	"stride":2,
	"dropout": 0.1,
	"learning_rate": learning_rate,
	"batch_size": batch_size,
	"epochs": epochs
	}

	model = SpeechRecognitionModel(
	hparams['n_cnn_layers'], hparams['n_rnn_layers'], hparams['rnn_dim'],
	hparams['n_class'], hparams['n_feats'], hparams['stride'], hparams['dropout']
	)
	model.load_state_dict(torch.load("./Speech_recognition_model_2.pth",map_location=torch.device('cpu')))
	#model = from_pretrained_fastai(repo_id)
	def prediction(audio):
	valid_audio_transforms = torchaudio.transforms.MelSpectrogram(sample_rate = 16000)

	spectrogram = []
	waveform, freq = torchaudio.load(audio)


	resampler = T.Resample(freq, 16000, dtype=waveform.dtype)
	resampled_waveform = resampler(waveform)

	spec = valid_audio_transforms(resampled_waveform).squeeze(0).transpose(0,1)

	spectrogram.append(spec)
	spec = nn.utils.rnn.pad_sequence(spectrogram, batch_first = True).unsqueeze(1).transpose(2,3)
	spec = spec

	model.eval()
	with torch.no_grad():
	output = model(spec)
	output = F.log_softmax(output, dim=2)

	arg_maxes = torch.argmax(output, dim=2)

	decodes = []
	for i, args in enumerate(arg_maxes):
	decode = []
	for j, index in enumerate(args):
	if index != 28:
	if True and j != 0 and index == args[j -1]:
	continue
	decode.append(index.item())
	decodes.append(text_transform.int_to_text(decode))

	return decodes
	#microphone_input = gr.Audio( type="filepath", label="Record")
	#upload_input = gr.Audio( type="filepath", label="Upload File")
	audio_input = gr.Audio(type='filepath', label="Upload an audio file")
	emotion_output = gr.Textbox(label="Predicted Emotion")
	examples=[["2902-9008-0000.flac"],["2902-9008-0001.flac"],["2902-9008-0002.flac"],["2902-9008-0003.flac"],["2902-9008-0004.flac"]]
	interface = gr.Interface(fn=prediction, inputs=audio_input, outputs=emotion_output,examples =examples)

	interface.launch(share=True)