File size: 2,510 Bytes
d1fb9a5
8581f9c
d1fb9a5
675e1e5
a1288b8
d1fb9a5
8a7312c
a1288b8
4a065d2
864e9d8
8a7312c
a1288b8
90f8c97
675e1e5
 
 
d1fb9a5
8581f9c
a1288b8
90f8c97
a1288b8
938bf7e
a1288b8
 
 
938bf7e
675e1e5
8581f9c
4a065d2
 
 
938bf7e
4a065d2
 
 
 
a1288b8
4a065d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import gradio as gr
import spaces ## For ZeroGPU
import torch
import torchaudio
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification

device = "cuda" if torch.cuda.is_available() else "cpu"

model_name = "Hatman/audio-emotion-detection"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)

def preprocess_audio(audio):
    waveform, sampling_rate = torchaudio.load(audio)
    resampled_waveform = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)(waveform)
    return {'speech': resampled_waveform.numpy().flatten(), 'sampling_rate': 16000}

@spaces.GPU ## For ZeroGPU
def inference(audio):
    example = preprocess_audio(audio)
    inputs = feature_extractor(example['speech'], sampling_rate=16000, return_tensors="pt", padding=True)
    inputs = {k: v.to('cpu') for k, v in inputs.items()} # Not necessary on ZeroGPU
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    return model.config.id2label[predicted_ids.item()], logits, predicted_ids   
    
@spaces.GPU ## For ZeroGPU
def inference_label(audio):
    example = preprocess_audio(audio)
    inputs = feature_extractor(example['speech'], sampling_rate=16000, return_tensors="pt", padding=True)
    inputs = {k: v.to('cpu') for k, v in inputs.items()} # Not necessary on ZeroGPU
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    return model.config.id2label[predicted_ids.item()]

with gr.Blocks() as demo:
    gr.Markdown("# Audio Sentiment Analysis")
    
    
    
    with gr.Tab("Label Only Inference"):
        gr.Interface(
            fn=inference_label,
            inputs=gr.Audio(type="filepath"),
            outputs=gr.Label(label="Predicted Sentiment"),
            title="Audio Sentiment Analysis",
            description="Upload an audio file or record one to get the predicted sentiment label."
        )

    with gr.Tab("Full Inference"):
        gr.Interface(
            fn=inference,
            inputs=gr.Audio(type="filepath"),
            outputs=[gr.Label(label="Predicted Sentiment"), gr.Textbox(label="Logits"), gr.Textbox(label="Predicted IDs")],
            title="Audio Sentiment Analysis (Full)",
            description="Upload an audio file or record one to analyze sentiment and get detailed results."
        )

demo.launch(share=True)