Spaces:
Running
Running
# Copyright (c) Meta Platforms, Inc. and affiliates. | |
# All rights reserved. | |
# This source code is licensed under the license found in the | |
# LICENSE file in the root directory of this source tree. | |
# This file is copied & modified from UNESCO/MMS/blob/main/asr.py | |
import gradio as gr | |
import numpy as np | |
import torch | |
import torchaudio | |
import json | |
from audiobox_aesthetics import infer as aes_infer | |
import plotly.graph_objects as go | |
ASR_EXAMPLES = [("assets/bach.wav")] | |
aes_predictor = aes_infer.initialize_model(None) | |
def transcribe(audio_data=None): | |
if not audio_data: | |
return "<<ERROR: Empty Audio Input>>" | |
if isinstance(audio_data, tuple): | |
# microphone | |
sr, audio_samples = audio_data | |
audio_samples = (audio_samples / 32768.0).astype(np.float32) | |
audio_samples = torch.tensor(audio_samples) | |
if audio_samples.ndim == 1: | |
audio_samples = audio_samples[:, None] | |
assert audio_samples.ndim == 2 | |
audio_samples = audio_samples.t() | |
else: | |
# file upload | |
if not isinstance(audio_data, str): | |
return "<<ERROR: Invalid Audio Input Instance: {}>>".format( | |
type(audio_data) | |
) | |
audio_samples, sr = torchaudio.load(audio_data) | |
transcription = json.loads( | |
aes_predictor.forward([{"path": audio_samples, "sample_rate": sr}])[0] | |
) | |
# reorder like paper figures | |
transcription = { | |
"Production Quality": transcription["PQ"], | |
"Production Complexity": transcription["PC"], | |
"Content Enjoyment": transcription["CE"], | |
"Content Usefulness": transcription["CU"], | |
} | |
# Create a Plotly bar plot | |
fig = go.Figure() | |
colors = ["#b1d8ff", "#fee2f5", "#cefac4", "#d2d3ff"] | |
values = list(transcription.values()) | |
keys = list(transcription.keys()) | |
fig.add_trace( | |
go.Bar( | |
x=keys, | |
y=values, | |
text=[f"{v:.2f}" for v in values], # Format text to 2 decimal places | |
textposition="outside", # Position text outside the bars | |
marker=dict(color=colors), | |
) | |
) | |
# Set the range for the y-axis | |
fig.update_layout( | |
yaxis=dict(range=[0, 10]), | |
xaxis_title="Metrics", | |
yaxis_title="Scores", | |
) | |
return fig | |
main_interface = gr.Interface( | |
fn=transcribe, | |
inputs=[ | |
gr.Audio(), | |
], | |
# outputs="text", | |
outputs=gr.Plot(), | |
examples=ASR_EXAMPLES, | |
title="Audiobox Aesthetics Demo Prediction", | |
description=("Play some audio through microphone or upload the file."), | |
article="", | |
allow_flagging="never", | |
) | |
disclaimer = """ | |
## Disclaimer | |
""" | |
with gr.Blocks() as demo: | |
gr.HTML( | |
""" | |
<div style="text-align: center;"> | |
<h1> | |
Meta Audiobox Aesthetics: Unified Automatic Quality Assessment for Speech, Music, and Sound | |
</h1> | |
</div> | |
""" | |
) | |
gr.Markdown( | |
"<p align='center' style='font-size: 20px;'>See our <a href='https://arxiv.org/abs/2502.05139'>paper</a>, Github <a href='https://github.com/facebookresearch/audiobox-aesthetics'>repo</a> and HuggingFace <a href='https://huggingface.co/facebook/audiobox-aesthetics'>repo</a> </p>" | |
) | |
gr.HTML( | |
"""<center><a href="https://huggingface.co/spaces/facebook/audiobox-aesthetics?duplicate=true" style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank"><img style="margin-bottom: 0em;display: inline;margin-top: -.25em;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> for more control and no queue.</center>""" | |
) | |
main_interface.render() | |
gr.HTML( | |
""" | |
<div class="footer" style="text-align:center"> | |
<p> | |
Model by <a href="https://ai.facebook.com" style="text-decoration: underline;" target="_blank">Meta AI</a> - Gradio Demo by π€ Hugging Face | |
</p> | |
</div> | |
""" | |
) | |
# with gr.Row(): | |
# gr.Markdown(disclaimer) | |
if __name__ == "__main__": | |
demo.queue() | |
demo.launch() | |