androstj's picture
initial commit
4126fb2
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# This file is copied & modified from UNESCO/MMS/blob/main/asr.py
import gradio as gr
import numpy as np
import torch
import torchaudio
import json
from audiobox_aesthetics import infer as aes_infer
import plotly.graph_objects as go
ASR_EXAMPLES = [("assets/bach.wav")]
aes_predictor = aes_infer.initialize_model(None)
def transcribe(audio_data=None):
if not audio_data:
return "<<ERROR: Empty Audio Input>>"
if isinstance(audio_data, tuple):
# microphone
sr, audio_samples = audio_data
audio_samples = (audio_samples / 32768.0).astype(np.float32)
audio_samples = torch.tensor(audio_samples)
if audio_samples.ndim == 1:
audio_samples = audio_samples[:, None]
assert audio_samples.ndim == 2
audio_samples = audio_samples.t()
else:
# file upload
if not isinstance(audio_data, str):
return "<<ERROR: Invalid Audio Input Instance: {}>>".format(
type(audio_data)
)
audio_samples, sr = torchaudio.load(audio_data)
transcription = json.loads(
aes_predictor.forward([{"path": audio_samples, "sample_rate": sr}])[0]
)
# reorder like paper figures
transcription = {
"Production Quality": transcription["PQ"],
"Production Complexity": transcription["PC"],
"Content Enjoyment": transcription["CE"],
"Content Usefulness": transcription["CU"],
}
# Create a Plotly bar plot
fig = go.Figure()
colors = ["#b1d8ff", "#fee2f5", "#cefac4", "#d2d3ff"]
values = list(transcription.values())
keys = list(transcription.keys())
fig.add_trace(
go.Bar(
x=keys,
y=values,
text=[f"{v:.2f}" for v in values], # Format text to 2 decimal places
textposition="outside", # Position text outside the bars
marker=dict(color=colors),
)
)
# Set the range for the y-axis
fig.update_layout(
yaxis=dict(range=[0, 10]),
xaxis_title="Metrics",
yaxis_title="Scores",
)
return fig
main_interface = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(),
],
# outputs="text",
outputs=gr.Plot(),
examples=ASR_EXAMPLES,
title="Audiobox Aesthetics Demo Prediction",
description=("Play some audio through microphone or upload the file."),
article="",
allow_flagging="never",
)
disclaimer = """
## Disclaimer
"""
with gr.Blocks() as demo:
gr.HTML(
"""
<div style="text-align: center;">
<h1>
Meta Audiobox Aesthetics: Unified Automatic Quality Assessment for Speech, Music, and Sound
</h1>
</div>
"""
)
gr.Markdown(
"<p align='center' style='font-size: 20px;'>See our <a href='https://arxiv.org/abs/2502.05139'>paper</a>, Github <a href='https://github.com/facebookresearch/audiobox-aesthetics'>repo</a> and HuggingFace <a href='https://huggingface.co/facebook/audiobox-aesthetics'>repo</a> </p>"
)
gr.HTML(
"""<center><a href="https://huggingface.co/spaces/facebook/audiobox-aesthetics?duplicate=true" style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank"><img style="margin-bottom: 0em;display: inline;margin-top: -.25em;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> for more control and no queue.</center>"""
)
main_interface.render()
gr.HTML(
"""
<div class="footer" style="text-align:center">
<p>
Model by <a href="https://ai.facebook.com" style="text-decoration: underline;" target="_blank">Meta AI</a> - Gradio Demo by πŸ€— Hugging Face
</p>
</div>
"""
)
# with gr.Row():
# gr.Markdown(disclaimer)
if __name__ == "__main__":
demo.queue()
demo.launch()