Spaces:
Running
Running
"""Set up the Gradio interface""" | |
import gradio as gr | |
from transformers import pipeline | |
from TTS.api import TTS | |
# Load pre-trained emotion detection model | |
emotion_classifier = pipeline("text-classification", model="bhadresh-savani/distilbert-base-uncased-emotion") | |
# Load TTS model | |
tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC") | |
# Emotion-specific settings for pitch and speed | |
emotion_settings = { | |
"neutral": {"pitch": 1.0, "speed": 1.0}, | |
"joy": {"pitch": 1.3, "speed": 1.2}, | |
"sadness": {"pitch": 0.8, "speed": 0.9}, | |
"anger": {"pitch": 1.6, "speed": 1.4}, | |
"fear": {"pitch": 1.2, "speed": 0.95}, | |
"surprise": {"pitch": 1.5, "speed": 1.3}, | |
"disgust": {"pitch": 0.9, "speed": 0.95}, | |
"shame": {"pitch": 0.8, "speed": 0.85}, | |
} | |
import librosa | |
import soundfile as sf | |
def adjust_audio_speed(audio_path, speed_factor): | |
y, sr = librosa.load(audio_path) | |
y_speeded = librosa.effects.time_stretch(y, speed_factor) | |
sf.write(audio_path, y_speeded, sr) | |
def adjust_audio_pitch(audio_path, pitch_factor): | |
y, sr = librosa.load(audio_path) | |
y_shifted = librosa.effects.pitch_shift(y, sr, n_steps=pitch_factor) | |
sf.write(audio_path, y_shifted, sr) | |
def emotion_aware_tts_pipeline(input_text=None, file_input=None): | |
try: | |
# Get text from input or file | |
if file_input: | |
with open(file_input.name, 'r') as file: | |
input_text = file.read() | |
if input_text: | |
# Detect emotion | |
emotion_data = emotion_classifier(input_text)[0] | |
emotion = emotion_data['label'] | |
confidence = emotion_data['score'] | |
# Adjust pitch and speed | |
settings = emotion_settings.get(emotion.lower(), {"pitch": 1.0, "speed": 1.0}) | |
pitch = settings["pitch"] | |
speed = settings["speed"] | |
# Generate audio | |
audio_path = "output.wav" | |
tts_model.tts_to_file(text=input_text, file_path=audio_path) | |
# Adjust pitch and speed using librosa | |
if pitch != 1.0: | |
adjust_audio_pitch(audio_path, pitch) | |
if speed != 1.0: | |
adjust_audio_speed(audio_path, speed) | |
return f"Detected Emotion: {emotion} (Confidence: {confidence:.2f})", audio_path | |
else: | |
return "Please provide input text or file", None | |
except Exception as e: | |
return f"Error: {str(e)}", None | |
# Define Gradio interface | |
iface = gr.Interface( | |
fn=emotion_aware_tts_pipeline, | |
inputs=[ | |
gr.Textbox(label="Input Text", placeholder="Enter text here"), | |
gr.File(label="Upload a Text File") | |
], | |
outputs=[ | |
gr.Textbox(label="Detected Emotion"), | |
gr.Audio(label="Generated Audio") | |
], | |
title="Emotion-Aware Text-to-Speech", | |
description="Input text or upload a text file to detect the emotion and generate audio with emotion-aware modulation." | |
) | |
# Launch Gradio interface | |
iface.launch() |