File size: 3,517 Bytes
99705b8
 
 
 
 
 
 
7dbebe4
 
 
3696a79
 
 
7dbebe4
 
3696a79
 
 
 
 
3919a75
3696a79
99705b8
3696a79
 
 
 
 
 
 
 
 
99705b8
3696a79
 
 
 
 
 
99705b8
3696a79
 
 
 
 
 
 
 
 
 
 
 
99705b8
3696a79
 
 
99705b8
3696a79
 
 
 
 
99705b8
3696a79
 
 
99705b8
3696a79
 
 
 
 
99705b8
3696a79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99705b8
 
3696a79
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import spaces
import torch
import torchaudio
from einops import rearrange
from stable_audio_tools import get_pretrained_model
from stable_audio_tools.inference.generation import generate_diffusion_cond
import gradio as gr
import os
from huggingface_hub import login

# Ordner für temporäre Dateien erstellen
os.makedirs('static', exist_ok=True)

# Authentifizierung
if os.environ.get("HUGGING_FACE_HUB_TOKEN"):
    token = os.environ["HUGGING_FACE_HUB_TOKEN"].strip()
    try:
        login(token=token, add_to_git_credential=True)
    except Exception as e:
        print(f"Warnung: Login fehlgeschlagen - {str(e)}")

@spaces.GPU(duration=300)
def generate_audio(prompt, duration=10, steps=50, cfg_scale=7):
    try:
        device = "cuda" if torch.cuda.is_available() else "cpu"
        
        # Modell laden und zum Gerät verschieben
        model, model_config = get_pretrained_model("stabilityai/stable-audio-open-1.0")
        model = model.to(device)
        
        sample_rate = model_config["sample_rate"]
        sample_size = model_config["sample_size"]

        # Konditionierung einrichten
        conditioning = [{
            "prompt": prompt,
            "seconds_start": 0,
            "seconds_total": duration  # Keine Begrenzung mehr
        }]

        # Audio generieren mit anpassbaren Parametern
        output = generate_diffusion_cond(
            model,
            steps=steps,  # Keine Begrenzung mehr
            cfg_scale=cfg_scale,
            conditioning=conditioning,
            sample_size=sample_size,
            sigma_min=0.3,
            sigma_max=500,
            sampler_type="dpmpp-3m-sde",
            device=device
        )

        # Audio verarbeiten
        output = rearrange(output, "b d n -> d (b n)")
        output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()

        # Audio speichern
        output_path = "static/generated_audio.wav"
        torchaudio.save(output_path, output, model_config["sample_rate"])
        
        return output_path

    except Exception as e:
        print(f"Fehler bei der Audiogenerierung: {str(e)}")
        raise e

# Benutzerdefiniertes CSS für besseres Aussehen
custom_css = """
body { background-color: #f6f6f6; }
.gradio-container { max-width: 800px; margin: auto; }
"""

# Gradio Interface mit Blocks
with gr.Blocks(css=custom_css) as demo:
    gr.Markdown("# Stable Audio Generator")
    gr.Markdown("Generieren Sie Audio aus Textbeschreibungen mit Stable Audio 1.0")
    
    with gr.Row():
        with gr.Column():
            prompt = gr.Textbox(
                label="Prompt",
                placeholder="Beschreiben Sie den gewünschten Sound..."
            )
            duration = gr.Slider(
                minimum=1, maximum=30, value=10,
                step=1, label="Dauer (Sekunden)"
            )
            steps = gr.Slider(
                minimum=20, maximum=100, value=50,
                step=5, label="Anzahl der Schritte"
            )
            cfg_scale = gr.Slider(
                minimum=1, maximum=15, value=7,
                step=0.5, label="CFG Scale"
            )
            generate_btn = gr.Button("Generieren")
        
        with gr.Column():
            output = gr.Audio(label="Generiertes Audio", type="filepath")
    
    generate_btn.click(
        fn=generate_audio,
        inputs=[prompt, duration, steps, cfg_scale],
        outputs=output
    )

if __name__ == "__main__":
    demo.launch()