EconLabAI commited on
Commit
3696a79
·
verified ·
1 Parent(s): 043880b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -51
app.py CHANGED
@@ -8,67 +8,102 @@ import gradio as gr
8
  import os
9
  from huggingface_hub import login
10
 
 
 
 
11
  # Authentifizierung
12
  if os.environ.get("HUGGING_FACE_HUB_TOKEN"):
13
- login(token=os.environ["HUGGING_FACE_HUB_TOKEN"])
14
-
 
 
 
15
 
16
- @spaces.GPU(duration=180)
17
  def generate_audio(prompt, duration=10, steps=50, cfg_scale=7):
18
- device = "cuda" if torch.cuda.is_available() else "cpu"
19
-
20
- # Modell laden und zum Gerät verschieben
21
- model, model_config = get_pretrained_model("stabilityai/stable-audio-open-1.0")
22
- model = model.to(device)
23
-
24
- sample_rate = model_config["sample_rate"]
25
- sample_size = model_config["sample_size"]
 
26
 
27
- # Konditionierung einrichten
28
- conditioning = [{
29
- "prompt": prompt,
30
- "seconds_start": 0,
31
- "seconds_total": duration
32
- }]
33
 
34
- # Audio generieren mit anpassbaren Parametern
35
- output = generate_diffusion_cond(
36
- model,
37
- steps=steps,
38
- cfg_scale=cfg_scale,
39
- conditioning=conditioning,
40
- sample_size=sample_size,
41
- sigma_min=0.3,
42
- sigma_max=500,
43
- sampler_type="dpmpp-3m-sde", # Besserer Sampler
44
- device=device
45
- )
46
 
47
- # Audio-Batch in eine einzelne Sequenz umwandeln
48
- output = rearrange(output, "b d n -> d (b n)")
 
49
 
50
- # Peak-Normalisierung, Clipping, Konvertierung zu int16
51
- output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
 
 
 
52
 
53
- return output, sample_rate
 
 
54
 
55
- def generate(prompt, duration=10, steps=50, cfg_scale=7):
56
- audio, sr = generate_audio(prompt, duration, steps, cfg_scale)
57
- return (sr, audio.numpy())
 
 
58
 
59
- # Verbesserte Benutzeroberfläche
60
- iface = gr.Interface(
61
- fn=generate,
62
- inputs=[
63
- gr.Textbox(label="Prompt", placeholder="Beschreiben Sie den gewünschten Sound..."),
64
- gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Dauer (Sekunden)"),
65
- gr.Slider(minimum=20, maximum=100, value=50, step=5, label="Anzahl der Schritte"),
66
- gr.Slider(minimum=1, maximum=15, value=7, step=0.5, label="CFG Scale"),
67
- ],
68
- outputs=gr.Audio(label="Generiertes Audio"),
69
- title="Stable Audio Generator",
70
- description="Generieren Sie Audio aus Textbeschreibungen mit Stable Audio 1.0",
71
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  if __name__ == "__main__":
74
- iface.launch()
 
8
  import os
9
  from huggingface_hub import login
10
 
11
+ # Ordner für temporäre Dateien erstellen
12
+ os.makedirs('static', exist_ok=True)
13
+
14
  # Authentifizierung
15
  if os.environ.get("HUGGING_FACE_HUB_TOKEN"):
16
+ token = os.environ["HUGGING_FACE_HUB_TOKEN"].strip()
17
+ try:
18
+ login(token=token, add_to_git_credential=True)
19
+ except Exception as e:
20
+ print(f"Warnung: Login fehlgeschlagen - {str(e)}")
21
 
22
+ @spaces.GPU(duration=300)
23
  def generate_audio(prompt, duration=10, steps=50, cfg_scale=7):
24
+ try:
25
+ device = "cuda" if torch.cuda.is_available() else "cpu"
26
+
27
+ # Modell laden und zum Gerät verschieben
28
+ model, model_config = get_pretrained_model("stabilityai/stable-audio-open-1.0")
29
+ model = model.to(device)
30
+
31
+ sample_rate = model_config["sample_rate"]
32
+ sample_size = model_config["sample_size"]
33
 
34
+ # Konditionierung einrichten
35
+ conditioning = [{
36
+ "prompt": prompt,
37
+ "seconds_start": 0,
38
+ "seconds_total": duration # Keine Begrenzung mehr
39
+ }]
40
 
41
+ # Audio generieren mit anpassbaren Parametern
42
+ output = generate_diffusion_cond(
43
+ model,
44
+ steps=steps, # Keine Begrenzung mehr
45
+ cfg_scale=cfg_scale,
46
+ conditioning=conditioning,
47
+ sample_size=sample_size,
48
+ sigma_min=0.3,
49
+ sigma_max=500,
50
+ sampler_type="dpmpp-3m-sde",
51
+ device=device
52
+ )
53
 
54
+ # Audio verarbeiten
55
+ output = rearrange(output, "b d n -> d (b n)")
56
+ output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
57
 
58
+ # Audio speichern
59
+ output_path = "static/generated_audio.wav"
60
+ torchaudio.save(output_path, output, model_config["sample_rate"])
61
+
62
+ return output_path
63
 
64
+ except Exception as e:
65
+ print(f"Fehler bei der Audiogenerierung: {str(e)}")
66
+ raise e
67
 
68
+ # Benutzerdefiniertes CSS für besseres Aussehen
69
+ custom_css = """
70
+ body { background-color: #f6f6f6; }
71
+ .gradio-container { max-width: 800px; margin: auto; }
72
+ """
73
 
74
+ # Gradio Interface mit Blocks
75
+ with gr.Blocks(css=custom_css) as demo:
76
+ gr.Markdown("# Stable Audio Generator")
77
+ gr.Markdown("Generieren Sie Audio aus Textbeschreibungen mit Stable Audio 1.0")
78
+
79
+ with gr.Row():
80
+ with gr.Column():
81
+ prompt = gr.Textbox(
82
+ label="Prompt",
83
+ placeholder="Beschreiben Sie den gewünschten Sound..."
84
+ )
85
+ duration = gr.Slider(
86
+ minimum=1, maximum=30, value=10,
87
+ step=1, label="Dauer (Sekunden)"
88
+ )
89
+ steps = gr.Slider(
90
+ minimum=20, maximum=100, value=50,
91
+ step=5, label="Anzahl der Schritte"
92
+ )
93
+ cfg_scale = gr.Slider(
94
+ minimum=1, maximum=15, value=7,
95
+ step=0.5, label="CFG Scale"
96
+ )
97
+ generate_btn = gr.Button("Generieren")
98
+
99
+ with gr.Column():
100
+ output = gr.Audio(label="Generiertes Audio", type="filepath")
101
+
102
+ generate_btn.click(
103
+ fn=generate_audio,
104
+ inputs=[prompt, duration, steps, cfg_scale],
105
+ outputs=output
106
+ )
107
 
108
  if __name__ == "__main__":
109
+ demo.launch()