Manjushri commited on
Commit
3dbaeed
·
1 Parent(s): 477fcd5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +341 -42
app.py CHANGED
@@ -1,44 +1,343 @@
1
- import torch #needed only for GPU
2
- from PIL import Image
3
- from io import BytesIO
4
- import numpy as np
5
- from diffusers import StableDiffusionLatentUpscalePipeline, StableDiffusionUpscalePipeline
6
- import gradio as gr
 
 
 
 
 
 
 
 
 
 
7
  import modin.pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- # load model for CPU or GPU
10
-
11
- model_2x = "stabilityai/sd-x2-latent-upscaler"
12
- model_4x = "stabilityai/stable-diffusion-x4-upscaler"
13
- device = "cuda" if torch.cuda.is_available() else "cpu"
14
- upscaler2x = StableDiffusionLatentUpscalePipeline.from_pretrained(model_2x, torch_dtype=torch.float16) if torch.cuda.is_available() else StableDiffusionLatentUpscalePipeline.from_pretrained(model_2x)
15
- upscaler4x = StableDiffusionUpscalePipeline.from_pretrained(model_4x, torch_dtype=torch.float16, revision="fp16") if torch.cuda.is_available() else StableDiffusionUpscalePipeline.from_pretrained(model_4x)
16
- upscaler2x = upscaler2x.to(device)
17
- upscaler4x = upscaler4x.to(device)
18
-
19
- #define interface
20
-
21
- def upscale(raw_img, model, prompt, negative_prompt, scale, steps, Seed):
22
- generator = torch.manual_seed(Seed)
23
- raw_img = Image.open(raw_img).convert("RGB")
24
- if model == "Upscaler 4x":
25
- low_res_img = raw_img.resize((128, 128))
26
- upscaled_image = upscaler4x(prompt=prompt, negative_prompt=negative_prompt, image=low_res_img, guidance_scale=scale, num_inference_steps=steps).images[0]
27
- else:
28
- upscaled_image = upscaler2x(prompt=prompt, negative_prompt=negative_prompt, image=raw_img, guidance_scale=scale, num_inference_steps=steps).images[0]
29
- return upscaled_image
30
-
31
- #launch interface
32
-
33
- gr.Interface(fn=upscale, inputs=[
34
- gr.Image(type="filepath", label='Lower Resolution Image'),
35
- gr.Radio(['Upscaler 2x','Upscaler 4x'], label="Models"),
36
- gr.Textbox(label="Optional: Enter a Prompt to Guide the AI's Enhancement, this can have an Img2Img Effect"),
37
- gr.Textbox(label='Experimental: Influence What you do not want the AI to Enhance. Such as Blur, Smudges, or Pixels'),
38
- gr.Slider(1, 15, 1, step=1, label='Guidance Scale: How much the AI influences the Upscaling.'),
39
- gr.Slider(5, 50, 5, step=1, label='Number of Iterations'),
40
- gr.Slider(minimum=1, maximum=999999999999999999, randomize=True, step=1)],
41
- outputs=gr.Image(type="filepath", label = 'Upscaled Image'),
42
- title='SD Upscaler',
43
- description='2x Latent Upscaler using SD 2.0 And 4x Upscaler using SD 2.1. This version runs on CPU or GPU and is currently running on a T4 GPU. For 4x Upscaling use images lower than 512x512, ideally 128x128 or smaller for 512x512 output. For 2x Upscaling use up to 512x512 images for 1024x1024 output.<br><br><b>Notice: Largest Accepted Resolution is 512x512',
44
- article = "Code Monkey: <a href=\"https://huggingface.co/Manjushri\">Manjushri</a>").launch(max_threads=True, debug=True)
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # Updated to account for UI changes from https://github.com/rkfg/audiocraft/blob/long/app.py
8
+ # also released under the MIT license.
9
+
10
+ import argparse
11
+ from concurrent.futures import ProcessPoolExecutor
12
+ import os
13
+ import subprocess as sp
14
+ from tempfile import NamedTemporaryFile
15
+ import time
16
+ import warnings
17
  import modin.pandas as pd
18
+ import torch
19
+ import gradio as gr
20
+
21
+ from audiocraft.data.audio_utils import convert_audio
22
+ from audiocraft.data.audio import audio_write
23
+ from audiocraft.models import MusicGen
24
+
25
+
26
+ MODEL = None # Last used model
27
+ IS_BATCHED = "facebook/MusicGen" in os.environ.get('SPACE_ID', '')
28
+ MAX_BATCH_SIZE = 6
29
+ BATCHED_DURATION = 15
30
+ INTERRUPTING = True
31
+ # We have to wrap subprocess call to clean a bit the log when using gr.make_waveform
32
+ _old_call = sp.call
33
+
34
+
35
+ def _call_nostderr(*args, **kwargs):
36
+ # Avoid ffmpeg vomitting on the logs.
37
+ kwargs['stderr'] = sp.DEVNULL
38
+ kwargs['stdout'] = sp.DEVNULL
39
+ _old_call(*args, **kwargs)
40
+
41
+
42
+ sp.call = _call_nostderr
43
+ # Preallocating the pool of processes.
44
+ pool = ProcessPoolExecutor(3)
45
+ pool.__enter__()
46
+
47
+
48
+ def interrupt():
49
+ global INTERRUPTING
50
+ INTERRUPTING = True
51
+
52
+
53
+ def make_waveform(*args, **kwargs):
54
+ # Further remove some warnings.
55
+ be = time.time()
56
+ with warnings.catch_warnings():
57
+ warnings.simplefilter('ignore')
58
+ out = gr.make_waveform(*args, **kwargs)
59
+ print("Make a video took", time.time() - be)
60
+ return out
61
+
62
+
63
+ def load_model(version='melody'):
64
+ global MODEL
65
+ print("Loading model", version)
66
+ if MODEL is None or MODEL.name != version:
67
+ MODEL = MusicGen.get_pretrained(version)
68
+
69
+
70
+ def _do_predictions(texts, melodies, duration, progress=False, **gen_kwargs):
71
+ MODEL.set_generation_params(duration=duration, **gen_kwargs)
72
+ print("new batch", len(texts), texts, [None if m is None else (m[0], m[1].shape) for m in melodies])
73
+ be = time.time()
74
+ processed_melodies = []
75
+ target_sr = 32000
76
+ target_ac = 1
77
+ for melody in melodies:
78
+ if melody is None:
79
+ processed_melodies.append(None)
80
+ else:
81
+ sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t()
82
+ if melody.dim() == 1:
83
+ melody = melody[None]
84
+ melody = melody[..., :int(sr * duration)]
85
+ melody = convert_audio(melody, sr, target_sr, target_ac)
86
+ processed_melodies.append(melody)
87
+
88
+ if any(m is not None for m in processed_melodies):
89
+ outputs = MODEL.generate_with_chroma(
90
+ descriptions=texts,
91
+ melody_wavs=processed_melodies,
92
+ melody_sample_rate=target_sr,
93
+ progress=progress,
94
+ )
95
+ else:
96
+ outputs = MODEL.generate(texts, progress=progress)
97
+
98
+ outputs = outputs.detach().cpu().float()
99
+ out_files = []
100
+ for output in outputs:
101
+ with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
102
+ audio_write(
103
+ file.name, output, MODEL.sample_rate, strategy="loudness",
104
+ loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
105
+ out_files.append(pool.submit(make_waveform, file.name))
106
+ res = [out_file.result() for out_file in out_files]
107
+ print("batch finished", len(texts), time.time() - be)
108
+ return res
109
+
110
+
111
+ def predict_batched(texts, melodies):
112
+ max_text_length = 512
113
+ texts = [text[:max_text_length] for text in texts]
114
+ load_model('melody')
115
+ res = _do_predictions(texts, melodies, BATCHED_DURATION)
116
+ return [res]
117
+
118
+
119
+ def predict_full(model, text, melody, duration, topk, topp, temperature, cfg_coef, progress=gr.Progress()):
120
+ global INTERRUPTING
121
+ INTERRUPTING = False
122
+ topk = int(topk)
123
+ load_model(model)
124
+
125
+ def _progress(generated, to_generate):
126
+ progress((generated, to_generate))
127
+ if INTERRUPTING:
128
+ raise gr.Error("Interrupted.")
129
+ MODEL.set_custom_progress_callback(_progress)
130
+
131
+ outs = _do_predictions(
132
+ [text], [melody], duration, progress=True,
133
+ top_k=topk, top_p=topp, temperature=temperature, cfg_coef=cfg_coef)
134
+ return outs[0]
135
+
136
+
137
+ def ui_full(launch_kwargs):
138
+ with gr.Blocks() as interface:
139
+ gr.Markdown(
140
+ """
141
+ # MusicGen
142
+ This is a demo for [MusicGen](https://github.com/facebookresearch/audiocraft), a simple and controllable model for music generation
143
+ presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284)
144
+ """
145
+ )
146
+ with gr.Row():
147
+ with gr.Column():
148
+ with gr.Row():
149
+ text = gr.Text(label="Input Text", interactive=True)
150
+ melody = gr.Audio(source="upload", type="numpy", label="Melody Condition (optional)", interactive=True)
151
+ with gr.Row():
152
+ submit = gr.Button("Submit")
153
+ # Adapted from https://github.com/rkfg/audiocraft/blob/long/app.py, MIT license.
154
+ _ = gr.Button("Interrupt").click(fn=interrupt, queue=False)
155
+ with gr.Row():
156
+ model = gr.Radio(["melody", "large", "medium", "small"], label="Model", value="melody", interactive=True)
157
+ with gr.Row():
158
+ duration = gr.Slider(minimum=1, maximum=120, value=16, label="Duration", interactive=True)
159
+ with gr.Row():
160
+ topk = gr.Number(label="Top-k", value=250, interactive=True)
161
+ topp = gr.Number(label="Top-p", value=0, interactive=True)
162
+ temperature = gr.Number(label="Temperature", value=1.0, interactive=True)
163
+ cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
164
+ with gr.Column():
165
+ output = gr.Video(label="Generated Music")
166
+ submit.click(predict_full, inputs=[model, text, melody, duration, topk, topp, temperature, cfg_coef], outputs=[output])
167
+ gr.Examples(
168
+ fn=predict_full,
169
+ examples=[
170
+ [
171
+ "An 80s driving pop song with heavy drums and synth pads in the background",
172
+ "./assets/bach.mp3",
173
+ "melody"
174
+ ],
175
+ [
176
+ "A cheerful country song with acoustic guitars",
177
+ "./assets/bolero_ravel.mp3",
178
+ "melody"
179
+ ],
180
+ [
181
+ "90s rock song with electric guitar and heavy drums",
182
+ None,
183
+ "medium"
184
+ ],
185
+ [
186
+ "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions",
187
+ "./assets/bach.mp3",
188
+ "melody"
189
+ ],
190
+ [
191
+ "lofi slow bpm electro chill with organic samples",
192
+ None,
193
+ "medium",
194
+ ],
195
+ ],
196
+ inputs=[text, melody, model],
197
+ outputs=[output]
198
+ )
199
+ gr.Markdown(
200
+ """
201
+ ### More details
202
+
203
+ The model will generate a short music extract based on the description you provided.
204
+ The model can generate up to 30 seconds of audio in one pass. It is now possible
205
+ to extend the generation by feeding back the end of the previous chunk of audio.
206
+ This can take a long time, and the model might lose consistency. The model might also
207
+ decide at arbitrary positions that the song ends.
208
+
209
+ **WARNING:** Choosing long durations will take a long time to generate (2min might take ~10min). An overlap of 12 seconds
210
+ is kept with the previously generated chunk, and 18 "new" seconds are generated each time.
211
+
212
+ We present 4 model variations:
213
+ 1. Melody -- a music generation model capable of generating music condition on text and melody inputs. **Note**, you can also use text only.
214
+ 2. Small -- a 300M transformer decoder conditioned on text only.
215
+ 3. Medium -- a 1.5B transformer decoder conditioned on text only.
216
+ 4. Large -- a 3.3B transformer decoder conditioned on text only (might OOM for the longest sequences.)
217
+
218
+ When using `melody`, you can optionaly provide a reference audio from
219
+ which a broad melody will be extracted. The model will then try to follow both the description and melody provided.
220
+
221
+ You can also use your own GPU or a Google Colab by following the instructions on our repo.
222
+ See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
223
+ for more details.
224
+ """
225
+ )
226
+
227
+ interface.queue(max_size=2).launch(**launch_kwargs)
228
+
229
+
230
+ def ui_batched(launch_kwargs):
231
+ with gr.Blocks() as demo:
232
+ gr.Markdown(
233
+ """
234
+ # MusicGen
235
+
236
+ This is the demo for [MusicGen](https://github.com/facebookresearch/audiocraft), a simple and controllable model for music generation
237
+ presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284).
238
+ <br/>
239
+ <a href="https://huggingface.co/spaces/facebook/MusicGen?duplicate=true" style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
240
+ <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
241
+ for longer sequences, more control and no queue.</p>
242
+ """
243
+ )
244
+ with gr.Row():
245
+ with gr.Column():
246
+ with gr.Row():
247
+ text = gr.Text(label="Describe your music", lines=2, interactive=True)
248
+ melody = gr.Audio(source="upload", type="numpy", label="Condition on a melody (optional)", interactive=True)
249
+ with gr.Row():
250
+ submit = gr.Button("Generate")
251
+ with gr.Column():
252
+ output = gr.Video(label="Generated Music")
253
+ submit.click(predict_batched, inputs=[text, melody], outputs=[output], batch=True, max_batch_size=MAX_BATCH_SIZE)
254
+ gr.Examples(
255
+ fn=predict_batched,
256
+ examples=[
257
+ [
258
+ "An 80s driving pop song with heavy drums and synth pads in the background",
259
+ "./assets/bach.mp3",
260
+ ],
261
+ [
262
+ "A cheerful country song with acoustic guitars",
263
+ "./assets/bolero_ravel.mp3",
264
+ ],
265
+ [
266
+ "90s rock song with electric guitar and heavy drums",
267
+ None,
268
+ ],
269
+ [
270
+ "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130",
271
+ "./assets/bach.mp3",
272
+ ],
273
+ [
274
+ "lofi slow bpm electro chill with organic samples",
275
+ None,
276
+ ],
277
+ ],
278
+ inputs=[text, melody],
279
+ outputs=[output]
280
+ )
281
+ gr.Markdown("""
282
+ ### More details
283
+
284
+ The model will generate 12 seconds of audio based on the description you provided.
285
+ You can optionaly provide a reference audio from which a broad melody will be extracted.
286
+ The model will then try to follow both the description and melody provided.
287
+ All samples are generated with the `melody` model.
288
+
289
+ You can also use your own GPU or a Google Colab by following the instructions on our repo.
290
+
291
+ See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
292
+ for more details.
293
+ """)
294
+
295
+ demo.queue(max_size=3).launch(**launch_kwargs)
296
+
297
+
298
+ if __name__ == "__main__":
299
+ parser = argparse.ArgumentParser()
300
+ parser.add_argument(
301
+ '--listen',
302
+ type=str,
303
+ default='0.0.0.0' if 'SPACE_ID' in os.environ else '127.0.0.1',
304
+ help='IP to listen on for connections to Gradio',
305
+ )
306
+ parser.add_argument(
307
+ '--username', type=str, default='', help='Username for authentication'
308
+ )
309
+ parser.add_argument(
310
+ '--password', type=str, default='', help='Password for authentication'
311
+ )
312
+ parser.add_argument(
313
+ '--server_port',
314
+ type=int,
315
+ default=0,
316
+ help='Port to run the server listener on',
317
+ )
318
+ parser.add_argument(
319
+ '--inbrowser', action='store_true', help='Open in browser'
320
+ )
321
+ parser.add_argument(
322
+ '--share', action='store_true', help='Share the gradio UI'
323
+ )
324
+
325
+ args = parser.parse_args()
326
+
327
+ launch_kwargs = {}
328
+ launch_kwargs['server_name'] = args.listen
329
+
330
+ if args.username and args.password:
331
+ launch_kwargs['auth'] = (args.username, args.password)
332
+ if args.server_port:
333
+ launch_kwargs['server_port'] = args.server_port
334
+ if args.inbrowser:
335
+ launch_kwargs['inbrowser'] = args.inbrowser
336
+ if args.share:
337
+ launch_kwargs['share'] = args.share
338
 
339
+ # Show the interface
340
+ if IS_BATCHED:
341
+ ui_batched(launch_kwargs)
342
+ else:
343
+ ui_full(launch_kwargs)