Tom Auger commited on
Commit
4d91e6f
·
unverified ·
1 Parent(s): b58645b

initial commit

Browse files
Files changed (3) hide show
  1. README.md +3 -3
  2. app.py +367 -0
  3. requirements.txt +10 -0
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: Stable Diffusion Music Vid Gen
3
  emoji: 💻
4
- colorFrom: yellow
5
- colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 3.35.2
8
  app_file: app.py
 
1
  ---
2
+ title: Stable Diffusion Video Gen
3
  emoji: 💻
4
+ colorFrom: purple
5
+ colorTo: blue
6
  sdk: gradio
7
  sdk_version: 3.35.2
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Experimental app to help with the process of generating music videos
2
+ # Requires youtube-dl to be installed
3
+ # pip install youtube-dl
4
+
5
+ import gradio as gr
6
+ import librosa
7
+ from pathlib import Path
8
+ import numpy as np
9
+ import random
10
+ from io import BytesIO
11
+ import soundfile as sf
12
+ from matplotlib import pyplot as plt
13
+
14
+ from stable_diffusion_videos import StableDiffusionWalkPipeline, generate_images, get_timesteps_arr
15
+
16
+ from diffusers.models import AutoencoderKL
17
+ from diffusers.schedulers import LMSDiscreteScheduler
18
+ from diffusers.utils.import_utils import is_xformers_available
19
+ import torch
20
+ import youtube_dl
21
+ import os
22
+
23
+ pipe = StableDiffusionWalkPipeline.from_pretrained(
24
+ 'runwayml/stable-diffusion-v1-5',
25
+ torch_dtype=torch.float16,
26
+ safety_checker=None,
27
+ vae=AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16).to("cuda"),
28
+ scheduler=LMSDiscreteScheduler(
29
+ beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
30
+ )
31
+ ).to("cuda")
32
+
33
+ if is_xformers_available():
34
+ pipe.enable_xformers_memory_efficient_attention()
35
+
36
+ def download_example_clip(url, output_dir='./', output_filename='%(title)s.%(ext)s'):
37
+ if (Path(output_dir) / output_filename).exists():
38
+ return str(Path(output_dir) / output_filename)
39
+
40
+ files_before = os.listdir(output_dir) if os.path.exists(output_dir) else []
41
+ ydl_opts = {
42
+ 'outtmpl': str(Path(output_dir) / output_filename),
43
+ 'format': 'bestaudio',
44
+ 'extract-audio': True,
45
+ 'audio-format': 'mp3',
46
+ 'audio-quality': 0,
47
+ }
48
+ with youtube_dl.YoutubeDL(ydl_opts) as ydl:
49
+ ydl.download([url])
50
+
51
+ files_after = os.listdir(output_dir)
52
+ return str(Path(output_dir) / list(set(files_after) - set(files_before))[0])
53
+
54
+ def audio_data_to_buffer(y, sr):
55
+ audio_filepath = BytesIO()
56
+ audio_filepath.name = 'audio.wav'
57
+ sf.write(audio_filepath, y, samplerate=sr, format='WAV')
58
+ audio_filepath.seek(0)
59
+ return audio_filepath
60
+
61
+
62
+ def plot_array(y):
63
+ fig = plt.figure()
64
+ x = np.arange(y.shape[0])
65
+ plt.title("Line graph")
66
+ plt.xlabel("X axis")
67
+ plt.ylabel("Y axis")
68
+ plt.plot(x, y, color ="red")
69
+ plt.savefig('timesteps_chart.png')
70
+ return fig
71
+
72
+ def on_slice_btn_click(audio, audio_start_sec, duration, fps, smooth, margin):
73
+ if audio is None:
74
+ return [
75
+ gr.update(visible=False),
76
+ gr.update(visible=False),
77
+ ]
78
+
79
+ y, sr = librosa.load(audio, offset=audio_start_sec, duration=duration)
80
+ T = get_timesteps_arr(
81
+ audio_data_to_buffer(y, sr),
82
+ 0,
83
+ duration,
84
+ fps=fps,
85
+ margin=margin,
86
+ smooth=smooth,
87
+ )
88
+ return [gr.update(value=(sr, y), visible=True), gr.update(value=plot_array(T), visible=True)]
89
+
90
+ def on_audio_change_or_clear(audio):
91
+ if audio is None:
92
+ return [
93
+ gr.update(visible=False),
94
+ gr.update(visible=False)
95
+ ]
96
+
97
+ duration = librosa.get_duration(filename=audio)
98
+ return [
99
+ gr.update(maximum=int(duration), visible=True),
100
+ gr.update(maximum=int(min(10, duration)), visible=True)
101
+ ]
102
+
103
+ def on_update_weight_settings_btn_click(sliced_audio, duration, fps, smooth, margin):
104
+ if sliced_audio is None:
105
+ return gr.update(visible=False)
106
+
107
+ T = get_timesteps_arr(
108
+ sliced_audio,
109
+ 0,
110
+ duration,
111
+ fps=fps,
112
+ margin=margin,
113
+ smooth=smooth,
114
+ )
115
+ return gr.update(value=plot_array(T), visible=True)
116
+
117
+
118
+ def on_generate_images_btn_click(
119
+ prompt_a,
120
+ prompt_b,
121
+ seed_a,
122
+ seed_b,
123
+ output_dir,
124
+ num_inference_steps,
125
+ guidance_scale,
126
+ height,
127
+ width,
128
+ upsample,
129
+ ):
130
+ output_dir = Path(output_dir) / 'images'
131
+
132
+ if seed_a == -1:
133
+ seed_a = random.randint(0, 9999999)
134
+ if seed_b == -1:
135
+ seed_b = random.randint(0, 9999999)
136
+
137
+ image_a_fpath = generate_images(
138
+ pipe,
139
+ prompt_a,
140
+ seeds=[seed_a],
141
+ num_inference_steps=num_inference_steps,
142
+ guidance_scale=guidance_scale,
143
+ height=height,
144
+ width=width,
145
+ upsample=upsample,
146
+ output_dir=output_dir
147
+ )[0]
148
+ image_b_fpath = generate_images(
149
+ pipe,
150
+ prompt_b,
151
+ seeds=[seed_b],
152
+ num_inference_steps=num_inference_steps,
153
+ guidance_scale=guidance_scale,
154
+ height=height,
155
+ width=width,
156
+ upsample=upsample,
157
+ output_dir=output_dir
158
+ )[0]
159
+
160
+ return [
161
+ gr.update(value=image_a_fpath, visible=True),
162
+ gr.update(value=image_b_fpath, visible=True),
163
+ gr.update(value=seed_a),
164
+ gr.update(value=seed_b),
165
+ ]
166
+
167
+ def on_generate_music_video_btn_click(
168
+ audio_filepath,
169
+ audio_start_sec,
170
+ duration,
171
+ fps,
172
+ smooth,
173
+ margin,
174
+ prompt_a,
175
+ prompt_b,
176
+ seed_a,
177
+ seed_b,
178
+ batch_size,
179
+ output_dir,
180
+ num_inference_steps,
181
+ guidance_scale,
182
+ height,
183
+ width,
184
+ upsample,
185
+ ):
186
+
187
+ if audio_filepath is None:
188
+ return gr.update(visible=False)
189
+
190
+ video_filepath = pipe.walk(
191
+ prompts=[prompt_a, prompt_b],
192
+ seeds=[seed_a, seed_b],
193
+ num_interpolation_steps=int(duration * fps),
194
+ output_dir=output_dir,
195
+ fps=fps,
196
+ num_inference_steps=num_inference_steps,
197
+ guidance_scale=guidance_scale,
198
+ height=height,
199
+ width=width,
200
+ upsample=upsample,
201
+ batch_size=batch_size,
202
+ audio_filepath=audio_filepath,
203
+ audio_start_sec=audio_start_sec,
204
+ margin=margin,
205
+ smooth=smooth,
206
+ )
207
+ return gr.update(value=video_filepath, visible=True)
208
+
209
+
210
+ audio_start_sec = gr.Slider(0, 10, 0, step=1, label="Start (sec)", interactive=True)
211
+ duration = gr.Slider(0, 10, 1, step=1, label="Duration (sec)", interactive=True)
212
+ slice_btn = gr.Button("Slice Audio")
213
+
214
+ sliced_audio = gr.Audio(type='filepath')
215
+ wav_plot = gr.Plot(label="Interpolation Weights Per Frame")
216
+
217
+ fps = gr.Slider(1, 60, 12, step=1, label="FPS", interactive=True)
218
+ smooth = gr.Slider(0, 1, 0.0, label="Smoothing", interactive=True)
219
+ margin = gr.Slider(1.0, 20.0, 1.0, step=0.5, label="Margin Max", interactive=True)
220
+ update_weight_settings_btn = gr.Button("Update Interpolation Weights")
221
+
222
+ prompt_a = gr.Textbox(value='blueberry spaghetti', label="Prompt A")
223
+ prompt_b = gr.Textbox(value='strawberry spaghetti', label="Prompt B")
224
+ seed_a = gr.Number(-1, label="Seed A", precision=0, interactive=True)
225
+ seed_b = gr.Number(-1, label="Seed B", precision=0, interactive=True)
226
+ generate_images_btn = gr.Button("Generate Images")
227
+ image_a = gr.Image(visible=False, label="Image A")
228
+ image_b = gr.Image(visible=False, label="Image B")
229
+
230
+ batch_size = gr.Slider(1, 32, 1, step=1, label="Batch Size", interactive=True)
231
+ generate_music_video_btn = gr.Button("Generate Music Video")
232
+ video = gr.Video(visible=False, label="Video")
233
+
234
+ STEP_1_MARKDOWN = """
235
+ ## 1. Upload Some Audio
236
+
237
+ Upload an audio file to use as the source for the music video.
238
+ """
239
+
240
+ STEP_2_MARKDOWN = """
241
+ ## 2. Slice Portion of Audio for Generated Clip
242
+
243
+ Here you can slice a portion of the audio to use for the generated music video. The longer the audio, the more frames will be generated (which will take longer).
244
+
245
+ I suggest you use this app to make music videos in segments of 5-10 seconds at a time. Then, you can stitch the videos together using a video editor or ffmpeg later.
246
+
247
+ **Warning**: If your audio file is short, I do no check that the duration you chose is not longer than the audio. It may cause some issues, so just be mindful of that.
248
+ """
249
+
250
+ STEP_3_MARKDOWN = """
251
+ ## 3. Set Interpolation Weight Settings
252
+
253
+ This section lets you play with the settings used to configure how we move through the latent space given the audio you sliced.
254
+
255
+ If you look at the graph on the right, you'll see in the X-axis how many frames. The Y-axis is the weight of Image A as we move through the latent space.
256
+
257
+ If you listen to the audio slice and look at the graph, you should see bumps at points where the audio energy is high (in our case, percussive energy).
258
+ """
259
+
260
+ STEP_4_MARKDOWN = """
261
+ ## 4. Select Prompts, Seeds, Settings, and Generate Images
262
+
263
+ Here you can select the settings for image generation.
264
+
265
+ Then, you can select prompts and seeds for generating images.
266
+
267
+ - Image A will be first frame of the generated video.
268
+ - Image B will be last frame of the generated video.
269
+ - The video will be generated by interpolating between the two images using the audio you provided.
270
+
271
+ If you set the seeds to -1, a random seed will be used and saved for you, so you can explore different images given the same prompt.
272
+ """
273
+
274
+
275
+ with gr.Blocks() as demo:
276
+ gr.Markdown(STEP_1_MARKDOWN)
277
+ audio = gr.Audio(type='filepath', interactive=True)
278
+ gr.Examples(
279
+ [
280
+ download_example_clip(
281
+ url='https://soundcloud.com/nateraw/thoughts',
282
+ output_dir='./music',
283
+ output_filename='thoughts.mp3'
284
+ )
285
+ ],
286
+ inputs=audio,
287
+ outputs=[audio_start_sec, duration],
288
+ fn=on_audio_change_or_clear,
289
+ cache_examples=False
290
+ )
291
+ audio.change(on_audio_change_or_clear, audio, [audio_start_sec, duration])
292
+ audio.clear(on_audio_change_or_clear, audio, [audio_start_sec, duration])
293
+
294
+ gr.Markdown(STEP_2_MARKDOWN)
295
+ audio_start_sec.render()
296
+ duration.render()
297
+ slice_btn.render()
298
+
299
+ slice_btn.click(on_slice_btn_click, [audio, audio_start_sec, duration, fps, smooth, margin], [sliced_audio, wav_plot])
300
+ sliced_audio.render()
301
+
302
+ gr.Markdown(STEP_3_MARKDOWN)
303
+
304
+ with gr.Row():
305
+ with gr.Column(scale=4):
306
+ fps.render()
307
+ smooth.render()
308
+ margin.render()
309
+ update_weight_settings_btn.render()
310
+ update_weight_settings_btn.click(
311
+ on_update_weight_settings_btn_click,
312
+ [sliced_audio, duration, fps, smooth, margin],
313
+ wav_plot
314
+ )
315
+ with gr.Column(scale=3):
316
+ wav_plot.render()
317
+
318
+ gr.Markdown(STEP_4_MARKDOWN)
319
+
320
+ with gr.Accordion("Additional Settings", open=False):
321
+ output_dir = gr.Textbox(value='./dreams', label="Output Directory")
322
+ num_inference_steps = gr.Slider(1, 200, 50, step=10, label="Diffusion Inference Steps", interactive=True)
323
+ guidance_scale = gr.Slider(1.0, 25.0, 7.5, step=0.5, label="Guidance Scale", interactive=True)
324
+ height = gr.Slider(512, 1024, 512, step=64, label="Height", interactive=True)
325
+ width = gr.Slider(512, 1024, 512, step=64, label="Width", interactive=True)
326
+ upsample = gr.Checkbox(value=False, label="Upsample with Real-ESRGAN")
327
+
328
+ with gr.Row():
329
+ with gr.Column(scale=4):
330
+ prompt_a.render()
331
+ with gr.Column(scale=1):
332
+ seed_a.render()
333
+
334
+ with gr.Row():
335
+ with gr.Column(scale=4):
336
+ prompt_b.render()
337
+ with gr.Column(scale=1):
338
+ seed_b.render()
339
+
340
+ generate_images_btn.render()
341
+
342
+ with gr.Row():
343
+ with gr.Column(scale=1):
344
+ image_a.render()
345
+ with gr.Column(scale=1):
346
+ image_b.render()
347
+
348
+ generate_images_btn.click(
349
+ on_generate_images_btn_click,
350
+ [prompt_a, prompt_b, seed_a, seed_b, output_dir, num_inference_steps, guidance_scale, height, width, upsample],
351
+ [image_a, image_b, seed_a, seed_b]
352
+ )
353
+
354
+ gr.Markdown("## 5. Generate Music Video")
355
+ # TODO - add equivalent code snippet to generate music video
356
+ batch_size.render()
357
+ generate_music_video_btn.render()
358
+ generate_music_video_btn.click(
359
+ on_generate_music_video_btn_click,
360
+ [audio, audio_start_sec, duration, fps, smooth, margin, prompt_a, prompt_b, seed_a, seed_b, batch_size, output_dir, num_inference_steps, guidance_scale, height, width, upsample],
361
+ video
362
+ )
363
+ video.render()
364
+
365
+
366
+ if __name__ == '__main__':
367
+ demo.launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # A10G needs cuda 11.3
2
+ --extra-index-url https://download.pytorch.org/whl/cu113
3
+ torch==1.12.0+cu113
4
+ torchvision==0.13.0+cu113
5
+
6
+ # Stable diffusion videos repo with the realesrgan extra for upsampling
7
+ stable-diffusion-videos[realesrgan]
8
+
9
+ # Youtube-dl for downloading examples
10
+ youtube-dl