ylacombe commited on
Commit
bda8ed2
·
verified ·
1 Parent(s): a204cc2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -45
app.py CHANGED
@@ -24,7 +24,7 @@ from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
24
 
25
  import copy
26
  import torch
27
- import inspect
28
 
29
  from demucs import pretrained
30
  from demucs.apply import apply_model
@@ -34,7 +34,7 @@ logger = logging.get_logger(__name__)
34
 
35
 
36
  class MusicgenMelodyForLongFormConditionalGeneration(MusicgenMelodyForConditionalGeneration):
37
- stride_longform = 500
38
 
39
 
40
  def _prepare_audio_encoder_kwargs_for_longform_generation(
@@ -65,20 +65,14 @@ class MusicgenMelodyForLongFormConditionalGeneration(MusicgenMelodyForConditiona
65
  **kwargs,
66
  ):
67
  """
68
-
69
  Generates sequences of token ids for models with a language modeling head.
70
-
71
  <Tip warning={true}>
72
-
73
  Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
74
  model's default generation configuration. You can override any `generation_config` by passing the corresponding
75
  parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
76
-
77
  For an overview of generation strategies and code examples, check out the [following
78
  guide](./generation_strategies).
79
-
80
  </Tip>
81
-
82
  Parameters:
83
  inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
84
  The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
@@ -109,20 +103,15 @@ class MusicgenMelodyForLongFormConditionalGeneration(MusicgenMelodyForConditiona
109
  Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
110
  forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
111
  specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
112
-
113
  Return:
114
  [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
115
  or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
116
-
117
  If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
118
  [`~utils.ModelOutput`] types are:
119
-
120
  - [`~generation.GenerateDecoderOnlyOutput`],
121
  - [`~generation.GenerateBeamDecoderOnlyOutput`]
122
-
123
  If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
124
  [`~utils.ModelOutput`] types are:
125
-
126
  - [`~generation.GenerateEncoderDecoderOutput`],
127
  - [`~generation.GenerateBeamEncoderDecoderOutput`]
128
  """
@@ -272,7 +261,10 @@ class MusicgenMelodyForLongFormConditionalGeneration(MusicgenMelodyForConditiona
272
  # the first timestamps corresponds to decoder_start_token
273
  current_generated_length = input_ids.shape[1] - 1
274
 
275
- while current_generated_length <= self.max_longform_generation_length:
 
 
 
276
  if is_greedy_gen_mode:
277
  if generation_config.num_return_sequences > 1:
278
  raise ValueError(
@@ -343,12 +335,12 @@ class MusicgenMelodyForLongFormConditionalGeneration(MusicgenMelodyForConditiona
343
  generated_tokens.append(output_ids[:, :, self.stride_longform:])
344
  else:
345
  generated_tokens.append(output_ids)
346
-
347
  current_generated_length += generated_tokens[-1].shape[-1]
348
 
349
  # append the frame dimension back to the audio codes
350
  # use last generated tokens as begining of the newest generation
351
- output_ids = output_ids[None, :, :, (output_ids.shape[-1] - self.stride_longform):]
352
 
353
  model_kwargs = self._prepare_audio_encoder_kwargs_for_longform_generation(output_ids, model_kwargs)
354
 
@@ -417,18 +409,24 @@ processor = AutoProcessor.from_pretrained("facebook/musicgen-melody", revision="
417
 
418
  demucs = pretrained.get_model('htdemucs')
419
 
420
- title = "MusicGen Streaming"
421
 
422
  description = """
423
- Stream the outputs of the MusicGen text-to-music model by playing the generated audio as soon as the first chunk is ready.
424
- Demo uses [MusicGen Small](https://huggingface.co/facebook/musicgen-small) in the 🤗 Transformers library. Note that the
 
 
 
425
  demo works best on the Chrome browser. If there is no audio output, try switching browser to Chrome.
426
  """
427
 
428
  article = """
429
- ## How Does It Work?
 
 
430
 
431
  MusicGen is an auto-regressive transformer-based model, meaning generates audio codes (tokens) in a causal fashion.
 
432
  At each decoding step, the model generates a new set of audio codes, conditional on the text input and all previous audio codes. From the
433
  frame rate of the [EnCodec model](https://huggingface.co/facebook/encodec_32khz) used to decode the generated codes to audio waveform,
434
  each set of generated audio codes corresponds to 0.02 seconds. This means we require a total of 1000 decoding steps to generate
@@ -436,18 +434,28 @@ each set of generated audio codes corresponds to 0.02 seconds. This means we req
436
 
437
  Rather than waiting for the entire audio sequence to be generated, which would require the full 1000 decoding steps, we can start
438
  playing the audio after a specified number of decoding steps have been reached, a techinque known as [*streaming*](https://huggingface.co/docs/transformers/main/en/generation_strategies#streaming).
 
439
  For example, after 250 steps we have the first 5 seconds of audio ready, and so can play this without waiting for the remaining
440
  750 decoding steps to be complete. As we continue to generate with the MusicGen model, we append new chunks of generated audio
441
  to our output waveform on-the-fly. After the full 1000 decoding steps, the generated audio is complete, and is composed of four
442
  chunks of audio, each corresponding to 250 tokens.
443
 
444
- This method of playing incremental generations reduces the latency of the MusicGen model from the total time to generate 1000 tokens,
445
- to the time taken to play the first chunk of audio (250 tokens). This can result in significant improvements to perceived latency,
446
- particularly when the chunk size is chosen to be small. In practice, the chunk size should be tuned to your device: using a
447
- smaller chunk size will mean that the first chunk is ready faster, but should not be chosen so small that the model generates slower
 
448
  than the time it takes to play the audio.
449
 
450
  For details on how the streaming class works, check out the source code for the [MusicgenStreamer](https://huggingface.co/spaces/sanchit-gandhi/musicgen-streaming/blob/main/app.py#L52).
 
 
 
 
 
 
 
 
451
  """
452
 
453
 
@@ -465,7 +473,6 @@ class MusicgenStreamer(BaseStreamer):
465
  Streamer that stores playback-ready audio in a queue, to be used by a downstream application as an iterator. This is
466
  useful for applications that benefit from accessing the generated audio in a non-blocking way (e.g. in an interactive
467
  Gradio demo).
468
-
469
  Parameters:
470
  model (`MusicgenForConditionalGeneration`):
471
  The MusicGen model used to generate the audio waveform.
@@ -530,12 +537,23 @@ class MusicgenStreamer(BaseStreamer):
530
 
531
  # send the input_ids to the correct device
532
  input_ids = input_ids.to(self.audio_encoder.device)
 
533
 
534
- output_values = self.audio_encoder.decode(
535
- input_ids,
536
- audio_scales=[None],
537
- )
538
- audio_values = output_values.audio_values[0, 0]
 
 
 
 
 
 
 
 
 
 
539
  return audio_values.cpu().float().numpy()
540
 
541
  def put(self, value):
@@ -546,14 +564,13 @@ class MusicgenStreamer(BaseStreamer):
546
  if self.token_cache is None:
547
  self.token_cache = value
548
  else:
 
 
 
549
  self.token_cache = torch.concatenate([self.token_cache, value[:, None]], dim=-1)
550
 
551
  if self.token_cache.shape[-1] % self.play_steps == 0:
552
  audio_values = self.apply_delay_pattern_mask(self.token_cache)
553
- if self.is_longform:
554
- if not self.longform_stride_applied:
555
- self.to_yield = self.to_yield + self.longform_stride
556
- self.longform_stride_applied = True
557
  self.on_finalized_audio(audio_values[self.to_yield : -self.stride])
558
  self.to_yield += len(audio_values) - self.to_yield - self.stride
559
 
@@ -607,13 +624,14 @@ def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=2
607
 
608
  return wav_buf.read()
609
 
610
- @spaces.GPU()
611
  def generate_audio(text_prompt, audio, audio_length_in_s=10.0, play_steps_in_s=2.0, seed=0):
612
  max_new_tokens = int(frame_rate * audio_length_in_s)
613
  play_steps = int(frame_rate * play_steps_in_s)
614
 
615
  if audio is not None:
616
- audio = convert_audio(torch.tensor(audio[1]).float(), audio[0], demucs.samplerate, demucs.audio_channels)
 
617
  audio = apply_model(demucs, audio[None])
618
 
619
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
@@ -666,20 +684,20 @@ demo = gr.Interface(
666
  fn=generate_audio,
667
  inputs=[
668
  gr.Text(label="Prompt", value="80s pop track with synth and instrumentals"),
669
- gr.Audio(type="numpy", label="Conditioning audio"),
670
- gr.Slider(15, 60, value=45, step=5, label="Audio length in seconds"),
671
- gr.Slider(0.5, 2.5, value=1.5, step=0.5, label="Streaming interval in seconds", info="Lower = shorter chunks, lower latency, more codec steps"),
672
- gr.Number(value=5, precision=0, step=1, minimum=0, label="Seed for random generations"),
673
  ],
674
  outputs=[
675
  gr.Audio(label="Generated Music", autoplay=True, interactive=False, streaming=True)
676
  ],
677
  examples=[
678
- ["An 80s driving pop song with heavy drums and synth pads in the background", None, 30, 1.5, 5],
679
- ["A cheerful country song with acoustic guitars", None, 30, 1.5, 5],
680
- ["90s rock song with electric guitar and heavy drums", None, 30, 1.5, 5],
681
- ["a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130", None, 30, 1.5, 5],
682
- ["lofi slow bpm electro chill with organic samples", None, 30, 1.5, 5],
683
  ],
684
  title=title,
685
  description=description,
 
24
 
25
  import copy
26
  import torch
27
+ import torchaudio
28
 
29
  from demucs import pretrained
30
  from demucs.apply import apply_model
 
34
 
35
 
36
  class MusicgenMelodyForLongFormConditionalGeneration(MusicgenMelodyForConditionalGeneration):
37
+ stride_longform = 750
38
 
39
 
40
  def _prepare_audio_encoder_kwargs_for_longform_generation(
 
65
  **kwargs,
66
  ):
67
  """
 
68
  Generates sequences of token ids for models with a language modeling head.
 
69
  <Tip warning={true}>
 
70
  Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
71
  model's default generation configuration. You can override any `generation_config` by passing the corresponding
72
  parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
 
73
  For an overview of generation strategies and code examples, check out the [following
74
  guide](./generation_strategies).
 
75
  </Tip>
 
76
  Parameters:
77
  inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
78
  The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
 
103
  Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
104
  forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
105
  specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
 
106
  Return:
107
  [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
108
  or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
 
109
  If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
110
  [`~utils.ModelOutput`] types are:
 
111
  - [`~generation.GenerateDecoderOnlyOutput`],
112
  - [`~generation.GenerateBeamDecoderOnlyOutput`]
 
113
  If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
114
  [`~utils.ModelOutput`] types are:
 
115
  - [`~generation.GenerateEncoderDecoderOutput`],
116
  - [`~generation.GenerateBeamEncoderDecoderOutput`]
117
  """
 
261
  # the first timestamps corresponds to decoder_start_token
262
  current_generated_length = input_ids.shape[1] - 1
263
 
264
+ max_new_tokens = generation_config.max_new_tokens
265
+
266
+ while current_generated_length + 20 <= max_longform_generation_length:
267
+ generation_config.max_new_tokens = min(max_new_tokens, max_longform_generation_length - current_generated_length)
268
  if is_greedy_gen_mode:
269
  if generation_config.num_return_sequences > 1:
270
  raise ValueError(
 
335
  generated_tokens.append(output_ids[:, :, self.stride_longform:])
336
  else:
337
  generated_tokens.append(output_ids)
338
+
339
  current_generated_length += generated_tokens[-1].shape[-1]
340
 
341
  # append the frame dimension back to the audio codes
342
  # use last generated tokens as begining of the newest generation
343
+ output_ids = output_ids[None, :, :, - self.stride_longform:]
344
 
345
  model_kwargs = self._prepare_audio_encoder_kwargs_for_longform_generation(output_ids, model_kwargs)
346
 
 
409
 
410
  demucs = pretrained.get_model('htdemucs')
411
 
412
+ title = "Streaming Long-form MusicGen"
413
 
414
  description = """
415
+ Stream the outputs of the MusicGen Melody text-to-music model by playing the generated audio as soon as the first chunk is ready.
416
+
417
+ The generation loop is adapted to perform **long-form** music generation. In this demo, we limit the duration of the music generated, but in theory, it could run **endlessly**.
418
+
419
+ Demo uses [MusicGen Melody](https://huggingface.co/facebook/musicgen-melody) in the 🤗 Transformers library. Note that the
420
  demo works best on the Chrome browser. If there is no audio output, try switching browser to Chrome.
421
  """
422
 
423
  article = """
424
+ ## FAQ
425
+
426
+ ### How Does It Work?
427
 
428
  MusicGen is an auto-regressive transformer-based model, meaning generates audio codes (tokens) in a causal fashion.
429
+
430
  At each decoding step, the model generates a new set of audio codes, conditional on the text input and all previous audio codes. From the
431
  frame rate of the [EnCodec model](https://huggingface.co/facebook/encodec_32khz) used to decode the generated codes to audio waveform,
432
  each set of generated audio codes corresponds to 0.02 seconds. This means we require a total of 1000 decoding steps to generate
 
434
 
435
  Rather than waiting for the entire audio sequence to be generated, which would require the full 1000 decoding steps, we can start
436
  playing the audio after a specified number of decoding steps have been reached, a techinque known as [*streaming*](https://huggingface.co/docs/transformers/main/en/generation_strategies#streaming).
437
+
438
  For example, after 250 steps we have the first 5 seconds of audio ready, and so can play this without waiting for the remaining
439
  750 decoding steps to be complete. As we continue to generate with the MusicGen model, we append new chunks of generated audio
440
  to our output waveform on-the-fly. After the full 1000 decoding steps, the generated audio is complete, and is composed of four
441
  chunks of audio, each corresponding to 250 tokens.
442
 
443
+ This method of playing incremental generations **reduces the latency** of the MusicGen model from the total time to generate 1000 tokens,
444
+ to the time taken to play the first chunk of audio (250 tokens). This can result in **significant improvements** to perceived latency,
445
+ particularly when the chunk size is chosen to be small.
446
+
447
+ In practice, the chunk size should be tuned to your device: using a smaller chunk size will mean that the first chunk is ready faster, but should not be chosen so small that the model generates slower
448
  than the time it takes to play the audio.
449
 
450
  For details on how the streaming class works, check out the source code for the [MusicgenStreamer](https://huggingface.co/spaces/sanchit-gandhi/musicgen-streaming/blob/main/app.py#L52).
451
+
452
+ ### Could this be used for stereo music generation?
453
+
454
+ In theory, yes, but you would have to adapt the current demo a bit and use a checkpoint specificaly made for stereo generation, for example, this [one](https://huggingface.co/facebook/musicgen-stereo-melody).
455
+
456
+ ### Why is there a delay between the moment the first chunk is generated and the moment the audio starts playing?
457
+
458
+ This behaviour is specific to gradio and the different components it uses. If you ever adapt this demo for a streaming use-case, you could have lower latency.
459
  """
460
 
461
 
 
473
  Streamer that stores playback-ready audio in a queue, to be used by a downstream application as an iterator. This is
474
  useful for applications that benefit from accessing the generated audio in a non-blocking way (e.g. in an interactive
475
  Gradio demo).
 
476
  Parameters:
477
  model (`MusicgenForConditionalGeneration`):
478
  The MusicGen model used to generate the audio waveform.
 
537
 
538
  # send the input_ids to the correct device
539
  input_ids = input_ids.to(self.audio_encoder.device)
540
+
541
 
542
+ if self.decoder.config.audio_channels == 1:
543
+ output_values = self.audio_encoder.decode(
544
+ input_ids,
545
+ audio_scales=[None],
546
+ ).audio_values
547
+ else:
548
+ codec_outputs_left = self.audio_encoder.decode(input_ids[:, :, ::2, :], audio_scales=[None])
549
+ output_values_left = codec_outputs_left.audio_values
550
+
551
+ codec_outputs_right = self.audio_encoder.decode(input_ids[:, :, 1::2, :], audio_scales=[None])
552
+ output_values_right = codec_outputs_right.audio_values
553
+
554
+ output_values = torch.cat([output_values_left, output_values_right], dim=1)
555
+
556
+ audio_values = output_values[0, 0]
557
  return audio_values.cpu().float().numpy()
558
 
559
  def put(self, value):
 
564
  if self.token_cache is None:
565
  self.token_cache = value
566
  else:
567
+ # if self.is_longform and not self.longform_stride_applied:
568
+ # value = value[self.longform_stride:]
569
+ # self.longform_stride_applied = True
570
  self.token_cache = torch.concatenate([self.token_cache, value[:, None]], dim=-1)
571
 
572
  if self.token_cache.shape[-1] % self.play_steps == 0:
573
  audio_values = self.apply_delay_pattern_mask(self.token_cache)
 
 
 
 
574
  self.on_finalized_audio(audio_values[self.to_yield : -self.stride])
575
  self.to_yield += len(audio_values) - self.to_yield - self.stride
576
 
 
624
 
625
  return wav_buf.read()
626
 
627
+ @spaces.GPU(duration=90)
628
  def generate_audio(text_prompt, audio, audio_length_in_s=10.0, play_steps_in_s=2.0, seed=0):
629
  max_new_tokens = int(frame_rate * audio_length_in_s)
630
  play_steps = int(frame_rate * play_steps_in_s)
631
 
632
  if audio is not None:
633
+ audio = torchaudio.load(audio)
634
+ audio = convert_audio(audio[0], audio[1], demucs.samplerate, demucs.audio_channels)
635
  audio = apply_model(demucs, audio[None])
636
 
637
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
684
  fn=generate_audio,
685
  inputs=[
686
  gr.Text(label="Prompt", value="80s pop track with synth and instrumentals"),
687
+ gr.Audio(type="filepath", label="Conditioning audio. Use this for melody-guided generation."),
688
+ gr.Slider(35, 60, value=45, step=5, label="(Approximate) Audio length in seconds."),
689
+ gr.Slider(0.5, 2.5, value=1.5, step=0.5, label="Streaming interval in seconds.", info="Lower = shorter chunks, lower latency, more codec steps."),
690
+ gr.Number(value=5, precision=0, step=1, minimum=0, label="Seed for random generations."),
691
  ],
692
  outputs=[
693
  gr.Audio(label="Generated Music", autoplay=True, interactive=False, streaming=True)
694
  ],
695
  examples=[
696
+ ["An 80s driving pop song with heavy drums and synth pads in the background", None, 45, 1.5, 5],
697
+ ["Bossa nova with guitars and synthesizer", None, 45, 1.5, 5],
698
+ ["90s rock song with electric guitar and heavy drums", None, 45, 1.5, 5],
699
+ ["a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130", None, 45, 1.5, 5],
700
+ ["lofi slow bpm electro chill with organic samples", None, 45, 1.5, 5],
701
  ],
702
  title=title,
703
  description=description,