sanchit-gandhi commited on
Commit
290deb7
·
1 Parent(s): c8a6713

fix examples

Browse files
Files changed (1) hide show
  1. app.py +15 -6
app.py CHANGED
@@ -24,6 +24,7 @@ model = ParlerTTSForConditionalGeneration.from_pretrained(
24
  jenny_model = ParlerTTSForConditionalGeneration.from_pretrained(
25
  jenny_repo_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
26
  ).to(device)
 
27
  tokenizer = AutoTokenizer.from_pretrained(repo_id)
28
  feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
29
 
@@ -34,41 +35,50 @@ default_text = "Please surprise me and speak in whatever voice you enjoy."
34
  examples = [
35
  [
36
  "Remember - this is only the first iteration of the model! To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data by a factor of five times.",
37
- "A male speaker with a low-pitched voice delivering his words at a fast pace in a small, confined space with a very clear audio and an animated tone."
 
38
  ],
39
  [
40
  "'This is the best time of my life, Bartley,' she said happily.",
41
  "A female speaker with a slightly low-pitched, quite monotone voice delivers her words at a slightly faster-than-average pace in a confined space with very clear audio.",
 
42
  ],
43
  [
44
  "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
45
  "A male speaker with a slightly high-pitched voice delivering his words at a slightly slow pace in a small, confined space with a touch of background noise and a quite monotone tone.",
 
46
  ],
47
  [
48
  "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
49
  "A male speaker with a low-pitched voice delivers his words at a fast pace and an animated tone, in a very spacious environment, accompanied by noticeable background noise.",
 
50
  ],
51
  ]
52
 
53
  jenny_examples = [
54
  [
55
  "Remember - this is only the first iteration of the model! To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data by a factor of five times.",
56
- "Jenny speaks at a fast pace in a small, confined space with a very clear audio and an animated tone."
 
57
  ],
58
  [
59
  "'This is the best time of my life, Bartley,' she said happily.",
60
  "Jenny speaks in quite a monotone voice at a slightly faster-than-average pace in a confined space with very clear audio.",
 
61
  ],
62
  [
63
  "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
64
  "Jenny delivers her words at a slightly slow pace in a small, confined space with a touch of background noise and a quite monotone tone.",
 
65
  ],
66
  [
67
  "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
68
  "Jenny delivers words at a fast pace and an animated tone, in a very spacious environment, accompanied by noticeable background noise.",
 
69
  ],
70
  ]
71
 
 
72
  class ParlerTTSStreamer(BaseStreamer):
73
  def __init__(
74
  self,
@@ -120,7 +130,7 @@ class ParlerTTSStreamer(BaseStreamer):
120
  self.timeout = timeout
121
 
122
  def apply_delay_pattern_mask(self, input_ids):
123
- # build the delay pattern mask for offsetting each codebook prediction by 1 (this behaviour is specific to MusicGen)
124
  _, delay_pattern_mask = self.decoder.build_delay_pattern_mask(
125
  input_ids[:, :1],
126
  bos_token_id=self.generation_config.bos_token_id,
@@ -149,7 +159,7 @@ class ParlerTTSStreamer(BaseStreamer):
149
  def put(self, value):
150
  batch_size = value.shape[0] // self.decoder.num_codebooks
151
  if batch_size > 1:
152
- raise ValueError("MusicgenStreamer only supports batch size 1")
153
 
154
  if self.token_cache is None:
155
  self.token_cache = value
@@ -336,8 +346,7 @@ with gr.Blocks(css=css) as block:
336
  play_seconds = gr.Slider(2.5, 5.0, value=2.5, step=0.5, label="Streaming interval in seconds", info="Lower = shorter chunks, lower latency, more codec steps"),
337
  run_button = gr.Button("Generate Audio", variant="primary")
338
  with gr.Column():
339
- audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out", streaming=True,
340
- autoplay=True)
341
 
342
  inputs = [input_text, description, play_seconds]
343
  outputs = [audio_out]
 
24
  jenny_model = ParlerTTSForConditionalGeneration.from_pretrained(
25
  jenny_repo_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
26
  ).to(device)
27
+
28
  tokenizer = AutoTokenizer.from_pretrained(repo_id)
29
  feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
30
 
 
35
  examples = [
36
  [
37
  "Remember - this is only the first iteration of the model! To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data by a factor of five times.",
38
+ "A male speaker with a low-pitched voice delivering his words at a fast pace in a small, confined space with a very clear audio and an animated tone.",
39
+ 2.5,
40
  ],
41
  [
42
  "'This is the best time of my life, Bartley,' she said happily.",
43
  "A female speaker with a slightly low-pitched, quite monotone voice delivers her words at a slightly faster-than-average pace in a confined space with very clear audio.",
44
+ 2.5,
45
  ],
46
  [
47
  "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
48
  "A male speaker with a slightly high-pitched voice delivering his words at a slightly slow pace in a small, confined space with a touch of background noise and a quite monotone tone.",
49
+ 2.5,
50
  ],
51
  [
52
  "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
53
  "A male speaker with a low-pitched voice delivers his words at a fast pace and an animated tone, in a very spacious environment, accompanied by noticeable background noise.",
54
+ 2.5,
55
  ],
56
  ]
57
 
58
  jenny_examples = [
59
  [
60
  "Remember - this is only the first iteration of the model! To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data by a factor of five times.",
61
+ "Jenny speaks at a fast pace in a small, confined space with a very clear audio and an animated tone.",
62
+ 2.5,
63
  ],
64
  [
65
  "'This is the best time of my life, Bartley,' she said happily.",
66
  "Jenny speaks in quite a monotone voice at a slightly faster-than-average pace in a confined space with very clear audio.",
67
+ 2.5,
68
  ],
69
  [
70
  "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
71
  "Jenny delivers her words at a slightly slow pace in a small, confined space with a touch of background noise and a quite monotone tone.",
72
+ 2.5,
73
  ],
74
  [
75
  "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
76
  "Jenny delivers words at a fast pace and an animated tone, in a very spacious environment, accompanied by noticeable background noise.",
77
+ 2.5,
78
  ],
79
  ]
80
 
81
+
82
  class ParlerTTSStreamer(BaseStreamer):
83
  def __init__(
84
  self,
 
130
  self.timeout = timeout
131
 
132
  def apply_delay_pattern_mask(self, input_ids):
133
+ # build the delay pattern mask for offsetting each codebook prediction by 1 (this behaviour is specific to Parler)
134
  _, delay_pattern_mask = self.decoder.build_delay_pattern_mask(
135
  input_ids[:, :1],
136
  bos_token_id=self.generation_config.bos_token_id,
 
159
  def put(self, value):
160
  batch_size = value.shape[0] // self.decoder.num_codebooks
161
  if batch_size > 1:
162
+ raise ValueError("ParlerTTSStreamer only supports batch size 1")
163
 
164
  if self.token_cache is None:
165
  self.token_cache = value
 
346
  play_seconds = gr.Slider(2.5, 5.0, value=2.5, step=0.5, label="Streaming interval in seconds", info="Lower = shorter chunks, lower latency, more codec steps"),
347
  run_button = gr.Button("Generate Audio", variant="primary")
348
  with gr.Column():
349
+ audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out", streaming=True, autoplay=True)
 
350
 
351
  inputs = [input_text, description, play_seconds]
352
  outputs = [audio_out]