parler-tts-streaming-webrtc

Running on Zero

App Files Files Community

sanchit-gandhi commited on Apr 24, 2024

Commit

290deb7

1 Parent(s): c8a6713

fix examples

Browse files

Files changed (1) hide show

app.py +15 -6

app.py CHANGED Viewed

@@ -24,6 +24,7 @@ model = ParlerTTSForConditionalGeneration.from_pretrained(
 jenny_model = ParlerTTSForConditionalGeneration.from_pretrained(
     jenny_repo_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
 ).to(device)
 tokenizer = AutoTokenizer.from_pretrained(repo_id)
 feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
@@ -34,41 +35,50 @@ default_text = "Please surprise me and speak in whatever voice you enjoy."
 examples = [
     [
         "Remember - this is only the first iteration of the model! To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data by a factor of five times.",
-        "A male speaker with a low-pitched voice delivering his words at a fast pace in a small, confined space with a very clear audio and an animated tone."
     ],
     [
         "'This is the best time of my life, Bartley,' she said happily.",
         "A female speaker with a slightly low-pitched, quite monotone voice delivers her words at a slightly faster-than-average pace in a confined space with very clear audio.",
     ],
     [
         "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
         "A male speaker with a slightly high-pitched voice delivering his words at a slightly slow pace in a small, confined space with a touch of background noise and a quite monotone tone.",
     ],
     [
         "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
         "A male speaker with a low-pitched voice delivers his words at a fast pace and an animated tone, in a very spacious environment, accompanied by noticeable background noise.",
     ],
 ]
 jenny_examples = [
     [
         "Remember - this is only the first iteration of the model! To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data by a factor of five times.",
-        "Jenny speaks at a fast pace in a small, confined space with a very clear audio and an animated tone."
     ],
     [
         "'This is the best time of my life, Bartley,' she said happily.",
         "Jenny speaks in quite a monotone voice at a slightly faster-than-average pace in a confined space with very clear audio.",
     ],
     [
         "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
         "Jenny delivers her words at a slightly slow pace in a small, confined space with a touch of background noise and a quite monotone tone.",
     ],
     [
         "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
         "Jenny delivers words at a fast pace and an animated tone, in a very spacious environment, accompanied by noticeable background noise.",
     ],
 ]
 class ParlerTTSStreamer(BaseStreamer):
     def __init__(
         self,
@@ -120,7 +130,7 @@ class ParlerTTSStreamer(BaseStreamer):
         self.timeout = timeout
     def apply_delay_pattern_mask(self, input_ids):
-        # build the delay pattern mask for offsetting each codebook prediction by 1 (this behaviour is specific to MusicGen)
         _, delay_pattern_mask = self.decoder.build_delay_pattern_mask(
             input_ids[:, :1],
             bos_token_id=self.generation_config.bos_token_id,
@@ -149,7 +159,7 @@ class ParlerTTSStreamer(BaseStreamer):
     def put(self, value):
         batch_size = value.shape[0] // self.decoder.num_codebooks
         if batch_size > 1:
-            raise ValueError("MusicgenStreamer only supports batch size 1")
         if self.token_cache is None:
             self.token_cache = value
@@ -336,8 +346,7 @@ with gr.Blocks(css=css) as block:
                 play_seconds = gr.Slider(2.5, 5.0, value=2.5, step=0.5, label="Streaming interval in seconds", info="Lower = shorter chunks, lower latency, more codec steps"),
                 run_button = gr.Button("Generate Audio", variant="primary")
             with gr.Column():
-                audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out", streaming=True,
-                                     autoplay=True)
         inputs = [input_text, description, play_seconds]
         outputs = [audio_out]

 jenny_model = ParlerTTSForConditionalGeneration.from_pretrained(
     jenny_repo_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
 ).to(device)
 tokenizer = AutoTokenizer.from_pretrained(repo_id)
 feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
 examples = [
     [
         "Remember - this is only the first iteration of the model! To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data by a factor of five times.",
+        "A male speaker with a low-pitched voice delivering his words at a fast pace in a small, confined space with a very clear audio and an animated tone.",
+        2.5,
     ],
     [
         "'This is the best time of my life, Bartley,' she said happily.",
         "A female speaker with a slightly low-pitched, quite monotone voice delivers her words at a slightly faster-than-average pace in a confined space with very clear audio.",
+        2.5,
     ],
     [
         "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
         "A male speaker with a slightly high-pitched voice delivering his words at a slightly slow pace in a small, confined space with a touch of background noise and a quite monotone tone.",
+        2.5,
     ],
     [
         "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
         "A male speaker with a low-pitched voice delivers his words at a fast pace and an animated tone, in a very spacious environment, accompanied by noticeable background noise.",
+        2.5,
     ],
 ]
 jenny_examples = [
     [
         "Remember - this is only the first iteration of the model! To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data by a factor of five times.",
+        "Jenny speaks at a fast pace in a small, confined space with a very clear audio and an animated tone.",
+        2.5,
     ],
     [
         "'This is the best time of my life, Bartley,' she said happily.",
         "Jenny speaks in quite a monotone voice at a slightly faster-than-average pace in a confined space with very clear audio.",
+        2.5,
     ],
     [
         "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
         "Jenny delivers her words at a slightly slow pace in a small, confined space with a touch of background noise and a quite monotone tone.",
+        2.5,
     ],
     [
         "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
         "Jenny delivers words at a fast pace and an animated tone, in a very spacious environment, accompanied by noticeable background noise.",
+        2.5,
     ],
 ]
 class ParlerTTSStreamer(BaseStreamer):
     def __init__(
         self,
         self.timeout = timeout
     def apply_delay_pattern_mask(self, input_ids):
+        # build the delay pattern mask for offsetting each codebook prediction by 1 (this behaviour is specific to Parler)
         _, delay_pattern_mask = self.decoder.build_delay_pattern_mask(
             input_ids[:, :1],
             bos_token_id=self.generation_config.bos_token_id,
     def put(self, value):
         batch_size = value.shape[0] // self.decoder.num_codebooks
         if batch_size > 1:
+            raise ValueError("ParlerTTSStreamer only supports batch size 1")
         if self.token_cache is None:
             self.token_cache = value
                 play_seconds = gr.Slider(2.5, 5.0, value=2.5, step=0.5, label="Streaming interval in seconds", info="Lower = shorter chunks, lower latency, more codec steps"),
                 run_button = gr.Button("Generate Audio", variant="primary")
             with gr.Column():
+                audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out", streaming=True, autoplay=True)
         inputs = [input_text, description, play_seconds]
         outputs = [audio_out]