Spaces:

sagegu
/

gradio_space

Running

App Files Files Community

sagegu commited on 11 days ago

Commit

135bd1e

1 Parent(s): 0f5fe38

update

Browse files

Files changed (3) hide show

app.py +132 -13
requirements.txt +4 -3
streamer.py +137 -0

app.py CHANGED Viewed

@@ -1,18 +1,137 @@
 import gradio as gr
-from transformers import pipeline
-pipeline = pipeline(task="image-classification", model="julien-c/hotdog-not-hotdog")
-def predict(input_img):
-    predictions = pipeline(input_img)
-    return input_img, {p["label"]: p["score"] for p in predictions}
-gradio_app = gr.Interface(
-    predict,
-    inputs=gr.Image(label="Select hot dog candidate", sources=['upload', 'webcam'], type="pil"),
-    outputs=[gr.Image(label="Processed Image"), gr.Label(label="Result", num_top_classes=2)],
-    title="Hot Dog? Or Not?",
-)
-if __name__ == "__main__":
-    gradio_app.launch()

+import io
+from threading import Thread
+import random
+import os
+import numpy as np
+import spaces
 import gradio as gr
+import torch
+from parler_tts import ParlerTTSForConditionalGeneration
+from pydub import AudioSegment
+from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
+from huggingface_hub import InferenceClient
+from streamer import ParlerTTSStreamer
+import time
+device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
+torch_dtype = torch.float16 if device != "cpu" else torch.float32
+repo_id = "parler-tts/parler_tts_mini_v0.1"
+jenny_repo_id = "ylacombe/parler-tts-mini-jenny-30H"
+model = ParlerTTSForConditionalGeneration.from_pretrained(
+    jenny_repo_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
+).to(device)
+client = InferenceClient(token=os.getenv("HF_TOKEN"))
+tokenizer = AutoTokenizer.from_pretrained(repo_id)
+feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
+SAMPLE_RATE = feature_extractor.sampling_rate
+SEED = 42
+def numpy_to_mp3(audio_array, sampling_rate):
+    # Normalize audio_array if it's floating-point
+    if np.issubdtype(audio_array.dtype, np.floating):
+        max_val = np.max(np.abs(audio_array))
+        audio_array = (audio_array / max_val) * 32767  # Normalize to 16-bit range
+        audio_array = audio_array.astype(np.int16)
+    # Create an audio segment from the numpy array
+    audio_segment = AudioSegment(
+        audio_array.tobytes(),
+        frame_rate=sampling_rate,
+        sample_width=audio_array.dtype.itemsize,
+        channels=1
+    )
+    # Export the audio segment to MP3 bytes - use a high bitrate to maximise quality
+    mp3_io = io.BytesIO()
+    audio_segment.export(mp3_io, format="mp3", bitrate="320k")
+    # Get the MP3 bytes
+    mp3_bytes = mp3_io.getvalue()
+    mp3_io.close()
+    return mp3_bytes
+sampling_rate = model.audio_encoder.config.sampling_rate
+frame_rate = model.audio_encoder.config.frame_rate
+def generate_response(audio):
+    gr.Info("Transcribing Audio", duration=5)
+    question = client.automatic_speech_recognition(audio).text
+    messages = [{"role": "system", "content": ("You are a magic 8 ball."
+                                               "Someone will present to you a situation or question and your job "
+                                               "is to answer with a cryptic addage or proverb such as "
+                                               "'curiosity killed the cat' or 'The early bird gets the worm'."
+                                               "Keep your answers short and do not include the phrase 'Magic 8 Ball' in your response. If the question does not make sense or is off-topic, say 'Foolish questions get foolish answers.'"
+                                               "For example, 'Magic 8 Ball, should I get a dog?', 'A dog is ready for you but are you ready for the dog?'")},
+                {"role": "user", "content": f"Magic 8 Ball please answer this question -  {question}"}]
+    response = client.chat_completion(messages, max_tokens=64, seed=random.randint(1, 5000),
+                                      model="mistralai/Mistral-7B-Instruct-v0.3")
+    response = response.choices[0].message.content.replace("Magic 8 Ball", "")
+    return response, None, None
+@spaces.GPU
+def read_response(answer):
+    play_steps_in_s = 2.0
+    play_steps = int(frame_rate * play_steps_in_s)
+    description = "Jenny speaks at an average pace with a calm delivery in a very confined sounding environment with clear audio quality."
+    description_tokens = tokenizer(description, return_tensors="pt").to(device)
+    streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
+    prompt = tokenizer(answer, return_tensors="pt").to(device)
+    generation_kwargs = dict(
+        input_ids=description_tokens.input_ids,
+        prompt_input_ids=prompt.input_ids,
+        streamer=streamer,
+        do_sample=True,
+        temperature=1.0,
+        min_new_tokens=10,
+    )
+    set_seed(SEED)
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    start = time.time()
+    for new_audio in streamer:
+        print(
+            f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds after {time.time() - start} seconds")
+        yield answer, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
+with gr.Blocks() as block:
+    gr.HTML(
+        f"""
+        <h1 style='text-align: center;'> Magic 8 Ball 🎱 </h1>
+        <h3 style='text-align: center;'> Ask a question and receive wisdom </h3>
+        <p style='text-align: center;'> Powered by <a href="https://github.com/huggingface/parler-tts"> Parler-TTS</a>
+        """
+    )
+    with gr.Group():
+        with gr.Row():
+            audio_out = gr.Audio(label="Spoken Answer", streaming=True, autoplay=True, loop=False)
+            answer = gr.Textbox(label="Answer")
+            state = gr.State()
+        with gr.Row():
+            audio_in = gr.Audio(label="Speak you question", sources="microphone", type="filepath")
+    with gr.Row():
+        gr.HTML(
+            """<h3 style='text-align: center;'> Examples: 'What is the meaning of life?', 'Should I get a dog?' </h3>""")
+    audio_in.stop_recording(generate_response, audio_in, [state, answer, audio_out]).then(fn=read_response,
+                                                                                          inputs=state,
+                                                                                          outputs=[answer, audio_out])
+block.launch()

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
-huggingface_hub==0.25.2
-transformers
-torch

+https://gradio-builds.s3.amazonaws.com/bed454c3d22cfacedc047eb3b0ba987b485ac3fd/gradio-4.40.0-py3-none-any.whl
+git+https://github.com/huggingface/parler-tts.git
+accelerate
+nltk

streamer.py ADDED Viewed

	@@ -0,0 +1,137 @@

+from queue import Queue
+from transformers.generation.streamers import BaseStreamer
+from typing import Optional
+from parler_tts import ParlerTTSForConditionalGeneration
+import numpy as np
+import math
+import torch
+class ParlerTTSStreamer(BaseStreamer):
+    def __init__(
+        self,
+        model: ParlerTTSForConditionalGeneration,
+        device: Optional[str] = None,
+        play_steps: Optional[int] = 10,
+        stride: Optional[int] = None,
+        timeout: Optional[float] = None,
+    ):
+        """
+        Streamer that stores playback-ready audio in a queue, to be used by a downstream application as an iterator. This is
+        useful for applications that benefit from accessing the generated audio in a non-blocking way (e.g. in an interactive
+        Gradio demo).
+        Parameters:
+            model (`ParlerTTSForConditionalGeneration`):
+                The Parler-TTS model used to generate the audio waveform.
+            device (`str`, *optional*):
+                The torch device on which to run the computation. If `None`, will default to the device of the model.
+            play_steps (`int`, *optional*, defaults to 10):
+                The number of generation steps with which to return the generated audio array. Using fewer steps will
+                mean the first chunk is ready faster, but will require more codec decoding steps overall. This value
+                should be tuned to your device and latency requirements.
+            stride (`int`, *optional*):
+                The window (stride) between adjacent audio samples. Using a stride between adjacent audio samples reduces
+                the hard boundary between them, giving smoother playback. If `None`, will default to a value equivalent to
+                play_steps // 6 in the audio space.
+            timeout (`int`, *optional*):
+                The timeout for the audio queue. If `None`, the queue will block indefinitely. Useful to handle exceptions
+                in `.generate()`, when it is called in a separate thread.
+        """
+        self.decoder = model.decoder
+        self.audio_encoder = model.audio_encoder
+        self.generation_config = model.generation_config
+        self.device = device if device is not None else model.device
+        # variables used in the streaming process
+        self.play_steps = play_steps
+        if stride is not None:
+            self.stride = stride
+        else:
+            hop_length = math.floor(self.audio_encoder.config.sampling_rate / self.audio_encoder.config.frame_rate)
+            self.stride = hop_length * (play_steps - self.decoder.num_codebooks) // 6
+        self.token_cache = None
+        self.to_yield = 0
+        # varibles used in the thread process
+        self.audio_queue = Queue()
+        self.stop_signal = None
+        self.timeout = timeout
+    def apply_delay_pattern_mask(self, input_ids):
+        # build the delay pattern mask for offsetting each codebook prediction by 1 (this behaviour is specific to Parler)
+        _, delay_pattern_mask = self.decoder.build_delay_pattern_mask(
+            input_ids[:, :1],
+            bos_token_id=self.generation_config.bos_token_id,
+            pad_token_id=self.generation_config.decoder_start_token_id,
+            max_length=input_ids.shape[-1],
+        )
+        # apply the pattern mask to the input ids
+        input_ids = self.decoder.apply_delay_pattern_mask(input_ids, delay_pattern_mask)
+        # revert the pattern delay mask by filtering the pad token id
+        mask = (delay_pattern_mask != self.generation_config.bos_token_id) & (delay_pattern_mask != self.generation_config.pad_token_id)
+        input_ids = input_ids[mask].reshape(1, self.decoder.num_codebooks, -1)
+        # append the frame dimension back to the audio codes
+        input_ids = input_ids[None, ...]
+        # send the input_ids to the correct device
+        input_ids = input_ids.to(self.audio_encoder.device)
+        decode_sequentially = (
+            self.generation_config.bos_token_id in input_ids
+            or self.generation_config.pad_token_id in input_ids
+            or self.generation_config.eos_token_id in input_ids
+        )
+        if not decode_sequentially:
+            output_values = self.audio_encoder.decode(
+                input_ids,
+                audio_scales=[None],
+            )
+        else:
+            sample = input_ids[:, 0]
+            sample_mask = (sample >= self.audio_encoder.config.codebook_size).sum(dim=(0, 1)) == 0
+            sample = sample[:, :, sample_mask]
+            output_values = self.audio_encoder.decode(sample[None, ...], [None])
+        audio_values = output_values.audio_values[0, 0]
+        return audio_values.cpu().float().numpy()
+    def put(self, value):
+        batch_size = value.shape[0] // self.decoder.num_codebooks
+        if batch_size > 1:
+            raise ValueError("ParlerTTSStreamer only supports batch size 1")
+        if self.token_cache is None:
+            self.token_cache = value
+        else:
+            self.token_cache = torch.concatenate([self.token_cache, value[:, None]], dim=-1)
+        if self.token_cache.shape[-1] % self.play_steps == 0:
+            audio_values = self.apply_delay_pattern_mask(self.token_cache)
+            self.on_finalized_audio(audio_values[self.to_yield : -self.stride])
+            self.to_yield += len(audio_values) - self.to_yield - self.stride
+    def end(self):
+        """Flushes any remaining cache and appends the stop symbol."""
+        if self.token_cache is not None:
+            audio_values = self.apply_delay_pattern_mask(self.token_cache)
+        else:
+            audio_values = np.zeros(self.to_yield)
+        self.on_finalized_audio(audio_values[self.to_yield :], stream_end=True)
+    def on_finalized_audio(self, audio: np.ndarray, stream_end: bool = False):
+        """Put the new audio in the queue. If the stream is ending, also put a stop signal in the queue."""
+        self.audio_queue.put(audio, timeout=self.timeout)
+        if stream_end:
+            self.audio_queue.put(self.stop_signal, timeout=self.timeout)
+    def __iter__(self):
+        return self
+    def __next__(self):
+        value = self.audio_queue.get(timeout=self.timeout)
+        if not isinstance(value, np.ndarray) and value == self.stop_signal:
+            raise StopIteration()
+        else:
+            return value