Spaces:

Fabrice-TIERCELIN
/

for-pinokio

Runtime error

App Files Files Community

Fabrice-TIERCELIN commited on Aug 8, 2024

Commit

fc362b8

verified ·

1 Parent(s): 0ff2686

Add new code

Browse files

Files changed (1) hide show

app.py +139 -1

app.py CHANGED Viewed

@@ -17,7 +17,7 @@ from audioldm.audio.stft import TacotronSTFT
 from audioldm.variational_autoencoder import AutoencoderKL
 from pydub import AudioSegment
-# Old code
 import numpy as np
 import torch.nn.functional as F
 from torchvision.transforms.functional import normalize
@@ -28,6 +28,144 @@ import PIL
 from PIL import Image
 from typing import Tuple
 net=BriaRMBG()
 # model_path = "./model1.pth"
 #model_path = hf_hub_download("briaai/RMBG-1.4", 'model.pth')

 from audioldm.variational_autoencoder import AutoencoderKL
 from pydub import AudioSegment
+# Old import
 import numpy as np
 import torch.nn.functional as F
 from torchvision.transforms.functional import normalize
 from PIL import Image
 from typing import Tuple
+max_64_bit_int = 2**63 - 1
+# Automatic device detection
+if torch.cuda.is_available():
+    device_type = "cuda"
+    device_selection = "cuda:0"
+else:
+    device_type = "cpu"
+    device_selection = "cpu"
+class Tango:
+    def __init__(self, name = "declare-lab/tango2", device = device_selection):
+        path = snapshot_download(repo_id = name)
+        vae_config = json.load(open("{}/vae_config.json".format(path)))
+        stft_config = json.load(open("{}/stft_config.json".format(path)))
+        main_config = json.load(open("{}/main_config.json".format(path)))
+        self.vae = AutoencoderKL(**vae_config).to(device)
+        self.stft = TacotronSTFT(**stft_config).to(device)
+        self.model = AudioDiffusion(**main_config).to(device)
+        vae_weights = torch.load("{}/pytorch_model_vae.bin".format(path), map_location = device)
+        stft_weights = torch.load("{}/pytorch_model_stft.bin".format(path), map_location = device)
+        main_weights = torch.load("{}/pytorch_model_main.bin".format(path), map_location = device)
+        self.vae.load_state_dict(vae_weights)
+        self.stft.load_state_dict(stft_weights)
+        self.model.load_state_dict(main_weights)
+        print ("Successfully loaded checkpoint from:", name)
+        self.vae.eval()
+        self.stft.eval()
+        self.model.eval()
+        self.scheduler = DDPMScheduler.from_pretrained(main_config["scheduler_name"], subfolder = "scheduler")
+    def chunks(self, lst, n):
+        # Yield successive n-sized chunks from a list
+        for i in range(0, len(lst), n):
+            yield lst[i:i + n]
+    def generate(self, prompt, steps = 100, guidance = 3, samples = 1, disable_progress = True):
+        # Generate audio for a single prompt string
+        with torch.no_grad():
+            latents = self.model.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress = disable_progress)
+            mel = self.vae.decode_first_stage(latents)
+            wave = self.vae.decode_to_waveform(mel)
+        return wave
+    def generate_for_batch(self, prompts, steps = 200, guidance = 3, samples = 1, batch_size = 8, disable_progress = True):
+        # Generate audio for a list of prompt strings
+        outputs = []
+        for k in tqdm(range(0, len(prompts), batch_size)):
+            batch = prompts[k: k + batch_size]
+            with torch.no_grad():
+                latents = self.model.inference(batch, self.scheduler, steps, guidance, samples, disable_progress = disable_progress)
+                mel = self.vae.decode_first_stage(latents)
+                wave = self.vae.decode_to_waveform(mel)
+                outputs += [item for item in wave]
+        if samples == 1:
+            return outputs
+        return list(self.chunks(outputs, samples))
+# Initialize TANGO
+tango = Tango(device = "cpu")
+tango.vae.to(device_type)
+tango.stft.to(device_type)
+tango.model.to(device_type)
+def update_seed(is_randomize_seed, seed):
+    if is_randomize_seed:
+        return random.randint(0, max_64_bit_int)
+    return seed
+def check(
+    prompt,
+    output_number,
+    steps,
+    guidance,
+    is_randomize_seed,
+    seed
+):
+    if prompt is None or prompt == "":
+        raise gr.Error("Please provide a prompt input.")
+    if not output_number in [1, 2, 3]:
+        raise gr.Error("Please ask for 1, 2 or 3 output files.")
+def update_output(output_format, output_number):
+    return [
+        gr.update(format = output_format),
+        gr.update(format = output_format, visible = (2 <= output_number)),
+        gr.update(format = output_format, visible = (output_number == 3)),
+        gr.update(visible = False)
+    ]
+def text2audio(
+    prompt,
+    output_number,
+    steps,
+    guidance,
+    is_randomize_seed,
+    seed
+):
+    start = time.time()
+    if seed is None:
+        seed = random.randint(0, max_64_bit_int)
+    random.seed(seed)
+    torch.manual_seed(seed)
+    output_wave = tango.generate(prompt, steps, guidance, output_number)
+    output_wave_1 = gr.make_waveform((16000, output_wave[0]))
+    output_wave_2 = gr.make_waveform((16000, output_wave[1])) if (2 <= output_number) else None
+    output_wave_3 = gr.make_waveform((16000, output_wave[2])) if (output_number == 3) else None
+    end = time.time()
+    secondes = int(end - start)
+    minutes = secondes // 60
+    secondes = secondes - (minutes * 60)
+    hours = minutes // 60
+    minutes = minutes - (hours * 60)
+    return [
+        output_wave_1,
+        output_wave_2,
+        output_wave_3,
+        gr.update(visible = True, value = "Start again to get a different result. The output have been generated in " + ((str(hours) + " h, ") if hours != 0 else "") + ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + str(secondes) + " sec.")
+    ]
+if is_space_imported:
+    text2audio = spaces.GPU(text2audio, duration = 420)
+# Old code
 net=BriaRMBG()
 # model_path = "./model1.pth"
 #model_path = hf_hub_download("briaai/RMBG-1.4", 'model.pth')