diff --git a/configs/model/codi_2.yaml b/configs/model/codi_2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5121589f01c933780421c86cee4d5d2782228db7 --- /dev/null +++ b/configs/model/codi_2.yaml @@ -0,0 +1,21 @@ +######## +# CoDi # +######## + +codi_2: + type: codi_2 + symbol: codi_2 + find_unused_parameters: true + args: + autokl_cfg: MODEL(sd_autoencoder) + optimus_cfg: MODEL(optimus_vae) + clip_cfg: MODEL(clip_frozen) + unet_config: MODEL(openai_unet_codi_2) + beta_linear_start: 0.00085 + beta_linear_end: 0.012 + timesteps: 1000 + vision_scale_factor: 0.18215 + text_scale_factor: 4.3108 + audio_scale_factor: 0.9228 + use_ema: false + parameterization : "eps" diff --git a/configs/model/openai_unet.yaml b/configs/model/openai_unet.yaml index 5be0d88400aface8c99cf294e229595ca574c30b..e5c702d2b1d819629a43bbbce88a9329698d907d 100644 --- a/configs/model/openai_unet.yaml +++ b/configs/model/openai_unet.yaml @@ -82,4 +82,6 @@ openai_unet_codi: unet_image_cfg: MODEL(openai_unet_2d) unet_text_cfg: MODEL(openai_unet_0dmd) unet_audio_cfg: MODEL(openai_unet_2d_audio) - model_type: ['video', 'image', 'text'] \ No newline at end of file + # model_type: ['video', 'image'] + # model_type: ['text'] + model_type: ['audio', 'image', 'video', 'text'] \ No newline at end of file diff --git a/configs/model/openai_unet_2.yaml b/configs/model/openai_unet_2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7ffcd8c981d5e7e14df6b824c9486b6ecbd6b9bb --- /dev/null +++ b/configs/model/openai_unet_2.yaml @@ -0,0 +1,87 @@ +openai_unet_sd: + type: openai_unet + args: + image_size: null # no use + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: [ 2, 2, 2, 2 ] + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + +openai_unet_dual_context: + super_cfg: openai_unet_sd + type: openai_unet_dual_context + +######################## +# Code cleaned version # +######################## + +openai_unet_2d_audio: + type: openai_unet_2d + args: + input_channels: 8 + model_channels: 192 + output_channels: 8 + num_noattn_blocks: [ 2, 2, 2, 2 ] + channel_mult: [ 1, 2, 4, 4 ] + with_attn: [true, true, true, false] + channel_mult_connector: [1, 2, 4] + num_noattn_blocks_connector: [1, 1, 1] + with_connector: [True, True, True, False] + connector_output_channel: 1280 + num_heads: 8 + context_dim: 768 + use_checkpoint: False + +openai_unet_2d: + type: openai_unet_2d + args: + input_channels: 4 + model_channels: 320 + output_channels: 4 + num_noattn_blocks: [ 2, 2, 2, 2 ] + channel_mult: [ 1, 2, 4, 4 ] + with_attn: [true, true, true, false] + channel_mult_connector: [1, 2, 4] + num_noattn_blocks_connector: [1, 1, 1] + with_connector: [True, True, True, False] + connector_output_channel: 1280 + num_heads: 8 + context_dim: 768 + use_checkpoint: True + use_video_architecture: True + +openai_unet_0dmd: + type: openai_unet_0dmd + args: + input_channels: 768 + model_channels: 320 + output_channels: 768 + num_noattn_blocks: [ 2, 2, 2, 2 ] + channel_mult: [ 1, 2, 4, 4 ] + second_dim: [ 4, 4, 4, 4 ] + with_attn: [true, true, true, false] + num_noattn_blocks_connector: [1, 1, 1] + second_dim_connector: [4, 4, 4] + with_connector: [True, True, True, False] + connector_output_channel: 1280 + num_heads: 8 + context_dim: 768 + use_checkpoint: True + +openai_unet_codi_2: + type: openai_unet_codi_2 + args: + unet_frontal_cfg: MODEL(openai_unet_2d) + unet_lateral_cfg: MODEL(openai_unet_2d) + unet_text_cfg: MODEL(openai_unet_0dmd) + # model_type: ['lateral', 'text'] + # model_type: ['text'] + model_type: ['frontal', 'lateral', 'text'] diff --git a/configs/model/optimus.yaml b/configs/model/optimus.yaml index 96a8692134f2824aa7de7142618ea82775e389f3..f716f6b40419e0eb0c623ddd19d328ab941c7ccc 100644 --- a/configs/model/optimus.yaml +++ b/configs/model/optimus.yaml @@ -100,8 +100,7 @@ optimus_vae: tokenizer_decoder: MODEL(optimus_gpt2_tokenizer) args: latent_size: 768 - beta: 1.0 - fb_mode: 0 - length_weighted_loss: false + beta : 1.0 + fb_mode : 0 + length_weighted_loss : false dim_target_kl : 3.0 - diff --git a/configs/model/prova.yaml b/configs/model/prova.yaml index 03c7f07a75d95e24f80f0cb51f06ca6e672012ea..871b18a56de97225f5155675f28e1a8c533dde98 100644 --- a/configs/model/prova.yaml +++ b/configs/model/prova.yaml @@ -82,4 +82,4 @@ prova: unet_frontal_cfg: MODEL(openai_unet_2d) unet_lateral_cfg: MODEL(openai_unet_2d) unet_text_cfg: MODEL(openai_unet_0dmd) - model_type: ['text'] + model_type: ['frontal', 'lateral', 'text'] diff --git a/core/__pycache__/cfg_helper.cpython-311.pyc b/core/__pycache__/cfg_helper.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e20dccbe6059a36d6950b61dba0d82cd1dc15e14 Binary files /dev/null and b/core/__pycache__/cfg_helper.cpython-311.pyc differ diff --git a/core/__pycache__/cfg_helper.cpython-38.pyc b/core/__pycache__/cfg_helper.cpython-38.pyc index 6a352f15b2f331625612050277c0d29056127620..0515edf8707fa6704d0a46320fe3a13f488f15f3 100644 Binary files a/core/__pycache__/cfg_helper.cpython-38.pyc and b/core/__pycache__/cfg_helper.cpython-38.pyc differ diff --git a/core/__pycache__/cfg_holder.cpython-311.pyc b/core/__pycache__/cfg_holder.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2de56e777b54bf0a037e534cc145d9785a929da0 Binary files /dev/null and b/core/__pycache__/cfg_holder.cpython-311.pyc differ diff --git a/core/__pycache__/cfg_holder.cpython-38.pyc b/core/__pycache__/cfg_holder.cpython-38.pyc index 6df0aea37c7a7f52bb46169af937baf71e172f7b..bffb8fcc9dd19267097a88b888af86d0a6359155 100644 Binary files a/core/__pycache__/cfg_holder.cpython-38.pyc and b/core/__pycache__/cfg_holder.cpython-38.pyc differ diff --git a/core/__pycache__/sync.cpython-311.pyc b/core/__pycache__/sync.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..49761fedfbc4f790b348bf900101729e5597045d Binary files /dev/null and b/core/__pycache__/sync.cpython-311.pyc differ diff --git a/core/__pycache__/sync.cpython-38.pyc b/core/__pycache__/sync.cpython-38.pyc index 48528471b831701925ca455c68edbfeb5d0e910a..0b1b04002478c17477caab944383cbb8b7360ba0 100644 Binary files a/core/__pycache__/sync.cpython-38.pyc and b/core/__pycache__/sync.cpython-38.pyc differ diff --git a/core/common/__pycache__/utils.cpython-311.pyc b/core/common/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8cfc00f70a90ca0602bfc38f588f541f594de1ea Binary files /dev/null and b/core/common/__pycache__/utils.cpython-311.pyc differ diff --git a/core/common/__pycache__/utils.cpython-38.pyc b/core/common/__pycache__/utils.cpython-38.pyc index 2425bbde7306394096729b0e86841766e8ff5207..c92211c9e63e6f7cb0d77a027814aa54c304b158 100644 Binary files a/core/common/__pycache__/utils.cpython-38.pyc and b/core/common/__pycache__/utils.cpython-38.pyc differ diff --git a/core/common/utils.py b/core/common/utils.py index 31cc40d95adf81a5b2c4c8fdf98f8e523b485a34..38636192b8ed914a93b09c842d455120db672755 100644 --- a/core/common/utils.py +++ b/core/common/utils.py @@ -99,7 +99,6 @@ def remove_duplicate_word(tx): def regularize_image(x, image_size=512): - BICUBIC = T.InterpolationMode.BICUBIC if isinstance(x, str): x = Image.open(x) size = min(x.size) @@ -111,7 +110,6 @@ def regularize_image(x, image_size=512): size = min(x.size) elif isinstance(x, torch.Tensor): # normalize to [0, 1] - x = x/255.0 size = min(x.size()[1:]) else: assert False, 'Unknown image type' @@ -126,7 +124,6 @@ def regularize_image(x, image_size=512): T.ToTensor(), ]) x = transforms(x) - assert (x.shape[1] == image_size) & (x.shape[2] == image_size), \ 'Wrong image size' """ diff --git a/core/models/__pycache__/__init__.cpython-311.pyc b/core/models/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e7cc975133fd54126c25c9d8f6d6070132045289 Binary files /dev/null and b/core/models/__pycache__/__init__.cpython-311.pyc differ diff --git a/core/models/__pycache__/codi.cpython-311.pyc b/core/models/__pycache__/codi.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7b473c7ceb3e1af3b434fc955ce3379d1de04872 Binary files /dev/null and b/core/models/__pycache__/codi.cpython-311.pyc differ diff --git a/core/models/__pycache__/codi_2.cpython-311.pyc b/core/models/__pycache__/codi_2.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..82565e7bd52ba21b58200a3c821fadb3f23c6035 Binary files /dev/null and b/core/models/__pycache__/codi_2.cpython-311.pyc differ diff --git a/core/models/__pycache__/dani_model.cpython-311.pyc b/core/models/__pycache__/dani_model.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..049af3d8a2438ac43b8f9195912f201a0d1eb579 Binary files /dev/null and b/core/models/__pycache__/dani_model.cpython-311.pyc differ diff --git a/core/models/__pycache__/ema.cpython-311.pyc b/core/models/__pycache__/ema.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a1fdf10fcba85a8953f06b9dc0271553cb23269a Binary files /dev/null and b/core/models/__pycache__/ema.cpython-311.pyc differ diff --git a/core/models/__pycache__/ema.cpython-38.pyc b/core/models/__pycache__/ema.cpython-38.pyc index 84880cceb2846e714dd480a8c13a44d7e3dc50bc..5fe875a378ab2a7dc1bd04f17fa345033e7c8d2f 100644 Binary files a/core/models/__pycache__/ema.cpython-38.pyc and b/core/models/__pycache__/ema.cpython-38.pyc differ diff --git a/core/models/__pycache__/model_module_infer.cpython-311.pyc b/core/models/__pycache__/model_module_infer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..17c3c3015f10690c62b4e3f88b44aa6cfc5ffd22 Binary files /dev/null and b/core/models/__pycache__/model_module_infer.cpython-311.pyc differ diff --git a/core/models/__pycache__/model_module_infer.cpython-38.pyc b/core/models/__pycache__/model_module_infer.cpython-38.pyc index f99560bfe2e651637e4932c2314d33e8b0fc9797..7da1b54dd83223acf864022531d7eaac5d4c08ee 100644 Binary files a/core/models/__pycache__/model_module_infer.cpython-38.pyc and b/core/models/__pycache__/model_module_infer.cpython-38.pyc differ diff --git a/core/models/__pycache__/sd.cpython-311.pyc b/core/models/__pycache__/sd.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1a3e2e6bc97d90aca74d50115f39c0040ee65363 Binary files /dev/null and b/core/models/__pycache__/sd.cpython-311.pyc differ diff --git a/core/models/__pycache__/sd.cpython-38.pyc b/core/models/__pycache__/sd.cpython-38.pyc index aed18f436ec8af530fdd194245ec4acb80baf364..81d49360d860c9115ce4b98d6838f603bc85bd3e 100644 Binary files a/core/models/__pycache__/sd.cpython-38.pyc and b/core/models/__pycache__/sd.cpython-38.pyc differ diff --git a/core/models/codi.py b/core/models/codi.py index 9a91b4de008271e11884b7df906e95f24bd0f6e8..445efe5614f0db8a09e756b20df56462d6008a7a 100644 --- a/core/models/codi.py +++ b/core/models/codi.py @@ -75,16 +75,16 @@ class CoDi(DDPM): @torch.no_grad() def optimus_encode(self, text): if isinstance(text, List): - tokenizer = self.optimus.tokenizer_encoder - token = [tokenizer.tokenize(sentence.lower()) for sentence in text] + token = [self.optimus.tokenizer_encoder.tokenize(sentence.lower()) for sentence in text] token_id = [] for tokeni in token: - token_sentence = [tokenizer._convert_token_to_id(i) for i in tokeni] - token_sentence = tokenizer.add_special_tokens_single_sentence(token_sentence) + token_sentence = [self.optimus.tokenizer_encoder._convert_token_to_id(i) for i in tokeni] + token_sentence = self.optimus.tokenizer_encoder.add_special_tokens_single_sentence(token_sentence) token_id.append(torch.LongTensor(token_sentence)) token_id = torch._C._nn.pad_sequence(token_id, batch_first=True, padding_value=0.0)[:, :512] else: token_id = text + token_id = token_id.to(self.device) z = self.optimus.encoder(token_id, attention_mask=(token_id > 0))[1] z_mu, z_logvar = self.optimus.encoder.linear(z).chunk(2, -1) return z_mu.squeeze(1) * self.text_scale_factor @@ -92,6 +92,7 @@ class CoDi(DDPM): @torch.no_grad() def optimus_decode(self, z, temperature=1.0, max_length=30): z = 1.0 / self.text_scale_factor * z + z = z.to(self.device) return self.optimus.decode(z, temperature, max_length=max_length) @torch.no_grad() diff --git a/core/models/codi_2.py b/core/models/codi_2.py index f81ef6df70a7677899b1ab982e94133f70c051aa..039e4baf9098a3186f5d7db45c82812a489fafb2 100644 --- a/core/models/codi_2.py +++ b/core/models/codi_2.py @@ -1,221 +1,226 @@ -from typing import Dict, List -import os - -import torch -import torch.nn as nn -import torch.nn.functional as F -import numpy as np -import numpy.random as npr -import copy -from functools import partial -from contextlib import contextmanager - -from .common.get_model import get_model, register -from .sd import DDPM - -version = '0' -symbol = 'thesis_model' - - -@register('thesis_model', version) -class CoDi(DDPM): - def __init__(self, - autokl_cfg=None, - optimus_cfg=None, - clip_cfg=None, - vision_scale_factor=0.1812, - text_scale_factor=4.3108, - audio_scale_factor=0.9228, - scale_by_std=False, - *args, - **kwargs): - super().__init__(*args, **kwargs) - - if autokl_cfg is not None: - self.autokl = get_model()(autokl_cfg) - - if optimus_cfg is not None: - self.optimus = get_model()(optimus_cfg) - - if clip_cfg is not None: - self.clip = get_model()(clip_cfg) - - if not scale_by_std: - self.vision_scale_factor = vision_scale_factor - self.text_scale_factor = text_scale_factor - self.audio_scale_factor = audio_scale_factor - else: - self.register_buffer("text_scale_factor", torch.tensor(text_scale_factor)) - self.register_buffer("audio_scale_factor", torch.tensor(audio_scale_factor)) - self.register_buffer('vision_scale_factor', torch.tensor(vision_scale_factor)) - - @property - def device(self): - return next(self.parameters()).device - - @torch.no_grad() - def autokl_encode(self, image): - encoder_posterior = self.autokl.encode(image) - z = encoder_posterior.sample().to(image.dtype) - return self.vision_scale_factor * z - - @torch.no_grad() - def autokl_decode(self, z): - z = 1. / self.vision_scale_factor * z - return self.autokl.decode(z) - - @torch.no_grad() - def optimus_encode(self, text): - if isinstance(text, List): - tokenizer = self.optimus.tokenizer_encoder - token = [tokenizer.tokenize(sentence.lower()) for sentence in text] - token_id = [] - for tokeni in token: - token_sentence = [tokenizer._convert_token_to_id(i) for i in tokeni] - token_sentence = tokenizer.add_special_tokens_single_sentence(token_sentence) - token_id.append(torch.LongTensor(token_sentence)) - token_id = torch._C._nn.pad_sequence(token_id, batch_first=True, padding_value=0.0)[:, :512] - else: - token_id = text - z = self.optimus.encoder(token_id, attention_mask=(token_id > 0))[1] - z_mu, z_logvar = self.optimus.encoder.linear(z).chunk(2, -1) - return z_mu.squeeze(1) * self.text_scale_factor - - @torch.no_grad() - def optimus_decode(self, z, temperature=1.0): - z = 1.0 / self.text_scale_factor * z - return self.optimus.decode(z, temperature) - - @torch.no_grad() - def clip_encode_text(self, text, encode_type='encode_text'): - swap_type = self.clip.encode_type - self.clip.encode_type = encode_type - embedding = self.clip(text, encode_type) - self.clip.encode_type = swap_type - return embedding - - @torch.no_grad() - def clip_encode_vision(self, vision, encode_type='encode_vision'): - swap_type = self.clip.encode_type - self.clip.encode_type = encode_type - embedding = self.clip(vision, encode_type) - self.clip.encode_type = swap_type - return embedding - - @torch.no_grad() - def clap_encode_audio(self, audio): - embedding = self.clap(audio) - return embedding - - def forward(self, x=None, c=None, noise=None, xtype='frontal', ctype='text', u=None, return_algined_latents=False, env_enc=False): - if isinstance(x, list): - t = torch.randint(0, self.num_timesteps, (x[0].shape[0],), device=x[0].device).long() - else: - t = torch.randint(0, self.num_timesteps, (x.shape[0],), device=x.device).long() - return self.p_losses(x, c, t, noise, xtype, ctype, u, return_algined_latents, env_enc) - - def apply_model(self, x_noisy, t, cond, xtype='frontal', ctype='text', u=None, return_algined_latents=False, env_enc=False): - return self.model.diffusion_model(x_noisy, t, cond, xtype, ctype, u, return_algined_latents, env_enc=env_enc) - - def get_pixel_loss(self, pred, target, mean=True): - if self.loss_type == 'l1': - loss = (target - pred).abs() - if mean: - loss = loss.mean() - elif self.loss_type == 'l2': - if mean: - loss = torch.nn.functional.mse_loss(target, pred) - else: - loss = torch.nn.functional.mse_loss(target, pred, reduction='none') - else: - raise NotImplementedError("unknown loss type '{loss_type}'") - loss = torch.nan_to_num(loss, nan=0.0, posinf=0.0, neginf=-0.0) - return loss - - def get_text_loss(self, pred, target): - if self.loss_type == 'l1': - loss = (target - pred).abs() - elif self.loss_type == 'l2': - loss = torch.nn.functional.mse_loss(target, pred, reduction='none') - loss = torch.nan_to_num(loss, nan=0.0, posinf=0.0, neginf=0.0) - return loss - - def p_losses(self, x_start, cond, t, noise=None, xtype='frontal', ctype='text', u=None, - return_algined_latents=False, env_enc=False): - if isinstance(x_start, list): - noise = [torch.randn_like(x_start_i) for x_start_i in x_start] if noise is None else noise - x_noisy = [self.q_sample(x_start=x_start_i, t=t, noise=noise_i) for x_start_i, noise_i in - zip(x_start, noise)] - if not env_enc: - model_output = self.apply_model(x_noisy, t, cond, xtype, ctype, u, return_algined_latents, env_enc) - else: - model_output, h_con = self.apply_model(x_noisy, t, cond, xtype, ctype, u, return_algined_latents, env_enc) - if return_algined_latents: - return model_output - - loss_dict = {} - - if self.parameterization == "x0": - target = x_start - elif self.parameterization == "eps": - target = noise - else: - raise NotImplementedError() - - loss = 0.0 - for model_output_i, target_i, xtype_i in zip(model_output, target, xtype): - if xtype_i == 'frontal': - loss_simple = self.get_pixel_loss(model_output_i, target_i, mean=False).mean([1, 2, 3]) - elif xtype_i == 'text': - loss_simple = self.get_text_loss(model_output_i, target_i).mean([1]) - elif xtype_i == 'lateral': - loss_simple = self.get_pixel_loss(model_output_i, target_i, mean=False).mean([1, 2, 3]) - loss += loss_simple.mean() - - # Controlliamo se il modello ha restituito anche h_con - # In tal caso, abbiamo le rappresentazioni latenti delle due modalità - # estratte dagli environmental encoder, essendo due tensori di dimensione batch_sizex1x1280 - # possiamo utilizzarli per calcolare anche un termine di contrastive loss (crossentropy come in CLIP) - if h_con is not None: - def similarity(z_a, z_b): - return F.cosine_similarity(z_a, z_b) - - z_a, z_b = h_con - - z_a = z_a / z_a.norm(dim=-1, keepdim=True) - z_b = z_b / z_b.norm(dim=-1, keepdim=True) - - logits_a = z_a.squeeze() @ z_b.squeeze().t() - logits_b = z_a.squeeze() @ z_b.squeeze().t() - - labels = torch.arange(len(z_a)).to(z_a.device) - - loss_a = F.cross_entropy(logits_a, labels) - loss_b = F.cross_entropy(logits_b, labels) - - loss_con = (loss_a + loss_b) / 2 - loss += loss_con - return loss / len(xtype) - - else: - noise = torch.randn_like(x_start) if noise is None else noise - x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise) - model_output = self.apply_model(x_noisy, t, cond, xtype, ctype) - - loss_dict = {} - - if self.parameterization == "x0": - target = x_start - elif self.parameterization == "eps": - target = noise - else: - raise NotImplementedError() - - if xtype == 'frontal': - loss_simple = self.get_pixel_loss(model_output, target, mean=False).mean([1, 2, 3]) - elif xtype == 'text': - loss_simple = self.get_text_loss(model_output, target).mean([1]) - elif xtype == 'lateral': - loss_simple = self.get_pixel_loss(model_output, target, mean=False).mean([1, 2, 3]) - loss = loss_simple.mean() - return loss +from typing import Dict, List +import os + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +import numpy.random as npr +import copy +from functools import partial +from contextlib import contextmanager + +from .common.get_model import get_model, register +from .sd import DDPM + +version = '0' +symbol = 'thesis_model' + + +@register('thesis_model', version) +class CoDi(DDPM): + def __init__(self, + autokl_cfg=None, + optimus_cfg=None, + clip_cfg=None, + vision_scale_factor=0.1812, + text_scale_factor=4.3108, + audio_scale_factor=0.9228, + scale_by_std=False, + *args, + **kwargs): + super().__init__(*args, **kwargs) + + if autokl_cfg is not None: + self.autokl = get_model()(autokl_cfg) + + if optimus_cfg is not None: + self.optimus = get_model()(optimus_cfg) + + if clip_cfg is not None: + self.clip = get_model()(clip_cfg) + + if not scale_by_std: + self.vision_scale_factor = vision_scale_factor + self.text_scale_factor = text_scale_factor + self.audio_scale_factor = audio_scale_factor + else: + self.register_buffer("text_scale_factor", torch.tensor(text_scale_factor)) + self.register_buffer("audio_scale_factor", torch.tensor(audio_scale_factor)) + self.register_buffer('vision_scale_factor', torch.tensor(vision_scale_factor)) + + @property + def device(self): + return next(self.parameters()).device + + @torch.no_grad() + def autokl_encode(self, image): + encoder_posterior = self.autokl.encode(image) + z = encoder_posterior.sample().to(image.dtype) + return self.vision_scale_factor * z + + @torch.no_grad() + def autokl_decode(self, z): + z = 1. / self.vision_scale_factor * z + return self.autokl.decode(z) + + @torch.no_grad() + def optimus_encode(self, text): + if isinstance(text, List): + tokenizer = self.optimus.tokenizer_encoder + token = [tokenizer.tokenize(sentence.lower()) for sentence in text] + token_id = [] + for tokeni in token: + token_sentence = [tokenizer._convert_token_to_id(i) for i in tokeni] + token_sentence = tokenizer.add_special_tokens_single_sentence(token_sentence) + token_id.append(torch.LongTensor(token_sentence)) + token_id = torch._C._nn.pad_sequence(token_id, batch_first=True, padding_value=0.0)[:, :512] + else: + token_id = text + token_id = token_id.to(self.device) + z = self.optimus.encoder(token_id, attention_mask=(token_id > 0))[1] + z_mu, z_logvar = self.optimus.encoder.linear(z).chunk(2, -1) + return z_mu.squeeze(1) * self.text_scale_factor + + @torch.no_grad() + def optimus_decode(self, z, temperature=1.0, max_length=30): + z = 1.0 / self.text_scale_factor * z + z = z.to(self.device) + return self.optimus.decode(z, temperature, max_length=max_length) + + @torch.no_grad() + def clip_encode_text(self, text, encode_type='encode_text'): + swap_type = self.clip.encode_type + self.clip.encode_type = encode_type + embedding = self.clip(text, encode_type) + self.clip.encode_type = swap_type + return embedding + + @torch.no_grad() + def clip_encode_vision(self, vision, encode_type='encode_vision'): + swap_type = self.clip.encode_type + self.clip.encode_type = encode_type + embedding = self.clip(vision, encode_type) + self.clip.encode_type = swap_type + return embedding + + @torch.no_grad() + def clap_encode_audio(self, audio): + embedding = self.clap(audio) + return embedding + + def forward(self, x=None, c=None, noise=None, xtype='frontal', ctype='text', u=None, return_algined_latents=False, env_enc=False): + if isinstance(x, list): + t = torch.randint(0, self.num_timesteps, (x[0].shape[0],), device=x[0].device).long() + else: + t = torch.randint(0, self.num_timesteps, (x.shape[0],), device=x.device).long() + return self.p_losses(x, c, t, noise, xtype, ctype, u, return_algined_latents, env_enc) + + def apply_model(self, x_noisy, t, cond, xtype='frontal', ctype='text', u=None, return_algined_latents=False, env_enc=False): + return self.model.diffusion_model(x_noisy, t, cond, xtype, ctype, u, return_algined_latents, env_enc=env_enc) + + def get_pixel_loss(self, pred, target, mean=True): + if self.loss_type == 'l1': + loss = (target - pred).abs() + if mean: + loss = loss.mean() + elif self.loss_type == 'l2': + if mean: + loss = torch.nn.functional.mse_loss(target, pred) + else: + loss = torch.nn.functional.mse_loss(target, pred, reduction='none') + else: + raise NotImplementedError("unknown loss type '{loss_type}'") + loss = torch.nan_to_num(loss, nan=0.0, posinf=0.0, neginf=-0.0) + return loss + + def get_text_loss(self, pred, target): + if self.loss_type == 'l1': + loss = (target - pred).abs() + elif self.loss_type == 'l2': + loss = torch.nn.functional.mse_loss(target, pred, reduction='none') + loss = torch.nan_to_num(loss, nan=0.0, posinf=0.0, neginf=0.0) + return loss + + def p_losses(self, x_start, cond, t, noise=None, xtype='frontal', ctype='text', u=None, + return_algined_latents=False, env_enc=False): + if isinstance(x_start, list): + noise = [torch.randn_like(x_start_i) for x_start_i in x_start] if noise is None else noise + x_noisy = [self.q_sample(x_start=x_start_i, t=t, noise=noise_i) for x_start_i, noise_i in + zip(x_start, noise)] + if not env_enc: + model_output = self.apply_model(x_noisy, t, cond, xtype, ctype, u, return_algined_latents, env_enc) + else: + model_output, h_con = self.apply_model(x_noisy, t, cond, xtype, ctype, u, return_algined_latents, env_enc) + if return_algined_latents: + return model_output + + loss_dict = {} + + if self.parameterization == "x0": + target = x_start + elif self.parameterization == "eps": + target = noise + else: + raise NotImplementedError() + + loss = 0.0 + for model_output_i, target_i, xtype_i in zip(model_output, target, xtype): + if xtype_i == 'frontal': + loss_simple = self.get_pixel_loss(model_output_i, target_i, mean=False).mean([1, 2, 3]) + elif xtype_i == 'text': + loss_simple = self.get_text_loss(model_output_i, target_i).mean([1]) + elif xtype_i == 'lateral': + loss_simple = self.get_pixel_loss(model_output_i, target_i, mean=False).mean([1, 2, 3]) + loss += loss_simple.mean() + + + # Controlliamo se il modello ha restituito anche h_con + # In tal caso, abbiamo le rappresentazioni latenti delle due modalità + # estratte dagli environmental encoder, essendo due tensori di dimensione batch_sizex1x1280 + # possiamo utilizzarli per calcolare anche un termine di contrastive loss (crossentropy come in CLIP) + if h_con is not None: + def similarity(z_a, z_b): + return F.cosine_similarity(z_a, z_b) + + z_a, z_b = h_con + + z_a = z_a / z_a.norm(dim=-1, keepdim=True) + z_b = z_b / z_b.norm(dim=-1, keepdim=True) + + logits_a = z_a.squeeze() @ z_b.squeeze().t() + logits_b = z_a.squeeze() @ z_b.squeeze().t() + + labels = torch.arange(len(z_a)).to(z_a.device) + + loss_a = F.cross_entropy(logits_a, labels) + loss_b = F.cross_entropy(logits_b, labels) + + loss_con = (loss_a + loss_b) / 2 + loss += loss_con + + + return loss / len(xtype) + + else: + noise = torch.randn_like(x_start) if noise is None else noise + x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise) + model_output = self.apply_model(x_noisy, t, cond, xtype, ctype) + + loss_dict = {} + + if self.parameterization == "x0": + target = x_start + elif self.parameterization == "eps": + target = noise + else: + raise NotImplementedError() + + if xtype == 'frontal': + loss_simple = self.get_pixel_loss(model_output, target, mean=False).mean([1, 2, 3]) + elif xtype == 'text': + loss_simple = self.get_text_loss(model_output, target).mean([1]) + elif xtype == 'lateral': + loss_simple = self.get_pixel_loss(model_output, target, mean=False).mean([1, 2, 3]) + loss = loss_simple.mean() + return loss diff --git a/core/models/common/__pycache__/get_model.cpython-311.pyc b/core/models/common/__pycache__/get_model.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6a186676611d7885145b32e0a8407898aa92d36d Binary files /dev/null and b/core/models/common/__pycache__/get_model.cpython-311.pyc differ diff --git a/core/models/common/__pycache__/get_model.cpython-38.pyc b/core/models/common/__pycache__/get_model.cpython-38.pyc index 95fa24e299c9960ef068c818731514bd00a3a157..5d5f3511aac374ea0ae69fd719eb0a538ba8042a 100644 Binary files a/core/models/common/__pycache__/get_model.cpython-38.pyc and b/core/models/common/__pycache__/get_model.cpython-38.pyc differ diff --git a/core/models/common/__pycache__/get_optimizer.cpython-311.pyc b/core/models/common/__pycache__/get_optimizer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eaa9e3b0d201a68fd4e76af16697e078ca627dd5 Binary files /dev/null and b/core/models/common/__pycache__/get_optimizer.cpython-311.pyc differ diff --git a/core/models/common/__pycache__/get_optimizer.cpython-38.pyc b/core/models/common/__pycache__/get_optimizer.cpython-38.pyc index 245b6154f44997d185704c608406ed0a1ecd36d5..63554b95db8fe2b37cf9b5eb51a3540978c38685 100644 Binary files a/core/models/common/__pycache__/get_optimizer.cpython-38.pyc and b/core/models/common/__pycache__/get_optimizer.cpython-38.pyc differ diff --git a/core/models/common/__pycache__/get_scheduler.cpython-311.pyc b/core/models/common/__pycache__/get_scheduler.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4bf1deae0d676af4484c5ecf446e2607d31db087 Binary files /dev/null and b/core/models/common/__pycache__/get_scheduler.cpython-311.pyc differ diff --git a/core/models/common/__pycache__/get_scheduler.cpython-38.pyc b/core/models/common/__pycache__/get_scheduler.cpython-38.pyc index d0628287f2db98d72942a52770267740d025b977..0d8953d064ef17487005f5372a82b8a499e4360a 100644 Binary files a/core/models/common/__pycache__/get_scheduler.cpython-38.pyc and b/core/models/common/__pycache__/get_scheduler.cpython-38.pyc differ diff --git a/core/models/common/__pycache__/utils.cpython-311.pyc b/core/models/common/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9c11c750963d5e082d4c32ee500f100399b0ec0f Binary files /dev/null and b/core/models/common/__pycache__/utils.cpython-311.pyc differ diff --git a/core/models/common/__pycache__/utils.cpython-38.pyc b/core/models/common/__pycache__/utils.cpython-38.pyc index 1bbd3d038bd45650ab7778ff83a27e211d8f7564..65d9c3f8b57daa1dd730b0b8ab647499f873bce3 100644 Binary files a/core/models/common/__pycache__/utils.cpython-38.pyc and b/core/models/common/__pycache__/utils.cpython-38.pyc differ diff --git a/core/models/dani_model.py b/core/models/dani_model.py index edb71feabc36e77f3226e0420d61cd9c16fd3ad0..49e1e9f3690b61512fac3b23cb89fadde99a2fc4 100644 --- a/core/models/dani_model.py +++ b/core/models/dani_model.py @@ -160,7 +160,9 @@ class dani_model(pl.LightningModule): condition_types=condition_types, eta=ddim_eta, verbose=False, - mix_weight=mix_weight) + mix_weight=mix_weight, + progress_bar=None + ) out_all = [] for i, xtype_i in enumerate(xtype): diff --git a/core/models/ddim/__pycache__/ddim.cpython-311.pyc b/core/models/ddim/__pycache__/ddim.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..21a04016768775bf25c4c38c42a6f30d222c4fb8 Binary files /dev/null and b/core/models/ddim/__pycache__/ddim.cpython-311.pyc differ diff --git a/core/models/ddim/__pycache__/ddim.cpython-38.pyc b/core/models/ddim/__pycache__/ddim.cpython-38.pyc index 95b9724582c574078441d49682a6d69c455718bd..26bdd8d096b031e59337286e282c66f9ff4de11e 100644 Binary files a/core/models/ddim/__pycache__/ddim.cpython-38.pyc and b/core/models/ddim/__pycache__/ddim.cpython-38.pyc differ diff --git a/core/models/ddim/__pycache__/ddim_vd.cpython-311.pyc b/core/models/ddim/__pycache__/ddim_vd.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..30270f2f85f8781104c0e262dcd22f98c66a46a8 Binary files /dev/null and b/core/models/ddim/__pycache__/ddim_vd.cpython-311.pyc differ diff --git a/core/models/ddim/__pycache__/ddim_vd.cpython-38.pyc b/core/models/ddim/__pycache__/ddim_vd.cpython-38.pyc index b909844df277911df78bec4726af1bd0d254f4cf..73ad9469929b5397dbb94e2e8f107eb48ea0c418 100644 Binary files a/core/models/ddim/__pycache__/ddim_vd.cpython-38.pyc and b/core/models/ddim/__pycache__/ddim_vd.cpython-38.pyc differ diff --git a/core/models/ddim/__pycache__/diffusion_utils.cpython-311.pyc b/core/models/ddim/__pycache__/diffusion_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0c81da8d81dfa2c71d50a74a534c4b952fa811f2 Binary files /dev/null and b/core/models/ddim/__pycache__/diffusion_utils.cpython-311.pyc differ diff --git a/core/models/ddim/__pycache__/diffusion_utils.cpython-38.pyc b/core/models/ddim/__pycache__/diffusion_utils.cpython-38.pyc index 79bfef9490d05dee64b7bafbec316a44d718577c..e7c072cf592b5e7b9a212d38c50a93448201475b 100644 Binary files a/core/models/ddim/__pycache__/diffusion_utils.cpython-38.pyc and b/core/models/ddim/__pycache__/diffusion_utils.cpython-38.pyc differ diff --git a/core/models/ddim/ddim.py b/core/models/ddim/ddim.py index 0beba123b041382c417b5ac6224bb0a0025b2e63..cbb9ac65223acd35695c96f07441024831acf696 100644 --- a/core/models/ddim/ddim.py +++ b/core/models/ddim/ddim.py @@ -7,6 +7,7 @@ from functools import partial from .diffusion_utils import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like +import streamlit as st class DDIMSampler(object): def __init__(self, model, schedule="linear", **kwargs): @@ -136,7 +137,8 @@ class DDIMSampler(object): score_corrector=None, corrector_kwargs=None, unconditional_guidance_scale=1., - unconditional_conditioning=None,): + unconditional_conditioning=None, + progress_bar=None,): device = self.model.betas.device b = shape[0] if x_T is None: @@ -157,7 +159,11 @@ class DDIMSampler(object): iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps) + if progress_bar is not None: + progress_bar.text("Generating samples...") for i, step in enumerate(iterator): + if progress_bar is not None: + progress_bar.progress(i/total_steps) index = total_steps - i - 1 ts = torch.full((b,), step, device=device, dtype=torch.long) @@ -180,6 +186,9 @@ class DDIMSampler(object): intermediates['x_inter'].append(img) intermediates['pred_x0'].append(pred_x0) + if progress_bar is not None: + progress_bar.success("Sampling complete.") + return img, intermediates @torch.no_grad() diff --git a/core/models/ddim/ddim_vd.py b/core/models/ddim/ddim_vd.py index 7e7a0a9c92e21db818ed64792f3973bc7bcc8b9c..f91428ba82810717517a0afd6e7390bbc9a4e6e3 100644 --- a/core/models/ddim/ddim_vd.py +++ b/core/models/ddim/ddim_vd.py @@ -184,4 +184,4 @@ class DDIMSampler_VD(DDIMSampler): x_prev_i = a_prev.sqrt() * pred_x0_i + dir_xt + noise x_prev.append(x_prev_i) pred_x0.append(pred_x0_i) - return x_prev, pred_x0 \ No newline at end of file + return x_prev, pred_x0 diff --git a/core/models/encoders/__pycache__/clap.cpython-311.pyc b/core/models/encoders/__pycache__/clap.cpython-311.pyc index 46a95e5d8c6721cfdf1c004dc6f943fb5c4aa574..072587400cf24e07882ec14dc269691c0b2fa0ea 100644 Binary files a/core/models/encoders/__pycache__/clap.cpython-311.pyc and b/core/models/encoders/__pycache__/clap.cpython-311.pyc differ diff --git a/core/models/encoders/__pycache__/clip.cpython-311.pyc b/core/models/encoders/__pycache__/clip.cpython-311.pyc index 2065f3f536d97e3bb3a6eb7183697e5574473fdd..0b3bb8c44b1950d5e5a746621693b39c8cea84b4 100644 Binary files a/core/models/encoders/__pycache__/clip.cpython-311.pyc and b/core/models/encoders/__pycache__/clip.cpython-311.pyc differ diff --git a/core/models/encoders/__pycache__/clip.cpython-38.pyc b/core/models/encoders/__pycache__/clip.cpython-38.pyc index e5b5dd9dece03718f8e6200147d4902e2d8cd47c..b349077be81ad172d1b80a785cf074db4646f15c 100644 Binary files a/core/models/encoders/__pycache__/clip.cpython-38.pyc and b/core/models/encoders/__pycache__/clip.cpython-38.pyc differ diff --git a/core/models/encoders/clap_modules/__pycache__/__init__.cpython-311.pyc b/core/models/encoders/clap_modules/__pycache__/__init__.cpython-311.pyc index 5e8f52dcc979bb3cda6a2ab4ad52bf1e6eae4773..1d47b1e0d7ed432bf792f55b7ab29d3d7baec3e6 100644 Binary files a/core/models/encoders/clap_modules/__pycache__/__init__.cpython-311.pyc and b/core/models/encoders/clap_modules/__pycache__/__init__.cpython-311.pyc differ diff --git a/core/models/encoders/clap_modules/open_clip/__pycache__/__init__.cpython-311.pyc b/core/models/encoders/clap_modules/open_clip/__pycache__/__init__.cpython-311.pyc index 7c8a928371857e02c8188ae8f77260c8beb4dea8..01eb73718f8af1297cb3c89b66591400d3a11206 100644 Binary files a/core/models/encoders/clap_modules/open_clip/__pycache__/__init__.cpython-311.pyc and b/core/models/encoders/clap_modules/open_clip/__pycache__/__init__.cpython-311.pyc differ diff --git a/core/models/encoders/clap_modules/open_clip/__pycache__/factory.cpython-311.pyc b/core/models/encoders/clap_modules/open_clip/__pycache__/factory.cpython-311.pyc index 54f82ccdf06391c408cf3918a620a35b43572fc9..a738c9cb24630f33a4a53253dbfbcab79ecddc2c 100644 Binary files a/core/models/encoders/clap_modules/open_clip/__pycache__/factory.cpython-311.pyc and b/core/models/encoders/clap_modules/open_clip/__pycache__/factory.cpython-311.pyc differ diff --git a/core/models/encoders/clap_modules/open_clip/__pycache__/feature_fusion.cpython-311.pyc b/core/models/encoders/clap_modules/open_clip/__pycache__/feature_fusion.cpython-311.pyc index 9c6579fac5b266e296b03f81b3f49abedbe3420b..3f964a70b7cdf3771f8fd962cc171774bb1cb03e 100644 Binary files a/core/models/encoders/clap_modules/open_clip/__pycache__/feature_fusion.cpython-311.pyc and b/core/models/encoders/clap_modules/open_clip/__pycache__/feature_fusion.cpython-311.pyc differ diff --git a/core/models/encoders/clap_modules/open_clip/__pycache__/htsat.cpython-311.pyc b/core/models/encoders/clap_modules/open_clip/__pycache__/htsat.cpython-311.pyc index eeb4b410589af64dffa159a1e8ce326f60d43075..1defa521fa56bd220bf485e41a77f794c57951ee 100644 Binary files a/core/models/encoders/clap_modules/open_clip/__pycache__/htsat.cpython-311.pyc and b/core/models/encoders/clap_modules/open_clip/__pycache__/htsat.cpython-311.pyc differ diff --git a/core/models/encoders/clap_modules/open_clip/__pycache__/loss.cpython-311.pyc b/core/models/encoders/clap_modules/open_clip/__pycache__/loss.cpython-311.pyc index 8fd7f85a5141cbc2c637aa64b52c23ece96a8cdf..67bb6476b8fc596092f88be9ada962e7d4da0a80 100644 Binary files a/core/models/encoders/clap_modules/open_clip/__pycache__/loss.cpython-311.pyc and b/core/models/encoders/clap_modules/open_clip/__pycache__/loss.cpython-311.pyc differ diff --git a/core/models/encoders/clap_modules/open_clip/__pycache__/model.cpython-311.pyc b/core/models/encoders/clap_modules/open_clip/__pycache__/model.cpython-311.pyc index 93dfc0c10c584ba0044f646d8f89a399a77d30eb..d6855dd899ac0f44618fa6341ffdbf92ca2d983e 100644 Binary files a/core/models/encoders/clap_modules/open_clip/__pycache__/model.cpython-311.pyc and b/core/models/encoders/clap_modules/open_clip/__pycache__/model.cpython-311.pyc differ diff --git a/core/models/encoders/clap_modules/open_clip/__pycache__/openai.cpython-311.pyc b/core/models/encoders/clap_modules/open_clip/__pycache__/openai.cpython-311.pyc index 6988f3aba76f94d1ab0bef9b5949ad67948689bf..c79d46e1d0f527328ea236095b60ccf4ec322446 100644 Binary files a/core/models/encoders/clap_modules/open_clip/__pycache__/openai.cpython-311.pyc and b/core/models/encoders/clap_modules/open_clip/__pycache__/openai.cpython-311.pyc differ diff --git a/core/models/encoders/clap_modules/open_clip/__pycache__/pann_model.cpython-311.pyc b/core/models/encoders/clap_modules/open_clip/__pycache__/pann_model.cpython-311.pyc index 05d1f6498e8e3440e725457aed9a3896f09381a5..6cd47b4b3cd5ad962a03d1a0855999f72ac2b920 100644 Binary files a/core/models/encoders/clap_modules/open_clip/__pycache__/pann_model.cpython-311.pyc and b/core/models/encoders/clap_modules/open_clip/__pycache__/pann_model.cpython-311.pyc differ diff --git a/core/models/encoders/clap_modules/open_clip/__pycache__/pretrained.cpython-311.pyc b/core/models/encoders/clap_modules/open_clip/__pycache__/pretrained.cpython-311.pyc index 839ba4e7ab71478e4ed7202442c9792e509bf955..64fd766057366917cc02c88ac317e7b24c1bda36 100644 Binary files a/core/models/encoders/clap_modules/open_clip/__pycache__/pretrained.cpython-311.pyc and b/core/models/encoders/clap_modules/open_clip/__pycache__/pretrained.cpython-311.pyc differ diff --git a/core/models/encoders/clap_modules/open_clip/__pycache__/timm_model.cpython-311.pyc b/core/models/encoders/clap_modules/open_clip/__pycache__/timm_model.cpython-311.pyc index 275b21e8287da491ba5d8bd95b8cd8b4cadfc31f..381a540caacddf3bca6637eb165f80b3eb2c06b2 100644 Binary files a/core/models/encoders/clap_modules/open_clip/__pycache__/timm_model.cpython-311.pyc and b/core/models/encoders/clap_modules/open_clip/__pycache__/timm_model.cpython-311.pyc differ diff --git a/core/models/encoders/clap_modules/open_clip/__pycache__/tokenizer.cpython-311.pyc b/core/models/encoders/clap_modules/open_clip/__pycache__/tokenizer.cpython-311.pyc index 2558ea7dd3d724d4c0c16db810ff0187e66a8883..df9402dfc6368d0c05df7ec0074a702f73aad63d 100644 Binary files a/core/models/encoders/clap_modules/open_clip/__pycache__/tokenizer.cpython-311.pyc and b/core/models/encoders/clap_modules/open_clip/__pycache__/tokenizer.cpython-311.pyc differ diff --git a/core/models/encoders/clap_modules/open_clip/__pycache__/transform.cpython-311.pyc b/core/models/encoders/clap_modules/open_clip/__pycache__/transform.cpython-311.pyc index 3ac53cbab4a2396ffdc71af1db8d72b9ea56e0b8..c073a4b680e898fbdbcc761c8dd6e45d1d776c44 100644 Binary files a/core/models/encoders/clap_modules/open_clip/__pycache__/transform.cpython-311.pyc and b/core/models/encoders/clap_modules/open_clip/__pycache__/transform.cpython-311.pyc differ diff --git a/core/models/encoders/clap_modules/open_clip/__pycache__/utils.cpython-311.pyc b/core/models/encoders/clap_modules/open_clip/__pycache__/utils.cpython-311.pyc index 36b2e7e266d4b9cbb23e358d72bf90215db0482d..d97f1df88c1fa6dddca27ca83cf6ca8a615c1a58 100644 Binary files a/core/models/encoders/clap_modules/open_clip/__pycache__/utils.cpython-311.pyc and b/core/models/encoders/clap_modules/open_clip/__pycache__/utils.cpython-311.pyc differ diff --git a/core/models/encoders/clap_modules/training/__pycache__/__init__.cpython-311.pyc b/core/models/encoders/clap_modules/training/__pycache__/__init__.cpython-311.pyc index b1bc69ac289d14b53ffd4a8fcc3ee581a1379017..6bcf47b87582a3e0cb8958bb7711f33ab4f093fe 100644 Binary files a/core/models/encoders/clap_modules/training/__pycache__/__init__.cpython-311.pyc and b/core/models/encoders/clap_modules/training/__pycache__/__init__.cpython-311.pyc differ diff --git a/core/models/encoders/clap_modules/training/__pycache__/data.cpython-311.pyc b/core/models/encoders/clap_modules/training/__pycache__/data.cpython-311.pyc index 18c9bf737ef6c127df44cc01ab1fd29eb0e73e63..2923c9ecc3982356ef6e23ad74ad00afa5babe05 100644 Binary files a/core/models/encoders/clap_modules/training/__pycache__/data.cpython-311.pyc and b/core/models/encoders/clap_modules/training/__pycache__/data.cpython-311.pyc differ diff --git a/core/models/encoders/clap_modules/training/__pycache__/params.cpython-311.pyc b/core/models/encoders/clap_modules/training/__pycache__/params.cpython-311.pyc index d94e7623885ddf394da6d24d7aac9e7d55e05b9a..dcf617211ea97f4a1af9d831a8d6a2a58ff133f6 100644 Binary files a/core/models/encoders/clap_modules/training/__pycache__/params.cpython-311.pyc and b/core/models/encoders/clap_modules/training/__pycache__/params.cpython-311.pyc differ diff --git a/core/models/encoders/clip.py b/core/models/encoders/clip.py index 864b910c160a25600ccc0bf6f48ca319319bc260..00b828919e7800c9a5658e7ac04b008d17619122 100644 --- a/core/models/encoders/clip.py +++ b/core/models/encoders/clip.py @@ -11,6 +11,7 @@ from einops import rearrange from transformers import CLIPTokenizer, CLIPTextModel from .clip_modules import CLIPProcessor, CLIPModel, CLIPTokenizer, CLIPConfig + version = '0' symbol = 'clip' @@ -75,6 +76,16 @@ class FrozenCLIP(AbstractEncoder): # A trick to get device return self.model.text_projection.weight.device + def freeze(self, modules): + for module in modules: + for param in module.parameters(): + param.requires_grad = False + + def unfreeze(self, modules): + for module in modules: + for param in module.parameters(): + param.requires_grad = True + def encode_text_pooled(self, text): batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True, return_overflowing_tokens=False, padding="max_length", return_tensors="pt") diff --git a/core/models/encoders/clip_modules/__init__.py b/core/models/encoders/clip_modules/__init__.py index 1931d894f148d227831fc60e0dc22c704faaa0df..6086b7075ad6f9b6387295e353164095ed5ead4e 100644 --- a/core/models/encoders/clip_modules/__init__.py +++ b/core/models/encoders/clip_modules/__init__.py @@ -27,6 +27,7 @@ from transformers.utils import ( is_vision_available, ) + _import_structure = { "configuration_clip": ["CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP", "CLIPConfig", "CLIPTextConfig", "CLIPVisionConfig"], "tokenization_clip": ["CLIPTokenizer"], diff --git a/core/models/encoders/clip_modules/__pycache__/__init__.cpython-311.pyc b/core/models/encoders/clip_modules/__pycache__/__init__.cpython-311.pyc index 25eb508f250e5cf3db3f3a8851e259ef426cf565..babdd481ff436539e63b71ac72a7e2fe03457751 100644 Binary files a/core/models/encoders/clip_modules/__pycache__/__init__.cpython-311.pyc and b/core/models/encoders/clip_modules/__pycache__/__init__.cpython-311.pyc differ diff --git a/core/models/encoders/clip_modules/__pycache__/__init__.cpython-38.pyc b/core/models/encoders/clip_modules/__pycache__/__init__.cpython-38.pyc index eb2c0e11d568ca845d54116787f4273dd9214b6f..d74df97924337f60ab320b4bf3c6dfec13618b93 100644 Binary files a/core/models/encoders/clip_modules/__pycache__/__init__.cpython-38.pyc and b/core/models/encoders/clip_modules/__pycache__/__init__.cpython-38.pyc differ diff --git a/core/models/encoders/clip_modules/__pycache__/configuration_clip.cpython-311.pyc b/core/models/encoders/clip_modules/__pycache__/configuration_clip.cpython-311.pyc index b2e4433fdac406c9b62d7b3c768c538513de62ff..dba872d1d42a93b6b49165c18325c9faaca20bbb 100644 Binary files a/core/models/encoders/clip_modules/__pycache__/configuration_clip.cpython-311.pyc and b/core/models/encoders/clip_modules/__pycache__/configuration_clip.cpython-311.pyc differ diff --git a/core/models/encoders/clip_modules/__pycache__/configuration_clip.cpython-38.pyc b/core/models/encoders/clip_modules/__pycache__/configuration_clip.cpython-38.pyc index 1403731dbf390f5b0e5cf2ec940a260e4981124c..cbe863fce0128c9779a385a67b0a19d45b5188e1 100644 Binary files a/core/models/encoders/clip_modules/__pycache__/configuration_clip.cpython-38.pyc and b/core/models/encoders/clip_modules/__pycache__/configuration_clip.cpython-38.pyc differ diff --git a/core/models/encoders/clip_modules/__pycache__/modeling_clip.cpython-311.pyc b/core/models/encoders/clip_modules/__pycache__/modeling_clip.cpython-311.pyc index 5432f3de3ca222e6f1eeda32e955d3c45f41e4cf..3621177d46bc2473594dda117959ea8e3a390e38 100644 Binary files a/core/models/encoders/clip_modules/__pycache__/modeling_clip.cpython-311.pyc and b/core/models/encoders/clip_modules/__pycache__/modeling_clip.cpython-311.pyc differ diff --git a/core/models/encoders/clip_modules/__pycache__/modeling_clip.cpython-38.pyc b/core/models/encoders/clip_modules/__pycache__/modeling_clip.cpython-38.pyc index 2728c6fc3c1f9b2242dd9b1e94b2d38a5bea8ba5..008ce2013719c427d79316f348b83aaaf3f3e1a6 100644 Binary files a/core/models/encoders/clip_modules/__pycache__/modeling_clip.cpython-38.pyc and b/core/models/encoders/clip_modules/__pycache__/modeling_clip.cpython-38.pyc differ diff --git a/core/models/encoders/clip_modules/__pycache__/modules_video.cpython-311.pyc b/core/models/encoders/clip_modules/__pycache__/modules_video.cpython-311.pyc index 2ae4c9189417700f7decf07c530c0e86ccc53073..c00d06b51e09fa1cbe710efe075d73c07207956f 100644 Binary files a/core/models/encoders/clip_modules/__pycache__/modules_video.cpython-311.pyc and b/core/models/encoders/clip_modules/__pycache__/modules_video.cpython-311.pyc differ diff --git a/core/models/encoders/clip_modules/__pycache__/modules_video.cpython-38.pyc b/core/models/encoders/clip_modules/__pycache__/modules_video.cpython-38.pyc index 2cdbb75cc14f21ac0e005259b75bc228c28d6c9c..2654b9a4c9cc30dbb7a50173248192cae8849698 100644 Binary files a/core/models/encoders/clip_modules/__pycache__/modules_video.cpython-38.pyc and b/core/models/encoders/clip_modules/__pycache__/modules_video.cpython-38.pyc differ diff --git a/core/models/encoders/clip_modules/__pycache__/processing_clip.cpython-311.pyc b/core/models/encoders/clip_modules/__pycache__/processing_clip.cpython-311.pyc index fcade1e8ba4f92c1e6349f2b4d4deab1f8fdaed2..f009f8c360948127644657cbaba06d2d0e584092 100644 Binary files a/core/models/encoders/clip_modules/__pycache__/processing_clip.cpython-311.pyc and b/core/models/encoders/clip_modules/__pycache__/processing_clip.cpython-311.pyc differ diff --git a/core/models/encoders/clip_modules/__pycache__/processing_clip.cpython-38.pyc b/core/models/encoders/clip_modules/__pycache__/processing_clip.cpython-38.pyc index 585ad9027181fd2209b704d3573a524a36431a82..343da4f47c60f1f6985e3f42f67e33ff9598ff85 100644 Binary files a/core/models/encoders/clip_modules/__pycache__/processing_clip.cpython-38.pyc and b/core/models/encoders/clip_modules/__pycache__/processing_clip.cpython-38.pyc differ diff --git a/core/models/encoders/clip_modules/__pycache__/tokenization_clip.cpython-311.pyc b/core/models/encoders/clip_modules/__pycache__/tokenization_clip.cpython-311.pyc index 57c5f5aae683416ce3c0dfaf5f36029925e3b511..e355623bbea1d735d19f2bb54cf72c45cb24b366 100644 Binary files a/core/models/encoders/clip_modules/__pycache__/tokenization_clip.cpython-311.pyc and b/core/models/encoders/clip_modules/__pycache__/tokenization_clip.cpython-311.pyc differ diff --git a/core/models/encoders/clip_modules/__pycache__/tokenization_clip.cpython-38.pyc b/core/models/encoders/clip_modules/__pycache__/tokenization_clip.cpython-38.pyc index 8a04072d857f317165c852441d04aa599ade7ade..3967f943f47721c1649451f0c5cd18398fe7254e 100644 Binary files a/core/models/encoders/clip_modules/__pycache__/tokenization_clip.cpython-38.pyc and b/core/models/encoders/clip_modules/__pycache__/tokenization_clip.cpython-38.pyc differ diff --git a/core/models/latent_diffusion/__pycache__/diffusion_unet.cpython-311.pyc b/core/models/latent_diffusion/__pycache__/diffusion_unet.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d107f676a050e05a23c54b33d504cf0a7f9f7ada Binary files /dev/null and b/core/models/latent_diffusion/__pycache__/diffusion_unet.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/__pycache__/diffusion_unet.cpython-38.pyc b/core/models/latent_diffusion/__pycache__/diffusion_unet.cpython-38.pyc index 7dd761e76c407dafc7cd6ad71086725ff817b0a4..e2b48282c805b428e3d854d0a5a75a8352408e4b 100644 Binary files a/core/models/latent_diffusion/__pycache__/diffusion_unet.cpython-38.pyc and b/core/models/latent_diffusion/__pycache__/diffusion_unet.cpython-38.pyc differ diff --git a/core/models/latent_diffusion/__pycache__/modules_attention.cpython-311.pyc b/core/models/latent_diffusion/__pycache__/modules_attention.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b9a7cda6cacc3f7b524ad97bc4476fafbe918f99 Binary files /dev/null and b/core/models/latent_diffusion/__pycache__/modules_attention.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/__pycache__/modules_attention.cpython-38.pyc b/core/models/latent_diffusion/__pycache__/modules_attention.cpython-38.pyc index ac2ed2d5f71193cba5f935f7cd863d6ee6495198..0d7b3121857a25192dc095a920d5c7379455f497 100644 Binary files a/core/models/latent_diffusion/__pycache__/modules_attention.cpython-38.pyc and b/core/models/latent_diffusion/__pycache__/modules_attention.cpython-38.pyc differ diff --git a/core/models/latent_diffusion/__pycache__/modules_conv.cpython-311.pyc b/core/models/latent_diffusion/__pycache__/modules_conv.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4c64480906ec567d411e7f4833a0ca2619773d0e Binary files /dev/null and b/core/models/latent_diffusion/__pycache__/modules_conv.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/__pycache__/modules_conv.cpython-38.pyc b/core/models/latent_diffusion/__pycache__/modules_conv.cpython-38.pyc index acd64d4cf583538222ef3066268cd4048f456cce..efd407cb7c37c2fbccbaca4b6de113b6fbc77418 100644 Binary files a/core/models/latent_diffusion/__pycache__/modules_conv.cpython-38.pyc and b/core/models/latent_diffusion/__pycache__/modules_conv.cpython-38.pyc differ diff --git a/core/models/latent_diffusion/__pycache__/modules_video.cpython-311.pyc b/core/models/latent_diffusion/__pycache__/modules_video.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1d7bb58090faedad8c90f0371c641796d0520c7b Binary files /dev/null and b/core/models/latent_diffusion/__pycache__/modules_video.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/__pycache__/modules_video.cpython-38.pyc b/core/models/latent_diffusion/__pycache__/modules_video.cpython-38.pyc index 09f6623a23682f6a5507745f2342781fb35c6609..63d24ba3ba56d2cdb789eabf5bb16adf09e6108d 100644 Binary files a/core/models/latent_diffusion/__pycache__/modules_video.cpython-38.pyc and b/core/models/latent_diffusion/__pycache__/modules_video.cpython-38.pyc differ diff --git a/core/models/latent_diffusion/vae/__pycache__/audioldm.cpython-311.pyc b/core/models/latent_diffusion/vae/__pycache__/audioldm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0ebb7f200f10f7c49dbb4fb445fc66847cdd9614 Binary files /dev/null and b/core/models/latent_diffusion/vae/__pycache__/audioldm.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/__pycache__/autokl.cpython-311.pyc b/core/models/latent_diffusion/vae/__pycache__/autokl.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9e4e187a99eebd8c7e0efef4276b0b94e5d29784 Binary files /dev/null and b/core/models/latent_diffusion/vae/__pycache__/autokl.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/__pycache__/autokl.cpython-38.pyc b/core/models/latent_diffusion/vae/__pycache__/autokl.cpython-38.pyc index bfe50e8aa901531ab9b93dc9ca59fd61a83d7463..0c1820b232ca31fdd371bc7f158547386d41860a 100644 Binary files a/core/models/latent_diffusion/vae/__pycache__/autokl.cpython-38.pyc and b/core/models/latent_diffusion/vae/__pycache__/autokl.cpython-38.pyc differ diff --git a/core/models/latent_diffusion/vae/__pycache__/optimus.cpython-311.pyc b/core/models/latent_diffusion/vae/__pycache__/optimus.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..53a07ee30412c39af0941f07f7886ad402b93ead Binary files /dev/null and b/core/models/latent_diffusion/vae/__pycache__/optimus.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/__pycache__/optimus.cpython-38.pyc b/core/models/latent_diffusion/vae/__pycache__/optimus.cpython-38.pyc index 573531902bf9284f58d76fedf39b06f2aa142364..1c46b2c638b98ac312ee9aca1e5d98a4ab2f0f4d 100644 Binary files a/core/models/latent_diffusion/vae/__pycache__/optimus.cpython-38.pyc and b/core/models/latent_diffusion/vae/__pycache__/optimus.cpython-38.pyc differ diff --git a/core/models/latent_diffusion/vae/audioldm_modules/__pycache__/__init__.cpython-311.pyc b/core/models/latent_diffusion/vae/audioldm_modules/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c8cac4d88da7624abe53e0f3fa4254819b941b50 Binary files /dev/null and b/core/models/latent_diffusion/vae/audioldm_modules/__pycache__/__init__.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/audioldm_modules/audio/__pycache__/__init__.cpython-311.pyc b/core/models/latent_diffusion/vae/audioldm_modules/audio/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..244d8a96b4f2b2c0dd1f325a45fd32e06bff3cc5 Binary files /dev/null and b/core/models/latent_diffusion/vae/audioldm_modules/audio/__pycache__/__init__.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/audioldm_modules/audio/__pycache__/audio_processing.cpython-311.pyc b/core/models/latent_diffusion/vae/audioldm_modules/audio/__pycache__/audio_processing.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..41dd2a54a0edc7d941849852875ef5792e713732 Binary files /dev/null and b/core/models/latent_diffusion/vae/audioldm_modules/audio/__pycache__/audio_processing.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/audioldm_modules/audio/__pycache__/stft.cpython-311.pyc b/core/models/latent_diffusion/vae/audioldm_modules/audio/__pycache__/stft.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e7ecb3e72cbfda3a390cca48540c3a153689dbeb Binary files /dev/null and b/core/models/latent_diffusion/vae/audioldm_modules/audio/__pycache__/stft.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/audioldm_modules/audio/__pycache__/tools.cpython-311.pyc b/core/models/latent_diffusion/vae/audioldm_modules/audio/__pycache__/tools.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..45aeeffd40b772415d0e4a0ceeceb7149b380060 Binary files /dev/null and b/core/models/latent_diffusion/vae/audioldm_modules/audio/__pycache__/tools.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/audioldm_modules/hifigan/__pycache__/__init__.cpython-311.pyc b/core/models/latent_diffusion/vae/audioldm_modules/hifigan/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..61bd279a92a9d3f01c8fa5b019192ab1337762b8 Binary files /dev/null and b/core/models/latent_diffusion/vae/audioldm_modules/hifigan/__pycache__/__init__.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/audioldm_modules/hifigan/__pycache__/models.cpython-311.pyc b/core/models/latent_diffusion/vae/audioldm_modules/hifigan/__pycache__/models.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..34cfe6e784a9357f08e5ce1314b40f5282aa2b1a Binary files /dev/null and b/core/models/latent_diffusion/vae/audioldm_modules/hifigan/__pycache__/models.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/audioldm_modules/hifigan/__pycache__/utilities.cpython-311.pyc b/core/models/latent_diffusion/vae/audioldm_modules/hifigan/__pycache__/utilities.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..58d3c6c2ad095060aa97b00ed8c9354e94e4a981 Binary files /dev/null and b/core/models/latent_diffusion/vae/audioldm_modules/hifigan/__pycache__/utilities.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/audioldm_modules/latent_diffusion/__pycache__/__init__.cpython-311.pyc b/core/models/latent_diffusion/vae/audioldm_modules/latent_diffusion/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2e4cc46263bbfee28f803b8e20d9f4e1ec1b822c Binary files /dev/null and b/core/models/latent_diffusion/vae/audioldm_modules/latent_diffusion/__pycache__/__init__.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/audioldm_modules/latent_diffusion/__pycache__/attention.cpython-311.pyc b/core/models/latent_diffusion/vae/audioldm_modules/latent_diffusion/__pycache__/attention.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f03c8921e5821c8e661af735adbca5a28b6a2990 Binary files /dev/null and b/core/models/latent_diffusion/vae/audioldm_modules/latent_diffusion/__pycache__/attention.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/audioldm_modules/latent_diffusion/__pycache__/ema.cpython-311.pyc b/core/models/latent_diffusion/vae/audioldm_modules/latent_diffusion/__pycache__/ema.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..04b3eb1bd6f23fafccbb2d5acb3953f13aa6fa0e Binary files /dev/null and b/core/models/latent_diffusion/vae/audioldm_modules/latent_diffusion/__pycache__/ema.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/audioldm_modules/latent_diffusion/__pycache__/util.cpython-311.pyc b/core/models/latent_diffusion/vae/audioldm_modules/latent_diffusion/__pycache__/util.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..09fc43a842cb1774fdac78900c17951488ba59d8 Binary files /dev/null and b/core/models/latent_diffusion/vae/audioldm_modules/latent_diffusion/__pycache__/util.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/audioldm_modules/variational_autoencoder/__pycache__/__init__.cpython-311.pyc b/core/models/latent_diffusion/vae/audioldm_modules/variational_autoencoder/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..98c2a7845c05815275c10c7d70cb681cda5ea1f2 Binary files /dev/null and b/core/models/latent_diffusion/vae/audioldm_modules/variational_autoencoder/__pycache__/__init__.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/audioldm_modules/variational_autoencoder/__pycache__/distributions.cpython-311.pyc b/core/models/latent_diffusion/vae/audioldm_modules/variational_autoencoder/__pycache__/distributions.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2149f99194340b12f4b454b0f9dcd50f1d3e669d Binary files /dev/null and b/core/models/latent_diffusion/vae/audioldm_modules/variational_autoencoder/__pycache__/distributions.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/audioldm_modules/variational_autoencoder/__pycache__/modules.cpython-311.pyc b/core/models/latent_diffusion/vae/audioldm_modules/variational_autoencoder/__pycache__/modules.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b05aca41d8b26a666a5fc233b703310a306ecf3 Binary files /dev/null and b/core/models/latent_diffusion/vae/audioldm_modules/variational_autoencoder/__pycache__/modules.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/autokl_modules/__pycache__/attention.cpython-311.pyc b/core/models/latent_diffusion/vae/autokl_modules/__pycache__/attention.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f1ff24ba646759a50e33070d5a4113b05f956a64 Binary files /dev/null and b/core/models/latent_diffusion/vae/autokl_modules/__pycache__/attention.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/autokl_modules/__pycache__/diffusion_modules.cpython-311.pyc b/core/models/latent_diffusion/vae/autokl_modules/__pycache__/diffusion_modules.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..992fb76b3f59cf77ed266493e7dc37395f413582 Binary files /dev/null and b/core/models/latent_diffusion/vae/autokl_modules/__pycache__/diffusion_modules.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/autokl_modules/__pycache__/diffusion_modules.cpython-38.pyc b/core/models/latent_diffusion/vae/autokl_modules/__pycache__/diffusion_modules.cpython-38.pyc index cfb82d6d76d0087603085d0ecb30d51025b4bd77..798b7ba59e8aef383838e9b6d4a457e21a4a0688 100644 Binary files a/core/models/latent_diffusion/vae/autokl_modules/__pycache__/diffusion_modules.cpython-38.pyc and b/core/models/latent_diffusion/vae/autokl_modules/__pycache__/diffusion_modules.cpython-38.pyc differ diff --git a/core/models/latent_diffusion/vae/autokl_modules/__pycache__/distributions.cpython-311.pyc b/core/models/latent_diffusion/vae/autokl_modules/__pycache__/distributions.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d810b2554ffa0e9ad0b8760100333b5101ffcdca Binary files /dev/null and b/core/models/latent_diffusion/vae/autokl_modules/__pycache__/distributions.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/optimus.py b/core/models/latent_diffusion/vae/optimus.py index 1e1ee4cd0d776f2b5f7fbefabfe17ae732d8eb46..e112c1243bd94bb7ecf6b93392adf92a376e4976 100644 --- a/core/models/latent_diffusion/vae/optimus.py +++ b/core/models/latent_diffusion/vae/optimus.py @@ -111,9 +111,7 @@ class optimus_vae(nn.Module): return mu_expd + torch.mul(eps, std_expd) def forward(self, inputs, labels): - - # pdb.set_trace() - + # pdb.set_trace() attention_mask=(inputs > 0).float() # logger.info(inputs) # logger.info(attention_mask) @@ -300,13 +298,10 @@ class optimus_vae(nn.Module): # pdb.set_trace() log_gen = log_gen.unsqueeze(0).contiguous().view(z.shape[0],-1) - # pdb.set_trace() rc_tmp.append(log_gen) ll_tmp.append(log_gen + log_prior - log_infer) - - log_prob_iw = log_sum_exp(torch.cat(ll_tmp, dim=-1), dim=-1) - math.log(nsamples) log_gen_iw = torch.mean(torch.cat(rc_tmp, dim=-1), dim=-1) diff --git a/core/models/latent_diffusion/vae/optimus_modules/__pycache__/configuration_bert.cpython-311.pyc b/core/models/latent_diffusion/vae/optimus_modules/__pycache__/configuration_bert.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3896bd3cc805bb2226c5a2be28337d186b0036cd Binary files /dev/null and b/core/models/latent_diffusion/vae/optimus_modules/__pycache__/configuration_bert.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/optimus_modules/__pycache__/configuration_gpt2.cpython-311.pyc b/core/models/latent_diffusion/vae/optimus_modules/__pycache__/configuration_gpt2.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d37b530c4249171f4ec4a13d7ba4310122de8c10 Binary files /dev/null and b/core/models/latent_diffusion/vae/optimus_modules/__pycache__/configuration_gpt2.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/optimus_modules/__pycache__/configuration_utils.cpython-311.pyc b/core/models/latent_diffusion/vae/optimus_modules/__pycache__/configuration_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f1849bbfdd4bb7f234c084067880e5dd65ee4129 Binary files /dev/null and b/core/models/latent_diffusion/vae/optimus_modules/__pycache__/configuration_utils.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/optimus_modules/__pycache__/file_utils.cpython-311.pyc b/core/models/latent_diffusion/vae/optimus_modules/__pycache__/file_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a641ae413320bbd22db0448c0ec0b88ce88e8ba3 Binary files /dev/null and b/core/models/latent_diffusion/vae/optimus_modules/__pycache__/file_utils.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/optimus_modules/__pycache__/modeling_utils.cpython-311.pyc b/core/models/latent_diffusion/vae/optimus_modules/__pycache__/modeling_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bb712412c471c40720c29662d26f1a89d8613694 Binary files /dev/null and b/core/models/latent_diffusion/vae/optimus_modules/__pycache__/modeling_utils.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/optimus_modules/__pycache__/modeling_utils.cpython-38.pyc b/core/models/latent_diffusion/vae/optimus_modules/__pycache__/modeling_utils.cpython-38.pyc index f57986ac69f6359a4ee6ba21c60e18d80294187b..3b1e856f9b902620c966dcf762ad404f44144226 100644 Binary files a/core/models/latent_diffusion/vae/optimus_modules/__pycache__/modeling_utils.cpython-38.pyc and b/core/models/latent_diffusion/vae/optimus_modules/__pycache__/modeling_utils.cpython-38.pyc differ diff --git a/core/models/latent_diffusion/vae/optimus_modules/__pycache__/optimus_bert.cpython-311.pyc b/core/models/latent_diffusion/vae/optimus_modules/__pycache__/optimus_bert.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cc761d19d6c2fdb20f88da4e9f991e013115f36a Binary files /dev/null and b/core/models/latent_diffusion/vae/optimus_modules/__pycache__/optimus_bert.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/optimus_modules/__pycache__/optimus_gpt2.cpython-311.pyc b/core/models/latent_diffusion/vae/optimus_modules/__pycache__/optimus_gpt2.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1bdfe0bb32e7638f6717dcc026355a25ac9a7fb4 Binary files /dev/null and b/core/models/latent_diffusion/vae/optimus_modules/__pycache__/optimus_gpt2.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/optimus_modules/__pycache__/tokenization_bert.cpython-311.pyc b/core/models/latent_diffusion/vae/optimus_modules/__pycache__/tokenization_bert.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f58a79421a494848d2b0088c5c8b561c7d3fe7e1 Binary files /dev/null and b/core/models/latent_diffusion/vae/optimus_modules/__pycache__/tokenization_bert.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/optimus_modules/__pycache__/tokenization_gpt2.cpython-311.pyc b/core/models/latent_diffusion/vae/optimus_modules/__pycache__/tokenization_gpt2.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b04b81bb5edbfdd4275ac837e9637e67c866853c Binary files /dev/null and b/core/models/latent_diffusion/vae/optimus_modules/__pycache__/tokenization_gpt2.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/optimus_modules/__pycache__/tokenization_utils.cpython-311.pyc b/core/models/latent_diffusion/vae/optimus_modules/__pycache__/tokenization_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d3d520e7dc5c4c526b69865aa11ac6754078b9ce Binary files /dev/null and b/core/models/latent_diffusion/vae/optimus_modules/__pycache__/tokenization_utils.cpython-311.pyc differ diff --git a/core/models/latent_diffusion/vae/optimus_modules/__pycache__/tokenization_utils.cpython-38.pyc b/core/models/latent_diffusion/vae/optimus_modules/__pycache__/tokenization_utils.cpython-38.pyc index 1f72ce716a49e0bb158a74ae26e48df4add8a244..a430db4cb161954d594a1c521417e6691065f2df 100644 Binary files a/core/models/latent_diffusion/vae/optimus_modules/__pycache__/tokenization_utils.cpython-38.pyc and b/core/models/latent_diffusion/vae/optimus_modules/__pycache__/tokenization_utils.cpython-38.pyc differ diff --git a/core/models/latent_diffusion/vae/optimus_modules/modeling_utils.py b/core/models/latent_diffusion/vae/optimus_modules/modeling_utils.py index d16725f2138f3f0bb33cfe3ddb544063c5e6ee55..cb28ebe17bf57c0cb36f166e04874dbaa4b1e6ea 100644 --- a/core/models/latent_diffusion/vae/optimus_modules/modeling_utils.py +++ b/core/models/latent_diffusion/vae/optimus_modules/modeling_utils.py @@ -51,7 +51,6 @@ except ImportError: def forward(self, input): return input - class PreTrainedModel(nn.Module): r""" Base class for all models. diff --git a/core/models/latent_diffusion/vae/optimus_modules/optimus_bert.py b/core/models/latent_diffusion/vae/optimus_modules/optimus_bert.py index b4f9137d1597165575b44c1dc08f7cb1d32dd557..c64db9b93ed75dfde507555cac6539a37214c054 100644 --- a/core/models/latent_diffusion/vae/optimus_modules/optimus_bert.py +++ b/core/models/latent_diffusion/vae/optimus_modules/optimus_bert.py @@ -52,6 +52,7 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = { 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin", } + def load_tf_weights_in_bert(model, config, tf_checkpoint_path): """ Load tf checkpoints in a pytorch model. """ @@ -61,7 +62,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path): import tensorflow as tf except ImportError: logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " - "https://www.tensorflow.org/install/ for installation instructions.") + "https://www.tensorflow.org/install/ for installation instructions.") raise tf_path = os.path.abspath(tf_checkpoint_path) logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) diff --git a/core/models/latent_diffusion/vae/optimus_modules/tokenization_utils.py b/core/models/latent_diffusion/vae/optimus_modules/tokenization_utils.py index 3d0c7916bc2efb93b6526f1ee0e19c58208a008c..1e2cd59648d764d43f65073dba6c34b318dd4a6b 100644 --- a/core/models/latent_diffusion/vae/optimus_modules/tokenization_utils.py +++ b/core/models/latent_diffusion/vae/optimus_modules/tokenization_utils.py @@ -417,6 +417,7 @@ class PreTrainedTokenizer(object): return tokenizer + def save_pretrained(self, save_directory): """ Save the tokenizer vocabulary files together with: - added tokens, @@ -458,6 +459,7 @@ class PreTrainedTokenizer(object): return vocab_files + (special_tokens_map_file, added_tokens_file) + def save_vocabulary(self, save_directory): """ Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens and special token mappings. @@ -466,14 +468,17 @@ class PreTrainedTokenizer(object): """ raise NotImplementedError + def vocab_size(self): """ Size of the base vocabulary (without the added tokens) """ raise NotImplementedError + def __len__(self): """ Size of the full vocabulary with the added tokens """ return self.vocab_size + len(self.added_tokens_encoder) + def add_tokens(self, new_tokens): """ Add a list of new tokens to the tokenizer class. If the new tokens are not in the @@ -513,6 +518,7 @@ class PreTrainedTokenizer(object): return len(to_add_tokens) + def add_special_tokens(self, special_tokens_dict): """ Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them diff --git a/core/models/sd.py b/core/models/sd.py index d73577ebd2302b17b53a16b4e30b96cc54fff8be..e00e40d908c2b69e7fb14e12b7ab149374cc8e3f 100644 --- a/core/models/sd.py +++ b/core/models/sd.py @@ -277,6 +277,7 @@ class DDPM(nn.Module): loss = torch.nn.functional.mse_loss(target, pred, reduction='none') else: raise NotImplementedError("unknown loss type '{loss_type}'") + return loss def p_losses(self, x_start, t, noise=None):