Spaces:

dmolino
/

MedCoDi-M

Running

App Files Files Community

dmolino commited on Dec 3, 2024

Commit

9a7fe1f

verified ·

1 Parent(s): 7c28703

Upload 225 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

configs/model/audioldm.yaml +24 -0
configs/model/clap.yaml +10 -0
configs/model/clip.yaml +22 -0
configs/model/codi.yaml +23 -0
configs/model/openai_unet.yaml +85 -0
configs/model/optimus.yaml +107 -0
configs/model/prova.yaml +85 -0
configs/model/sd.yaml +20 -0
configs/model/thesis_model.yaml +21 -0
core/__init__.py +0 -0
core/__pycache__/__init__.cpython-38.pyc +0 -0
core/__pycache__/cfg_helper.cpython-38.pyc +0 -0
core/__pycache__/cfg_holder.cpython-38.pyc +0 -0
core/__pycache__/sync.cpython-38.pyc +0 -0
core/cfg_helper.py +665 -0
core/cfg_holder.py +33 -0
core/common/__pycache__/utils.cpython-38.pyc +0 -0
core/common/registry.py +86 -0
core/common/utils.py +412 -0
core/models/__init__.py +4 -0
core/models/__pycache__/__init__.cpython-38.pyc +0 -0
core/models/__pycache__/codi.cpython-38.pyc +0 -0
core/models/__pycache__/codi_2.cpython-38.pyc +0 -0
core/models/__pycache__/dani_model.cpython-38.pyc +0 -0
core/models/__pycache__/ema.cpython-38.pyc +0 -0
core/models/__pycache__/model_module_infer.cpython-38.pyc +0 -0
core/models/__pycache__/sd.cpython-38.pyc +0 -0
core/models/codi.py +227 -0
core/models/codi_2.py +221 -0
core/models/common/__pycache__/get_model.cpython-38.pyc +0 -0
core/models/common/__pycache__/get_optimizer.cpython-38.pyc +0 -0
core/models/common/__pycache__/get_scheduler.cpython-38.pyc +0 -0
core/models/common/__pycache__/utils.cpython-38.pyc +0 -0
core/models/common/get_model.py +88 -0
core/models/common/get_optimizer.py +50 -0
core/models/common/get_scheduler.py +273 -0
core/models/common/utils.py +310 -0
core/models/dani_model.py +170 -0
core/models/ddim/__pycache__/ddim.cpython-38.pyc +0 -0
core/models/ddim/__pycache__/ddim_vd.cpython-38.pyc +0 -0
core/models/ddim/__pycache__/diffusion_utils.cpython-38.pyc +0 -0
core/models/ddim/ddim.py +224 -0
core/models/ddim/ddim_vd.py +175 -0
core/models/ddim/diffusion_utils.py +273 -0
core/models/ema.py +76 -0
core/models/encoders/__pycache__/clap.cpython-311.pyc +0 -0
core/models/encoders/__pycache__/clap.cpython-38.pyc +0 -0
core/models/encoders/__pycache__/clip.cpython-311.pyc +0 -0
core/models/encoders/__pycache__/clip.cpython-38.pyc +0 -0
core/models/encoders/clap.py +134 -0

configs/model/audioldm.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+########################
+# audioldm autoencoder #
+########################
+audioldm_autoencoder:
+  type: audioldm_autoencoder
+  args:
+    embed_dim: 8
+    monitor: val/rec_loss
+    ddconfig:
+      double_z: True
+      z_channels: 8
+      resolution: 256
+      downsample_time: False
+      in_channels: 1
+      out_ch: 1
+      ch: 128
+      ch_mult: [1, 2, 4]
+      num_res_blocks: 2
+      attn_resolutions: []
+      dropout: 0.0
+    lossconfig:
+      target: torch.nn.Identity

configs/model/clap.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+######################
+# clap audio encoder #
+######################
+clap_audio:
+  type: clap_audio
+  args:
+    amodel: "HTSAT-large"
+    joint_embed_shape: 768

configs/model/clip.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+##############################
+# clip vision & text encoder #
+##############################
+clip:
+  symbol: clip
+  args: {}
+clip_frozen:
+  super_cfg: clip
+  type: clip_frozen
+  args: {}
+clip_text:
+  super_cfg: clip
+  type: clip_text
+  args: {}
+clip_vision:
+  super_cfg: clip
+  type: clip_vision
+  args: {}

configs/model/codi.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+########
+# CoDi #
+########
+codi:
+  type: codi
+  symbol: codi
+  find_unused_parameters: true
+  args:
+    audioldm_cfg: MODEL(audioldm_autoencoder)
+    autokl_cfg: MODEL(sd_autoencoder)
+    optimus_cfg: MODEL(optimus_vae)
+    clip_cfg: MODEL(clip_frozen)
+    clap_cfg: MODEL(clap_audio)
+    unet_config: MODEL(openai_unet_codi)
+    beta_linear_start: 0.00085
+    beta_linear_end: 0.012
+    timesteps: 1000
+    vision_scale_factor: 0.18215
+    text_scale_factor: 4.3108
+    audio_scale_factor: 0.9228
+    use_ema: false
+    parameterization : "eps"

configs/model/openai_unet.yaml ADDED Viewed

	@@ -0,0 +1,85 @@

+openai_unet_sd:
+  type: openai_unet
+  args:
+    image_size: null # no use
+    in_channels: 4
+    out_channels: 4
+    model_channels: 320
+    attention_resolutions: [ 4, 2, 1 ]
+    num_res_blocks: [ 2, 2, 2, 2 ]
+    channel_mult: [ 1, 2, 4, 4 ]
+    num_heads: 8
+    use_spatial_transformer: True
+    transformer_depth: 1
+    context_dim: 768
+    use_checkpoint: True
+    legacy: False
+openai_unet_dual_context:
+  super_cfg: openai_unet_sd
+  type: openai_unet_dual_context
+########################
+# Code cleaned version #
+########################
+openai_unet_2d_audio:
+  type: openai_unet_2d
+  args:
+    input_channels: 8
+    model_channels: 192
+    output_channels: 8
+    num_noattn_blocks: [ 2, 2, 2, 2 ]
+    channel_mult: [ 1, 2, 4, 4 ]
+    with_attn: [true, true, true, false]
+    channel_mult_connector: [1, 2, 4]
+    num_noattn_blocks_connector: [1, 1, 1]
+    with_connector: [True, True, True, False]
+    connector_output_channel: 1280
+    num_heads: 8
+    context_dim: 768
+    use_checkpoint: False
+openai_unet_2d:
+  type: openai_unet_2d
+  args:
+    input_channels: 4
+    model_channels: 320
+    output_channels: 4
+    num_noattn_blocks: [ 2, 2, 2, 2 ]
+    channel_mult: [ 1, 2, 4, 4 ]
+    with_attn: [true, true, true, false]
+    channel_mult_connector: [1, 2, 4]
+    num_noattn_blocks_connector: [1, 1, 1]
+    with_connector: [True, True, True, False]
+    connector_output_channel: 1280
+    num_heads: 8
+    context_dim: 768
+    use_checkpoint: True
+    use_video_architecture: True
+openai_unet_0dmd:
+  type: openai_unet_0dmd
+  args:
+    input_channels: 768
+    model_channels: 320
+    output_channels: 768
+    num_noattn_blocks: [ 2, 2, 2, 2 ]
+    channel_mult: [ 1, 2, 4, 4 ]
+    second_dim: [ 4, 4, 4, 4 ]
+    with_attn: [true, true, true, false]
+    num_noattn_blocks_connector: [1, 1, 1]
+    second_dim_connector: [4, 4, 4]
+    with_connector: [True, True, True, False]
+    connector_output_channel: 1280
+    num_heads: 8
+    context_dim: 768
+    use_checkpoint: True
+openai_unet_codi:
+  type: openai_unet_codi
+  args:
+    unet_image_cfg: MODEL(openai_unet_2d)
+    unet_text_cfg: MODEL(openai_unet_0dmd)
+    unet_audio_cfg: MODEL(openai_unet_2d_audio)
+    model_type: ['video', 'image', 'text']

configs/model/optimus.yaml ADDED Viewed

	@@ -0,0 +1,107 @@

+optimus:
+  symbol: optimus
+  find_unused_parameters: false
+  args: {}
+optimus_bert_encoder:
+  super_cfg: optimus
+  type: optimus_bert_connector
+  # pth: pretrained/optimus_bert_encoder.pth
+  args:
+    config:
+      architectures:
+        - BertForMaskedLM
+      attention_probs_dropout_prob: 0.1
+      finetuning_task: null
+      hidden_act: gelu
+      hidden_dropout_prob: 0.1
+      hidden_size: 768
+      initializer_range: 0.02
+      intermediate_size: 3072
+      layer_norm_eps: 1.e-12
+      max_position_embeddings: 512
+      num_attention_heads: 12
+      num_hidden_layers: 12
+      num_labels: 2
+      output_attentions: false
+      output_hidden_states: false
+      pruned_heads: {}
+      torchscript: false
+      type_vocab_size: 2
+      vocab_size: 28996
+    latent_size: 768
+optimus_bert_tokenizer:
+  super_cfg: optimus
+  type: optimus_bert_tokenizer
+  args:
+    do_lower_case: false
+    max_len: 512
+    vocab_file: core/models/latent_diffusion/vae/optimus_modules/vocab/bert-base-cased-vocab.txt
+optimus_gpt2_decoder:
+  super_cfg: optimus
+  type: optimus_gpt2_connector
+  # pth: pretrained/optimus_gpt2_decoder.pth
+  args:
+    config:
+      architectures:
+        - GPT2LMHeadModel
+      attn_pdrop: 0.1
+      embd_pdrop: 0.1
+      finetuning_task: null
+      hidden_size: 768
+      initializer_range: 0.02
+      latent_size: 768
+      layer_norm_epsilon: 1.e-05
+      max_position_embeddings: 1024
+      n_ctx: 1024
+      n_embd: 768
+      n_head: 12
+      n_layer: 12
+      n_positions: 1024
+      num_attention_heads: 12
+      num_hidden_layers: 12
+      num_labels: 1
+      output_attentions: false
+      output_hidden_states: false
+      pretrained_config_archive_map:
+        gpt2        : https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json
+        gpt2-medium : https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json
+        gpt2-large  : https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json
+      pruned_heads: {}
+      resid_pdrop: 0.1
+      summary_activation: null
+      summary_first_dropout: 0.1
+      summary_proj_to_labels: true
+      summary_type: cls_index
+      summary_use_proj: true
+      torchscript: false
+      vocab_size: 50260
+optimus_gpt2_tokenizer:
+  super_cfg: optimus
+  type: optimus_gpt2_tokenizer
+  args:
+    do_lower_case: false
+    max_len: 1024
+    vocab_file: core/models/latent_diffusion/vae/optimus_modules/vocab/gpt2-vocab.json
+    merges_file: core/models/latent_diffusion/vae/optimus_modules/vocab/gpt2-merges.txt
+optimus_vae:
+  super_cfg: optimus
+  type: optimus_vae
+  pth: pretrained/optimus-vae.pth
+  args:
+    encoder: MODEL(optimus_bert_encoder)
+    decoder: MODEL(optimus_gpt2_decoder)
+    tokenizer_encoder: MODEL(optimus_bert_tokenizer)
+    tokenizer_decoder: MODEL(optimus_gpt2_tokenizer)
+    args:
+      latent_size: 768
+      beta: 1.0
+      fb_mode: 0
+      length_weighted_loss: false
+      dim_target_kl : 3.0

configs/model/prova.yaml ADDED Viewed

	@@ -0,0 +1,85 @@

+openai_unet_sd:
+  type: openai_unet
+  args:
+    image_size: null # no use
+    in_channels: 4
+    out_channels: 4
+    model_channels: 320
+    attention_resolutions: [ 4, 2, 1 ]
+    num_res_blocks: [ 2, 2, 2, 2 ]
+    channel_mult: [ 1, 2, 4, 4 ]
+    num_heads: 8
+    use_spatial_transformer: True
+    transformer_depth: 1
+    context_dim: 768
+    use_checkpoint: True
+    legacy: False
+openai_unet_dual_context:
+  super_cfg: openai_unet_sd
+  type: openai_unet_dual_context
+########################
+# Code cleaned version #
+########################
+openai_unet_2d_audio:
+  type: openai_unet_2d
+  args:
+    input_channels: 8
+    model_channels: 192
+    output_channels: 8
+    num_noattn_blocks: [ 2, 2, 2, 2 ]
+    channel_mult: [ 1, 2, 4, 4 ]
+    with_attn: [true, true, true, false]
+    channel_mult_connector: [1, 2, 4]
+    num_noattn_blocks_connector: [1, 1, 1]
+    with_connector: [True, True, True, False]
+    connector_output_channel: 1280
+    num_heads: 8
+    context_dim: 768
+    use_checkpoint: False
+openai_unet_2d:
+  type: openai_unet_2d
+  args:
+    input_channels: 4
+    model_channels: 320
+    output_channels: 4
+    num_noattn_blocks: [ 2, 2, 2, 2 ]
+    channel_mult: [ 1, 2, 4, 4 ]
+    with_attn: [true, true, true, false]
+    channel_mult_connector: [1, 2, 4]
+    num_noattn_blocks_connector: [1, 1, 1]
+    with_connector: [True, True, True, False]
+    connector_output_channel: 1280
+    num_heads: 8
+    context_dim: 768
+    use_checkpoint: True
+    use_video_architecture: True
+openai_unet_0dmd:
+  type: openai_unet_0dmd
+  args:
+    input_channels: 768
+    model_channels: 320
+    output_channels: 768
+    num_noattn_blocks: [ 2, 2, 2, 2 ]
+    channel_mult: [ 1, 2, 4, 4 ]
+    second_dim: [ 4, 4, 4, 4 ]
+    with_attn: [true, true, true, false]
+    num_noattn_blocks_connector: [1, 1, 1]
+    second_dim_connector: [4, 4, 4]
+    with_connector: [True, True, True, False]
+    connector_output_channel: 1280
+    num_heads: 8
+    context_dim: 768
+    use_checkpoint: True
+prova:
+  type: prova
+  args:
+    unet_frontal_cfg: MODEL(openai_unet_2d)
+    unet_lateral_cfg: MODEL(openai_unet_2d)
+    unet_text_cfg: MODEL(openai_unet_0dmd)
+    model_type: ['text']

configs/model/sd.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+sd_autoencoder:
+  type: autoencoderkl
+  args:
+    embed_dim: 4
+    monitor: val/rec_loss
+    ddconfig:
+      double_z: true
+      z_channels: 4
+      resolution: 256
+      in_channels: 3
+      out_ch: 3
+      ch: 128
+      ch_mult: [1, 2, 4, 4]
+      num_res_blocks: 2
+      attn_resolutions: []
+      dropout: 0.0
+      use_video_arch: true
+    lossconfig:
+      target: torch.nn.Identity
+  pth: pretrained/kl-f8.pth

configs/model/thesis_model.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+########
+# CoDi #
+########
+thesis_model:
+  type: thesis_model
+  symbol: thesis_model
+  find_unused_parameters: true
+  args:
+    autokl_cfg: MODEL(sd_autoencoder)
+    optimus_cfg: MODEL(optimus_vae)
+    clip_cfg: MODEL(clip_frozen)
+    unet_config: MODEL(prova)
+    beta_linear_start: 0.00085
+    beta_linear_end: 0.012
+    timesteps: 1000
+    vision_scale_factor: 0.18215
+    text_scale_factor: 4.3108
+    audio_scale_factor: 0.9228
+    use_ema: false
+    parameterization : "eps"

core/__init__.py ADDED Viewed

File without changes

core/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (149 Bytes). View file

core/__pycache__/cfg_helper.cpython-38.pyc ADDED Viewed

Binary file (13 kB). View file

core/__pycache__/cfg_holder.cpython-38.pyc ADDED Viewed

Binary file (1.21 kB). View file

core/__pycache__/sync.cpython-38.pyc ADDED Viewed

Binary file (6.24 kB). View file

core/cfg_helper.py ADDED Viewed

	@@ -0,0 +1,665 @@

+import os
+import os.path as osp
+import shutil
+import copy
+import time
+import pprint
+import numpy as np
+import torch
+import argparse
+import json
+import yaml
+from easydict import EasyDict as edict
+from core.models import get_model
+############
+# cfg_bank #
+############
+def cfg_solvef(cmd, root):
+    if not isinstance(cmd, str):
+        return cmd
+    if cmd.find('SAME')==0:
+        zoom = root
+        p = cmd[len('SAME'):].strip('()').split('.')
+        p = [pi.strip() for pi in p]
+        for pi in p:
+            try:
+                pi = int(pi)
+            except:
+                pass
+            try:
+                zoom = zoom[pi]
+            except:
+                return cmd
+        return cfg_solvef(zoom, root)
+    if cmd.find('SEARCH')==0:
+        zoom = root
+        p = cmd[len('SEARCH'):].strip('()').split('.')
+        p = [pi.strip() for pi in p]
+        find = True
+        # Depth first search
+        for pi in p:
+            try:
+                pi = int(pi)
+            except:
+                pass
+            try:
+                zoom = zoom[pi]
+            except:
+                find = False
+                break
+        if find:
+            return cfg_solvef(zoom, root)
+        else:
+            if isinstance(root, dict):
+                for ri in root:
+                    rv = cfg_solvef(cmd, root[ri])
+                    if rv != cmd:
+                        return rv
+            if isinstance(root, list):
+                for ri in root:
+                    rv = cfg_solvef(cmd, ri)
+                    if rv != cmd:
+                        return rv
+            return cmd
+    if cmd.find('MODEL')==0:
+        goto = cmd[len('MODEL'):].strip('()')
+        return model_cfg_bank()(goto)
+    if cmd.find('DATASET')==0:
+        goto = cmd[len('DATASET'):].strip('()')
+        return dataset_cfg_bank()(goto)
+    return cmd
+def cfg_solve(cfg, cfg_root):
+    # The function solve cfg element such that
+    #   all sorrogate input are settled.
+    #   (i.e. SAME(***) )
+    if isinstance(cfg, list):
+        for i in range(len(cfg)):
+            if isinstance(cfg[i], (list, dict)):
+                cfg[i] = cfg_solve(cfg[i], cfg_root)
+            else:
+                cfg[i] = cfg_solvef(cfg[i], cfg_root)
+    if isinstance(cfg, dict):
+        for k in cfg:
+            if isinstance(cfg[k], (list, dict)):
+                cfg[k] = cfg_solve(cfg[k], cfg_root)
+            else:
+                cfg[k] = cfg_solvef(cfg[k], cfg_root)
+    return cfg
+class model_cfg_bank(object):
+    def __init__(self):
+        self.cfg_dir = osp.join('configs', 'model')
+        self.cfg_bank = edict()
+    def __call__(self, name):
+        if name not in self.cfg_bank:
+            cfg_path = self.get_yaml_path(name)
+            with open(cfg_path, 'r') as f:
+                cfg_new = yaml.load(
+                    f, Loader=yaml.FullLoader)
+            cfg_new = edict(cfg_new)
+            self.cfg_bank.update(cfg_new)
+        cfg = self.cfg_bank[name]
+        cfg.name = name
+        if 'super_cfg' not in cfg:
+            cfg = cfg_solve(cfg, cfg)
+            self.cfg_bank[name] = cfg
+            return copy.deepcopy(cfg)
+        super_cfg = self.__call__(cfg.super_cfg)
+        # unlike other field,
+        # args will not be replaced but update.
+        if 'args' in cfg:
+            if 'args' in  super_cfg:
+                super_cfg.args.update(cfg.args)
+            else:
+                super_cfg.args = cfg.args
+            cfg.pop('args')
+        super_cfg.update(cfg)
+        super_cfg.pop('super_cfg')
+        cfg = super_cfg
+        try:
+            delete_args = cfg.pop('delete_args')
+        except:
+            delete_args = []
+        for dargs in delete_args:
+            cfg.args.pop(dargs)
+        cfg = cfg_solve(cfg, cfg)
+        self.cfg_bank[name] = cfg
+        return copy.deepcopy(cfg)
+    def get_yaml_path(self, name):
+        if name.find('openai_unet')==0:
+            return osp.join(
+                self.cfg_dir, 'openai_unet.yaml')
+        elif name.find('prova')==0:
+            return osp.join(
+                self.cfg_dir, 'prova.yaml')
+        elif name.find('audioldm')==0:
+            return osp.join(
+                self.cfg_dir, 'audioldm.yaml')
+        elif name.find('clip')==0:
+            return osp.join(
+                self.cfg_dir, 'clip.yaml')
+        elif name.find('sd')==0:
+            return osp.join(
+                self.cfg_dir, 'sd.yaml')
+        elif name.find('codi')==0:
+            return osp.join(
+                self.cfg_dir, 'codi.yaml')
+        elif name.find('thesis_model')==0:
+            return osp.join(
+                self.cfg_dir, 'thesis_model.yaml')
+        elif name.find('clap')==0:
+            return osp.join(
+                self.cfg_dir, 'clap.yaml')
+        elif name.find('optimus')==0:
+            return osp.join(
+                self.cfg_dir, 'optimus.yaml')
+        else:
+            raise ValueError
+class dataset_cfg_bank(object):
+    def __init__(self):
+        self.cfg_dir = osp.join('configs', 'dataset')
+        self.cfg_bank = edict()
+    def __call__(self, name):
+        if name not in self.cfg_bank:
+            cfg_path = self.get_yaml_path(name)
+            with open(cfg_path, 'r') as f:
+                cfg_new = yaml.load(
+                    f, Loader=yaml.FullLoader)
+            cfg_new = edict(cfg_new)
+            self.cfg_bank.update(cfg_new)
+        cfg = self.cfg_bank[name]
+        cfg.name = name
+        if cfg.get('super_cfg', None) is None:
+            cfg = cfg_solve(cfg, cfg)
+            self.cfg_bank[name] = cfg
+            return copy.deepcopy(cfg)
+        super_cfg = self.__call__(cfg.super_cfg)
+        super_cfg.update(cfg)
+        cfg = super_cfg
+        cfg.super_cfg = None
+        try:
+            delete = cfg.pop('delete')
+        except:
+            delete = []
+        for dargs in delete:
+            cfg.pop(dargs)
+        cfg = cfg_solve(cfg, cfg)
+        self.cfg_bank[name] = cfg
+        return copy.deepcopy(cfg)
+    def get_yaml_path(self, name):
+        if name.find('cityscapes')==0:
+            return osp.join(
+                self.cfg_dir, 'cityscapes.yaml')
+        elif name.find('div2k')==0:
+            return osp.join(
+                self.cfg_dir, 'div2k.yaml')
+        elif name.find('gandiv2k')==0:
+            return osp.join(
+                self.cfg_dir, 'gandiv2k.yaml')
+        elif name.find('srbenchmark')==0:
+            return osp.join(
+                self.cfg_dir, 'srbenchmark.yaml')
+        elif name.find('imagedir')==0:
+            return osp.join(
+                self.cfg_dir, 'imagedir.yaml')
+        elif name.find('places2')==0:
+            return osp.join(
+                self.cfg_dir, 'places2.yaml')
+        elif name.find('ffhq')==0:
+            return osp.join(
+                self.cfg_dir, 'ffhq.yaml')
+        elif name.find('imcpt')==0:
+            return osp.join(
+                self.cfg_dir, 'imcpt.yaml')
+        elif name.find('texture')==0:
+            return osp.join(
+                self.cfg_dir, 'texture.yaml')
+        elif name.find('openimages')==0:
+            return osp.join(
+                self.cfg_dir, 'openimages.yaml')
+        elif name.find('laion2b')==0:
+            return osp.join(
+                self.cfg_dir, 'laion2b.yaml')
+        elif name.find('laionart')==0:
+            return osp.join(
+                self.cfg_dir, 'laionart.yaml')
+        elif name.find('celeba')==0:
+            return osp.join(
+                self.cfg_dir, 'celeba.yaml')
+        elif name.find('coyo')==0:
+            return osp.join(
+                self.cfg_dir, 'coyo.yaml')
+        elif name.find('pafc')==0:
+            return osp.join(
+                self.cfg_dir, 'pafc.yaml')
+        elif name.find('coco')==0:
+            return osp.join(
+                self.cfg_dir, 'coco.yaml')
+        else:
+            raise ValueError
+class experiment_cfg_bank(object):
+    def __init__(self):
+        self.cfg_dir = osp.join('configs', 'experiment')
+        self.cfg_bank = edict()
+    def __call__(self, name):
+        if name not in self.cfg_bank:
+            cfg_path = self.get_yaml_path(name)
+            with open(cfg_path, 'r') as f:
+                cfg = yaml.load(
+                    f, Loader=yaml.FullLoader)
+            cfg = edict(cfg)
+        cfg = cfg_solve(cfg, cfg)
+        cfg = cfg_solve(cfg, cfg)
+        # twice for SEARCH
+        self.cfg_bank[name] = cfg
+        return copy.deepcopy(cfg)
+    def get_yaml_path(self, name):
+        return osp.join(
+            self.cfg_dir, name+'.yaml')
+def load_cfg_yaml(path):
+    if osp.isfile(path):
+        cfg_path = path
+    elif osp.isfile(osp.join('configs', 'experiment', path)):
+        cfg_path = osp.join('configs', 'experiment', path)
+    elif osp.isfile(osp.join('configs', 'experiment', path+'.yaml')):
+        cfg_path = osp.join('configs', 'experiment', path+'.yaml')
+    else:
+        assert False, 'No such config!'
+    with open(cfg_path, 'r') as f:
+        cfg = yaml.load(f, Loader=yaml.FullLoader)
+        cfg = edict(cfg)
+    cfg = cfg_solve(cfg, cfg)
+    cfg = cfg_solve(cfg, cfg)
+    return cfg
+##############
+# cfg_helper #
+##############
+def get_experiment_id(ref=None):
+    if ref is None:
+        time.sleep(0.5)
+        return int(time.time()*100)
+    else:
+        try:
+            return int(ref)
+        except:
+            pass
+        _, ref = osp.split(ref)
+        ref = ref.split('_')[0]
+        try:
+            return int(ref)
+        except:
+            assert False, 'Invalid experiment ID!'
+def record_resume_cfg(path):
+    cnt = 0
+    while True:
+        if osp.exists(path+'.{:04d}'.format(cnt)):
+            cnt += 1
+            continue
+        shutil.copyfile(path, path+'.{:04d}'.format(cnt))
+        break
+def get_command_line_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--debug', action='store_true', default=False)
+    parser.add_argument('--config', type=str)
+    parser.add_argument('--gpu', nargs='+', type=int)
+    parser.add_argument('--node_rank', type=int, default=0)
+    parser.add_argument('--nodes', type=int, default=1)
+    parser.add_argument('--addr', type=str, default='127.0.0.1')
+    parser.add_argument('--port', type=int, default=11233)
+    parser.add_argument('--signature', nargs='+', type=str)
+    parser.add_argument('--seed', type=int)
+    parser.add_argument('--eval', type=str)
+    parser.add_argument('--eval_subdir', type=str)
+    parser.add_argument('--pretrained', type=str)
+    parser.add_argument('--resume_dir', type=str)
+    parser.add_argument('--resume_step', type=int)
+    parser.add_argument('--resume_weight', type=str)
+    args = parser.parse_args()
+    # Special handling the resume
+    if args.resume_dir is not None:
+        cfg = edict()
+        cfg.env = edict()
+        cfg.env.debug = args.debug
+        cfg.env.resume = edict()
+        cfg.env.resume.dir = args.resume_dir
+        cfg.env.resume.step = args.resume_step
+        cfg.env.resume.weight = args.resume_weight
+        return cfg
+    cfg = load_cfg_yaml(args.config)
+    cfg.env.debug = args.debug
+    cfg.env.gpu_device = [0] if args.gpu is None else list(args.gpu)
+    cfg.env.master_addr = args.addr
+    cfg.env.master_port = args.port
+    cfg.env.dist_url = 'tcp://{}:{}'.format(args.addr, args.port)
+    cfg.env.node_rank = args.node_rank
+    cfg.env.nodes = args.nodes
+    istrain = False if args.eval is not None else True
+    isdebug = cfg.env.debug
+    if istrain:
+        if isdebug:
+            cfg.env.experiment_id = 999999999999
+            cfg.train.signature = ['debug']
+        else:
+            cfg.env.experiment_id = get_experiment_id()
+            if args.signature is not None:
+                cfg.train.signature = args.signature
+    else:
+        if 'train' in cfg:
+            cfg.pop('train')
+        cfg.env.experiment_id = get_experiment_id(args.eval)
+        if args.signature is not None:
+            cfg.eval.signature = args.signature
+        if isdebug and (args.eval is None):
+            cfg.env.experiment_id = 999999999999
+            cfg.eval.signature = ['debug']
+        if args.eval_subdir is not None:
+            if isdebug:
+                cfg.eval.eval_subdir = 'debug'
+            else:
+                cfg.eval.eval_subdir = args.eval_subdir
+        if args.pretrained is not None:
+            cfg.eval.pretrained = args.pretrained
+    # The override pretrained over the setting in cfg.model
+    if args.seed is not None:
+        cfg.env.rnd_seed = args.seed
+    return cfg
+def cfg_initiates(cfg):
+    cfge = cfg.env
+    isdebug = cfge.debug
+    isresume = 'resume' in cfge
+    istrain = 'train' in cfg
+    haseval = 'eval' in cfg
+    cfgt = cfg.train if istrain else None
+    cfgv = cfg.eval if haseval else None
+    ###############################
+    # get some environment params #
+    ###############################
+    cfge.computer = os.uname()
+    cfge.torch_version = str(torch.__version__)
+    ##########
+    # resume #
+    ##########
+    if isresume:
+        resume_cfg_path = osp.join(cfge.resume.dir, 'config.yaml')
+        record_resume_cfg(resume_cfg_path)
+        with open(resume_cfg_path, 'r') as f:
+            cfg_resume = yaml.load(f, Loader=yaml.FullLoader)
+        cfg_resume = edict(cfg_resume)
+        cfg_resume.env.update(cfge)
+        cfg = cfg_resume
+        cfge = cfg.env
+        log_file = cfg.train.log_file
+        print('')
+        print('##########')
+        print('# resume #')
+        print('##########')
+        print('')
+        with open(log_file, 'a') as f:
+            print('', file=f)
+            print('##########', file=f)
+            print('# resume #', file=f)
+            print('##########', file=f)
+            print('', file=f)
+        pprint.pprint(cfg)
+        with open(log_file, 'a') as f:
+            pprint.pprint(cfg, f)
+    ####################
+    # node distributed #
+    ####################
+    if cfg.env.master_addr!='127.0.0.1':
+        os.environ['MASTER_ADDR'] = cfge.master_addr
+        os.environ['MASTER_PORT'] = '{}'.format(cfge.master_port)
+        if cfg.env.dist_backend=='nccl':
+            os.environ['NCCL_SOCKET_FAMILY'] = 'AF_INET'
+        if cfg.env.dist_backend=='gloo':
+            os.environ['GLOO_SOCKET_FAMILY'] = 'AF_INET'
+    #######################
+    # cuda visible device #
+    #######################
+    os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(
+        [str(gid) for gid in cfge.gpu_device])
+    #####################
+    # return resume cfg #
+    #####################
+    if isresume:
+        return cfg
+    #############################################
+    # some misc setting that not need in resume #
+    #############################################
+    cfgm = cfg.model
+    cfge.gpu_count = len(cfge.gpu_device)
+    ##########################################
+    # align batch size and num worker config #
+    ##########################################
+    gpu_n = cfge.gpu_count * cfge.nodes
+    def align_batch_size(bs, bs_per_gpu):
+        assert (bs is not None) or (bs_per_gpu is not None)
+        bs = bs_per_gpu * gpu_n if bs is None else bs
+        bs_per_gpu = bs // gpu_n if bs_per_gpu is None else bs_per_gpu
+        assert (bs == bs_per_gpu * gpu_n)
+        return bs, bs_per_gpu
+    if istrain:
+        cfgt.batch_size, cfgt.batch_size_per_gpu = \
+            align_batch_size(cfgt.batch_size, cfgt.batch_size_per_gpu)
+        cfgt.dataset_num_workers, cfgt.dataset_num_workers_per_gpu = \
+            align_batch_size(cfgt.dataset_num_workers, cfgt.dataset_num_workers_per_gpu)
+    if haseval:
+        cfgv.batch_size, cfgv.batch_size_per_gpu = \
+            align_batch_size(cfgv.batch_size, cfgv.batch_size_per_gpu)
+        cfgv.dataset_num_workers, cfgv.dataset_num_workers_per_gpu = \
+            align_batch_size(cfgv.dataset_num_workers, cfgv.dataset_num_workers_per_gpu)
+    ##################
+    # create log dir #
+    ##################
+    if istrain:
+        if not isdebug:
+            sig = cfgt.get('signature', [])
+            version = get_model().get_version(cfgm.type)
+            sig = sig + ['v{}'.format(version), 's{}'.format(cfge.rnd_seed)]
+        else:
+            sig = ['debug']
+        log_dir = [
+            cfge.log_root_dir,
+            '{}_{}'.format(cfgm.symbol, cfgt.dataset.symbol),
+            '_'.join([str(cfge.experiment_id)] + sig)
+        ]
+        log_dir = osp.join(*log_dir)
+        log_file = osp.join(log_dir, 'train.log')
+        if not osp.exists(log_file):
+            os.makedirs(osp.dirname(log_file))
+        cfgt.log_dir = log_dir
+        cfgt.log_file = log_file
+        if haseval:
+            cfgv.log_dir = log_dir
+            cfgv.log_file = log_file
+    else:
+        model_symbol = cfgm.symbol
+        if cfgv.get('dataset', None) is None:
+            dataset_symbol = 'nodataset'
+        else:
+            dataset_symbol = cfgv.dataset.symbol
+        log_dir = osp.join(cfge.log_root_dir, '{}_{}'.format(model_symbol, dataset_symbol))
+        exp_dir = search_experiment_folder(log_dir, cfge.experiment_id)
+        if exp_dir is None:
+            if not isdebug:
+                sig = cfgv.get('signature', []) + ['evalonly']
+            else:
+                sig = ['debug']
+            exp_dir = '_'.join([str(cfge.experiment_id)] + sig)
+        eval_subdir = cfgv.get('eval_subdir', None)
+        # override subdir in debug mode (if eval_subdir is set)
+        eval_subdir = 'debug' if (eval_subdir is not None) and isdebug else eval_subdir
+        if eval_subdir is not None:
+            log_dir = osp.join(log_dir, exp_dir, eval_subdir)
+        else:
+            log_dir = osp.join(log_dir, exp_dir)
+        disable_log_override = cfgv.get('disable_log_override', False)
+        if osp.isdir(log_dir):
+            if disable_log_override:
+                assert False, 'Override an exsited log_dir is disabled at [{}]'.format(log_dir)
+        else:
+            os.makedirs(log_dir)
+        log_file = osp.join(log_dir, 'eval.log')
+        cfgv.log_dir = log_dir
+        cfgv.log_file = log_file
+    ######################
+    # print and save cfg #
+    ######################
+    pprint.pprint(cfg)
+    with open(log_file, 'w') as f:
+        pprint.pprint(cfg, f)
+    with open(osp.join(log_dir, 'config.yaml'), 'w') as f:
+        yaml.dump(edict_2_dict(cfg), f)
+    #############
+    # save code #
+    #############
+    save_code = False
+    if istrain:
+        save_code = cfgt.get('save_code', False)
+    elif haseval:
+        save_code = cfgv.get('save_code', False)
+    if save_code:
+        codedir = osp.join(log_dir, 'code')
+        if osp.exists(codedir):
+            shutil.rmtree(codedir)
+        for d in ['configs', 'lib']:
+            fromcodedir = d
+            tocodedir = osp.join(codedir, d)
+            shutil.copytree(
+                fromcodedir, tocodedir,
+                ignore=shutil.ignore_patterns(
+                    '*__pycache__*', '*build*'))
+        for codei in os.listdir('.'):
+            if osp.splitext(codei)[1] == 'py':
+                shutil.copy(codei, codedir)
+    #######################
+    # set matplotlib mode #
+    #######################
+    if 'matplotlib_mode' in cfge:
+        try:
+            matplotlib.use(cfge.matplotlib_mode)
+        except:
+            print('Warning: matplotlib mode [{}] failed to be set!'.format(cfge.matplotlib_mode))
+    return cfg
+def edict_2_dict(x):
+    if isinstance(x, dict):
+        xnew = {}
+        for k in x:
+            xnew[k] = edict_2_dict(x[k])
+        return xnew
+    elif isinstance(x, list):
+        xnew = []
+        for i in range(len(x)):
+            xnew.append( edict_2_dict(x[i]) )
+        return xnew
+    else:
+        return x
+def search_experiment_folder(root, exid):
+    target = None
+    for fi in os.listdir(root):
+        if not osp.isdir(osp.join(root, fi)):
+            continue
+        if int(fi.split('_')[0]) == exid:
+            if target is not None:
+                return None # duplicated
+            elif target is None:
+                target = fi
+    return target

core/cfg_holder.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import copy
+def singleton(class_):
+    instances = {}
+    def getinstance(*args, **kwargs):
+        if class_ not in instances:
+            instances[class_] = class_(*args, **kwargs)
+        return instances[class_]
+    return getinstance
+##############
+# cfg_holder #
+##############
+@singleton
+class cfg_unique_holder(object):
+    def __init__(self):
+        self.cfg = None
+        # this is use to track the main codes.
+        self.code = set()
+    def save_cfg(self, cfg):
+        self.cfg = copy.deepcopy(cfg)
+    def add_code(self, code):
+        """
+        A new main code is reached and
+            its name is added.
+        """
+        self.code.add(code)

core/common/__pycache__/utils.cpython-38.pyc ADDED Viewed

Binary file (11.3 kB). View file

core/common/registry.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from argparse import Namespace
+from typing import Union
+from hydra.core.config_store import ConfigStore
+from omegaconf import DictConfig
+REGISTRIES = {}
+def setup_registry(registry_name: str,
+                   base_class=None,
+                   default=None,
+                   required=False):
+    assert registry_name.startswith('--')
+    registry_name = registry_name[2:].replace('-', '_')
+    REGISTRY = {}
+    REGISTRY_CLASS_NAMES = set()
+    DATACLASS_REGISTRY = {}
+    # maintain a registry of all registries
+    if registry_name in REGISTRIES:
+        return  # registry already exists
+    REGISTRIES[registry_name] = {
+        'registry': REGISTRY,
+        'default': default,
+        'dataclass_registry': DATACLASS_REGISTRY,
+    }
+    def build_x(cfg: Union[DictConfig, str, Namespace], *extra_args,
+                **extra_kwargs):
+        assert isinstance(cfg, str)
+        choice = cfg
+        if choice in DATACLASS_REGISTRY:
+            cfg = DATACLASS_REGISTRY[choice]()
+        if choice is None:
+            if required:
+                raise ValueError('{} is required!'.format(registry_name))
+            return None
+        cls = REGISTRY[choice]
+        if hasattr(cls, 'build_' + registry_name):
+            builder = getattr(cls, 'build_' + registry_name)
+        else:
+            builder = cls
+        return builder(cfg, *extra_args, **extra_kwargs)
+    def register_x(name, dataclass=None):
+        def register_x_cls(cls):
+            if name in REGISTRY:
+                raise ValueError('Cannot register duplicate {} ({})'.format(
+                    registry_name, name))
+            if cls.__name__ in REGISTRY_CLASS_NAMES:
+                raise ValueError(
+                    'Cannot register {} with duplicate class name ({})'.format(
+                        registry_name, cls.__name__))
+            if base_class is not None and not issubclass(cls, base_class):
+                raise ValueError('{} must extend {}'.format(
+                    cls.__name__, base_class.__name__))
+            cls.__dataclass = dataclass
+            if cls.__dataclass is not None:
+                DATACLASS_REGISTRY[name] = cls.__dataclass
+                cs = ConfigStore.instance()
+                node = dataclass()
+                node._name = name
+                cs.store(name=name,
+                         group=registry_name,
+                         node=node,
+                         provider='layoutlmft')
+            REGISTRY[name] = cls
+            return cls
+        return register_x_cls
+    return build_x, register_x, REGISTRY, DATACLASS_REGISTRY

core/common/utils.py ADDED Viewed

	@@ -0,0 +1,412 @@

+import random
+import torch
+from collections import OrderedDict
+import numpy as np
+from PIL import Image
+import torchvision.transforms as T
+from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor
+from torchvision import transforms as tvtrans
+from decord import VideoReader, cpu, gpu
+###############
+# text helper #
+###############
+def remove_duplicate_word(tx):
+    def combine_words(input, length):
+        combined_inputs = []
+        if len(splitted_input) > 1:
+            for i in range(len(input) - 1):
+                combined_inputs.append(input[i] + " " + last_word_of(splitted_input[i + 1],
+                                                                     length))  # add the last word of the right-neighbour (overlapping) sequence (before it has expanded), which is the next word in the original sentence
+        return combined_inputs, length + 1
+    def remove_duplicates(input, length):
+        bool_broke = False  #this means we didn't find any duplicates here
+        for i in range(len(input) - length):
+            if input[i] == input[i + length]:  #found a duplicate piece of sentence!
+                for j in range(0, length):  #remove the overlapping sequences in reverse order
+                    del input[i + length - j]
+                bool_broke = True
+                break  #break the for loop as the loop length does not matches the length of splitted_input anymore as we removed elements
+        if bool_broke:
+            return remove_duplicates(input,
+                                     length)  #if we found a duplicate, look for another duplicate of the same length
+        return input
+    def last_word_of(input, length):
+        splitted = input.split(" ")
+        if len(splitted) == 0:
+            return input
+        else:
+            return splitted[length - 1]
+    def split_and_puncsplit(text):
+        tx = text.split(" ")
+        txnew = []
+        for txi in tx:
+            txqueue = []
+            while True:
+                if txi[0] in '([{':
+                    txqueue.extend([txi[:1], '<puncnext>'])
+                    txi = txi[1:]
+                    if len(txi) == 0:
+                        break
+                else:
+                    break
+            txnew += txqueue
+            txstack = []
+            if len(txi) == 0:
+                continue
+            while True:
+                if txi[-1] in '?!.,:;}])':
+                    txstack = ['<puncnext>', txi[-1:]] + txstack
+                    txi = txi[:-1]
+                    if len(txi) == 0:
+                        break
+                else:
+                    break
+            if len(txi) != 0:
+                txnew += [txi]
+            txnew += txstack
+        return txnew
+    if tx == '':
+        return tx
+    splitted_input = split_and_puncsplit(tx)
+    word_length = 1
+    intermediate_output = False
+    while len(splitted_input) > 1:
+        splitted_input = remove_duplicates(splitted_input, word_length)
+        if len(splitted_input) > 1:
+            splitted_input, word_length = combine_words(splitted_input, word_length)
+        if intermediate_output:
+            print(splitted_input)
+            print(word_length)
+    output = splitted_input[0]
+    output = output.replace(' <puncnext> ', '')
+    return output
+#################
+# vision helper #
+#################
+def regularize_image(x, image_size=512):
+    BICUBIC = T.InterpolationMode.BICUBIC
+    if isinstance(x, str):
+        x = Image.open(x)
+        size = min(x.size)
+    elif isinstance(x, Image.Image):
+        x = x.convert('RGB')
+        size = min(x.size)
+    elif isinstance(x, np.ndarray):
+        x = Image.fromarray(x).convert('RGB')
+        size = min(x.size)
+    elif isinstance(x, torch.Tensor):
+        # normalize to [0, 1]
+        x = x/255.0
+        size = min(x.size()[1:])
+    else:
+        assert False, 'Unknown image type'
+    """transforms = T.Compose([
+        T.RandomCrop(size),
+        T.Resize(
+            (image_size, image_size),
+            interpolation=BICUBIC,
+        ),
+        T.RandomHorizontalFlip(),
+        T.ToTensor(),
+    ])
+    x = transforms(x)
+    assert (x.shape[1] == image_size) & (x.shape[2] == image_size), \
+        'Wrong image size'
+    """
+    x = x * 2 - 1
+    return x
+def center_crop(img, new_width=None, new_height=None):
+    width = img.shape[2]
+    height = img.shape[1]
+    if new_width is None:
+        new_width = min(width, height)
+    if new_height is None:
+        new_height = min(width, height)
+    left = int(np.ceil((width - new_width) / 2))
+    right = width - int(np.floor((width - new_width) / 2))
+    top = int(np.ceil((height - new_height) / 2))
+    bottom = height - int(np.floor((height - new_height) / 2))
+    if len(img.shape) == 3:
+        center_cropped_img = img[:, top:bottom, left:right]
+    else:
+        center_cropped_img = img[:, top:bottom, left:right, ...]
+    return center_cropped_img
+def _transform(n_px):
+    return Compose([
+        Resize([n_px, n_px], interpolation=T.InterpolationMode.BICUBIC), ])
+def regularize_video(video, image_size=256):
+    min_shape = min(video.shape[1:3])
+    video = center_crop(video, min_shape, min_shape)
+    video = torch.from_numpy(video).permute(0, 3, 1, 2)
+    video = _transform(image_size)(video)
+    video = video / 255.0 * 2.0 - 1.0
+    return video.permute(1, 0, 2, 3)
+def time_to_indices(video_reader, time):
+    times = video_reader.get_frame_timestamp(range(len(video_reader))).mean(-1)
+    indices = np.searchsorted(times, time)
+    # Use `np.bitwise_or` so it works both with scalars and numpy arrays.
+    return np.where(np.bitwise_or(indices == 0, times[indices] - time <= time - times[indices - 1]), indices,
+                    indices - 1)
+def load_video(video_path, sample_duration=8.0, num_frames=8):
+    sample_duration = 4.0
+    num_frames = 4
+    vr = VideoReader(video_path, ctx=cpu(0))
+    framerate = vr.get_avg_fps()
+    video_frame_len = len(vr)
+    video_len = video_frame_len / framerate
+    sample_duration = min(sample_duration, video_len)
+    if video_len > sample_duration:
+        s = random.random() * (video_len - sample_duration)
+        t = s + sample_duration
+        start, end = time_to_indices(vr, [s, t])
+        end = min(video_frame_len - 1, end)
+        start = min(start, end - 1)
+        downsamlp_indices = np.linspace(start, end, num_frames, endpoint=True).astype(int).tolist()
+    else:
+        downsamlp_indices = np.linspace(0, video_frame_len - 1, num_frames, endpoint=True).astype(int).tolist()
+    video = vr.get_batch(downsamlp_indices).asnumpy()
+    return video
+###############
+# some helper #
+###############
+def atomic_save(cfg, net, opt, step, path):
+    if isinstance(net, (torch.nn.DataParallel,
+                        torch.nn.parallel.DistributedDataParallel)):
+        netm = net.module
+    else:
+        netm = net
+    sd = netm.state_dict()
+    slimmed_sd = [(ki, vi) for ki, vi in sd.items()
+                  if ki.find('first_stage_model') != 0 and ki.find('cond_stage_model') != 0]
+    checkpoint = {
+        "config": cfg,
+        "state_dict": OrderedDict(slimmed_sd),
+        "step": step}
+    if opt is not None:
+        checkpoint['optimizer_states'] = opt.state_dict()
+    import io
+    import fsspec
+    bytesbuffer = io.BytesIO()
+    torch.save(checkpoint, bytesbuffer)
+    with fsspec.open(path, "wb") as f:
+        f.write(bytesbuffer.getvalue())
+def load_state_dict(net, cfg):
+    pretrained_pth_full = cfg.get('pretrained_pth_full', None)
+    pretrained_ckpt_full = cfg.get('pretrained_ckpt_full', None)
+    pretrained_pth = cfg.get('pretrained_pth', None)
+    pretrained_ckpt = cfg.get('pretrained_ckpt', None)
+    pretrained_pth_dm = cfg.get('pretrained_pth_dm', None)
+    pretrained_pth_ema = cfg.get('pretrained_pth_ema', None)
+    strict_sd = cfg.get('strict_sd', False)
+    errmsg = "Overlapped model state_dict! This is undesired behavior!"
+    if pretrained_pth_full is not None or pretrained_ckpt_full is not None:
+        assert (pretrained_pth is None) and \
+               (pretrained_ckpt is None) and \
+               (pretrained_pth_dm is None) and \
+               (pretrained_pth_ema is None), errmsg
+        if pretrained_pth_full is not None:
+            target_file = pretrained_pth_full
+            sd = torch.load(target_file, map_location='cpu')
+            assert pretrained_ckpt is None, errmsg
+        else:
+            target_file = pretrained_ckpt_full
+            sd = torch.load(target_file, map_location='cpu')['state_dict']
+        print('Load full model from [{}] strict [{}].'.format(
+            target_file, strict_sd))
+        net.load_state_dict(sd, strict=strict_sd)
+    if pretrained_pth is not None or pretrained_ckpt is not None:
+        assert (pretrained_ckpt_full is None) and \
+               (pretrained_pth_full is None) and \
+               (pretrained_pth_dm is None) and \
+               (pretrained_pth_ema is None), errmsg
+        if pretrained_pth is not None:
+            target_file = pretrained_pth
+            sd = torch.load(target_file, map_location='cpu')
+            assert pretrained_ckpt is None, errmsg
+        else:
+            target_file = pretrained_ckpt
+            sd = torch.load(target_file, map_location='cpu')['state_dict']
+        print('Load model from [{}] strict [{}].'.format(
+            target_file, strict_sd))
+        sd_extra = [(ki, vi) for ki, vi in net.state_dict().items() \
+                    if ki.find('first_stage_model') == 0 or ki.find('cond_stage_model') == 0]
+        sd.update(OrderedDict(sd_extra))
+        net.load_state_dict(sd, strict=strict_sd)
+    if pretrained_pth_dm is not None:
+        assert (pretrained_ckpt_full is None) and \
+               (pretrained_pth_full is None) and \
+               (pretrained_pth is None) and \
+               (pretrained_ckpt is None), errmsg
+        print('Load diffusion model from [{}] strict [{}].'.format(
+            pretrained_pth_dm, strict_sd))
+        sd = torch.load(pretrained_pth_dm, map_location='cpu')
+        net.model.diffusion_model.load_state_dict(sd, strict=strict_sd)
+    if pretrained_pth_ema is not None:
+        assert (pretrained_ckpt_full is None) and \
+               (pretrained_pth_full is None) and \
+               (pretrained_pth is None) and \
+               (pretrained_ckpt is None), errmsg
+        print('Load unet ema model from [{}] strict [{}].'.format(
+            pretrained_pth_ema, strict_sd))
+        sd = torch.load(pretrained_pth_ema, map_location='cpu')
+        net.model_ema.load_state_dict(sd, strict=strict_sd)
+def auto_merge_imlist(imlist, max=64):
+    imlist = imlist[0:max]
+    h, w = imlist[0].shape[0:2]
+    num_images = len(imlist)
+    num_row = int(np.sqrt(num_images))
+    num_col = num_images // num_row + 1 if num_images % num_row != 0 else num_images // num_row
+    canvas = np.zeros([num_row * h, num_col * w, 3], dtype=np.uint8)
+    for idx, im in enumerate(imlist):
+        hi = (idx // num_col) * h
+        wi = (idx % num_col) * w
+        canvas[hi:hi + h, wi:wi + w, :] = im
+    return canvas
+def latent2im(net, latent):
+    single_input = len(latent.shape) == 3
+    if single_input:
+        latent = latent[None]
+    im = net.decode_image(latent.to(net.device))
+    im = torch.clamp((im + 1.0) / 2.0, min=0.0, max=1.0)
+    im = [tvtrans.ToPILImage()(i) for i in im]
+    if single_input:
+        im = im[0]
+    return im
+def im2latent(net, im):
+    single_input = not isinstance(im, list)
+    if single_input:
+        im = [im]
+    im = torch.stack([tvtrans.ToTensor()(i) for i in im], dim=0)
+    im = (im * 2 - 1).to(net.device)
+    z = net.encode_image(im)
+    if single_input:
+        z = z[0]
+    return z
+class color_adjust(object):
+    def __init__(self, ref_from, ref_to):
+        x0, m0, std0 = self.get_data_and_stat(ref_from)
+        x1, m1, std1 = self.get_data_and_stat(ref_to)
+        self.ref_from_stat = (m0, std0)
+        self.ref_to_stat = (m1, std1)
+        self.ref_from = self.preprocess(x0).reshape(-1, 3)
+        self.ref_to = x1.reshape(-1, 3)
+    def get_data_and_stat(self, x):
+        if isinstance(x, str):
+            x = np.array(PIL.Image.open(x))
+        elif isinstance(x, PIL.Image.Image):
+            x = np.array(x)
+        elif isinstance(x, torch.Tensor):
+            x = torch.clamp(x, min=0.0, max=1.0)
+            x = np.array(tvtrans.ToPILImage()(x))
+        elif isinstance(x, np.ndarray):
+            pass
+        else:
+            raise ValueError
+        x = x.astype(float)
+        m = np.reshape(x, (-1, 3)).mean(0)
+        s = np.reshape(x, (-1, 3)).std(0)
+        return x, m, s
+    def preprocess(self, x):
+        m0, s0 = self.ref_from_stat
+        m1, s1 = self.ref_to_stat
+        y = ((x - m0) / s0) * s1 + m1
+        return y
+    def __call__(self, xin, keep=0, simple=False):
+        xin, _, _ = self.get_data_and_stat(xin)
+        x = self.preprocess(xin)
+        if simple:
+            y = (x * (1 - keep) + xin * keep)
+            y = np.clip(y, 0, 255).astype(np.uint8)
+            return y
+        h, w = x.shape[:2]
+        x = x.reshape(-1, 3)
+        y = []
+        for chi in range(3):
+            yi = self.pdf_transfer_1d(self.ref_from[:, chi], self.ref_to[:, chi], x[:, chi])
+            y.append(yi)
+        y = np.stack(y, axis=1)
+        y = y.reshape(h, w, 3)
+        y = (y.astype(float) * (1 - keep) + xin.astype(float) * keep)
+        y = np.clip(y, 0, 255).astype(np.uint8)
+        return y
+    def pdf_transfer_1d(self, arr_fo, arr_to, arr_in, n=600):
+        arr = np.concatenate((arr_fo, arr_to))
+        min_v = arr.min() - 1e-6
+        max_v = arr.max() + 1e-6
+        min_vto = arr_to.min() - 1e-6
+        max_vto = arr_to.max() + 1e-6
+        xs = np.array(
+            [min_v + (max_v - min_v) * i / n for i in range(n + 1)])
+        hist_fo, _ = np.histogram(arr_fo, xs)
+        hist_to, _ = np.histogram(arr_to, xs)
+        xs = xs[:-1]
+        # compute probability distribution
+        cum_fo = np.cumsum(hist_fo)
+        cum_to = np.cumsum(hist_to)
+        d_fo = cum_fo / cum_fo[-1]
+        d_to = cum_to / cum_to[-1]
+        # transfer
+        t_d = np.interp(d_fo, d_to, xs)
+        t_d[d_fo <= d_to[0]] = min_vto
+        t_d[d_fo >= d_to[-1]] = max_vto
+        arr_out = np.interp(arr_in, xs, t_d)
+        return arr_out

core/models/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .common.get_model import get_model
+from .common.get_optimizer import get_optimizer
+from .common.get_scheduler import get_scheduler
+from .common.utils import get_unit

core/models/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (367 Bytes). View file

core/models/__pycache__/codi.cpython-38.pyc ADDED Viewed

Binary file (7.7 kB). View file

core/models/__pycache__/codi_2.cpython-38.pyc ADDED Viewed

Binary file (7.12 kB). View file

core/models/__pycache__/dani_model.cpython-38.pyc ADDED Viewed

Binary file (4.29 kB). View file

core/models/__pycache__/ema.cpython-38.pyc ADDED Viewed

Binary file (2.99 kB). View file

core/models/__pycache__/model_module_infer.cpython-38.pyc ADDED Viewed

Binary file (4.31 kB). View file

core/models/__pycache__/sd.cpython-38.pyc ADDED Viewed

Binary file (9.82 kB). View file

core/models/codi.py ADDED Viewed

	@@ -0,0 +1,227 @@

+from typing import Dict, List
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import numpy.random as npr
+import copy
+from functools import partial
+from contextlib import contextmanager
+from .common.get_model import get_model, register
+from .sd import DDPM
+version = '0'
+symbol = 'codi'
+@register('codi', version)
+class CoDi(DDPM):
+    def __init__(self,
+                 audioldm_cfg=None,
+                 autokl_cfg=None,
+                 optimus_cfg=None,
+                 clip_cfg=None,
+                 clap_cfg=None,
+                 vision_scale_factor=0.1812,
+                 text_scale_factor=4.3108,
+                 audio_scale_factor=0.9228,
+                 scale_by_std=False,
+                 *args,
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        if audioldm_cfg is not None:
+            self.audioldm = get_model()(audioldm_cfg)
+        if autokl_cfg is not None:
+            self.autokl = get_model()(autokl_cfg)
+        if optimus_cfg is not None:
+            self.optimus = get_model()(optimus_cfg)
+        if clip_cfg is not None:
+            self.clip = get_model()(clip_cfg)
+        if clap_cfg is not None:
+            self.clap = get_model()(clap_cfg)
+        if not scale_by_std:
+            self.vision_scale_factor = vision_scale_factor
+            self.text_scale_factor = text_scale_factor
+            self.audio_scale_factor = audio_scale_factor
+        else:
+            self.register_buffer("text_scale_factor", torch.tensor(text_scale_factor))
+            self.register_buffer("audio_scale_factor", torch.tensor(audio_scale_factor))
+            self.register_buffer('vision_scale_factor', torch.tensor(vision_scale_factor))
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @torch.no_grad()
+    def autokl_encode(self, image):
+        encoder_posterior = self.autokl.encode(image)
+        z = encoder_posterior.sample().to(image.dtype)
+        return self.vision_scale_factor * z
+    @torch.no_grad()
+    def autokl_decode(self, z):
+        z = 1. / self.vision_scale_factor * z
+        return self.autokl.decode(z)
+    @torch.no_grad()
+    def optimus_encode(self, text):
+        if isinstance(text, List):
+            tokenizer = self.optimus.tokenizer_encoder
+            token = [tokenizer.tokenize(sentence.lower()) for sentence in text]
+            token_id = []
+            for tokeni in token:
+                token_sentence = [tokenizer._convert_token_to_id(i) for i in tokeni]
+                token_sentence = tokenizer.add_special_tokens_single_sentence(token_sentence)
+                token_id.append(torch.LongTensor(token_sentence))
+            token_id = torch._C._nn.pad_sequence(token_id, batch_first=True, padding_value=0.0)[:, :512]
+        else:
+            token_id = text
+        z = self.optimus.encoder(token_id, attention_mask=(token_id > 0))[1]
+        z_mu, z_logvar = self.optimus.encoder.linear(z).chunk(2, -1)
+        return z_mu.squeeze(1) * self.text_scale_factor
+    @torch.no_grad()
+    def optimus_decode(self, z, temperature=1.0, max_length=30):
+        z = 1.0 / self.text_scale_factor * z
+        return self.optimus.decode(z, temperature, max_length=max_length)
+    @torch.no_grad()
+    def audioldm_encode(self, audio, time=2.0):
+        encoder_posterior = self.audioldm.encode(audio, time=time)
+        z = encoder_posterior.sample().to(audio.dtype)
+        return z * self.audio_scale_factor
+    @torch.no_grad()
+    def audioldm_decode(self, z):
+        if torch.max(torch.abs(z)) > 1e2:
+            z = torch.clip(z, min=-10, max=10)
+        z = 1.0 / self.audio_scale_factor * z
+        return self.audioldm.decode(z)
+    @torch.no_grad()
+    def mel_spectrogram_to_waveform(self, mel):
+        # Mel: [bs, 1, t-steps, fbins]
+        if len(mel.size()) == 4:
+            mel = mel.squeeze(1)
+        mel = mel.permute(0, 2, 1)
+        waveform = self.audioldm.vocoder(mel)
+        waveform = waveform.cpu().detach().numpy()
+        return waveform
+    @torch.no_grad()
+    def clip_encode_text(self, text, encode_type='encode_text'):
+        swap_type = self.clip.encode_type
+        self.clip.encode_type = encode_type
+        embedding = self.clip(text, encode_type)
+        self.clip.encode_type = swap_type
+        return embedding
+    @torch.no_grad()
+    def clip_encode_vision(self, vision, encode_type='encode_vision'):
+        swap_type = self.clip.encode_type
+        self.clip.encode_type = encode_type
+        embedding = self.clip(vision, encode_type)
+        self.clip.encode_type = swap_type
+        return embedding
+    @torch.no_grad()
+    def clap_encode_audio(self, audio):
+        embedding = self.clap(audio)
+        return embedding
+    def forward(self, x=None, c=None, noise=None, xtype='image', ctype='prompt', u=None, return_algined_latents=False):
+        if isinstance(x, list):
+            t = torch.randint(0, self.num_timesteps, (x[0].shape[0],), device=x[0].device).long()
+        else:
+            t = torch.randint(0, self.num_timesteps, (x.shape[0],), device=x.device).long()
+        return self.p_losses(x, c, t, noise, xtype, ctype, u, return_algined_latents)
+    def apply_model(self, x_noisy, t, cond, xtype='image', ctype='text', u=None, return_algined_latents=False):
+        return self.model.diffusion_model(x_noisy, t, cond, xtype, ctype, u, return_algined_latents)
+    def get_pixel_loss(self, pred, target, mean=True):
+        if self.loss_type == 'l1':
+            loss = (target - pred).abs()
+            if mean:
+                loss = loss.mean()
+        elif self.loss_type == 'l2':
+            if mean:
+                loss = torch.nn.functional.mse_loss(target, pred)
+            else:
+                loss = torch.nn.functional.mse_loss(target, pred, reduction='none')
+        else:
+            raise NotImplementedError("unknown loss type '{loss_type}'")
+        loss = torch.nan_to_num(loss, nan=0.0, posinf=0.0, neginf=-0.0)
+        return loss
+    def get_text_loss(self, pred, target):
+        if self.loss_type == 'l1':
+            loss = (target - pred).abs()
+        elif self.loss_type == 'l2':
+            loss = torch.nn.functional.mse_loss(target, pred, reduction='none')
+        loss = torch.nan_to_num(loss, nan=0.0, posinf=0.0, neginf=0.0)
+        return loss
+    def p_losses(self, x_start, cond, t, noise=None, xtype='image', ctype='prompt', u=None, return_algined_latents=False):
+        if isinstance(x_start, list):
+            noise = [torch.randn_like(x_start_i) for x_start_i in x_start] if noise is None else noise
+            x_noisy = [self.q_sample(x_start=x_start_i, t=t, noise=noise_i) for x_start_i, noise_i in zip(x_start, noise)]
+            model_output = self.apply_model(x_noisy, t, cond, xtype, ctype, u, return_algined_latents)
+            if return_algined_latents:
+                return model_output
+            loss_dict = {}
+            if self.parameterization == "x0":
+                target = x_start
+            elif self.parameterization == "eps":
+                target = noise
+            else:
+                raise NotImplementedError()
+            loss = 0.0
+            for model_output_i, target_i, xtype_i in zip(model_output, target, xtype):
+                if xtype_i == 'image':
+                    loss_simple = self.get_pixel_loss(model_output_i, target_i, mean=False).mean([1, 2, 3])
+                elif xtype_i == 'video':
+                    loss_simple = self.get_pixel_loss(model_output_i, target_i, mean=False).mean([1, 2, 3, 4])
+                elif xtype_i == 'text':
+                    loss_simple = self.get_text_loss(model_output_i, target_i).mean([1])
+                elif xtype_i == 'audio':
+                    loss_simple = self.get_pixel_loss(model_output_i, target_i, mean=False).mean([1, 2, 3])
+                loss += loss_simple.mean()
+            return loss / len(xtype)
+        else:
+            noise = torch.randn_like(x_start) if noise is None else noise
+            x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
+            model_output = self.apply_model(x_noisy, t, cond, xtype, ctype)
+            loss_dict = {}
+            if self.parameterization == "x0":
+                target = x_start
+            elif self.parameterization == "eps":
+                target = noise
+            else:
+                raise NotImplementedError()
+            if xtype == 'image':
+                loss_simple = self.get_pixel_loss(model_output, target, mean=False).mean([1, 2, 3])
+            elif xtype == 'video':
+                loss_simple = self.get_pixel_loss(model_output, target, mean=False).mean([1, 2, 3, 4])
+            elif xtype == 'text':
+                loss_simple = self.get_text_loss(model_output, target).mean([1])
+            elif xtype == 'audio':
+                loss_simple = self.get_pixel_loss(model_output, target, mean=False).mean([1, 2, 3])
+            loss = loss_simple.mean()
+            return loss

core/models/codi_2.py ADDED Viewed

	@@ -0,0 +1,221 @@

+from typing import Dict, List
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import numpy.random as npr
+import copy
+from functools import partial
+from contextlib import contextmanager
+from .common.get_model import get_model, register
+from .sd import DDPM
+version = '0'
+symbol = 'thesis_model'
+@register('thesis_model', version)
+class CoDi(DDPM):
+    def __init__(self,
+                 autokl_cfg=None,
+                 optimus_cfg=None,
+                 clip_cfg=None,
+                 vision_scale_factor=0.1812,
+                 text_scale_factor=4.3108,
+                 audio_scale_factor=0.9228,
+                 scale_by_std=False,
+                 *args,
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        if autokl_cfg is not None:
+            self.autokl = get_model()(autokl_cfg)
+        if optimus_cfg is not None:
+            self.optimus = get_model()(optimus_cfg)
+        if clip_cfg is not None:
+            self.clip = get_model()(clip_cfg)
+        if not scale_by_std:
+            self.vision_scale_factor = vision_scale_factor
+            self.text_scale_factor = text_scale_factor
+            self.audio_scale_factor = audio_scale_factor
+        else:
+            self.register_buffer("text_scale_factor", torch.tensor(text_scale_factor))
+            self.register_buffer("audio_scale_factor", torch.tensor(audio_scale_factor))
+            self.register_buffer('vision_scale_factor', torch.tensor(vision_scale_factor))
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @torch.no_grad()
+    def autokl_encode(self, image):
+        encoder_posterior = self.autokl.encode(image)
+        z = encoder_posterior.sample().to(image.dtype)
+        return self.vision_scale_factor * z
+    @torch.no_grad()
+    def autokl_decode(self, z):
+        z = 1. / self.vision_scale_factor * z
+        return self.autokl.decode(z)
+    @torch.no_grad()
+    def optimus_encode(self, text):
+        if isinstance(text, List):
+            tokenizer = self.optimus.tokenizer_encoder
+            token = [tokenizer.tokenize(sentence.lower()) for sentence in text]
+            token_id = []
+            for tokeni in token:
+                token_sentence = [tokenizer._convert_token_to_id(i) for i in tokeni]
+                token_sentence = tokenizer.add_special_tokens_single_sentence(token_sentence)
+                token_id.append(torch.LongTensor(token_sentence))
+            token_id = torch._C._nn.pad_sequence(token_id, batch_first=True, padding_value=0.0)[:, :512]
+        else:
+            token_id = text
+        z = self.optimus.encoder(token_id, attention_mask=(token_id > 0))[1]
+        z_mu, z_logvar = self.optimus.encoder.linear(z).chunk(2, -1)
+        return z_mu.squeeze(1) * self.text_scale_factor
+    @torch.no_grad()
+    def optimus_decode(self, z, temperature=1.0):
+        z = 1.0 / self.text_scale_factor * z
+        return self.optimus.decode(z, temperature)
+    @torch.no_grad()
+    def clip_encode_text(self, text, encode_type='encode_text'):
+        swap_type = self.clip.encode_type
+        self.clip.encode_type = encode_type
+        embedding = self.clip(text, encode_type)
+        self.clip.encode_type = swap_type
+        return embedding
+    @torch.no_grad()
+    def clip_encode_vision(self, vision, encode_type='encode_vision'):
+        swap_type = self.clip.encode_type
+        self.clip.encode_type = encode_type
+        embedding = self.clip(vision, encode_type)
+        self.clip.encode_type = swap_type
+        return embedding
+    @torch.no_grad()
+    def clap_encode_audio(self, audio):
+        embedding = self.clap(audio)
+        return embedding
+    def forward(self, x=None, c=None, noise=None, xtype='frontal', ctype='text', u=None, return_algined_latents=False, env_enc=False):
+        if isinstance(x, list):
+            t = torch.randint(0, self.num_timesteps, (x[0].shape[0],), device=x[0].device).long()
+        else:
+            t = torch.randint(0, self.num_timesteps, (x.shape[0],), device=x.device).long()
+        return self.p_losses(x, c, t, noise, xtype, ctype, u, return_algined_latents, env_enc)
+    def apply_model(self, x_noisy, t, cond, xtype='frontal', ctype='text', u=None, return_algined_latents=False, env_enc=False):
+        return self.model.diffusion_model(x_noisy, t, cond, xtype, ctype, u, return_algined_latents, env_enc=env_enc)
+    def get_pixel_loss(self, pred, target, mean=True):
+        if self.loss_type == 'l1':
+            loss = (target - pred).abs()
+            if mean:
+                loss = loss.mean()
+        elif self.loss_type == 'l2':
+            if mean:
+                loss = torch.nn.functional.mse_loss(target, pred)
+            else:
+                loss = torch.nn.functional.mse_loss(target, pred, reduction='none')
+        else:
+            raise NotImplementedError("unknown loss type '{loss_type}'")
+        loss = torch.nan_to_num(loss, nan=0.0, posinf=0.0, neginf=-0.0)
+        return loss
+    def get_text_loss(self, pred, target):
+        if self.loss_type == 'l1':
+            loss = (target - pred).abs()
+        elif self.loss_type == 'l2':
+            loss = torch.nn.functional.mse_loss(target, pred, reduction='none')
+        loss = torch.nan_to_num(loss, nan=0.0, posinf=0.0, neginf=0.0)
+        return loss
+    def p_losses(self, x_start, cond, t, noise=None, xtype='frontal', ctype='text', u=None,
+                 return_algined_latents=False, env_enc=False):
+        if isinstance(x_start, list):
+            noise = [torch.randn_like(x_start_i) for x_start_i in x_start] if noise is None else noise
+            x_noisy = [self.q_sample(x_start=x_start_i, t=t, noise=noise_i) for x_start_i, noise_i in
+                       zip(x_start, noise)]
+            if not env_enc:
+                model_output = self.apply_model(x_noisy, t, cond, xtype, ctype, u, return_algined_latents, env_enc)
+            else:
+                model_output, h_con = self.apply_model(x_noisy, t, cond, xtype, ctype, u, return_algined_latents, env_enc)
+            if return_algined_latents:
+                return model_output
+            loss_dict = {}
+            if self.parameterization == "x0":
+                target = x_start
+            elif self.parameterization == "eps":
+                target = noise
+            else:
+                raise NotImplementedError()
+            loss = 0.0
+            for model_output_i, target_i, xtype_i in zip(model_output, target, xtype):
+                if xtype_i == 'frontal':
+                    loss_simple = self.get_pixel_loss(model_output_i, target_i, mean=False).mean([1, 2, 3])
+                elif xtype_i == 'text':
+                    loss_simple = self.get_text_loss(model_output_i, target_i).mean([1])
+                elif xtype_i == 'lateral':
+                    loss_simple = self.get_pixel_loss(model_output_i, target_i, mean=False).mean([1, 2, 3])
+                loss += loss_simple.mean()
+                # Controlliamo se il modello ha restituito anche h_con
+                # In tal caso, abbiamo le rappresentazioni latenti delle due modalità
+                # estratte dagli environmental encoder, essendo due tensori di dimensione batch_sizex1x1280
+                # possiamo utilizzarli per calcolare anche un termine di contrastive loss (crossentropy come in CLIP)
+                if h_con is not None:
+                    def similarity(z_a, z_b):
+                        return F.cosine_similarity(z_a, z_b)
+                    z_a, z_b = h_con
+                    z_a = z_a / z_a.norm(dim=-1, keepdim=True)
+                    z_b = z_b / z_b.norm(dim=-1, keepdim=True)
+                    logits_a = z_a.squeeze() @ z_b.squeeze().t()
+                    logits_b = z_a.squeeze() @ z_b.squeeze().t()
+                    labels = torch.arange(len(z_a)).to(z_a.device)
+                    loss_a = F.cross_entropy(logits_a, labels)
+                    loss_b = F.cross_entropy(logits_b, labels)
+                    loss_con = (loss_a + loss_b) / 2
+                    loss += loss_con
+            return loss / len(xtype)
+        else:
+            noise = torch.randn_like(x_start) if noise is None else noise
+            x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
+            model_output = self.apply_model(x_noisy, t, cond, xtype, ctype)
+            loss_dict = {}
+            if self.parameterization == "x0":
+                target = x_start
+            elif self.parameterization == "eps":
+                target = noise
+            else:
+                raise NotImplementedError()
+            if xtype == 'frontal':
+                loss_simple = self.get_pixel_loss(model_output, target, mean=False).mean([1, 2, 3])
+            elif xtype == 'text':
+                loss_simple = self.get_text_loss(model_output, target).mean([1])
+            elif xtype == 'lateral':
+                loss_simple = self.get_pixel_loss(model_output, target, mean=False).mean([1, 2, 3])
+            loss = loss_simple.mean()
+            return loss

core/models/common/__pycache__/get_model.cpython-38.pyc ADDED Viewed

Binary file (2.96 kB). View file

core/models/common/__pycache__/get_optimizer.cpython-38.pyc ADDED Viewed

Binary file (1.94 kB). View file

core/models/common/__pycache__/get_scheduler.cpython-38.pyc ADDED Viewed

Binary file (9.55 kB). View file

core/models/common/__pycache__/utils.cpython-38.pyc ADDED Viewed

Binary file (9.75 kB). View file

core/models/common/get_model.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from email.policy import strict
+import torch
+import torchvision.models
+import os.path as osp
+import copy
+from .utils import \
+    get_total_param, get_total_param_sum, \
+    get_unit
+def singleton(class_):
+    instances = {}
+    def getinstance(*args, **kwargs):
+        if class_ not in instances:
+            instances[class_] = class_(*args, **kwargs)
+        return instances[class_]
+    return getinstance
+def preprocess_model_args(args):
+    # If args has layer_units, get the corresponding
+    #     units.
+    # If args get backbone, get the backbone model.
+    args = copy.deepcopy(args)
+    if 'layer_units' in args:
+        layer_units = [
+            get_unit()(i) for i in args.layer_units
+        ]
+        args.layer_units = layer_units
+    if 'backbone' in args:
+        args.backbone = get_model()(args.backbone)
+    return args
+@singleton
+class get_model(object):
+    def __init__(self):
+        self.model = {}
+        self.version = {}
+    def register(self, model, name, version='x'):
+        self.model[name] = model
+        self.version[name] = version
+    def __call__(self, cfg, verbose=True):
+        """
+        Construct model based on the config.
+        """
+        t = cfg.type
+        # the register is in each file
+        if t.find('audioldm')==0:
+            from ..latent_diffusion.vae import audioldm
+        elif t.find('autoencoderkl')==0:
+            from ..latent_diffusion.vae import autokl
+        elif t.find('optimus')==0:
+            from ..latent_diffusion.vae import optimus
+        elif t.find('clip')==0:
+            from ..encoders import clip
+        elif t.find('clap')==0:
+            from ..encoders import clap
+        elif t.find('sd')==0:
+            from .. import sd
+        elif t.find('codi')==0:
+            from .. import codi
+        elif t.find('thesis_model')==0:
+            from .. import codi_2
+        elif t.find('openai_unet')==0:
+            from ..latent_diffusion import diffusion_unet
+        elif t.find('prova')==0:
+            from ..latent_diffusion import diffusion_unet
+        args = preprocess_model_args(cfg.args)
+        net = self.model[t](**args)
+        return net
+    def get_version(self, name):
+        return self.version[name]
+def register(name, version='x'):
+    def wrapper(class_):
+        get_model().register(class_, name, version)
+        return class_
+    return wrapper

core/models/common/get_optimizer.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+import torch.optim as optim
+import numpy as np
+import itertools
+def singleton(class_):
+    instances = {}
+    def getinstance(*args, **kwargs):
+        if class_ not in instances:
+            instances[class_] = class_(*args, **kwargs)
+        return instances[class_]
+    return getinstance
+class get_optimizer(object):
+    def __init__(self):
+        self.optimizer = {}
+        self.register(optim.SGD, 'sgd')
+        self.register(optim.Adam, 'adam')
+        self.register(optim.AdamW, 'adamw')
+    def register(self, optim, name):
+        self.optimizer[name] = optim
+    def __call__(self, net, cfg):
+        if cfg is None:
+            return None
+        t = cfg.type
+        if isinstance(net, (torch.nn.DataParallel,
+                            torch.nn.parallel.DistributedDataParallel)):
+            netm = net.module
+        else:
+            netm = net
+        pg = getattr(netm, 'parameter_group', None)
+        if pg is not None:
+            params = []
+            for group_name, module_or_para in pg.items():
+                if not isinstance(module_or_para, list):
+                    module_or_para = [module_or_para]
+                grouped_params = [mi.parameters() if isinstance(mi, torch.nn.Module) else [mi] for mi in module_or_para]
+                grouped_params = itertools.chain(*grouped_params)
+                pg_dict = {'params': grouped_params, 'name': group_name}
+                params.append(pg_dict)
+        else:
+            params = net.parameters()
+        return self.optimizer[t](params, lr=0, **cfg.args)

core/models/common/get_scheduler.py ADDED Viewed

	@@ -0,0 +1,273 @@

+import torch
+import torch.optim as optim
+import numpy as np
+import copy
+from ... import sync
+from ...cfg_holder import cfg_unique_holder as cfguh
+def singleton(class_):
+    instances = {}
+    def getinstance(*args, **kwargs):
+        if class_ not in instances:
+            instances[class_] = class_(*args, **kwargs)
+        return instances[class_]
+    return getinstance
+@singleton
+class get_scheduler(object):
+    def __init__(self):
+        self.lr_scheduler = {}
+    def register(self, lrsf, name):
+        self.lr_scheduler[name] = lrsf
+    def __call__(self, cfg):
+        if cfg is None:
+            return None
+        if isinstance(cfg, list):
+            schedulers = []
+            for ci in cfg:
+                t = ci.type
+                schedulers.append(
+                    self.lr_scheduler[t](**ci.args))
+            if len(schedulers) == 0:
+                raise ValueError
+            else:
+                return compose_scheduler(schedulers)
+        t = cfg.type
+        return self.lr_scheduler[t](**cfg.args)
+def register(name):
+    def wrapper(class_):
+        get_scheduler().register(class_, name)
+        return class_
+    return wrapper
+class template_scheduler(object):
+    def __init__(self, step):
+        self.step = step
+    def __getitem__(self, idx):
+        raise ValueError
+    def set_lr(self, optim, new_lr, pg_lrscale=None):
+        """
+        Set Each parameter_groups in optim with new_lr
+        New_lr can be find according to the idx.
+        pg_lrscale tells how to scale each pg.
+        """
+        # new_lr = self.__getitem__(idx)
+        pg_lrscale = copy.deepcopy(pg_lrscale)
+        for pg in optim.param_groups:
+            if pg_lrscale is None:
+                pg['lr'] = new_lr
+            else:
+                pg['lr'] = new_lr * pg_lrscale.pop(pg['name'])
+        assert (pg_lrscale is None) or (len(pg_lrscale)==0), \
+            "pg_lrscale doesn't match pg"
+@register('constant')
+class constant_scheduler(template_scheduler):
+    def __init__(self, lr, step):
+        super().__init__(step)
+        self.lr = lr
+    def __getitem__(self, idx):
+        if idx >= self.step:
+            raise ValueError
+        return self.lr
+@register('poly')
+class poly_scheduler(template_scheduler):
+    def __init__(self, start_lr, end_lr, power, step):
+        super().__init__(step)
+        self.start_lr = start_lr
+        self.end_lr = end_lr
+        self.power = power
+    def __getitem__(self, idx):
+        if idx >= self.step:
+            raise ValueError
+        a, b = self.start_lr, self.end_lr
+        p, n = self.power, self.step
+        return b + (a-b)*((1-idx/n)**p)
+@register('linear')
+class linear_scheduler(template_scheduler):
+    def __init__(self, start_lr, end_lr, step):
+        super().__init__(step)
+        self.start_lr = start_lr
+        self.end_lr = end_lr
+    def __getitem__(self, idx):
+        if idx >= self.step:
+            raise ValueError
+        a, b, n = self.start_lr, self.end_lr, self.step
+        return b + (a-b)*(1-idx/n)
+@register('multistage')
+class constant_scheduler(template_scheduler):
+    def __init__(self, start_lr, milestones, gamma, step):
+        super().__init__(step)
+        self.start_lr = start_lr
+        m = [0] + milestones + [step]
+        lr_iter = start_lr
+        self.lr = []
+        for ms, me in zip(m[0:-1], m[1:]):
+            for _ in range(ms, me):
+                self.lr.append(lr_iter)
+            lr_iter *= gamma
+    def __getitem__(self, idx):
+        if idx >= self.step:
+            raise ValueError
+        return self.lr[idx]
+class compose_scheduler(template_scheduler):
+    def __init__(self, schedulers):
+        self.schedulers = schedulers
+        self.step = [si.step for si in schedulers]
+        self.step_milestone = []
+        acc = 0
+        for i in self.step:
+            acc += i
+            self.step_milestone.append(acc)
+        self.step = sum(self.step)
+    def __getitem__(self, idx):
+        if idx >= self.step:
+            raise ValueError
+        ms = self.step_milestone
+        for idx, (mi, mj) in enumerate(zip(ms[:-1], ms[1:])):
+            if mi <= idx < mj:
+                return self.schedulers[idx-mi]
+        raise ValueError
+####################
+# lambda schedular #
+####################
+class LambdaWarmUpCosineScheduler(template_scheduler):
+    """
+    note: use with a base_lr of 1.0
+    """
+    def __init__(self,
+                 base_lr,
+                 warm_up_steps,
+                 lr_min, lr_max, lr_start, max_decay_steps, verbosity_interval=0):
+        cfgt = cfguh().cfg.train
+        bs = cfgt.batch_size
+        if 'gradacc_every' not in cfgt:
+            print('Warning, gradacc_every is not found in xml, use 1 as default.')
+        acc = cfgt.get('gradacc_every', 1)
+        self.lr_multi = base_lr * bs * acc
+        self.lr_warm_up_steps = warm_up_steps
+        self.lr_start = lr_start
+        self.lr_min = lr_min
+        self.lr_max = lr_max
+        self.lr_max_decay_steps = max_decay_steps
+        self.last_lr = 0.
+        self.verbosity_interval = verbosity_interval
+    def schedule(self, n):
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0:
+                print(f"current step: {n}, recent lr-multiplier: {self.last_lr}")
+        if n < self.lr_warm_up_steps:
+            lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * n + self.lr_start
+            self.last_lr = lr
+            return lr
+        else:
+            t = (n - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps)
+            t = min(t, 1.0)
+            lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * (
+                    1 + np.cos(t * np.pi))
+            self.last_lr = lr
+            return lr
+    def __getitem__(self, idx):
+        return self.schedule(idx) * self.lr_multi
+class LambdaWarmUpCosineScheduler2(template_scheduler):
+    """
+    supports repeated iterations, configurable via lists
+    note: use with a base_lr of 1.0.
+    """
+    def __init__(self,
+                 base_lr,
+                 warm_up_steps,
+                 f_min, f_max, f_start, cycle_lengths, verbosity_interval=0):
+        cfgt = cfguh().cfg.train
+        # bs = cfgt.batch_size
+        # if 'gradacc_every' not in cfgt:
+        #     print('Warning, gradacc_every is not found in xml, use 1 as default.')
+        # acc = cfgt.get('gradacc_every', 1)
+        # self.lr_multi = base_lr * bs * acc
+        self.lr_multi = base_lr
+        assert len(warm_up_steps) == len(f_min) == len(f_max) == len(f_start) == len(cycle_lengths)
+        self.lr_warm_up_steps = warm_up_steps
+        self.f_start = f_start
+        self.f_min = f_min
+        self.f_max = f_max
+        self.cycle_lengths = cycle_lengths
+        self.cum_cycles = np.cumsum([0] + list(self.cycle_lengths))
+        self.last_f = 0.
+        self.verbosity_interval = verbosity_interval
+    def find_in_interval(self, n):
+        interval = 0
+        for cl in self.cum_cycles[1:]:
+            if n <= cl:
+                return interval
+            interval += 1
+    def schedule(self, n):
+        cycle = self.find_in_interval(n)
+        n = n - self.cum_cycles[cycle]
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
+                                                       f"current cycle {cycle}")
+        if n < self.lr_warm_up_steps[cycle]:
+            f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
+            self.last_f = f
+            return f
+        else:
+            t = (n - self.lr_warm_up_steps[cycle]) / (self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle])
+            t = min(t, 1.0)
+            f = self.f_min[cycle] + 0.5 * (self.f_max[cycle] - self.f_min[cycle]) * (
+                    1 + np.cos(t * np.pi))
+            self.last_f = f
+            return f
+    def __getitem__(self, idx):
+        return self.schedule(idx) * self.lr_multi
+@register('stable_diffusion_linear')
+class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2):
+    def schedule(self, n):
+        cycle = self.find_in_interval(n)
+        n = n - self.cum_cycles[cycle]
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0:
+                print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
+                      f"current cycle {cycle}")
+        if n < self.lr_warm_up_steps[cycle]:
+            f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
+            self.last_f = f
+            return f
+        else:
+            f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (self.cycle_lengths[cycle] - n) / (self.cycle_lengths[cycle])
+            self.last_f = f
+            return f

core/models/common/utils.py ADDED Viewed

	@@ -0,0 +1,310 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import functools
+import itertools
+########
+# unit #
+########
+def singleton(class_):
+    instances = {}
+    def getinstance(*args, **kwargs):
+        if class_ not in instances:
+            instances[class_] = class_(*args, **kwargs)
+        return instances[class_]
+    return getinstance
+def str2value(v):
+    v = v.strip()
+    try:
+        return int(v)
+    except:
+        pass
+    try:
+        return float(v)
+    except:
+        pass
+    if v in ('True', 'true'):
+        return True
+    elif v in ('False', 'false'):
+        return False
+    else:
+        return v
+@singleton
+class get_unit(object):
+    def __init__(self):
+        self.unit = {}
+        self.register('none', None)
+        # general convolution
+        self.register('conv', nn.Conv2d)
+        self.register('bn', nn.BatchNorm2d)
+        self.register('relu', nn.ReLU)
+        self.register('relu6', nn.ReLU6)
+        self.register('lrelu', nn.LeakyReLU)
+        self.register('dropout', nn.Dropout)
+        self.register('dropout2d', nn.Dropout2d)
+        self.register('sine', Sine)
+        self.register('relusine', ReLUSine)
+    def register(self,
+                 name,
+                 unitf, ):
+        self.unit[name] = unitf
+    def __call__(self, name):
+        if name is None:
+            return None
+        i = name.find('(')
+        i = len(name) if i == -1 else i
+        t = name[:i]
+        f = self.unit[t]
+        args = name[i:].strip('()')
+        if len(args) == 0:
+            args = {}
+            return f
+        else:
+            args = args.split('=')
+            args = [[','.join(i.split(',')[:-1]), i.split(',')[-1]] for i in args]
+            args = list(itertools.chain.from_iterable(args))
+            args = [i.strip() for i in args if len(i) > 0]
+            kwargs = {}
+            for k, v in zip(args[::2], args[1::2]):
+                if v[0] == '(' and v[-1] == ')':
+                    kwargs[k] = tuple([str2value(i) for i in v.strip('()').split(',')])
+                elif v[0] == '[' and v[-1] == ']':
+                    kwargs[k] = [str2value(i) for i in v.strip('[]').split(',')]
+                else:
+                    kwargs[k] = str2value(v)
+            return functools.partial(f, **kwargs)
+def register(name):
+    def wrapper(class_):
+        get_unit().register(name, class_)
+        return class_
+    return wrapper
+class Sine(object):
+    def __init__(self, freq, gain=1):
+        self.freq = freq
+        self.gain = gain
+        self.repr = 'sine(freq={}, gain={})'.format(freq, gain)
+    def __call__(self, x, gain=1):
+        act_gain = self.gain * gain
+        return torch.sin(self.freq * x) * act_gain
+    def __repr__(self, ):
+        return self.repr
+class ReLUSine(nn.Module):
+    def __init(self):
+        super().__init__()
+    def forward(self, input):
+        a = torch.sin(30 * input)
+        b = nn.ReLU(inplace=False)(input)
+        return a + b
+@register('lrelu_agc')
+class lrelu_agc(object):
+    """
+    The lrelu layer with alpha, gain and clamp
+    """
+    def __init__(self, alpha=0.1, gain=1, clamp=None):
+        # super().__init__()
+        self.alpha = alpha
+        if gain == 'sqrt_2':
+            self.gain = np.sqrt(2)
+        else:
+            self.gain = gain
+        self.clamp = clamp
+        self.repr = 'lrelu_agc(alpha={}, gain={}, clamp={})'.format(
+            alpha, gain, clamp)
+    # def forward(self, x, gain=1):
+    def __call__(self, x, gain=1):
+        x = F.leaky_relu(x, negative_slope=self.alpha, inplace=True)
+        act_gain = self.gain * gain
+        act_clamp = self.clamp * gain if self.clamp is not None else None
+        if act_gain != 1:
+            x = x * act_gain
+        if act_clamp is not None:
+            x = x.clamp(-act_clamp, act_clamp)
+        return x
+    def __repr__(self, ):
+        return self.repr
+####################
+# spatial encoding #
+####################
+@register('se')
+class SpatialEncoding(nn.Module):
+    def __init__(self,
+                 in_dim,
+                 out_dim,
+                 sigma=6,
+                 cat_input=True,
+                 require_grad=False, ):
+        super().__init__()
+        assert out_dim % (2 * in_dim) == 0, "dimension must be dividable"
+        n = out_dim // 2 // in_dim
+        m = 2 ** np.linspace(0, sigma, n)
+        m = np.stack([m] + [np.zeros_like(m)] * (in_dim - 1), axis=-1)
+        m = np.concatenate([np.roll(m, i, axis=-1) for i in range(in_dim)], axis=0)
+        self.emb = torch.FloatTensor(m)
+        if require_grad:
+            self.emb = nn.Parameter(self.emb, requires_grad=True)
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.sigma = sigma
+        self.cat_input = cat_input
+        self.require_grad = require_grad
+    def forward(self, x, format='[n x c]'):
+        """
+        Args:
+            x: [n x m1],
+                m1 usually is 2
+        Outputs:
+            y: [n x m2]
+                m2 dimention number
+                :param format:
+        """
+        if format == '[bs x c x 2D]':
+            xshape = x.shape
+            x = x.permute(0, 2, 3, 1).contiguous()
+            x = x.view(-1, x.size(-1))
+        elif format == '[n x c]':
+            pass
+        else:
+            raise ValueError
+        if not self.require_grad:
+            self.emb = self.emb.to(x.device)
+        y = torch.mm(x, self.emb.T)
+        if self.cat_input:
+            z = torch.cat([x, torch.sin(y), torch.cos(y)], dim=-1)
+        else:
+            z = torch.cat([torch.sin(y), torch.cos(y)], dim=-1)
+        if format == '[bs x c x 2D]':
+            z = z.view(xshape[0], xshape[2], xshape[3], -1)
+            z = z.permute(0, 3, 1, 2).contiguous()
+        return z
+    def extra_repr(self):
+        outstr = 'SpatialEncoding (in={}, out={}, sigma={}, cat_input={}, require_grad={})'.format(
+            self.in_dim, self.out_dim, self.sigma, self.cat_input, self.require_grad)
+        return outstr
+@register('rffe')
+class RFFEncoding(SpatialEncoding):
+    """
+    Random Fourier Features
+    """
+    def __init__(self,
+                 in_dim,
+                 out_dim,
+                 sigma=6,
+                 cat_input=True,
+                 require_grad=False, ):
+        super().__init__(in_dim, out_dim, sigma, cat_input, require_grad)
+        n = out_dim // 2
+        m = np.random.normal(0, sigma, size=(n, in_dim))
+        self.emb = torch.FloatTensor(m)
+        if require_grad:
+            self.emb = nn.Parameter(self.emb, requires_grad=True)
+    def extra_repr(self):
+        outstr = 'RFFEncoding (in={}, out={}, sigma={}, cat_input={}, require_grad={})'.format(
+            self.in_dim, self.out_dim, self.sigma, self.cat_input, self.require_grad)
+        return outstr
+##########
+# helper #
+##########
+def freeze(net):
+    for m in net.modules():
+        if isinstance(m, (
+                nn.BatchNorm2d,
+                nn.SyncBatchNorm,)):
+            # inplace_abn not supported
+            m.eval()
+    for pi in net.parameters():
+        pi.requires_grad = False
+    return net
+def common_init(m):
+    if isinstance(m, (
+            nn.Conv2d,
+            nn.ConvTranspose2d,)):
+        nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+    elif isinstance(m, (
+            nn.BatchNorm2d,
+            nn.SyncBatchNorm,)):
+        nn.init.constant_(m.weight, 1)
+        nn.init.constant_(m.bias, 0)
+    else:
+        pass
+def init_module(module):
+    """
+    Args:
+        module: [nn.module] list or nn.module
+            a list of module to be initialized.
+    """
+    if isinstance(module, (list, tuple)):
+        module = list(module)
+    else:
+        module = [module]
+    for mi in module:
+        for mii in mi.modules():
+            common_init(mii)
+def get_total_param(net):
+    if getattr(net, 'parameters', None) is None:
+        return 0
+    return sum(p.numel() for p in net.parameters())
+def get_total_param_sum(net):
+    if getattr(net, 'parameters', None) is None:
+        return 0
+    with torch.no_grad():
+        s = sum(p.cpu().detach().numpy().sum().item() for p in net.parameters())
+    return s

core/models/dani_model.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import os
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.transforms as tvtrans
+from einops import rearrange
+import pytorch_lightning as pl
+from . import get_model
+from ..cfg_helper import model_cfg_bank
+from ..common.utils import regularize_image, regularize_video, remove_duplicate_word
+import warnings
+warnings.filterwarnings("ignore")
+class dani_model(pl.LightningModule):
+    def __init__(self, model='thesis_model', load_weights=True, data_dir='pretrained', pth=["CoDi_encoders.pth"], fp16=False):
+        super().__init__()
+        # import torch
+        # device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        cfgm = model_cfg_bank()(model)
+        net = get_model()(cfgm)
+        if load_weights:
+            for path in pth:
+                net.load_state_dict(torch.load(os.path.join(data_dir, path), map_location='cpu'), strict=False)
+            print('Load pretrained weight from {}'.format(pth))
+        self.net = net
+        from core.models.ddim.ddim_vd import DDIMSampler_VD
+        self.sampler = DDIMSampler_VD(net)
+    def decode(self, z, xtype):
+        device = z.device
+        net = self.net
+        z = z.to(device)
+        if xtype == 'image':
+            x = net.autokl_decode(z)
+            x = torch.clamp((x + 1.0) / 2.0, min=0.0, max=1.0)
+            return x
+        elif xtype == 'video':
+            num_frames = z.shape[2]
+            z = rearrange(z, 'b c f h w -> (b f) c h w')
+            x = net.autokl_decode(z)
+            x = rearrange(x, '(b f) c h w -> b f c h w', f=num_frames)
+            x = torch.clamp((x + 1.0) / 2.0, min=0.0, max=1.0)
+            video_list = []
+            for video in x:
+                video_list.append([tvtrans.ToPILImage()(xi) for xi in video])
+            return video_list
+        elif xtype == 'text':
+            prompt_temperature = 1.0
+            prompt_merge_same_adj_word = True
+            x = net.optimus_decode(z, temperature=prompt_temperature)
+            """
+            if prompt_merge_same_adj_word:
+                xnew = []
+                for xi in x:
+                    xi_split = xi.split()
+                    xinew = []
+                    for idxi, wi in enumerate(xi_split):
+                        if idxi!=0 and wi==xi_split[idxi-1]:
+                            continue
+                        xinew.append(wi)
+                    xnew.append(remove_duplicate_word(' '.join(xinew)))
+                x = xnew
+                """
+            return x
+        elif xtype == 'audio':
+            x = net.audioldm_decode(z)
+            x = net.mel_spectrogram_to_waveform(x)
+            return x
+    def forward(self, xtype=[], condition=[], condition_types=[], n_samples=1,
+                mix_weight={'video': 1, 'audio': 1, 'text': 1, 'image': 1}, image_size=256, ddim_steps=50, scale=7.5,
+                num_frames=8):
+        # import torch
+        # device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        device = self.device
+        net = self.net
+        sampler = self.sampler
+        ddim_eta = 0.0
+        conditioning = []
+        assert len(set(condition_types)) == len(condition_types), "we don't support condition with same modalities yet."
+        assert len(condition) == len(condition_types)
+        for i, condition_type in enumerate(condition_types):
+            if condition_type == 'image':
+                print(condition[i].shape)
+                ctemp1 = regularize_image(condition[i]).squeeze().to(device)
+                print(ctemp1.shape)
+                ctemp1 = ctemp1[None].repeat(n_samples, 1, 1, 1)
+                cim = net.clip_encode_vision(ctemp1).to(device)
+                uim = None
+                if scale != 1.0:
+                    dummy = torch.zeros_like(ctemp1).to(device)
+                    uim = net.clip_encode_vision(dummy).to(device)
+                conditioning.append(torch.cat([uim, cim]))
+            elif condition_type == 'video':
+                ctemp1 = regularize_video(condition[i]).to(device)
+                ctemp1 = ctemp1[None].repeat(n_samples, 1, 1, 1, 1)
+                cim = net.clip_encode_vision(ctemp1).to(device)
+                uim = None
+                if scale != 1.0:
+                    dummy = torch.zeros_like(ctemp1).to(device)
+                    uim = net.clip_encode_vision(dummy).to(device)
+                conditioning.append(torch.cat([uim, cim]))
+            elif condition_type == 'audio':
+                ctemp = condition[i][None].repeat(n_samples, 1, 1)
+                cad = net.clap_encode_audio(ctemp)
+                uad = None
+                if scale != 1.0:
+                    dummy = torch.zeros_like(ctemp)
+                    uad = net.clap_encode_audio(dummy)
+                conditioning.append(torch.cat([uad, cad]))
+            elif condition_type == 'text':
+                ctx = net.clip_encode_text(n_samples * [condition[i]]).to(device)
+                utx = None
+                if scale != 1.0:
+                    utx = net.clip_encode_text(n_samples * [""]).to(device)
+                conditioning.append(torch.cat([utx, ctx]))
+        shapes = []
+        for xtype_i in xtype:
+            if xtype_i == 'image':
+                h, w = [image_size, image_size]
+                shape = [n_samples, 4, h // 8, w // 8]
+            elif xtype_i == 'video':
+                h, w = [image_size, image_size]
+                shape = [n_samples, 4, num_frames, h // 8, w // 8]
+            elif xtype_i == 'text':
+                n = 768
+                shape = [n_samples, n]
+            elif xtype_i == 'audio':
+                h, w = [256, 16]
+                shape = [n_samples, 8, h, w]
+            else:
+                raise
+            shapes.append(shape)
+        z, _ = sampler.sample(
+            steps=ddim_steps,
+            shape=shapes,
+            condition=conditioning,
+            unconditional_guidance_scale=scale,
+            xtype=xtype,
+            condition_types=condition_types,
+            eta=ddim_eta,
+            verbose=False,
+            mix_weight=mix_weight)
+        out_all = []
+        for i, xtype_i in enumerate(xtype):
+            z[i] = z[i].to(device)
+            x_i = self.decode(z[i], xtype_i)
+            out_all.append(x_i)
+        return out_all

core/models/ddim/__pycache__/ddim.cpython-38.pyc ADDED Viewed

Binary file (6.27 kB). View file

core/models/ddim/__pycache__/ddim_vd.cpython-38.pyc ADDED Viewed

Binary file (4.29 kB). View file

core/models/ddim/__pycache__/diffusion_utils.cpython-38.pyc ADDED Viewed

Binary file (9.56 kB). View file

core/models/ddim/ddim.py ADDED Viewed

	@@ -0,0 +1,224 @@

+"""SAMPLING ONLY."""
+import torch
+import numpy as np
+from tqdm import tqdm
+from functools import partial
+from .diffusion_utils import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like
+class DDIMSampler(object):
+    def __init__(self, model, schedule="linear", **kwargs):
+        super().__init__()
+        self.model = model
+        self.ddpm_num_timesteps = model.num_timesteps
+        self.schedule = schedule
+    def register_buffer(self, name, attr):
+        # import torch
+        # device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        device = self.model.device
+        if type(attr) == torch.Tensor:
+            if attr.device != device:
+                attr = attr.to(device)
+        setattr(self, name, attr)
+    def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
+        self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize,
+                                                  num_ddim_timesteps=ddim_num_steps,
+                                                  num_ddpm_timesteps=self.ddpm_num_timesteps,
+                                                  verbose=verbose)
+        alphas_cumprod = self.model.alphas_cumprod
+        assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
+        to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
+        self.register_buffer('betas', to_torch(self.model.betas))
+        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
+        self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
+        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
+        # ddim sampling parameters
+        ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(
+            alphacums=alphas_cumprod.cpu(),
+            ddim_timesteps=self.ddim_timesteps,
+            eta=ddim_eta,verbose=verbose)
+        self.register_buffer('ddim_sigmas', ddim_sigmas)
+        self.register_buffer('ddim_alphas', ddim_alphas)
+        self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
+        self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
+        sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
+            (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
+                        1 - self.alphas_cumprod / self.alphas_cumprod_prev))
+        self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
+    @torch.no_grad()
+    def sample(self,
+               S,
+               batch_size,
+               shape,
+               conditioning=None,
+               callback=None,
+               normals_sequence=None,
+               img_callback=None,
+               quantize_x0=False,
+               eta=0.,
+               mask=None,
+               x0=None,
+               temperature=1.,
+               noise_dropout=0.,
+               score_corrector=None,
+               corrector_kwargs=None,
+               verbose=True,
+               x_T=None,
+               log_every_t=100,
+               unconditional_guidance_scale=1.,
+               unconditional_conditioning=None,
+               video_frame_share_noise=False,
+               # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
+               **kwargs
+               ):
+        # device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        device = self.model.device
+        if conditioning is not None:
+            if isinstance(conditioning, dict):
+                cbs = conditioning[list(conditioning.keys())[0]].shape[0]
+                if cbs != batch_size:
+                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
+            else:
+                if conditioning.shape[0] != batch_size:
+                    print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
+        self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
+        # sampling
+        C, H, W = shape
+        size = (batch_size, C, H, W)
+        print(f'Data shape for DDIM sampling is {size}, eta {eta}')
+        samples, intermediates = self.ddim_sampling(conditioning, size,
+                                                    callback=callback,
+                                                    img_callback=img_callback,
+                                                    quantize_denoised=quantize_x0,
+                                                    mask=mask, x0=x0,
+                                                    ddim_use_original_steps=False,
+                                                    noise_dropout=noise_dropout,
+                                                    temperature=temperature,
+                                                    score_corrector=score_corrector,
+                                                    corrector_kwargs=corrector_kwargs,
+                                                    x_T=x_T,
+                                                    log_every_t=log_every_t,
+                                                    unconditional_guidance_scale=unconditional_guidance_scale,
+                                                    unconditional_conditioning=unconditional_conditioning,
+                                                    )
+        return samples, intermediates
+    @torch.no_grad()
+    def ddim_sampling(self,
+                      cond, shape,
+                      x_T=None,
+                      ddim_use_original_steps=False,
+                      callback=None,
+                      timesteps=None,
+                      quantize_denoised=False,
+                      mask=None, x0=None,
+                      img_callback=None, log_every_t=100,
+                      temperature=1.,
+                      noise_dropout=0.,
+                      score_corrector=None,
+                      corrector_kwargs=None,
+                      unconditional_guidance_scale=1.,
+                      unconditional_conditioning=None,):
+        device = self.model.betas.device
+        b = shape[0]
+        if x_T is None:
+            img = torch.randn(shape, device=device)
+        else:
+            img = x_T
+        if timesteps is None:
+            timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
+        elif timesteps is not None and not ddim_use_original_steps:
+            subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
+            timesteps = self.ddim_timesteps[:subset_end]
+        intermediates = {'x_inter': [img], 'pred_x0': [img]}
+        time_range = reversed(range(0,timesteps)) if ddim_use_original_steps else np.flip(timesteps)
+        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
+        print(f"Running DDIM Sampling with {total_steps} timesteps")
+        iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps)
+        for i, step in enumerate(iterator):
+            index = total_steps - i - 1
+            ts = torch.full((b,), step, device=device, dtype=torch.long)
+            if mask is not None:
+                assert x0 is not None
+                img_orig = self.model.q_sample(x0, ts)  # TODO: deterministic forward pass?
+                img = img_orig * mask + (1. - mask) * img
+            outs = self.p_sample_ddim(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
+                                      quantize_denoised=quantize_denoised, temperature=temperature,
+                                      noise_dropout=noise_dropout, score_corrector=score_corrector,
+                                      corrector_kwargs=corrector_kwargs,
+                                      unconditional_guidance_scale=unconditional_guidance_scale,
+                                      unconditional_conditioning=unconditional_conditioning)
+            img, pred_x0 = outs
+            if callback: callback(i)
+            if img_callback: img_callback(pred_x0, i)
+            if index % log_every_t == 0 or index == total_steps - 1:
+                intermediates['x_inter'].append(img)
+                intermediates['pred_x0'].append(pred_x0)
+        return img, intermediates
+    @torch.no_grad()
+    def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
+                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
+                      unconditional_guidance_scale=1., unconditional_conditioning=None):
+        b, *_, device = *x.shape, x.device
+        if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
+            e_t = self.model.apply_model(x, t, c)
+        else:
+            x_in = torch.cat([x] * 2)
+            t_in = torch.cat([t] * 2)
+            c_in = torch.cat([unconditional_conditioning, c])
+            e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
+            e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
+        if score_corrector is not None:
+            assert self.model.parameterization == "eps"
+            e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
+        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
+        alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
+        sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
+        sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
+        # select parameters corresponding to the currently considered timestep
+        a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
+        a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
+        sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
+        sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
+        # current prediction for x_0
+        pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
+        if quantize_denoised:
+            pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
+        # direction pointing to x_t
+        dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
+        noise = sigma_t * noise_like(x, repeat_noise) * temperature
+        if noise_dropout > 0.:
+            noise = torch.nn.functional.dropout(noise, p=noise_dropout)
+        x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
+        return x_prev, pred_x0

core/models/ddim/ddim_vd.py ADDED Viewed

	@@ -0,0 +1,175 @@

+"""
+https://github.com/SHI-Labs/Versatile-Diffusion
+"""
+import torch
+import numpy as np
+from tqdm import tqdm
+from functools import partial
+from .diffusion_utils import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like
+from .ddim import DDIMSampler
+class DDIMSampler_VD(DDIMSampler):
+    @torch.no_grad()
+    def sample(self,
+               steps,
+               shape,
+               xt=None,
+               condition=None,
+               unconditional_guidance_scale=1.,
+               xtype='image',
+               condition_types=['text'],
+               eta=0.,
+               temperature=1.,
+               mix_weight=None,
+               noise_dropout=0.,
+               verbose=True,
+               log_every_t=100, ):
+        self.make_schedule(ddim_num_steps=steps, ddim_eta=eta, verbose=verbose)
+        print(f'Data shape for DDIM sampling is {shape}, eta {eta}')
+        samples, intermediates = self.ddim_sampling(
+            shape,
+            xt=xt,
+            condition=condition,
+            unconditional_guidance_scale=unconditional_guidance_scale,
+            xtype=xtype,
+            condition_types=condition_types,
+            ddim_use_original_steps=False,
+            noise_dropout=noise_dropout,
+            temperature=temperature,
+            log_every_t=log_every_t,
+            mix_weight=mix_weight, )
+        return samples, intermediates
+    @torch.no_grad()
+    def ddim_sampling(self,
+                      shape,
+                      xt=None,
+                      condition=None,
+                      unconditional_guidance_scale=1.,
+                      xtype=['image'],
+                      condition_types=['text'],
+                      ddim_use_original_steps=False,
+                      timesteps=None,
+                      noise_dropout=0.,
+                      temperature=1.,
+                      mix_weight=None,
+                      log_every_t=100, ):
+        device = self.model.device
+        dtype = condition[0][0].dtype
+        if isinstance(shape[0], list):
+            bs = shape[0][0]
+        else:
+            bs = shape[0]
+        if xt is None:
+            if isinstance(shape[0], list):
+                xt = [torch.randn(shape_i, device=device, dtype=dtype) for shape_i in shape]
+            else:
+                xt = torch.randn(shape, device=device, dtype=dtype)
+        if timesteps is None:
+            timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
+        elif timesteps is not None and not ddim_use_original_steps:
+            subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
+            timesteps = self.ddim_timesteps[:subset_end]
+        intermediates = {'pred_xt': [], 'pred_x0': []}
+        time_range = reversed(range(0, timesteps)) if ddim_use_original_steps else np.flip(timesteps)
+        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
+        # print(f"Running DDIM Sampling with {total_steps} timesteps")
+        pred_xt = xt
+        iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps)
+        for i, step in enumerate(iterator):
+            index = total_steps - i - 1
+            ts = torch.full((bs,), step, device=device, dtype=torch.long)
+            outs = self.p_sample_ddim(
+                pred_xt,
+                condition,
+                ts, index,
+                unconditional_guidance_scale=unconditional_guidance_scale,
+                xtype=xtype,
+                condition_types=condition_types,
+                use_original_steps=ddim_use_original_steps,
+                noise_dropout=noise_dropout,
+                temperature=temperature,
+                mix_weight=mix_weight, )
+            pred_xt, pred_x0 = outs
+            if index % log_every_t == 0 or index == total_steps - 1:
+                intermediates['pred_xt'].append(pred_xt)
+                intermediates['pred_x0'].append(pred_x0)
+        return pred_xt, intermediates
+    @torch.no_grad()
+    def p_sample_ddim(self, x,
+                      condition,
+                      t, index,
+                      unconditional_guidance_scale=1.,
+                      xtype=['image'],
+                      condition_types=['text'],
+                      repeat_noise=False,
+                      use_original_steps=False,
+                      noise_dropout=0.,
+                      temperature=1.,
+                      mix_weight=None, ):
+        b, *_, device = *x[0].shape, x[0].device
+        x_in = []
+        for x_i in x:
+            x_in.append(torch.cat([x_i] * 2))
+        t_in = torch.cat([t] * 2)
+        out = self.model.model.diffusion_model(
+            x_in, t_in, condition, xtype=xtype, condition_types=condition_types, mix_weight=mix_weight)
+        e_t = []
+        for out_i in out:
+            e_t_uncond_i, e_t_i = out_i.chunk(2)
+            e_t_i = e_t_uncond_i + unconditional_guidance_scale * (e_t_i - e_t_uncond_i)
+            e_t_i = e_t_i.to(device)
+            e_t.append(e_t_i)
+        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
+        alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
+        sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
+        sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
+        # select parameters corresponding to the currently considered timestep
+        x_prev = []
+        pred_x0 = []
+        device = x[0].device
+        dtype = x[0].dtype
+        for i, xtype_i in enumerate(xtype):
+            if xtype_i in ['image', 'frontal', 'lateral']:
+                extended_shape = (b, 1, 1, 1)
+            elif xtype_i == 'video':
+                extended_shape = (b, 1, 1, 1, 1)
+            elif xtype_i == 'text':
+                extended_shape = (b, 1)
+            elif xtype_i == 'audio':
+                extended_shape = (b, 1, 1, 1)
+            a_t = torch.full(extended_shape, alphas[index], device=device, dtype=dtype)
+            a_prev = torch.full(extended_shape, alphas_prev[index], device=device, dtype=dtype)
+            sigma_t = torch.full(extended_shape, sigmas[index], device=device, dtype=dtype)
+            sqrt_one_minus_at = torch.full(extended_shape, sqrt_one_minus_alphas[index], device=device, dtype=dtype)
+            # current prediction for x_0
+            pred_x0_i = (x[i] - sqrt_one_minus_at * e_t[i]) / a_t.sqrt()
+            dir_xt = (1. - a_prev - sigma_t ** 2).sqrt() * e_t[i]
+            noise = sigma_t * noise_like(x[i], repeat_noise) * temperature
+            if noise_dropout > 0.:
+                noise = torch.nn.functional.dropout(noise, p=noise_dropout)
+            x_prev_i = a_prev.sqrt() * pred_x0_i + dir_xt + noise
+            x_prev.append(x_prev_i)
+            pred_x0.append(pred_x0_i)
+        return x_prev, pred_x0

core/models/ddim/diffusion_utils.py ADDED Viewed

	@@ -0,0 +1,273 @@

+import os
+import math
+import torch
+import torch.nn as nn
+import numpy as np
+from einops import repeat
+def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
+    if schedule == "linear":
+        betas = (
+                torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2
+        )
+    elif schedule == "cosine":
+        timesteps = (
+                torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s
+        )
+        alphas = timesteps / (1 + cosine_s) * np.pi / 2
+        alphas = torch.cos(alphas).pow(2)
+        alphas = alphas / alphas[0]
+        betas = 1 - alphas[1:] / alphas[:-1]
+        betas = np.clip(betas, a_min=0, a_max=0.999)
+    elif schedule == "sqrt_linear":
+        betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64)
+    elif schedule == "sqrt":
+        betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64) ** 0.5
+    else:
+        raise ValueError(f"schedule '{schedule}' unknown.")
+    return betas.numpy()
+def make_ddim_timesteps(ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True):
+    if ddim_discr_method == 'uniform':
+        c = num_ddpm_timesteps // num_ddim_timesteps
+        ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c)))
+    elif ddim_discr_method == 'quad':
+        ddim_timesteps = ((np.linspace(0, np.sqrt(num_ddpm_timesteps * .8), num_ddim_timesteps)) ** 2).astype(int)
+    else:
+        raise NotImplementedError(f'There is no ddim discretization method called "{ddim_discr_method}"')
+    # assert ddim_timesteps.shape[0] == num_ddim_timesteps
+    # add one to get the final alpha values right (the ones from first scale to data during sampling)
+    if num_ddpm_timesteps != 1000:
+        steps_out = ddim_timesteps + 1
+    else:
+        steps_out = ddim_timesteps
+    if verbose:
+        print(f'Selected timesteps for ddim sampler: {steps_out}')
+    return steps_out
+def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True):
+    # select alphas for computing the variance schedule
+    alphas = alphacums[ddim_timesteps]
+    alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())
+    # according the the formula provided in https://arxiv.org/abs/2010.02502
+    sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev))
+    if verbose:
+        print(f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}')
+        print(f'For the chosen value of eta, which is {eta}, '
+              f'this results in the following sigma_t schedule for ddim sampler {sigmas}')
+    return sigmas, alphas, alphas_prev
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function,
+    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    :param num_diffusion_timesteps: the number of betas to produce.
+    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that
+                      part of the diffusion process.
+    :param max_beta: the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+    """
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.array(betas)
+def extract_into_tensor(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+def checkpoint(func, inputs, params, flag):
+    """
+    Evaluate a function without caching intermediate activations, allowing for
+    reduced memory at the expense of extra compute in the backward pass.
+    :param func: the function to evaluate.
+    :param inputs: the argument sequence to pass to `func`.
+    :param params: a sequence of parameters `func` depends on but does not
+                   explicitly take as arguments.
+    :param flag: if False, disable gradient checkpointing.
+    """
+    if flag:
+        args = tuple(inputs) + tuple(params)
+        return CheckpointFunction.apply(func, len(inputs), *args)
+    else:
+        return func(*inputs)
+class CheckpointFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, run_function, length, *args):
+        ctx.run_function = run_function
+        ctx.input_tensors = list(args[:length])
+        ctx.input_params = list(args[length:])
+        with torch.no_grad():
+            output_tensors = ctx.run_function(*ctx.input_tensors)
+        return output_tensors
+    @staticmethod
+    def backward(ctx, *output_grads):
+        ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
+        with torch.enable_grad():
+            # Fixes a bug where the first op in run_function modifies the
+            # Tensor storage in place, which is not allowed for detach()'d
+            # Tensors.
+            shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
+            output_tensors = ctx.run_function(*shallow_copies)
+        input_grads = torch.autograd.grad(
+            output_tensors,
+            ctx.input_tensors + ctx.input_params,
+            output_grads,
+            allow_unused=True,
+        )
+        del ctx.input_tensors
+        del ctx.input_params
+        del output_tensors
+        return (None, None) + input_grads
+def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    if not repeat_only:
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=timesteps.device)
+        args = timesteps[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    else:
+        embedding = repeat(timesteps, 'b -> b d', d=dim)
+    return embedding
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def scale_module(module, scale):
+    """
+    Scale the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().mul_(scale)
+    return module
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+def normalization(channels):
+    """
+    Make a standard normalization layer.
+    :param channels: number of input channels.
+    :return: an nn.Module for normalization.
+    """
+    return GroupNorm32(32, channels)
+# PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
+class SiLU(nn.Module):
+    def forward(self, x):
+        return x * torch.sigmoid(x)
+class GroupNorm32(nn.GroupNorm):
+    def forward(self, x):
+        # return super().forward(x.float()).type(x.dtype)
+        return super().forward(x)
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def linear(*args, **kwargs):
+    """
+    Create a linear module.
+    """
+    return nn.Linear(*args, **kwargs)
+def avg_pool_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D average pooling module.
+    """
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+class HybridConditioner(nn.Module):
+    def __init__(self, c_concat_config, c_crossattn_config):
+        super().__init__()
+        self.concat_conditioner = instantiate_from_config(c_concat_config)
+        self.crossattn_conditioner = instantiate_from_config(c_crossattn_config)
+    def forward(self, c_concat, c_crossattn):
+        c_concat = self.concat_conditioner(c_concat)
+        c_crossattn = self.crossattn_conditioner(c_crossattn)
+        return {'c_concat': [c_concat], 'c_crossattn': [c_crossattn]}
+def noise_like(x, repeat=False):
+    noise = torch.randn_like(x)
+    if repeat:
+        bs = x.shape[0]
+        noise = noise[0:1].repeat(bs, *((1,) * (len(x.shape) - 1)))
+    return noise
+##########################
+# inherit from ldm.utils #
+##########################
+def count_params(model, verbose=False):
+    total_params = sum(p.numel() for p in model.parameters())
+    if verbose:
+        print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
+    return total_params

core/models/ema.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import torch
+from torch import nn
+class LitEma(nn.Module):
+    def __init__(self, model, decay=0.9999, use_num_updates=True):
+        super().__init__()
+        if decay < 0.0 or decay > 1.0:
+            raise ValueError('Decay must be between 0 and 1')
+        self.m_name2s_name = {}
+        self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32))
+        self.register_buffer('num_updates', torch.tensor(0, dtype=torch.int) if use_num_updates
+        else torch.tensor(-1, dtype=torch.int))
+        for name, p in model.named_parameters():
+            if p.requires_grad:
+                # remove as '.'-character is not allowed in buffers
+                s_name = name.replace('.', '')
+                self.m_name2s_name.update({name: s_name})
+                self.register_buffer(s_name, p.clone().detach().data)
+        self.collected_params = []
+    def forward(self, model):
+        decay = self.decay
+        if self.num_updates >= 0:
+            self.num_updates += 1
+            decay = min(self.decay, (1 + self.num_updates) / (10 + self.num_updates))
+        one_minus_decay = 1.0 - decay
+        with torch.no_grad():
+            m_param = dict(model.named_parameters())
+            shadow_params = dict(self.named_buffers())
+            for key in m_param:
+                if m_param[key].requires_grad:
+                    sname = self.m_name2s_name[key]
+                    shadow_params[sname] = shadow_params[sname].type_as(m_param[key])
+                    shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key]))
+                else:
+                    assert not key in self.m_name2s_name
+    def copy_to(self, model):
+        m_param = dict(model.named_parameters())
+        shadow_params = dict(self.named_buffers())
+        for key in m_param:
+            if m_param[key].requires_grad:
+                m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data)
+            else:
+                assert not key in self.m_name2s_name
+    def store(self, parameters):
+        """
+        Save the current parameters for restoring later.
+        Args:
+          parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+            temporarily stored.
+        """
+        self.collected_params = [param.clone() for param in parameters]
+    def restore(self, parameters):
+        """
+        Restore the parameters stored with the `store` method.
+        Useful to validate the model with EMA parameters without affecting the
+        original optimization process. Store the parameters before the
+        `copy_to` method. After validation (or model saving), use this to
+        restore the former parameters.
+        Args:
+          parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+            updated with the stored parameters.
+        """
+        for c_param, param in zip(self.collected_params, parameters):
+            param.data.copy_(c_param.data)

core/models/encoders/__pycache__/clap.cpython-311.pyc ADDED Viewed

Binary file (7.09 kB). View file

core/models/encoders/__pycache__/clap.cpython-38.pyc ADDED Viewed

Binary file (4.16 kB). View file

core/models/encoders/__pycache__/clip.cpython-311.pyc ADDED Viewed

Binary file (10.4 kB). View file

core/models/encoders/__pycache__/clip.cpython-38.pyc ADDED Viewed

Binary file (6 kB). View file

core/models/encoders/clap.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import torch
+import torch.nn as nn
+import torchaudio
+from .clap_modules.open_clip import create_model
+from .clap_modules.training.data import get_audio_features
+from ..common.get_model import register
+@register('clap_audio')
+class CLAPAudioEmbeddingClassifierFreev2(nn.Module):
+    """Uses the CLAP audio encoder"""
+    def __init__(
+        self,
+        pretrained_path="",
+        key="waveform",
+        sampling_rate=16000,
+        embed_mode="audio",
+        unconditional_prob=0.1,
+        random_mute=False,
+        max_random_mute_portion=0.5,
+        training_mode=True,
+        joint_embed_shape=768,
+        embed_shape=512,
+        num_layers=12,
+        depths=[2, 2, 6, 2],
+        amodel="HTSAT-large",
+    ):
+        super().__init__()
+        self.key = key
+        self.amodel = amodel  # or 'PANN-14'
+        self.tmodel = "roberta"  # the best text encoder in our training
+        self.enable_fusion = False  # False if you do not want to use the fusion model
+        self.fusion_type = "aff_2d"
+        self.pretrained = pretrained_path
+        self.embed_mode = embed_mode
+        self.embed_mode_orig = embed_mode
+        self.sampling_rate = sampling_rate
+        self.unconditional_prob = unconditional_prob
+        self.random_mute = random_mute
+        self.joint_embed_shape = joint_embed_shape
+        self.max_random_mute_portion = max_random_mute_portion
+        self.training_mode = training_mode
+        self.model, self.model_cfg = create_model(
+            self.amodel,
+            self.tmodel,
+            self.pretrained,
+            precision="fp32",
+            device="cpu",
+            enable_fusion=self.enable_fusion,
+            fusion_type=self.fusion_type,
+            joint_embed_shape=self.joint_embed_shape,
+        )
+    def get_dtype(self):
+        return next(self.model.parameters()).dtype
+    def get_unconditional_condition(self, batchsize):
+        self.unconditional_token = self.model.get_text_embedding(
+            self.tokenizer(["", ""])
+        )[0:1]
+        return torch.cat([self.unconditional_token.unsqueeze(0)] * batchsize, dim=0)
+    def batch_to_list(self, batch):
+        ret = []
+        for i in range(batch.size(0)):
+            ret.append(batch[i])
+        return ret
+    def make_decision(self, probability):
+        if float(torch.rand(1)) < probability:
+            return True
+        else:
+            return False
+    def random_uniform(self, start, end):
+        val = torch.rand(1).item()
+        return start + (end - start) * val
+    def _random_mute(self, waveform):
+        # waveform: [bs, t-steps]
+        t_steps = waveform.size(-1)
+        for i in range(waveform.size(0)):
+            mute_size = int(
+                self.random_uniform(0, end=int(t_steps * self.max_random_mute_portion))
+            )
+            mute_start = int(self.random_uniform(0, t_steps - mute_size))
+            waveform[i, mute_start : mute_start + mute_size] = 0
+        return waveform
+    def cos_similarity(self, waveform, text):
+        # waveform: [bs, t_steps]
+        with torch.no_grad():
+            self.embed_mode = "audio"
+            audio_emb = self(waveform.cuda())
+            self.embed_mode = "text"
+            text_emb = self(text)
+            similarity = F.cosine_similarity(audio_emb, text_emb, dim=2)
+            return similarity.squeeze()
+    def forward(self, batch, key=None):
+        # the 'fusion' truncate mode can be changed to 'rand_trunc' if run in unfusion mode
+        if self.embed_mode == "audio":
+            audio_dict_list = []
+            assert (
+                self.sampling_rate == 16000
+            ), "We only support 16000 sampling rate"
+            # batch: [bs, 1, t-samples]
+            batch = torchaudio.functional.resample(
+                batch, orig_freq=self.sampling_rate, new_freq=48000
+            )
+            for waveform in self.batch_to_list(batch):
+                audio_dict = {}
+                audio_dict = get_audio_features(
+                    audio_dict,
+                    waveform.squeeze(),
+                    480000,
+                    data_truncating="fusion",
+                    data_filling="repeatpad",
+                    audio_cfg=self.model_cfg["audio_cfg"],
+                    dtype=self.get_dtype(),
+                )
+                audio_dict_list.append(audio_dict)
+            # [bs, 768]
+            embed = self.model.get_audio_embedding(audio_dict_list)
+        embed = embed.unsqueeze(1)
+        # [bs, 1, 768]
+        return embed