Spaces:

cavargas10
/

Unico3D

Running on Zero

App Files Files Community

cavargas10 commited on Jul 4

Commit

1f30907

•

1 Parent(s): 69b6a88

Upload 56 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +4 -0
assets/teaser.jpg +0 -0
custum_3d_diffusion/custum_modules/attention_processors.py +385 -0
custum_3d_diffusion/custum_modules/unifield_processor.py +459 -0
custum_3d_diffusion/custum_pipeline/unifield_pipeline_img2img.py +298 -0
custum_3d_diffusion/custum_pipeline/unifield_pipeline_img2mvimg.py +296 -0
custum_3d_diffusion/modules.py +14 -0
custum_3d_diffusion/trainings/__init__.py +0 -0
custum_3d_diffusion/trainings/base.py +208 -0
custum_3d_diffusion/trainings/config_classes.py +35 -0
custum_3d_diffusion/trainings/image2image_trainer.py +86 -0
custum_3d_diffusion/trainings/image2mvimage_trainer.py +139 -0
custum_3d_diffusion/trainings/utils.py +25 -0
gradio_app/__init__.py +0 -0
gradio_app/all_models.py +22 -0
gradio_app/custom_models/image2mvimage.yaml +63 -0
gradio_app/custom_models/image2normal.yaml +61 -0
gradio_app/custom_models/mvimg_prediction.py +59 -0
gradio_app/custom_models/normal_prediction.py +28 -0
gradio_app/custom_models/utils.py +75 -0
gradio_app/examples/Groot.png +0 -0
gradio_app/examples/aaa.png +0 -0
gradio_app/examples/abma.png +0 -0
gradio_app/examples/akun.png +0 -0
gradio_app/examples/anya.png +0 -0
gradio_app/examples/bag.png +3 -0
gradio_app/examples/ex1.png +3 -0
gradio_app/examples/ex2.png +0 -0
gradio_app/examples/ex3.jpg +0 -0
gradio_app/examples/ex4.png +0 -0
gradio_app/examples/generated_1715761545_frame0.png +0 -0
gradio_app/examples/generated_1715762357_frame0.png +0 -0
gradio_app/examples/generated_1715763329_frame0.png +0 -0
gradio_app/examples/hatsune_miku.png +0 -0
gradio_app/examples/princess-large.png +0 -0
gradio_app/gradio_3dgen.py +85 -0
gradio_app/gradio_3dgen_steps.py +87 -0
gradio_app/gradio_local.py +76 -0
gradio_app/utils.py +112 -0
mesh_reconstruction/func.py +133 -0
mesh_reconstruction/opt.py +190 -0
mesh_reconstruction/recon.py +59 -0
mesh_reconstruction/refine.py +80 -0
mesh_reconstruction/remesh.py +361 -0
mesh_reconstruction/render.py +159 -0
package/nvdiffrast-0.3.1.torch-cp310-cp310-linux_x86_64.whl +3 -0
package/onnxruntime_gpu-1.17.0-cp310-cp310-manylinux_2_28_x86_64.whl +3 -0
scripts/all_typing.py +42 -0
scripts/load_onnx.py +48 -0
scripts/mesh_init.py +132 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+gradio_app/examples/bag.png filter=lfs diff=lfs merge=lfs -text
+gradio_app/examples/ex1.png filter=lfs diff=lfs merge=lfs -text
+package/nvdiffrast-0.3.1.torch-cp310-cp310-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+package/onnxruntime_gpu-1.17.0-cp310-cp310-manylinux_2_28_x86_64.whl filter=lfs diff=lfs merge=lfs -text

assets/teaser.jpg ADDED Viewed

custum_3d_diffusion/custum_modules/attention_processors.py ADDED Viewed

	@@ -0,0 +1,385 @@

+from typing import Any, Dict, Optional
+import torch
+from diffusers.models.attention_processor import Attention
+def construct_pix2pix_attention(hidden_states_dim, norm_type="none"):
+    if norm_type == "layernorm":
+        norm = torch.nn.LayerNorm(hidden_states_dim)
+    else:
+        norm = torch.nn.Identity()
+    attention = Attention(
+        query_dim=hidden_states_dim,
+        heads=8,
+        dim_head=hidden_states_dim // 8,
+        bias=True,
+    )
+    # NOTE: xformers 0.22 does not support batchsize >= 4096
+    attention.xformers_not_supported = True # hacky solution
+    return norm, attention
+class ExtraAttnProc(torch.nn.Module):
+    def __init__(
+        self,
+        chained_proc,
+        enabled=False,
+        name=None,
+        mode='extract',
+        with_proj_in=False,
+        proj_in_dim=768,
+        target_dim=None,
+        pixel_wise_crosspond=False,
+        norm_type="none",   # none or layernorm
+        crosspond_effect_on="all",  # all or first
+        crosspond_chain_pos="parralle",     # before or parralle or after
+        simple_3d=False,
+        views=4,
+    ) -> None:
+        super().__init__()
+        self.enabled = enabled
+        self.chained_proc = chained_proc
+        self.name = name
+        self.mode = mode
+        self.with_proj_in=with_proj_in
+        self.proj_in_dim = proj_in_dim
+        self.target_dim = target_dim or proj_in_dim
+        self.hidden_states_dim = self.target_dim
+        self.pixel_wise_crosspond = pixel_wise_crosspond
+        self.crosspond_effect_on = crosspond_effect_on
+        self.crosspond_chain_pos = crosspond_chain_pos
+        self.views = views
+        self.simple_3d = simple_3d
+        if self.with_proj_in and self.enabled:
+            self.in_linear = torch.nn.Linear(self.proj_in_dim, self.target_dim, bias=False)
+            if self.target_dim == self.proj_in_dim:
+                self.in_linear.weight.data = torch.eye(proj_in_dim)
+        else:
+            self.in_linear = None
+        if self.pixel_wise_crosspond and self.enabled:
+            self.crosspond_norm, self.crosspond_attention = construct_pix2pix_attention(self.hidden_states_dim, norm_type=norm_type)
+    def do_crosspond_attention(self, hidden_states: torch.FloatTensor, other_states: torch.FloatTensor):
+        hidden_states = self.crosspond_norm(hidden_states)
+        batch, L, D = hidden_states.shape
+        assert hidden_states.shape == other_states.shape, f"got {hidden_states.shape} and {other_states.shape}"
+        # to -> batch * L, 1, D
+        hidden_states = hidden_states.reshape(batch * L, 1, D)
+        other_states = other_states.reshape(batch * L, 1, D)
+        hidden_states_catted = other_states
+        hidden_states = self.crosspond_attention(
+            hidden_states,
+            encoder_hidden_states=hidden_states_catted,
+        )
+        return hidden_states.reshape(batch, L, D)
+    def __call__(
+        self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None,
+        ref_dict: dict = None, mode=None, **kwargs
+    ) -> Any:
+        if not self.enabled:
+            return self.chained_proc(attn, hidden_states, encoder_hidden_states, attention_mask, **kwargs)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        assert ref_dict is not None
+        if (mode or self.mode) == 'extract':
+            ref_dict[self.name] = hidden_states
+            hidden_states1 = self.chained_proc(attn, hidden_states, encoder_hidden_states, attention_mask, **kwargs)
+            if self.pixel_wise_crosspond and self.crosspond_chain_pos == "after":
+                ref_dict[self.name] = hidden_states1
+            return hidden_states1
+        elif (mode or self.mode) == 'inject':
+            ref_state = ref_dict.pop(self.name)
+            if self.with_proj_in:
+                ref_state = self.in_linear(ref_state)
+            B, L, D = ref_state.shape
+            if hidden_states.shape[0] == B:
+                modalities = 1
+                views = 1
+            else:
+                modalities = hidden_states.shape[0] // B // self.views
+                views = self.views
+            if self.pixel_wise_crosspond:
+                if self.crosspond_effect_on == "all":
+                    ref_state = ref_state[:, None].expand(-1, modalities * views, -1, -1).reshape(-1, *ref_state.shape[-2:])
+                    if self.crosspond_chain_pos == "before":
+                        hidden_states = hidden_states + self.do_crosspond_attention(hidden_states, ref_state)
+                    hidden_states1 = self.chained_proc(attn, hidden_states, encoder_hidden_states, attention_mask, **kwargs)
+                    if self.crosspond_chain_pos == "parralle":
+                        hidden_states1 = hidden_states1 + self.do_crosspond_attention(hidden_states, ref_state)
+                    if self.crosspond_chain_pos == "after":
+                        hidden_states1 = hidden_states1 + self.do_crosspond_attention(hidden_states1, ref_state)
+                    return hidden_states1
+                else:
+                    assert self.crosspond_effect_on == "first"
+                    # hidden_states [B * modalities * views, L, D]
+                    # ref_state [B, L, D]
+                    ref_state = ref_state[:, None].expand(-1, modalities, -1, -1).reshape(-1, ref_state.shape[-2], ref_state.shape[-1])  # [B * modalities, L, D]
+                    def do_paritial_crosspond(hidden_states, ref_state):
+                        first_view_hidden_states = hidden_states.view(-1, views, hidden_states.shape[1], hidden_states.shape[2])[:, 0]  # [B * modalities, L, D]
+                        hidden_states2 = self.do_crosspond_attention(first_view_hidden_states, ref_state) # [B * modalities, L, D]
+                        hidden_states2_padded = torch.zeros_like(hidden_states).reshape(-1, views, hidden_states.shape[1], hidden_states.shape[2])
+                        hidden_states2_padded[:, 0] = hidden_states2
+                        hidden_states2_padded = hidden_states2_padded.reshape(-1, hidden_states.shape[1], hidden_states.shape[2])
+                        return hidden_states2_padded
+                    if self.crosspond_chain_pos == "before":
+                        hidden_states = hidden_states + do_paritial_crosspond(hidden_states, ref_state)
+                    hidden_states1 = self.chained_proc(attn, hidden_states, encoder_hidden_states, attention_mask, **kwargs)    # [B * modalities * views, L, D]
+                    if self.crosspond_chain_pos == "parralle":
+                        hidden_states1 = hidden_states1 + do_paritial_crosspond(hidden_states, ref_state)
+                    if self.crosspond_chain_pos == "after":
+                        hidden_states1 = hidden_states1 + do_paritial_crosspond(hidden_states1, ref_state)
+                    return hidden_states1
+            elif self.simple_3d:
+                B, L, C = encoder_hidden_states.shape
+                mv = self.views
+                encoder_hidden_states = encoder_hidden_states.reshape(B // mv, mv, L, C)
+                ref_state = ref_state[:, None]
+                encoder_hidden_states = torch.cat([encoder_hidden_states, ref_state], dim=1)
+                encoder_hidden_states = encoder_hidden_states.reshape(B // mv, 1, (mv+1) * L, C)
+                encoder_hidden_states = encoder_hidden_states.repeat(1, mv, 1, 1).reshape(-1, (mv+1) * L, C)
+                return self.chained_proc(attn, hidden_states, encoder_hidden_states, attention_mask, **kwargs)
+            else:
+                ref_state = ref_state[:, None].expand(-1, modalities * views, -1, -1).reshape(-1, ref_state.shape[-2], ref_state.shape[-1])
+                encoder_hidden_states = torch.cat([encoder_hidden_states, ref_state], dim=1)
+                return self.chained_proc(attn, hidden_states, encoder_hidden_states, attention_mask, **kwargs)
+        else:
+            raise NotImplementedError("mode or self.mode is required to be 'extract' or 'inject'")
+def add_extra_processor(model: torch.nn.Module, enable_filter=lambda x:True, **kwargs):
+    return_dict = torch.nn.ModuleDict()
+    proj_in_dim = kwargs.get('proj_in_dim', False)
+    kwargs.pop('proj_in_dim', None)
+    def recursive_add_processors(name: str, module: torch.nn.Module):
+        for sub_name, child in module.named_children():
+            if "ref_unet" not in (sub_name + name):
+                recursive_add_processors(f"{name}.{sub_name}", child)
+        if isinstance(module, Attention):
+            new_processor = ExtraAttnProc(
+                chained_proc=module.get_processor(),
+                enabled=enable_filter(f"{name}.processor"),
+                name=f"{name}.processor",
+                proj_in_dim=proj_in_dim if proj_in_dim else module.cross_attention_dim,
+                target_dim=module.cross_attention_dim,
+                **kwargs
+            )
+            module.set_processor(new_processor)
+            return_dict[f"{name}.processor".replace(".", "__")] = new_processor
+    for name, module in model.named_children():
+        recursive_add_processors(name, module)
+    return return_dict
+def switch_extra_processor(model, enable_filter=lambda x:True):
+    def recursive_add_processors(name: str, module: torch.nn.Module):
+        for sub_name, child in module.named_children():
+            recursive_add_processors(f"{name}.{sub_name}", child)
+        if isinstance(module, ExtraAttnProc):
+            module.enabled = enable_filter(name)
+    for name, module in model.named_children():
+        recursive_add_processors(name, module)
+class multiviewAttnProc(torch.nn.Module):
+    def __init__(
+        self,
+        chained_proc,
+        enabled=False,
+        name=None,
+        hidden_states_dim=None,
+        chain_pos="parralle",     # before or parralle or after
+        num_modalities=1,
+        views=4,
+        base_img_size=64,
+    ) -> None:
+        super().__init__()
+        self.enabled = enabled
+        self.chained_proc = chained_proc
+        self.name = name
+        self.hidden_states_dim = hidden_states_dim
+        self.num_modalities = num_modalities
+        self.views = views
+        self.base_img_size = base_img_size
+        self.chain_pos = chain_pos
+        self.diff_joint_attn = True
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        **kwargs
+    ) -> torch.Tensor:
+        if not self.enabled:
+            return self.chained_proc(attn, hidden_states, encoder_hidden_states, attention_mask, **kwargs)
+        B, L, C = hidden_states.shape
+        mv = self.views
+        hidden_states = hidden_states.reshape(B // mv, mv, L, C).reshape(-1, mv * L, C)
+        hidden_states = self.chained_proc(attn, hidden_states, encoder_hidden_states, attention_mask, **kwargs)
+        return hidden_states.reshape(B // mv, mv, L, C).reshape(-1, L, C)
+def add_multiview_processor(model: torch.nn.Module, enable_filter=lambda x:True, **kwargs):
+    return_dict = torch.nn.ModuleDict()
+    def recursive_add_processors(name: str, module: torch.nn.Module):
+        for sub_name, child in module.named_children():
+            if "ref_unet" not in (sub_name + name):
+                recursive_add_processors(f"{name}.{sub_name}", child)
+        if isinstance(module, Attention):
+            new_processor = multiviewAttnProc(
+                chained_proc=module.get_processor(),
+                enabled=enable_filter(f"{name}.processor"),
+                name=f"{name}.processor",
+                hidden_states_dim=module.inner_dim,
+                **kwargs
+            )
+            module.set_processor(new_processor)
+            return_dict[f"{name}.processor".replace(".", "__")] = new_processor
+    for name, module in model.named_children():
+        recursive_add_processors(name, module)
+    return return_dict
+def switch_multiview_processor(model, enable_filter=lambda x:True):
+    def recursive_add_processors(name: str, module: torch.nn.Module):
+        for sub_name, child in module.named_children():
+            recursive_add_processors(f"{name}.{sub_name}", child)
+        if isinstance(module, Attention):
+            processor = module.get_processor()
+            if isinstance(processor, multiviewAttnProc):
+                processor.enabled = enable_filter(f"{name}.processor")
+    for name, module in model.named_children():
+        recursive_add_processors(name, module)
+class NNModuleWrapper(torch.nn.Module):
+    def __init__(self, module):
+        super().__init__()
+        self.module = module
+    def forward(self, *args, **kwargs):
+        return self.module(*args, **kwargs)
+    def __getattr__(self, name: str):
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(self.module, name)
+class AttnProcessorSwitch(torch.nn.Module):
+    def __init__(
+        self,
+        proc_dict: dict,
+        enabled_proc="default",
+        name=None,
+        switch_name="default_switch",
+    ):
+        super().__init__()
+        self.proc_dict = torch.nn.ModuleDict({k: (v if isinstance(v, torch.nn.Module) else NNModuleWrapper(v)) for k, v in proc_dict.items()})
+        self.enabled_proc = enabled_proc
+        self.name = name
+        self.switch_name = switch_name
+        self.choose_module(enabled_proc)
+    def choose_module(self, enabled_proc):
+        self.enabled_proc = enabled_proc
+        assert enabled_proc in self.proc_dict.keys()
+    def __call__(
+        self,
+        *args,
+        **kwargs
+    ) -> torch.FloatTensor:
+        used_proc = self.proc_dict[self.enabled_proc]
+        return used_proc(*args, **kwargs)
+def add_switch(model: torch.nn.Module, module_filter=lambda x:True, switch_dict_fn=lambda x: {"default": x}, switch_name="default_switch", enabled_proc="default"):
+    return_dict = torch.nn.ModuleDict()
+    def recursive_add_processors(name: str, module: torch.nn.Module):
+        for sub_name, child in module.named_children():
+            if "ref_unet" not in (sub_name + name):
+                recursive_add_processors(f"{name}.{sub_name}", child)
+        if isinstance(module, Attention):
+            processor = module.get_processor()
+            if module_filter(processor):
+                proc_dict = switch_dict_fn(processor)
+                new_processor = AttnProcessorSwitch(
+                    proc_dict=proc_dict,
+                    enabled_proc=enabled_proc,
+                    name=f"{name}.processor",
+                    switch_name=switch_name,
+                )
+                module.set_processor(new_processor)
+                return_dict[f"{name}.processor".replace(".", "__")] = new_processor
+    for name, module in model.named_children():
+        recursive_add_processors(name, module)
+    return return_dict
+def change_switch(model: torch.nn.Module, switch_name="default_switch", enabled_proc="default"):
+    def recursive_change_processors(name: str, module: torch.nn.Module):
+        for sub_name, child in module.named_children():
+            recursive_change_processors(f"{name}.{sub_name}", child)
+        if isinstance(module, Attention):
+            processor = module.get_processor()
+            if isinstance(processor, AttnProcessorSwitch) and processor.switch_name == switch_name:
+                processor.choose_module(enabled_proc)
+    for name, module in model.named_children():
+        recursive_change_processors(name, module)
+########## Hack: Attention fix #############
+from diffusers.models.attention import Attention
+def forward(
+    self,
+    hidden_states: torch.FloatTensor,
+    encoder_hidden_states: Optional[torch.FloatTensor] = None,
+    attention_mask: Optional[torch.FloatTensor] = None,
+    **cross_attention_kwargs,
+) -> torch.Tensor:
+    r"""
+    The forward method of the `Attention` class.
+    Args:
+        hidden_states (`torch.Tensor`):
+            The hidden states of the query.
+        encoder_hidden_states (`torch.Tensor`, *optional*):
+            The hidden states of the encoder.
+        attention_mask (`torch.Tensor`, *optional*):
+            The attention mask to use. If `None`, no mask is applied.
+        **cross_attention_kwargs:
+            Additional keyword arguments to pass along to the cross attention.
+    Returns:
+        `torch.Tensor`: The output of the attention layer.
+    """
+    # The `Attention` class can call different attention processors / attention functions
+    # here we simply pass along all tensors to the selected processor class
+    # For standard processors that are defined here, `**cross_attention_kwargs` is empty
+    return self.processor(
+        self,
+        hidden_states,
+        encoder_hidden_states=encoder_hidden_states,
+        attention_mask=attention_mask,
+        **cross_attention_kwargs,
+    )
+Attention.forward = forward

custum_3d_diffusion/custum_modules/unifield_processor.py ADDED Viewed

	@@ -0,0 +1,459 @@

+from types import FunctionType
+from typing import Any, Dict, List
+from diffusers import UNet2DConditionModel
+import torch
+from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel, ImageProjection
+from diffusers.models.attention_processor import Attention, AttnProcessor, AttnProcessor2_0, XFormersAttnProcessor
+from dataclasses import dataclass, field
+from diffusers.loaders import IPAdapterMixin
+from custum_3d_diffusion.custum_modules.attention_processors import add_extra_processor, switch_extra_processor, add_multiview_processor, switch_multiview_processor, add_switch, change_switch
+@dataclass
+class AttnConfig:
+    """
+    * CrossAttention: Attention module (inherits knowledge), LoRA module (achieves fine-tuning), IPAdapter module (achieves conceptual control).
+    * SelfAttention: Attention module (inherits knowledge), LoRA module (achieves fine-tuning), Reference Attention module (achieves pixel-level control).
+    * Multiview Attention module: Multiview Attention module (achieves multi-view consistency).
+    * Cross Modality Attention module: Cross Modality Attention module (achieves multi-modality consistency).
+    For setups:
+        train_xxx_lr is implemented in the U-Net architecture.
+        enable_xxx_lora is implemented in the U-Net architecture.
+        enable_xxx_ip is implemented in the processor and U-Net architecture.
+        enable_xxx_ref_proj_in is implemented in the processor.
+    """
+    latent_size: int = 64
+    train_lr: float = 0
+    # for cross attention
+    # 0 learning rate for not training
+    train_cross_attn_lr: float = 0
+    train_cross_attn_lora_lr: float = 0
+    train_cross_attn_ip_lr: float = 0      # 0 for not trained
+    init_cross_attn_lora: bool = False
+    enable_cross_attn_lora: bool = False
+    init_cross_attn_ip: bool = False
+    enable_cross_attn_ip: bool = False
+    cross_attn_lora_rank: int = 64        # 0 for not enabled
+    cross_attn_lora_only_kv: bool = False
+    ipadapter_pretrained_name: str = "h94/IP-Adapter"
+    ipadapter_subfolder_name: str = "models"
+    ipadapter_weight_name: str = "ip-adapter-plus_sd15.safetensors"
+    ipadapter_effect_on: str = "all"    # all, first
+    # for self attention
+    train_self_attn_lr: float = 0
+    train_self_attn_lora_lr: float = 0
+    init_self_attn_lora: bool = False
+    enable_self_attn_lora: bool = False
+    self_attn_lora_rank: int = 64
+    self_attn_lora_only_kv: bool = False
+    train_self_attn_ref_lr: float = 0
+    train_ref_unet_lr: float = 0
+    init_self_attn_ref: bool = False
+    enable_self_attn_ref: bool = False
+    self_attn_ref_other_model_name: str = ""
+    self_attn_ref_position: str = "attn1"
+    self_attn_ref_pixel_wise_crosspond: bool = False    # enable pixel_wise_crosspond in refattn
+    self_attn_ref_chain_pos: str = "parralle"           # before or parralle or after
+    self_attn_ref_effect_on: str = "all"                # all or first, for _crosspond attn
+    self_attn_ref_zero_init: bool = True
+    use_simple3d_attn: bool = False
+    # for multiview attention
+    init_multiview_attn: bool = False
+    enable_multiview_attn: bool = False
+    multiview_attn_position: str = "attn1"
+    multiview_chain_pose: str = "parralle"             # before or parralle or after
+    num_modalities: int = 1
+    use_mv_joint_attn: bool = False
+    # for unet
+    init_unet_path: str = "runwayml/stable-diffusion-v1-5"
+    init_num_cls_label: int = 0                         # for initialize
+    cls_labels: List[int] = field(default_factory=lambda: [])
+    cls_label_type: str = "embedding"
+    cat_condition: bool = False                         # cat condition to input
+class Configurable:
+    attn_config: AttnConfig
+    def set_config(self, attn_config: AttnConfig):
+        raise NotImplementedError()
+    def update_config(self, attn_config: AttnConfig):
+        self.attn_config = attn_config
+    def do_set_config(self, attn_config: AttnConfig):
+        self.set_config(attn_config)
+        for name, module in self.named_modules():
+            if isinstance(module, Configurable):
+                if hasattr(module, "do_set_config"):
+                    module.do_set_config(attn_config)
+                else:
+                    print(f"Warning: {name} has no attribute do_set_config, but is an instance of Configurable")
+                    module.attn_config = attn_config
+    def do_update_config(self, attn_config: AttnConfig):
+        self.update_config(attn_config)
+        for name, module in self.named_modules():
+            if isinstance(module, Configurable):
+                if hasattr(module, "do_update_config"):
+                    module.do_update_config(attn_config)
+                else:
+                    print(f"Warning: {name} has no attribute do_update_config, but is an instance of Configurable")
+                    module.attn_config = attn_config
+from diffusers import ModelMixin  # Must import ModelMixin for CompiledUNet
+class UnifieldWrappedUNet(UNet2DConditionModel):
+    forward_hook: FunctionType
+    def forward(self, *args, **kwargs):
+        if hasattr(self, 'forward_hook'):
+            return self.forward_hook(super().forward, *args, **kwargs)
+        return super().forward(*args, **kwargs)
+class ConfigurableUNet2DConditionModel(Configurable, IPAdapterMixin):
+    unet: UNet2DConditionModel
+    cls_embedding_param_dict = {}
+    cross_attn_lora_param_dict = {}
+    self_attn_lora_param_dict = {}
+    cross_attn_param_dict = {}
+    self_attn_param_dict = {}
+    ipadapter_param_dict = {}
+    ref_attn_param_dict = {}
+    ref_unet_param_dict = {}
+    multiview_attn_param_dict = {}
+    other_param_dict = {}
+    rev_param_name_mapping = {}
+    class_labels = []
+    def set_class_labels(self, class_labels: torch.Tensor):
+        if self.attn_config.init_num_cls_label != 0:
+            self.class_labels = class_labels.to(self.unet.device).long()
+    def __init__(self, init_config: AttnConfig, weight_dtype) -> None:
+        super().__init__()
+        self.weight_dtype = weight_dtype
+        self.set_config(init_config)
+    def enable_xformers_memory_efficient_attention(self):
+        self.unet.enable_xformers_memory_efficient_attention
+        def recursive_add_processors(name: str, module: torch.nn.Module):
+            for sub_name, child in module.named_children():
+                recursive_add_processors(f"{name}.{sub_name}", child)
+            if isinstance(module, Attention):
+                if hasattr(module, 'xformers_not_supported'):
+                    return
+                old_processor = module.get_processor()
+                if isinstance(old_processor, (AttnProcessor, AttnProcessor2_0)):
+                    module.set_use_memory_efficient_attention_xformers(True)
+        for name, module in self.unet.named_children():
+            recursive_add_processors(name, module)
+    def __getattr__(self, name: str) -> Any:
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(self.unet, name)
+    # --- for IPAdapterMixin
+    def register_modules(self, **kwargs):
+        for name, module in kwargs.items():
+            # set models
+            setattr(self, name, module)
+    def register_to_config(self, **kwargs):
+        pass
+    def unload_ip_adapter(self):
+        raise NotImplementedError()
+    # --- for Configurable
+    def get_refunet(self):
+        if self.attn_config.self_attn_ref_other_model_name == "self":
+            return self.unet
+        else:
+            return self.unet.ref_unet
+    def set_config(self, attn_config: AttnConfig):
+        self.attn_config = attn_config
+        unet_type = UnifieldWrappedUNet
+        # class_embed_type = "projection" for 'camera'
+        # class_embed_type = None for 'embedding'
+        unet_kwargs = {}
+        if attn_config.init_num_cls_label > 0:
+            if attn_config.cls_label_type == "embedding":
+                unet_kwargs = {
+                    "num_class_embeds": attn_config.init_num_cls_label,
+                    "device_map": None,
+                    "low_cpu_mem_usage": False,
+                    "class_embed_type": None,
+                }
+            else:
+                raise ValueError(f"cls_label_type {attn_config.cls_label_type} is not supported")
+        self.unet: UnifieldWrappedUNet = unet_type.from_pretrained(
+            attn_config.init_unet_path, subfolder="unet", torch_dtype=self.weight_dtype,
+            **unet_kwargs
+        )
+        assert isinstance(self.unet, UnifieldWrappedUNet)
+        self.unet.forward_hook = self.unet_forward_hook
+        if self.attn_config.cat_condition:
+            # double in_channels
+            if self.unet.config.in_channels != 8:
+                self.unet.register_to_config(in_channels=self.unet.config.in_channels * 2)
+                # repeate unet.conv_in weight twice
+                doubled_conv_in = torch.nn.Conv2d(self.unet.conv_in.in_channels * 2, self.unet.conv_in.out_channels, self.unet.conv_in.kernel_size, self.unet.conv_in.stride, self.unet.conv_in.padding)
+                doubled_conv_in.weight.data = torch.cat([self.unet.conv_in.weight.data, torch.zeros_like(self.unet.conv_in.weight.data)], dim=1)
+                doubled_conv_in.bias.data = self.unet.conv_in.bias.data
+                self.unet.conv_in = doubled_conv_in
+        used_param_ids = set()
+        if attn_config.init_cross_attn_lora:
+            # setup lora
+            from peft import LoraConfig
+            from peft.utils import get_peft_model_state_dict
+            if attn_config.cross_attn_lora_only_kv:
+                target_modules=["attn2.to_k", "attn2.to_v"]
+            else:
+                target_modules=["attn2.to_k", "attn2.to_q", "attn2.to_v", "attn2.to_out.0"]
+            lora_config: LoraConfig = LoraConfig(
+                r=attn_config.cross_attn_lora_rank,
+                lora_alpha=attn_config.cross_attn_lora_rank,
+                init_lora_weights="gaussian",
+                target_modules=target_modules,
+            )
+            adapter_name="cross_attn_lora"
+            self.unet.add_adapter(lora_config, adapter_name=adapter_name)
+            # update cross_attn_lora_param_dict
+            self.cross_attn_lora_param_dict = {id(param): param for name, param in self.unet.named_parameters() if adapter_name in name and id(param) not in used_param_ids}
+            used_param_ids.update(self.cross_attn_lora_param_dict.keys())
+        if attn_config.init_self_attn_lora:
+            # setup lora
+            from peft import LoraConfig
+            if attn_config.self_attn_lora_only_kv:
+                target_modules=["attn1.to_k", "attn1.to_v"]
+            else:
+                target_modules=["attn1.to_k", "attn1.to_q", "attn1.to_v", "attn1.to_out.0"]
+            lora_config: LoraConfig = LoraConfig(
+                r=attn_config.self_attn_lora_rank,
+                lora_alpha=attn_config.self_attn_lora_rank,
+                init_lora_weights="gaussian",
+                target_modules=target_modules,
+            )
+            adapter_name="self_attn_lora"
+            self.unet.add_adapter(lora_config, adapter_name=adapter_name)
+            # update cross_self_lora_param_dict
+            self.self_attn_lora_param_dict = {id(param): param for name, param in self.unet.named_parameters() if adapter_name in name and id(param) not in used_param_ids}
+            used_param_ids.update(self.self_attn_lora_param_dict.keys())
+        if attn_config.init_num_cls_label != 0:
+            self.cls_embedding_param_dict = {id(param): param for param in self.unet.class_embedding.parameters()}
+            used_param_ids.update(self.cls_embedding_param_dict.keys())
+            self.set_class_labels(torch.tensor(attn_config.cls_labels).long())
+        if attn_config.init_cross_attn_ip:
+            self.image_encoder = None
+            # setup ipadapter
+            self.load_ip_adapter(
+                attn_config.ipadapter_pretrained_name,
+                subfolder=attn_config.ipadapter_subfolder_name,
+                weight_name=attn_config.ipadapter_weight_name
+            )
+            # warp ip_adapter_attn_proc with switch
+            from diffusers.models.attention_processor import IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0
+            add_switch(self.unet, module_filter=lambda x: isinstance(x, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)), switch_dict_fn=lambda x: {"ipadapter": x, "default": XFormersAttnProcessor()}, switch_name="ipadapter_switch", enabled_proc="ipadapter")
+            # update ipadapter_param_dict
+            # weights are in attention processors and unet.encoder_hid_proj
+            self.ipadapter_param_dict = {id(param): param for param in self.unet.encoder_hid_proj.parameters() if id(param) not in used_param_ids}
+            used_param_ids.update(self.ipadapter_param_dict.keys())
+            print("DEBUG: ipadapter_param_dict len in encoder_hid_proj", len(self.ipadapter_param_dict))
+            for name, processor in self.unet.attn_processors.items():
+                if hasattr(processor, "to_k_ip"):
+                    self.ipadapter_param_dict.update({id(param): param for param in processor.parameters()})
+            print(f"DEBUG: ipadapter_param_dict len in all", len(self.ipadapter_param_dict))
+        ref_unet = None
+        if attn_config.init_self_attn_ref:
+            # setup reference attention processor
+            if attn_config.self_attn_ref_other_model_name == "self":
+                raise NotImplementedError("self reference is not fully implemented")
+            else:
+                ref_unet: UNet2DConditionModel = UNet2DConditionModel.from_pretrained(
+                    attn_config.self_attn_ref_other_model_name, subfolder="unet", torch_dtype=self.unet.dtype
+                )
+                ref_unet.to(self.unet.device)
+                if self.attn_config.train_ref_unet_lr == 0:
+                    ref_unet.eval()
+                    ref_unet.requires_grad_(False)
+                else:
+                    ref_unet.train()
+                add_extra_processor(
+                    model=ref_unet,
+                    enable_filter=lambda name: name.endswith(f"{attn_config.self_attn_ref_position}.processor"),
+                    mode='extract',
+                    with_proj_in=False,
+                    pixel_wise_crosspond=False,
+                )
+                # NOTE: Here require cross_attention_dim in two unet's self attention should be the same
+                processor_dict = add_extra_processor(
+                    model=self.unet,
+                    enable_filter=lambda name: name.endswith(f"{attn_config.self_attn_ref_position}.processor"),
+                    mode='inject',
+                    with_proj_in=False,
+                    pixel_wise_crosspond=attn_config.self_attn_ref_pixel_wise_crosspond,
+                    crosspond_effect_on=attn_config.self_attn_ref_effect_on,
+                    crosspond_chain_pos=attn_config.self_attn_ref_chain_pos,
+                    simple_3d=attn_config.use_simple3d_attn,
+                )
+                self.ref_unet_param_dict = {id(param): param for name, param in ref_unet.named_parameters() if id(param) not in used_param_ids and (attn_config.self_attn_ref_position in name)}
+                if attn_config.self_attn_ref_chain_pos != "after":
+                    # pop untrainable paramters
+                    for name, param in ref_unet.named_parameters():
+                        if id(param) in self.ref_unet_param_dict and ('up_blocks.3.attentions.2.transformer_blocks.0.' in name):
+                            self.ref_unet_param_dict.pop(id(param))
+                used_param_ids.update(self.ref_unet_param_dict.keys())
+            # update ref_attn_param_dict
+            self.ref_attn_param_dict = {id(param): param for name, param in processor_dict.named_parameters() if id(param) not in used_param_ids}
+            used_param_ids.update(self.ref_attn_param_dict.keys())
+        if attn_config.init_multiview_attn:
+            processor_dict = add_multiview_processor(
+                model = self.unet,
+                enable_filter = lambda name: name.endswith(f"{attn_config.multiview_attn_position}.processor"),
+                num_modalities = attn_config.num_modalities,
+                base_img_size = attn_config.latent_size,
+                chain_pos = attn_config.multiview_chain_pose,
+            )
+            # update multiview_attn_param_dict
+            self.multiview_attn_param_dict = {id(param): param for name, param in processor_dict.named_parameters() if id(param) not in used_param_ids}
+            used_param_ids.update(self.multiview_attn_param_dict.keys())
+        # initialize cross_attn_param_dict parameters
+        self.cross_attn_param_dict = {id(param): param for name, param in self.unet.named_parameters() if "attn2" in name and id(param) not in used_param_ids}
+        used_param_ids.update(self.cross_attn_param_dict.keys())
+        # initialize self_attn_param_dict parameters
+        self.self_attn_param_dict = {id(param): param for name, param in self.unet.named_parameters() if "attn1" in name and id(param) not in used_param_ids}
+        used_param_ids.update(self.self_attn_param_dict.keys())
+        # initialize other_param_dict parameters
+        self.other_param_dict = {id(param): param for name, param in self.unet.named_parameters() if id(param) not in used_param_ids}
+        if ref_unet is not None:
+            self.unet.ref_unet = ref_unet
+        self.rev_param_name_mapping = {id(param): name for name, param in self.unet.named_parameters()}
+        self.update_config(attn_config, force_update=True)
+        return self.unet
+    _attn_keys_to_update = ["enable_cross_attn_lora", "enable_cross_attn_ip", "enable_self_attn_lora", "enable_self_attn_ref", "enable_multiview_attn", "cls_labels"]
+    def update_config(self, attn_config: AttnConfig, force_update=False):
+        assert isinstance(self.unet, UNet2DConditionModel), "unet must be an instance of UNet2DConditionModel"
+        need_to_update = False
+        # update cls_labels
+        for key in self._attn_keys_to_update:
+            if getattr(self.attn_config, key) != getattr(attn_config, key):
+                need_to_update = True
+                break
+        if not force_update and not need_to_update:
+            return
+        self.set_class_labels(torch.tensor(attn_config.cls_labels).long())
+        # setup loras
+        if self.attn_config.init_cross_attn_lora or self.attn_config.init_self_attn_lora:
+            if attn_config.enable_cross_attn_lora or attn_config.enable_self_attn_lora:
+                cross_attn_lora_weight = 1. if attn_config.enable_cross_attn_lora > 0 else 0
+                self_attn_lora_weight = 1. if attn_config.enable_self_attn_lora > 0 else 0
+                self.unet.set_adapters(["cross_attn_lora", "self_attn_lora"], weights=[cross_attn_lora_weight, self_attn_lora_weight])
+            else:
+                self.unet.disable_adapters()
+        # setup ipadapter
+        if self.attn_config.init_cross_attn_ip:
+            if attn_config.enable_cross_attn_ip:
+                change_switch(self.unet, "ipadapter_switch", "ipadapter")
+            else:
+                change_switch(self.unet, "ipadapter_switch", "default")
+        # setup reference attention processor
+        if self.attn_config.init_self_attn_ref:
+            if attn_config.enable_self_attn_ref:
+                switch_extra_processor(self.unet, enable_filter=lambda name: name.endswith(f"{attn_config.self_attn_ref_position}.processor"))
+            else:
+                switch_extra_processor(self.unet, enable_filter=lambda name: False)
+        # setup multiview attention processor
+        if self.attn_config.init_multiview_attn:
+            if attn_config.enable_multiview_attn:
+                switch_multiview_processor(self.unet, enable_filter=lambda name: name.endswith(f"{attn_config.multiview_attn_position}.processor"))
+            else:
+                switch_multiview_processor(self.unet, enable_filter=lambda name: False)
+        # update cls_labels
+        for key in self._attn_keys_to_update:
+            setattr(self.attn_config, key, getattr(attn_config, key))
+    def unet_forward_hook(self, raw_forward, sample: torch.FloatTensor, timestep: torch.Tensor, encoder_hidden_states: torch.Tensor, *args, cross_attention_kwargs=None, condition_latents=None, class_labels=None, noisy_condition_input=False, cond_pixels_clip=None, **kwargs):
+        if class_labels is None and len(self.class_labels) > 0:
+            class_labels = self.class_labels.repeat(sample.shape[0] // self.class_labels.shape[0]).to(sample.device)
+        elif self.attn_config.init_num_cls_label != 0:
+            assert class_labels is not None, "class_labels should be passed if self.class_labels is empty and self.attn_config.init_num_cls_label is not 0"
+        if class_labels is not None:
+            if self.attn_config.cls_label_type == "embedding":
+                pass
+            else:
+                raise ValueError(f"cls_label_type {self.attn_config.cls_label_type} is not supported")
+        if self.attn_config.init_self_attn_ref and self.attn_config.enable_self_attn_ref:
+            # NOTE: extra step, extract condition
+            ref_dict = {}
+            ref_unet = self.get_refunet().to(sample.device)
+            assert condition_latents is not None
+            if self.attn_config.self_attn_ref_other_model_name == "self":
+                raise NotImplementedError()
+            else:
+                with torch.no_grad():
+                    cond_encoder_hidden_states = encoder_hidden_states.reshape(condition_latents.shape[0], -1, *encoder_hidden_states.shape[1:])[:, 0]
+                    if timestep.dim() == 0:
+                        cond_timestep = timestep
+                    else:
+                        cond_timestep = timestep.reshape(condition_latents.shape[0], -1)[:, 0]
+                ref_unet(condition_latents, cond_timestep, cond_encoder_hidden_states,  cross_attention_kwargs=dict(ref_dict=ref_dict))
+            # NOTE: extra step, inject condition
+            # Predict the noise residual and compute loss
+            if cross_attention_kwargs is None:
+                cross_attention_kwargs = {}
+            cross_attention_kwargs.update(ref_dict=ref_dict, mode='inject')
+        elif condition_latents is not None:
+            if not hasattr(self, 'condition_latents_raised'):
+                print("Warning! condition_latents is not None, but self_attn_ref is not enabled! This warning will only be raised once.")
+                self.condition_latents_raised = True
+        if self.attn_config.init_cross_attn_ip:
+            raise NotImplementedError()
+        if self.attn_config.cat_condition:
+            assert condition_latents is not None
+            B = condition_latents.shape[0]
+            cat_latents = condition_latents.reshape(B, 1, *condition_latents.shape[1:]).repeat(1, sample.shape[0] // B, 1, 1, 1).reshape(*sample.shape)
+            sample = torch.cat([sample, cat_latents], dim=1)
+        return raw_forward(sample, timestep, encoder_hidden_states, *args, cross_attention_kwargs=cross_attention_kwargs, class_labels=class_labels, **kwargs)

custum_3d_diffusion/custum_pipeline/unifield_pipeline_img2img.py ADDED Viewed

	@@ -0,0 +1,298 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# modified by Wuvin
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import numpy as np
+import torch
+from diffusers import AutoencoderKL, UNet2DConditionModel, StableDiffusionImageVariationPipeline
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker, StableDiffusionPipelineOutput
+from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel
+from PIL import Image
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+class StableDiffusionImageCustomPipeline(
+    StableDiffusionImageVariationPipeline
+):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        image_encoder: CLIPVisionModelWithProjection,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+        latents_offset=None,
+        noisy_cond_latents=False,
+    ):
+        super().__init__(
+            vae=vae,
+            image_encoder=image_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            requires_safety_checker=requires_safety_checker
+        )
+        latents_offset = tuple(latents_offset) if latents_offset is not None else None
+        self.latents_offset = latents_offset
+        if latents_offset is not None:
+            self.register_to_config(latents_offset=latents_offset)
+        self.noisy_cond_latents = noisy_cond_latents
+        self.register_to_config(noisy_cond_latents=noisy_cond_latents)
+    def encode_latents(self, image, device, dtype, height, width):
+        # support batchsize > 1
+        if isinstance(image, Image.Image):
+            image = [image]
+        image = [img.convert("RGB") for img in image]
+        images = self.image_processor.preprocess(image, height=height, width=width).to(device, dtype=dtype)
+        latents = self.vae.encode(images).latent_dist.mode() * self.vae.config.scaling_factor
+        if self.latents_offset is not None:
+            return latents - torch.tensor(self.latents_offset).to(latents.device)[None, :, None, None]
+        else:
+            return latents
+    def _encode_image(self, image, device, num_images_per_prompt, do_classifier_free_guidance):
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(images=image, return_tensors="pt").pixel_values
+        image = image.to(device=device, dtype=dtype)
+        image_embeddings = self.image_encoder(image).image_embeds
+        image_embeddings = image_embeddings.unsqueeze(1)
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_images_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        if do_classifier_free_guidance:
+            # NOTE: the same as original code
+            negative_prompt_embeds = torch.zeros_like(image_embeddings)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeddings = torch.cat([negative_prompt_embeds, image_embeddings])
+        return image_embeddings
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Union[Image.Image, List[Image.Image], torch.FloatTensor],
+        height: Optional[int] = 1024,
+        width: Optional[int] = 1024,
+        height_cond: Optional[int] = 512,
+        width_cond: Optional[int] = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        upper_left_feature: bool = False,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            image (`Image.Image` or `List[Image.Image]` or `torch.FloatTensor`):
+                Image or images to guide image generation. If you provide a tensor, it needs to be compatible with
+                [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json).
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        Examples:
+        ```py
+        from diffusers import StableDiffusionImageVariationPipeline
+        from PIL import Image
+        from io import BytesIO
+        import requests
+        pipe = StableDiffusionImageVariationPipeline.from_pretrained(
+            "lambdalabs/sd-image-variations-diffusers", revision="v2.0"
+        )
+        pipe = pipe.to("cuda")
+        url = "https://lh3.googleusercontent.com/y-iFOHfLTwkuQSUegpwDdgKmOjRSTvPxat63dQLB25xkTs4lhIbRUFeNBWZzYf370g=s1200"
+        response = requests.get(url)
+        image = Image.open(BytesIO(response.content)).convert("RGB")
+        out = pipe(image, num_images_per_prompt=3, guidance_scale=15)
+        out["images"][0].save("result.jpg")
+        ```
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(image, height, width, callback_steps)
+        # 2. Define call parameters
+        if isinstance(image, Image.Image):
+            batch_size = 1
+        elif isinstance(image, list):
+            batch_size = len(image)
+        else:
+            batch_size = image.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input image
+        if isinstance(image, Image.Image) and upper_left_feature:
+            # only use the first one of four images
+            emb_image = image.crop((0, 0, image.size[0] // 2, image.size[1] // 2))
+        else:
+            emb_image = image
+        image_embeddings = self._encode_image(emb_image, device, num_images_per_prompt, do_classifier_free_guidance)
+        cond_latents = self.encode_latents(image, image_embeddings.device, image_embeddings.dtype, height_cond, width_cond)
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.out_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.noisy_cond_latents:
+                    raise ValueError("Noisy condition latents is not recommended.")
+                else:
+                    noisy_cond_latents = cond_latents
+                noisy_cond_latents = torch.cat([torch.zeros_like(noisy_cond_latents), noisy_cond_latents]) if do_classifier_free_guidance else noisy_cond_latents
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # predict the noise residual
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=image_embeddings, condition_latents=noisy_cond_latents).sample
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+        self.maybe_free_model_hooks()
+        if self.latents_offset is not None:
+            latents = latents + torch.tensor(self.latents_offset).to(latents.device)[None, :, None, None]
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+if __name__ == "__main__":
+    pass

custum_3d_diffusion/custum_pipeline/unifield_pipeline_img2mvimg.py ADDED Viewed

	@@ -0,0 +1,296 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# modified by Wuvin
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import numpy as np
+import torch
+from diffusers import AutoencoderKL, UNet2DConditionModel, StableDiffusionImageVariationPipeline
+from diffusers.schedulers import KarrasDiffusionSchedulers, DDPMScheduler
+from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker, StableDiffusionPipelineOutput
+from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel
+from PIL import Image
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+class StableDiffusionImage2MVCustomPipeline(
+    StableDiffusionImageVariationPipeline
+):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        image_encoder: CLIPVisionModelWithProjection,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+        latents_offset=None,
+        noisy_cond_latents=False,
+        condition_offset=True,
+    ):
+        super().__init__(
+            vae=vae,
+            image_encoder=image_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            requires_safety_checker=requires_safety_checker
+        )
+        latents_offset = tuple(latents_offset) if latents_offset is not None else None
+        self.latents_offset = latents_offset
+        if latents_offset is not None:
+            self.register_to_config(latents_offset=latents_offset)
+        if noisy_cond_latents:
+            raise NotImplementedError("Noisy condition latents not supported Now.")
+        self.condition_offset = condition_offset
+        self.register_to_config(condition_offset=condition_offset)
+    def encode_latents(self, image: Image.Image, device, dtype, height, width):
+        images = self.image_processor.preprocess(image.convert("RGB"), height=height, width=width).to(device, dtype=dtype)
+        # NOTE: .mode() for condition
+        latents = self.vae.encode(images).latent_dist.mode() * self.vae.config.scaling_factor
+        if self.latents_offset is not None and self.condition_offset:
+            return latents - torch.tensor(self.latents_offset).to(latents.device)[None, :, None, None]
+        else:
+            return latents
+    def _encode_image(self, image, device, num_images_per_prompt, do_classifier_free_guidance):
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(images=image, return_tensors="pt").pixel_values
+        image = image.to(device=device, dtype=dtype)
+        image_embeddings = self.image_encoder(image).image_embeds
+        image_embeddings = image_embeddings.unsqueeze(1)
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_images_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        if do_classifier_free_guidance:
+            # NOTE: the same as original code
+            negative_prompt_embeds = torch.zeros_like(image_embeddings)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeddings = torch.cat([negative_prompt_embeds, image_embeddings])
+        return image_embeddings
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Union[Image.Image, List[Image.Image], torch.FloatTensor],
+        height: Optional[int] = 1024,
+        width: Optional[int] = 1024,
+        height_cond: Optional[int] = 512,
+        width_cond: Optional[int] = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            image (`Image.Image` or `List[Image.Image]` or `torch.FloatTensor`):
+                Image or images to guide image generation. If you provide a tensor, it needs to be compatible with
+                [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json).
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        Examples:
+        ```py
+        from diffusers import StableDiffusionImageVariationPipeline
+        from PIL import Image
+        from io import BytesIO
+        import requests
+        pipe = StableDiffusionImageVariationPipeline.from_pretrained(
+            "lambdalabs/sd-image-variations-diffusers", revision="v2.0"
+        )
+        pipe = pipe.to("cuda")
+        url = "https://lh3.googleusercontent.com/y-iFOHfLTwkuQSUegpwDdgKmOjRSTvPxat63dQLB25xkTs4lhIbRUFeNBWZzYf370g=s1200"
+        response = requests.get(url)
+        image = Image.open(BytesIO(response.content)).convert("RGB")
+        out = pipe(image, num_images_per_prompt=3, guidance_scale=15)
+        out["images"][0].save("result.jpg")
+        ```
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(image, height, width, callback_steps)
+        # 2. Define call parameters
+        if isinstance(image, Image.Image):
+            batch_size = 1
+        elif len(image) == 1:
+            image = image[0]
+            batch_size = 1
+        else:
+            raise NotImplementedError()
+        # elif isinstance(image, list):
+        #     batch_size = len(image)
+        # else:
+        #     batch_size = image.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input image
+        emb_image = image
+        image_embeddings = self._encode_image(emb_image, device, num_images_per_prompt, do_classifier_free_guidance)
+        cond_latents = self.encode_latents(image, image_embeddings.device, image_embeddings.dtype, height_cond, width_cond)
+        cond_latents = torch.cat([torch.zeros_like(cond_latents), cond_latents]) if do_classifier_free_guidance else cond_latents
+        image_pixels = self.feature_extractor(images=emb_image, return_tensors="pt").pixel_values
+        if do_classifier_free_guidance:
+            image_pixels = torch.cat([torch.zeros_like(image_pixels), image_pixels], dim=0)
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.out_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # predict the noise residual
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=image_embeddings, condition_latents=cond_latents, noisy_condition_input=False, cond_pixels_clip=image_pixels).sample
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+        self.maybe_free_model_hooks()
+        if self.latents_offset is not None:
+            latents = latents + torch.tensor(self.latents_offset).to(latents.device)[None, :, None, None]
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+if __name__ == "__main__":
+    pass

custum_3d_diffusion/modules.py ADDED Viewed

	@@ -0,0 +1,14 @@

+__modules__ = {}
+def register(name):
+    def decorator(cls):
+        __modules__[name] = cls
+        return cls
+    return decorator
+def find(name):
+    return __modules__[name]
+from custum_3d_diffusion.trainings import base, image2mvimage_trainer, image2image_trainer

custum_3d_diffusion/trainings/__init__.py ADDED Viewed

File without changes

custum_3d_diffusion/trainings/base.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import torch
+from accelerate import Accelerator
+from accelerate.logging import MultiProcessAdapter
+from dataclasses import dataclass, field
+from typing import Optional, Union
+from datasets import load_dataset
+import json
+import abc
+from diffusers.utils import make_image_grid
+import numpy as np
+import wandb
+from custum_3d_diffusion.trainings.utils import load_config
+from custum_3d_diffusion.custum_modules.unifield_processor import ConfigurableUNet2DConditionModel, AttnConfig
+class BasicTrainer(torch.nn.Module, abc.ABC):
+    accelerator: Accelerator
+    logger: MultiProcessAdapter
+    unet: ConfigurableUNet2DConditionModel
+    train_dataloader: torch.utils.data.DataLoader
+    test_dataset: torch.utils.data.Dataset
+    attn_config: AttnConfig
+    @dataclass
+    class TrainerConfig:
+        trainer_name: str = "basic"
+        pretrained_model_name_or_path: str = ""
+        attn_config: dict = field(default_factory=dict)
+        dataset_name: str = ""
+        dataset_config_name: Optional[str] = None
+        resolution: str = "1024"
+        dataloader_num_workers: int = 4
+        pair_sampler_group_size: int = 1
+        num_views: int = 4
+        max_train_steps: int = -1                       # -1 means infinity, otherwise [0, max_train_steps)
+        training_step_interval: int = 1                 # train on step i*interval, stop at max_train_steps
+        max_train_samples: Optional[int] = None
+        seed: Optional[int] = None                      # For dataset related operations and validation stuff
+        train_batch_size: int = 1
+        validation_interval: int = 5000
+        debug: bool = False
+    cfg: TrainerConfig    # only enable_xxx is used
+    def __init__(
+        self,
+        accelerator: Accelerator,
+        logger: MultiProcessAdapter,
+        unet: ConfigurableUNet2DConditionModel,
+        config: Union[dict, str],
+        weight_dtype: torch.dtype,
+        index: int,
+    ):
+        super().__init__()
+        self.index = index              # index in all trainers
+        self.accelerator = accelerator
+        self.logger = logger
+        self.unet = unet
+        self.weight_dtype = weight_dtype
+        self.ext_logs = {}
+        self.cfg = load_config(self.TrainerConfig, config)
+        self.attn_config = load_config(AttnConfig, self.cfg.attn_config)
+        self.test_dataset = None
+        self.validate_trainer_config()
+        self.configure()
+    def get_HW(self):
+        resolution = json.loads(self.cfg.resolution)
+        if isinstance(resolution, int):
+            H = W = resolution
+        elif isinstance(resolution, list):
+            H, W = resolution
+        return H, W
+    def unet_update(self):
+        self.unet.update_config(self.attn_config)
+    def validate_trainer_config(self):
+        pass
+    def is_train_finished(self, current_step):
+        assert isinstance(self.cfg.max_train_steps, int)
+        return self.cfg.max_train_steps != -1 and current_step >= self.cfg.max_train_steps
+    def next_train_step(self, current_step):
+        if self.is_train_finished(current_step):
+            return None
+        return current_step + self.cfg.training_step_interval
+    @classmethod
+    def make_image_into_grid(cls, all_imgs, rows=2, columns=2):
+        catted = [make_image_grid(all_imgs[i:i+rows * columns], rows=rows, cols=columns) for i in range(0, len(all_imgs), rows * columns)]
+        return make_image_grid(catted, rows=1, cols=len(catted))
+    def configure(self) -> None:
+        pass
+    @abc.abstractmethod
+    def init_shared_modules(self, shared_modules: dict) -> dict:
+        pass
+    def load_dataset(self):
+        dataset = load_dataset(
+            self.cfg.dataset_name,
+            self.cfg.dataset_config_name,
+            trust_remote_code=True
+        )
+        return dataset
+    @abc.abstractmethod
+    def init_train_dataloader(self, shared_modules: dict) -> torch.utils.data.DataLoader:
+        """Both init train_dataloader and test_dataset, but returns train_dataloader only"""
+        pass
+    @abc.abstractmethod
+    def forward_step(
+        self,
+        *args,
+        **kwargs
+    ) -> torch.Tensor:
+        """
+        input a batch
+        return a loss
+        """
+        self.unet_update()
+        pass
+    @abc.abstractmethod
+    def construct_pipeline(self, shared_modules, unet):
+        pass
+    @abc.abstractmethod
+    def pipeline_forward(self, pipeline, **pipeline_call_kwargs) -> tuple:
+        """
+            For inference time forward.
+        """
+        pass
+    @abc.abstractmethod
+    def batched_validation_forward(self, pipeline, **pipeline_call_kwargs) -> tuple:
+        pass
+    def do_validation(
+        self,
+        shared_modules,
+        unet,
+        global_step,
+    ):
+        self.unet_update()
+        self.logger.info("Running validation... ")
+        pipeline = self.construct_pipeline(shared_modules, unet)
+        pipeline.set_progress_bar_config(disable=True)
+        titles, images = self.batched_validation_forward(pipeline, guidance_scale=[1., 3.])
+        for tracker in self.accelerator.trackers:
+            if tracker.name == "tensorboard":
+                np_images = np.stack([np.asarray(img) for img in images])
+                tracker.writer.add_images("validation", np_images, global_step, dataformats="NHWC")
+            elif tracker.name == "wandb":
+                [image.thumbnail((512, 512)) for image, title in zip(images, titles) if 'noresize' not in title]   # inplace operation
+                tracker.log({"validation": [
+                    wandb.Image(image, caption=f"{i}: {titles[i]}", file_type="jpg")
+                    for i, image in enumerate(images)]})
+            else:
+                self.logger.warn(f"image logging not implemented for {tracker.name}")
+        del pipeline
+        torch.cuda.empty_cache()
+        return images
+    @torch.no_grad()
+    def log_validation(
+        self,
+        shared_modules,
+        unet,
+        global_step,
+        force=False
+    ):
+        if self.accelerator.is_main_process:
+            for tracker in self.accelerator.trackers:
+                if tracker.name == "wandb":
+                    tracker.log(self.ext_logs)
+        self.ext_logs = {}
+        if (global_step % self.cfg.validation_interval == 0 and not self.is_train_finished(global_step)) or force:
+            self.unet_update()
+            if self.accelerator.is_main_process:
+                self.do_validation(shared_modules, self.accelerator.unwrap_model(unet), global_step)
+    def save_model(self, unwrap_unet, shared_modules, save_dir):
+        if self.accelerator.is_main_process:
+            pipeline = self.construct_pipeline(shared_modules, unwrap_unet)
+            pipeline.save_pretrained(save_dir)
+            self.logger.info(f"{self.cfg.trainer_name} Model saved at {save_dir}")
+    def save_debug_info(self, save_name="debug", **kwargs):
+        if self.cfg.debug:
+            to_saves = {key: value.detach().cpu() if isinstance(value, torch.Tensor) else value for key, value in kwargs.items()}
+            import pickle
+            import os
+            if os.path.exists(f"{save_name}.pkl"):
+                for i in range(100):
+                    if not os.path.exists(f"{save_name}_v{i}.pkl"):
+                        save_name = f"{save_name}_v{i}"
+                        break
+            with open(f"{save_name}.pkl", "wb") as f:
+                pickle.dump(to_saves, f)

custum_3d_diffusion/trainings/config_classes.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from dataclasses import dataclass, field
+from typing import List, Optional
+@dataclass
+class TrainerSubConfig:
+    trainer_type: str = ""
+    trainer: dict = field(default_factory=dict)
+@dataclass
+class ExprimentConfig:
+    trainers: List[dict] = field(default_factory=lambda: [])
+    init_config: dict = field(default_factory=dict)
+    pretrained_model_name_or_path: str = ""
+    pretrained_unet_state_dict_path: str = ""
+    # expriments related parameters
+    linear_beta_schedule: bool = False
+    zero_snr: bool = False
+    prediction_type: Optional[str] = None
+    seed: Optional[int] = None
+    max_train_steps: int = 1000000
+    gradient_accumulation_steps: int = 1
+    learning_rate: float = 1e-4
+    lr_scheduler: str = "constant"
+    lr_warmup_steps: int = 500
+    use_8bit_adam: bool = False
+    adam_beta1: float = 0.9
+    adam_beta2: float = 0.999
+    adam_weight_decay: float = 1e-2
+    adam_epsilon: float = 1e-08
+    max_grad_norm: float = 1.0
+    mixed_precision: Optional[str] = None       # ["no", "fp16", "bf16", "fp8"]
+    skip_training: bool = False
+    debug: bool = False

custum_3d_diffusion/trainings/image2image_trainer.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import json
+import torch
+from diffusers import EulerAncestralDiscreteScheduler, DDPMScheduler
+from dataclasses import dataclass
+from custum_3d_diffusion.modules import register
+from custum_3d_diffusion.trainings.image2mvimage_trainer import Image2MVImageTrainer
+from custum_3d_diffusion.custum_pipeline.unifield_pipeline_img2img import StableDiffusionImageCustomPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+def get_HW(resolution):
+    if isinstance(resolution, str):
+        resolution = json.loads(resolution)
+    if isinstance(resolution, int):
+        H = W = resolution
+    elif isinstance(resolution, list):
+        H, W = resolution
+    return H, W
+@register("image2image_trainer")
+class Image2ImageTrainer(Image2MVImageTrainer):
+    """
+    Trainer for simple image to multiview images.
+    """
+    @dataclass
+    class TrainerConfig(Image2MVImageTrainer.TrainerConfig):
+        trainer_name: str = "image2image"
+    cfg: TrainerConfig
+    def forward_step(self, batch, unet, shared_modules, noise_scheduler: DDPMScheduler, global_step) -> torch.Tensor:
+        raise NotImplementedError()
+    def construct_pipeline(self, shared_modules, unet, old_version=False):
+        MyPipeline = StableDiffusionImageCustomPipeline
+        pipeline = MyPipeline.from_pretrained(
+            self.cfg.pretrained_model_name_or_path,
+            vae=shared_modules['vae'],
+            image_encoder=shared_modules['image_encoder'],
+            feature_extractor=shared_modules['feature_extractor'],
+            unet=unet,
+            safety_checker=None,
+            torch_dtype=self.weight_dtype,
+            latents_offset=self.cfg.latents_offset,
+            noisy_cond_latents=self.cfg.noisy_condition_input,
+        )
+        pipeline.set_progress_bar_config(disable=True)
+        scheduler_dict = {}
+        if self.cfg.zero_snr:
+            scheduler_dict.update(rescale_betas_zero_snr=True)
+        if self.cfg.linear_beta_schedule:
+            scheduler_dict.update(beta_schedule='linear')
+        pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config, **scheduler_dict)
+        return pipeline
+    def get_forward_args(self):
+        if self.cfg.seed is None:
+            generator = None
+        else:
+            generator = torch.Generator(device=self.accelerator.device).manual_seed(self.cfg.seed)
+        H, W = get_HW(self.cfg.resolution)
+        H_cond, W_cond = get_HW(self.cfg.condition_image_resolution)
+        forward_args = dict(
+            num_images_per_prompt=1,
+            num_inference_steps=20,
+            height=H,
+            width=W,
+            height_cond=H_cond,
+            width_cond=W_cond,
+            generator=generator,
+        )
+        if self.cfg.zero_snr:
+            forward_args.update(guidance_rescale=0.7)
+        return forward_args
+    def pipeline_forward(self, pipeline, **pipeline_call_kwargs) -> StableDiffusionPipelineOutput:
+        forward_args = self.get_forward_args()
+        forward_args.update(pipeline_call_kwargs)
+        return pipeline(**forward_args)
+    def batched_validation_forward(self, pipeline, **pipeline_call_kwargs) -> tuple:
+        raise NotImplementedError()

custum_3d_diffusion/trainings/image2mvimage_trainer.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import torch
+from diffusers import AutoencoderKL, DDPMScheduler, EulerAncestralDiscreteScheduler, DDIMScheduler
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection, BatchFeature
+import json
+from dataclasses import dataclass
+from typing import List, Optional
+from custum_3d_diffusion.modules import register
+from custum_3d_diffusion.trainings.base import BasicTrainer
+from custum_3d_diffusion.custum_pipeline.unifield_pipeline_img2mvimg import StableDiffusionImage2MVCustomPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+def get_HW(resolution):
+    if isinstance(resolution, str):
+        resolution = json.loads(resolution)
+    if isinstance(resolution, int):
+        H = W = resolution
+    elif isinstance(resolution, list):
+        H, W = resolution
+    return H, W
+@register("image2mvimage_trainer")
+class Image2MVImageTrainer(BasicTrainer):
+    """
+    Trainer for simple image to multiview images.
+    """
+    @dataclass
+    class TrainerConfig(BasicTrainer.TrainerConfig):
+        trainer_name: str = "image2mvimage"
+        condition_image_column_name: str = "conditioning_image"
+        image_column_name: str = "image"
+        condition_dropout: float = 0.
+        condition_image_resolution: str = "512"
+        validation_images: Optional[List[str]] = None
+        noise_offset: float = 0.1
+        max_loss_drop: float = 0.
+        snr_gamma: float = 5.0
+        log_distribution: bool = False
+        latents_offset: Optional[List[float]] = None
+        input_perturbation: float = 0.
+        noisy_condition_input: bool = False                 # whether to add noise for ref unet input
+        normal_cls_offset: int = 0
+        condition_offset: bool = True
+        zero_snr: bool = False
+        linear_beta_schedule: bool = False
+    cfg: TrainerConfig
+    def configure(self) -> None:
+        return super().configure()
+    def init_shared_modules(self, shared_modules: dict) -> dict:
+        if 'vae' not in shared_modules:
+            vae = AutoencoderKL.from_pretrained(
+                self.cfg.pretrained_model_name_or_path, subfolder="vae", torch_dtype=self.weight_dtype
+            )
+            vae.requires_grad_(False)
+            vae.to(self.accelerator.device, dtype=self.weight_dtype)
+            shared_modules['vae'] = vae
+        if 'image_encoder' not in shared_modules:
+            image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+                self.cfg.pretrained_model_name_or_path, subfolder="image_encoder"
+            )
+            image_encoder.requires_grad_(False)
+            image_encoder.to(self.accelerator.device, dtype=self.weight_dtype)
+            shared_modules['image_encoder'] = image_encoder
+        if 'feature_extractor' not in shared_modules:
+            feature_extractor = CLIPImageProcessor.from_pretrained(
+                self.cfg.pretrained_model_name_or_path, subfolder="feature_extractor"
+            )
+            shared_modules['feature_extractor'] = feature_extractor
+        return shared_modules
+    def init_train_dataloader(self, shared_modules: dict) -> torch.utils.data.DataLoader:
+        raise NotImplementedError()
+    def loss_rescale(self, loss, timesteps=None):
+        raise NotImplementedError()
+    def forward_step(self, batch, unet, shared_modules, noise_scheduler: DDPMScheduler, global_step) -> torch.Tensor:
+        raise NotImplementedError()
+    def construct_pipeline(self, shared_modules, unet, old_version=False):
+        MyPipeline = StableDiffusionImage2MVCustomPipeline
+        pipeline = MyPipeline.from_pretrained(
+            self.cfg.pretrained_model_name_or_path,
+            vae=shared_modules['vae'],
+            image_encoder=shared_modules['image_encoder'],
+            feature_extractor=shared_modules['feature_extractor'],
+            unet=unet,
+            safety_checker=None,
+            torch_dtype=self.weight_dtype,
+            latents_offset=self.cfg.latents_offset,
+            noisy_cond_latents=self.cfg.noisy_condition_input,
+            condition_offset=self.cfg.condition_offset,
+        )
+        pipeline.set_progress_bar_config(disable=True)
+        scheduler_dict = {}
+        if self.cfg.zero_snr:
+            scheduler_dict.update(rescale_betas_zero_snr=True)
+        if self.cfg.linear_beta_schedule:
+            scheduler_dict.update(beta_schedule='linear')
+        pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config, **scheduler_dict)
+        return pipeline
+    def get_forward_args(self):
+        if self.cfg.seed is None:
+            generator = None
+        else:
+            generator = torch.Generator(device=self.accelerator.device).manual_seed(self.cfg.seed)
+        H, W = get_HW(self.cfg.resolution)
+        H_cond, W_cond = get_HW(self.cfg.condition_image_resolution)
+        sub_img_H = H // 2
+        num_imgs = H // sub_img_H * W // sub_img_H
+        forward_args = dict(
+            num_images_per_prompt=num_imgs,
+            num_inference_steps=50,
+            height=sub_img_H,
+            width=sub_img_H,
+            height_cond=H_cond,
+            width_cond=W_cond,
+            generator=generator,
+        )
+        if self.cfg.zero_snr:
+            forward_args.update(guidance_rescale=0.7)
+        return forward_args
+    def pipeline_forward(self, pipeline, **pipeline_call_kwargs) -> StableDiffusionPipelineOutput:
+        forward_args = self.get_forward_args()
+        forward_args.update(pipeline_call_kwargs)
+        return pipeline(**forward_args)
+    def batched_validation_forward(self, pipeline, **pipeline_call_kwargs) -> tuple:
+        raise NotImplementedError()

custum_3d_diffusion/trainings/utils.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from omegaconf import DictConfig, OmegaConf
+def parse_structured(fields, cfg) -> DictConfig:
+    scfg = OmegaConf.structured(fields(**cfg))
+    return scfg
+def load_config(fields, config, extras=None):
+    if extras is not None:
+        print("Warning! extra parameter in cli is not verified, may cause erros.")
+    if isinstance(config, str):
+        cfg = OmegaConf.load(config)
+    elif isinstance(config, dict):
+        cfg = OmegaConf.create(config)
+    elif isinstance(config, DictConfig):
+        cfg = config
+    else:
+        raise NotImplementedError(f"Unsupported config type {type(config)}")
+    if extras is not None:
+        cli_conf = OmegaConf.from_cli(extras)
+        cfg = OmegaConf.merge(cfg, cli_conf)
+    OmegaConf.resolve(cfg)
+    assert isinstance(cfg, DictConfig)
+    return parse_structured(fields, cfg)

gradio_app/__init__.py ADDED Viewed

File without changes

gradio_app/all_models.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import torch
+from scripts.sd_model_zoo import load_common_sd15_pipe
+from diffusers import StableDiffusionControlNetImg2ImgPipeline, StableDiffusionPipeline
+class MyModelZoo:
+    _pipe_disney_controlnet_lineart_ipadapter_i2i: StableDiffusionControlNetImg2ImgPipeline = None
+    base_model = "runwayml/stable-diffusion-v1-5"
+    def __init__(self, base_model=None) -> None:
+        if base_model is not None:
+            self.base_model = base_model
+    @property
+    def pipe_disney_controlnet_tile_ipadapter_i2i(self):
+        return self._pipe_disney_controlnet_lineart_ipadapter_i2i
+    def init_models(self):
+        self._pipe_disney_controlnet_lineart_ipadapter_i2i = load_common_sd15_pipe(base_model=self.base_model, ip_adapter=True, plus_model=False, controlnet="./ckpt/controlnet-tile", pipeline_class=StableDiffusionControlNetImg2ImgPipeline)
+model_zoo = MyModelZoo()

gradio_app/custom_models/image2mvimage.yaml ADDED Viewed

	@@ -0,0 +1,63 @@

+pretrained_model_name_or_path: "./ckpt/img2mvimg"
+mixed_precision: "bf16"
+init_config:
+  # enable controls
+  enable_cross_attn_lora: False
+  enable_cross_attn_ip: False
+  enable_self_attn_lora: False
+  enable_self_attn_ref: False
+  enable_multiview_attn: True
+  # for cross attention
+  init_cross_attn_lora: False
+  init_cross_attn_ip: False
+  cross_attn_lora_rank: 256        # 0 for not enabled
+  cross_attn_lora_only_kv: False
+  ipadapter_pretrained_name: "h94/IP-Adapter"
+  ipadapter_subfolder_name: "models"
+  ipadapter_weight_name: "ip-adapter_sd15.safetensors"
+  ipadapter_effect_on: "all"    # all, first
+  # for self attention
+  init_self_attn_lora: False
+  self_attn_lora_rank: 256
+  self_attn_lora_only_kv: False
+  # for self attention ref
+  init_self_attn_ref: False
+  self_attn_ref_position: "attn1"
+  self_attn_ref_other_model_name: "lambdalabs/sd-image-variations-diffusers"
+  self_attn_ref_pixel_wise_crosspond: False
+  self_attn_ref_effect_on: "all"
+  # for multiview attention
+  init_multiview_attn: True
+  multiview_attn_position: "attn1"
+  use_mv_joint_attn: True
+  num_modalities: 1
+  # for unet
+  init_unet_path: "${pretrained_model_name_or_path}"
+  cat_condition: True       # cat condition to input
+  # for cls embedding
+  init_num_cls_label: 8     # for initialize
+  cls_labels: [0, 1, 2, 3]  # for current task
+trainers:
+  - trainer_type: "image2mvimage_trainer"
+    trainer:
+        pretrained_model_name_or_path: "${pretrained_model_name_or_path}"
+        attn_config:
+          cls_labels: [0, 1, 2, 3]  # for current task
+          enable_cross_attn_lora: False
+          enable_cross_attn_ip: False
+          enable_self_attn_lora: False
+          enable_self_attn_ref: False
+          enable_multiview_attn: True
+        resolution: "256"
+        condition_image_resolution: "256"
+        normal_cls_offset: 4
+        condition_image_column_name: "conditioning_image"
+        image_column_name: "image"

gradio_app/custom_models/image2normal.yaml ADDED Viewed

	@@ -0,0 +1,61 @@

+pretrained_model_name_or_path: "lambdalabs/sd-image-variations-diffusers"
+mixed_precision: "bf16"
+init_config:
+  # enable controls
+  enable_cross_attn_lora: False
+  enable_cross_attn_ip: False
+  enable_self_attn_lora: False
+  enable_self_attn_ref: True
+  enable_multiview_attn: False
+  # for cross attention
+  init_cross_attn_lora: False
+  init_cross_attn_ip: False
+  cross_attn_lora_rank: 512        # 0 for not enabled
+  cross_attn_lora_only_kv: False
+  ipadapter_pretrained_name: "h94/IP-Adapter"
+  ipadapter_subfolder_name: "models"
+  ipadapter_weight_name: "ip-adapter_sd15.safetensors"
+  ipadapter_effect_on: "all"    # all, first
+  # for self attention
+  init_self_attn_lora: False
+  self_attn_lora_rank: 512
+  self_attn_lora_only_kv: False
+  # for self attention ref
+  init_self_attn_ref: True
+  self_attn_ref_position: "attn1"
+  self_attn_ref_other_model_name: "lambdalabs/sd-image-variations-diffusers"
+  self_attn_ref_pixel_wise_crosspond: True
+  self_attn_ref_effect_on: "all"
+  # for multiview attention
+  init_multiview_attn: False
+  multiview_attn_position: "attn1"
+  num_modalities: 1
+  # for unet
+  init_unet_path: "${pretrained_model_name_or_path}"
+  init_num_cls_label: 0     # for initialize
+  cls_labels: []  # for current task
+trainers:
+  - trainer_type: "image2image_trainer"
+    trainer:
+        pretrained_model_name_or_path: "${pretrained_model_name_or_path}"
+        attn_config:
+          cls_labels: []  # for current task
+          enable_cross_attn_lora: False
+          enable_cross_attn_ip: False
+          enable_self_attn_lora: False
+          enable_self_attn_ref: True
+          enable_multiview_attn: False
+        resolution: "512"
+        condition_image_resolution: "512"
+        condition_image_column_name: "conditioning_image"
+        image_column_name: "image"

gradio_app/custom_models/mvimg_prediction.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import sys
+import torch
+import gradio as gr
+from PIL import Image
+import numpy as np
+from rembg import remove
+from gradio_app.utils import change_rgba_bg, rgba_to_rgb
+from gradio_app.custom_models.utils import load_pipeline
+from scripts.all_typing import *
+from scripts.utils import session, simple_preprocess
+training_config = "gradio_app/custom_models/image2mvimage.yaml"
+checkpoint_path = "ckpt/img2mvimg/unet_state_dict.pth"
+trainer, pipeline = load_pipeline(training_config, checkpoint_path)
+def predict(img_list: List[Image.Image], guidance_scale=2., **kwargs):
+    global pipeline
+    pipeline = pipeline.to("cuda")
+    if isinstance(img_list, Image.Image):
+        img_list = [img_list]
+    img_list = [rgba_to_rgb(i) if i.mode == 'RGBA' else i for i in img_list]
+    ret = []
+    for img in img_list:
+        images = trainer.pipeline_forward(
+            pipeline=pipeline,
+            image=img,
+            guidance_scale=guidance_scale,
+            **kwargs
+        ).images
+        ret.extend(images)
+    return ret
+def run_mvprediction(input_image: Image.Image, remove_bg=True, guidance_scale=1.5, seed=1145):
+    if input_image.mode == 'RGB' or np.array(input_image)[..., -1].mean() == 255.:
+        # still do remove using rembg, since simple_preprocess requires RGBA image
+        print("RGB image not RGBA! still remove bg!")
+        remove_bg = True
+    if remove_bg:
+        input_image = remove(input_image, session=session)
+    # make front_pil RGBA with white bg
+    input_image = change_rgba_bg(input_image, "white")
+    single_image = simple_preprocess(input_image)
+    generator = torch.Generator(device="cuda").manual_seed(int(seed)) if seed >= 0 else None
+    rgb_pils = predict(
+        single_image,
+        generator=generator,
+        guidance_scale=guidance_scale,
+        width=256,
+        height=256,
+        num_inference_steps=30,
+    )
+    return rgb_pils, single_image

gradio_app/custom_models/normal_prediction.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import sys
+from PIL import Image
+from gradio_app.utils import rgba_to_rgb, simple_remove
+from gradio_app.custom_models.utils import load_pipeline
+from scripts.utils import rotate_normals_torch
+from scripts.all_typing import *
+training_config = "gradio_app/custom_models/image2normal.yaml"
+checkpoint_path = "ckpt/image2normal/unet_state_dict.pth"
+trainer, pipeline = load_pipeline(training_config, checkpoint_path)
+def predict_normals(image: List[Image.Image], guidance_scale=2., do_rotate=True, num_inference_steps=30, **kwargs):
+    global pipeline
+    pipeline = pipeline.to("cuda")
+    img_list = image if isinstance(image, list) else [image]
+    img_list = [rgba_to_rgb(i) if i.mode == 'RGBA' else i for i in img_list]
+    images = trainer.pipeline_forward(
+        pipeline=pipeline,
+        image=img_list,
+        num_inference_steps=num_inference_steps,
+        guidance_scale=guidance_scale,
+        **kwargs
+    ).images
+    images = simple_remove(images)
+    if do_rotate and len(images) > 1:
+        images = rotate_normals_torch(images, return_types='pil')
+    return images

gradio_app/custom_models/utils.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import torch
+from typing import List
+from dataclasses import dataclass
+from gradio_app.utils import rgba_to_rgb
+from custum_3d_diffusion.trainings.config_classes import ExprimentConfig, TrainerSubConfig
+from custum_3d_diffusion import modules
+from custum_3d_diffusion.custum_modules.unifield_processor import AttnConfig, ConfigurableUNet2DConditionModel
+from custum_3d_diffusion.trainings.base import BasicTrainer
+from custum_3d_diffusion.trainings.utils import load_config
+@dataclass
+class FakeAccelerator:
+    device: torch.device = torch.device("cuda")
+def init_trainers(cfg_path: str, weight_dtype: torch.dtype, extras: dict):
+    accelerator = FakeAccelerator()
+    cfg: ExprimentConfig = load_config(ExprimentConfig, cfg_path, extras)
+    init_config: AttnConfig = load_config(AttnConfig, cfg.init_config)
+    configurable_unet = ConfigurableUNet2DConditionModel(init_config, weight_dtype)
+    configurable_unet.enable_xformers_memory_efficient_attention()
+    trainer_cfgs: List[TrainerSubConfig] = [load_config(TrainerSubConfig, trainer) for trainer in cfg.trainers]
+    trainers: List[BasicTrainer] = [modules.find(trainer.trainer_type)(accelerator, None, configurable_unet, trainer.trainer, weight_dtype, i) for i, trainer in enumerate(trainer_cfgs)]
+    return trainers, configurable_unet
+from gradio_app.utils import make_image_grid, split_image
+def process_image(function, img, guidance_scale=2., merged_image=False, remove_bg=True):
+    from rembg import remove
+    if remove_bg:
+        img = remove(img)
+    img = rgba_to_rgb(img)
+    if merged_image:
+        img = split_image(img, rows=2)
+    images = function(
+        image=img,
+        guidance_scale=guidance_scale,
+    )
+    if len(images) > 1:
+        return make_image_grid(images, rows=2)
+    else:
+        return images[0]
+def process_text(trainer, pipeline, img, guidance_scale=2.):
+    pipeline.cfg.validation_prompts = [img]
+    titles, images = trainer.batched_validation_forward(pipeline, guidance_scale=[guidance_scale])
+    return images[0]
+def load_pipeline(config_path, ckpt_path, pipeline_filter=lambda x: True, weight_dtype = torch.bfloat16):
+    training_config = config_path
+    load_from_checkpoint = ckpt_path
+    extras = []
+    device = "cuda"
+    trainers, configurable_unet = init_trainers(training_config, weight_dtype, extras)
+    shared_modules = dict()
+    for trainer in trainers:
+        shared_modules = trainer.init_shared_modules(shared_modules)
+    if load_from_checkpoint is not None:
+        state_dict = torch.load(load_from_checkpoint, map_location="cpu")
+        configurable_unet.unet.load_state_dict(state_dict, strict=False)
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    configurable_unet.unet.to(device, dtype=weight_dtype)
+    pipeline = None
+    trainer_out = None
+    for trainer in trainers:
+        if pipeline_filter(trainer.cfg.trainer_name):
+            pipeline = trainer.construct_pipeline(shared_modules, configurable_unet.unet)
+            pipeline.set_progress_bar_config(disable=False)
+            trainer_out = trainer
+    pipeline = pipeline.to(device, dtype=weight_dtype)
+    return trainer_out, pipeline

gradio_app/examples/Groot.png ADDED Viewed

gradio_app/examples/aaa.png ADDED Viewed

gradio_app/examples/abma.png ADDED Viewed

gradio_app/examples/akun.png ADDED Viewed

gradio_app/examples/anya.png ADDED Viewed

gradio_app/examples/bag.png ADDED Viewed

Git LFS Details

SHA256: ac798ea1f112091c04f5bdfa47c490806fb433a02fe17758aa1f8c55cd64b66e
Pointer size: 132 Bytes
Size of remote file: 1.54 MB

gradio_app/examples/ex1.png ADDED Viewed

Git LFS Details

SHA256: d49ccccd40fe0317c2886b0d36a11667003d17a49cc49d9244208d250de9fe31
Pointer size: 132 Bytes
Size of remote file: 1.17 MB

gradio_app/examples/ex2.png ADDED Viewed

gradio_app/examples/ex3.jpg ADDED Viewed

gradio_app/examples/ex4.png ADDED Viewed

gradio_app/examples/generated_1715761545_frame0.png ADDED Viewed

gradio_app/examples/generated_1715762357_frame0.png ADDED Viewed

gradio_app/examples/generated_1715763329_frame0.png ADDED Viewed

gradio_app/examples/hatsune_miku.png ADDED Viewed

gradio_app/examples/princess-large.png ADDED Viewed

gradio_app/gradio_3dgen.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import spaces
+import os
+import gradio as gr
+from PIL import Image
+from pytorch3d.structures import Meshes
+from gradio_app.utils import clean_up
+from gradio_app.custom_models.mvimg_prediction import run_mvprediction
+from gradio_app.custom_models.normal_prediction import predict_normals
+from scripts.refine_lr_to_sr import run_sr_fast
+from scripts.utils import save_glb_and_video
+# from scripts.multiview_inference import geo_reconstruct
+from scripts.multiview_inference import geo_reconstruct_part1, geo_reconstruct_part2, geo_reconstruct_part3
+@spaces.GPU(duration=100)
+def run_mv(preview_img, input_processing, seed):
+    if preview_img.size[0] <= 512:
+        preview_img = run_sr_fast([preview_img])[0]
+    rgb_pils, front_pil = run_mvprediction(preview_img, remove_bg=input_processing, seed=int(seed)) # 6s
+    return rgb_pils, front_pil
+@spaces.GPU(duration=100) # seems split into multiple part will leads to `RuntimeError`, before fix it, still initialize here
+def generate3dv2(preview_img, input_processing, seed, render_video=True, do_refine=True, expansion_weight=0.1, init_type="std"):
+    if preview_img is None:
+        raise gr.Error("The input image is none!")
+    if isinstance(preview_img, str):
+        preview_img = Image.open(preview_img)
+    rgb_pils, front_pil = run_mv(preview_img, input_processing, seed)
+    vertices, faces, img_list = geo_reconstruct_part1(rgb_pils, None, front_pil, do_refine=do_refine, predict_normal=True, expansion_weight=expansion_weight, init_type=init_type)
+    meshes = geo_reconstruct_part2(vertices, faces)
+    new_meshes = geo_reconstruct_part3(meshes, img_list)
+    vertices = new_meshes.verts_packed()
+    vertices = vertices / 2 * 1.35
+    vertices[..., [0, 2]] = - vertices[..., [0, 2]]
+    new_meshes = Meshes(verts=[vertices], faces=new_meshes.faces_list(), textures=new_meshes.textures)
+    ret_mesh, video = save_glb_and_video("/tmp/gradio/generated", new_meshes, with_timestamp=True, dist=3.5, fov_in_degrees=2 / 1.35, cam_type="ortho", export_video=render_video)
+    return ret_mesh, video
+#######################################
+def create_ui(concurrency_id="wkl"):
+    with gr.Row():
+        with gr.Column(scale=1):
+            input_image = gr.Image(type='pil', image_mode='RGBA', label='Frontview')
+            example_folder = os.path.join(os.path.dirname(__file__), "./examples")
+            example_fns = sorted([os.path.join(example_folder, example) for example in os.listdir(example_folder)])
+            gr.Examples(
+                examples=example_fns,
+                inputs=[input_image],
+                cache_examples=False,
+                label='Examples',
+                examples_per_page=12
+            )
+        with gr.Column(scale=1):
+            # export mesh display
+            output_mesh = gr.Model3D(value=None, label="Mesh Model", show_label=True, height=320, camera_position=(90, 90, 2))
+            output_video = gr.Video(label="Preview", show_label=True, show_share_button=True, height=320, visible=False)
+            input_processing = gr.Checkbox(
+                value=True,
+                label='Remove Background',
+                visible=True,
+            )
+            do_refine = gr.Checkbox(value=True, label="Refine Multiview Details", visible=False)
+            expansion_weight = gr.Slider(minimum=-1., maximum=1.0, value=0.1, step=0.1, label="Expansion Weight", visible=False)
+            init_type = gr.Dropdown(choices=["std", "thin"], label="Mesh Initialization", value="std", visible=False)
+            setable_seed = gr.Slider(-1, 1000000000, -1, step=1, visible=True, label="Seed")
+            render_video = gr.Checkbox(value=False, visible=False, label="generate video")
+            fullrunv2_btn = gr.Button('Generate 3D', variant = "primary", interactive=True)
+    fullrunv2_btn.click(
+        fn = generate3dv2,
+        inputs=[input_image, input_processing, setable_seed, render_video, do_refine, expansion_weight, init_type],
+        outputs=[output_mesh, output_video],
+        concurrency_id=concurrency_id,
+        api_name="generate3dv2",
+    ).success(clean_up, api_name=False)
+    return input_image

gradio_app/gradio_3dgen_steps.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import gradio as gr
+from PIL import Image
+from gradio_app.custom_models.mvimg_prediction import run_mvprediction
+from gradio_app.utils import make_image_grid, split_image
+from scripts.utils import save_glb_and_video
+def concept_to_multiview(preview_img, input_processing, seed, guidance=1.):
+    seed = int(seed)
+    if preview_img is None:
+        raise gr.Error("preview_img is none.")
+    if isinstance(preview_img, str):
+        preview_img = Image.open(preview_img)
+    rgb_pils, front_pil = run_mvprediction(preview_img, remove_bg=input_processing, seed=seed, guidance_scale=guidance)
+    rgb_pil = make_image_grid(rgb_pils, rows=2)
+    return rgb_pil, front_pil
+def concept_to_multiview_ui(concurrency_id="wkl"):
+    with gr.Row():
+        with gr.Column(scale=2):
+            preview_img = gr.Image(type='pil', image_mode='RGBA', label='Frontview')
+            input_processing = gr.Checkbox(
+                value=True,
+                label='Remove Background',
+            )
+            seed = gr.Slider(minimum=-1, maximum=1000000000, value=-1, step=1.0, label="seed")
+            guidance = gr.Slider(minimum=1.0, maximum=5.0, value=1.0, label="Guidance Scale", step=0.5)
+            run_btn = gr.Button('Generate Multiview', interactive=True)
+        with gr.Column(scale=3):
+            # export mesh display
+            output_rgb = gr.Image(type='pil', label="RGB", show_label=True)
+            output_front = gr.Image(type='pil', image_mode='RGBA', label="Frontview", show_label=True)
+    run_btn.click(
+        fn = concept_to_multiview,
+        inputs=[preview_img, input_processing, seed, guidance],
+        outputs=[output_rgb, output_front],
+        concurrency_id=concurrency_id,
+        api_name=False,
+    )
+    return output_rgb, output_front
+from gradio_app.custom_models.normal_prediction import predict_normals
+from scripts.multiview_inference import geo_reconstruct
+def multiview_to_mesh_v2(rgb_pil, normal_pil, front_pil, do_refine=False, expansion_weight=0.1, init_type="std"):
+    rgb_pils = split_image(rgb_pil, rows=2)
+    if normal_pil is not None:
+        normal_pil = split_image(normal_pil, rows=2)
+    if front_pil is None:
+        front_pil = rgb_pils[0]
+    new_meshes = geo_reconstruct(rgb_pils, normal_pil, front_pil, do_refine=do_refine, predict_normal=normal_pil is None, expansion_weight=expansion_weight, init_type=init_type)
+    ret_mesh, video = save_glb_and_video("/tmp/gradio/generated", new_meshes, with_timestamp=True, dist=3.5, fov_in_degrees=2 / 1.35, cam_type="ortho", export_video=False)
+    return ret_mesh
+def new_multiview_to_mesh_ui(concurrency_id="wkl"):
+    with gr.Row():
+        with gr.Column(scale=2):
+            rgb_pil = gr.Image(type='pil', image_mode='RGB', label='RGB')
+            front_pil = gr.Image(type='pil', image_mode='RGBA', label='Frontview(Optinal)')
+            normal_pil = gr.Image(type='pil', image_mode='RGBA', label='Normal(Optinal)')
+            do_refine = gr.Checkbox(
+                value=False,
+                label='Refine rgb',
+                visible=False,
+            )
+            expansion_weight = gr.Slider(minimum=-1.0, maximum=1.0, value=0.1, step=0.1, label="Expansion Weight", visible=False)
+            init_type = gr.Dropdown(choices=["std", "thin"], label="Mesh initialization", value="std", visible=False)
+            run_btn = gr.Button('Generate 3D', interactive=True)
+        with gr.Column(scale=3):
+            # export mesh display
+            output_mesh = gr.Model3D(value=None, label="mesh model", show_label=True)
+    run_btn.click(
+        fn = multiview_to_mesh_v2,
+        inputs=[rgb_pil, normal_pil, front_pil, do_refine, expansion_weight, init_type],
+        outputs=[output_mesh],
+        concurrency_id=concurrency_id,
+        api_name="multiview_to_mesh",
+    )
+    return rgb_pil, front_pil, output_mesh
+#######################################
+def create_step_ui(concurrency_id="wkl"):
+    with gr.Tab(label="3D:concept_to_multiview"):
+        concept_to_multiview_ui(concurrency_id)
+    with gr.Tab(label="3D:new_multiview_to_mesh"):
+        new_multiview_to_mesh_ui(concurrency_id)

gradio_app/gradio_local.py ADDED Viewed

	@@ -0,0 +1,76 @@

+if __name__ == "__main__":
+    import os
+    import sys
+    sys.path.append(os.curdir)
+    if 'CUDA_VISIBLE_DEVICES' not in os.environ:
+        os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+    os.environ['TRANSFORMERS_OFFLINE']='0'
+    os.environ['DIFFUSERS_OFFLINE']='0'
+    os.environ['HF_HUB_OFFLINE']='0'
+    os.environ['GRADIO_ANALYTICS_ENABLED']='False'
+    os.environ['HF_ENDPOINT']='https://hf-mirror.com'
+    import torch
+    torch.set_float32_matmul_precision('medium')
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.set_grad_enabled(False)
+import gradio as gr
+import argparse
+from gradio_app.gradio_3dgen import create_ui as create_3d_ui
+# from app.gradio_3dgen_steps import create_step_ui
+from gradio_app.all_models import model_zoo
+_TITLE = '''Unique3D: High-Quality and Efficient 3D Mesh Generation from a Single Image'''
+_DESCRIPTION = '''
+[Project page](https://wukailu.github.io/Unique3D/)
+* High-fidelity and diverse textured meshes generated by Unique3D from single-view images.
+* The demo is still under construction, and more features are expected to be implemented soon.
+'''
+def launch(
+    port,
+    listen=False,
+    share=False,
+    gradio_root="",
+):
+    model_zoo.init_models()
+    with gr.Blocks(
+        title=_TITLE,
+        theme=gr.themes.Monochrome(),
+    ) as demo:
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown('# ' + _TITLE)
+        gr.Markdown(_DESCRIPTION)
+        create_3d_ui("wkl")
+    launch_args = {}
+    if listen:
+        launch_args["server_name"] = "0.0.0.0"
+    demo.queue(default_concurrency_limit=1).launch(
+        server_port=None if port == 0 else port,
+        share=share,
+        root_path=gradio_root if gradio_root != "" else None,  # "/myapp"
+        **launch_args,
+    )
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    args, extra = parser.parse_known_args()
+    parser.add_argument("--listen", action="store_true")
+    parser.add_argument("--port", type=int, default=0)
+    parser.add_argument("--share", action="store_true")
+    parser.add_argument("--gradio_root", default="")
+    args = parser.parse_args()
+    launch(
+        args.port,
+        listen=args.listen,
+        share=args.share,
+        gradio_root=args.gradio_root,
+    )

gradio_app/utils.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import torch
+import numpy as np
+from PIL import Image
+import gc
+import numpy as np
+import numpy as np
+from PIL import Image
+from scripts.refine_lr_to_sr import run_sr_fast
+GRADIO_CACHE = "/tmp/gradio/"
+def clean_up():
+    torch.cuda.empty_cache()
+    gc.collect()
+def remove_color(arr):
+    if arr.shape[-1] == 4:
+        arr = arr[..., :3]
+    # calc diffs
+    base = arr[0, 0]
+    diffs = np.abs(arr.astype(np.int32) - base.astype(np.int32)).sum(axis=-1)
+    alpha = (diffs <= 80)
+    arr[alpha] = 255
+    alpha = ~alpha
+    arr = np.concatenate([arr, alpha[..., None].astype(np.int32) * 255], axis=-1)
+    return arr
+def simple_remove(imgs, run_sr=True):
+    """Only works for normal"""
+    if not isinstance(imgs, list):
+        imgs = [imgs]
+        single_input = True
+    else:
+        single_input = False
+    if run_sr:
+        imgs = run_sr_fast(imgs)
+    rets = []
+    for img in imgs:
+        arr = np.array(img)
+        arr = remove_color(arr)
+        rets.append(Image.fromarray(arr.astype(np.uint8)))
+    if single_input:
+        return rets[0]
+    return rets
+def rgba_to_rgb(rgba: Image.Image, bkgd="WHITE"):
+    new_image = Image.new("RGBA", rgba.size, bkgd)
+    new_image.paste(rgba, (0, 0), rgba)
+    new_image = new_image.convert('RGB')
+    return new_image
+def change_rgba_bg(rgba: Image.Image, bkgd="WHITE"):
+    rgb_white = rgba_to_rgb(rgba, bkgd)
+    new_rgba = Image.fromarray(np.concatenate([np.array(rgb_white), np.array(rgba)[:, :, 3:4]], axis=-1))
+    return new_rgba
+def split_image(image, rows=None, cols=None):
+    """
+        inverse function of make_image_grid
+    """
+    # image is in square
+    if rows is None and cols is None:
+        # image.size [W, H]
+        rows = 1
+        cols = image.size[0] // image.size[1]
+        assert cols * image.size[1] == image.size[0]
+        subimg_size = image.size[1]
+    elif rows is None:
+        subimg_size = image.size[0] // cols
+        rows = image.size[1] // subimg_size
+        assert rows * subimg_size == image.size[1]
+    elif cols is None:
+        subimg_size = image.size[1] // rows
+        cols = image.size[0] // subimg_size
+        assert cols * subimg_size == image.size[0]
+    else:
+        subimg_size = image.size[1] // rows
+        assert cols * subimg_size == image.size[0]
+    subimgs = []
+    for i in range(rows):
+        for j in range(cols):
+            subimg = image.crop((j*subimg_size, i*subimg_size, (j+1)*subimg_size, (i+1)*subimg_size))
+            subimgs.append(subimg)
+    return subimgs
+def make_image_grid(images, rows=None, cols=None, resize=None):
+    if rows is None and cols is None:
+        rows = 1
+        cols = len(images)
+    if rows is None:
+        rows = len(images) // cols
+        if len(images) % cols != 0:
+            rows += 1
+    if cols is None:
+        cols = len(images) // rows
+        if len(images) % rows != 0:
+            cols += 1
+    total_imgs = rows * cols
+    if total_imgs > len(images):
+        images += [Image.new(images[0].mode, images[0].size) for _ in range(total_imgs - len(images))]
+    if resize is not None:
+        images = [img.resize((resize, resize)) for img in images]
+    w, h = images[0].size
+    grid = Image.new(images[0].mode, size=(cols * w, rows * h))
+    for i, img in enumerate(images):
+        grid.paste(img, box=(i % cols * w, i // cols * h))
+    return grid

mesh_reconstruction/func.py ADDED Viewed

	@@ -0,0 +1,133 @@

+# modified from https://github.com/Profactor/continuous-remeshing
+import torch
+import numpy as np
+import trimesh
+from typing import Tuple
+def to_numpy(*args):
+    def convert(a):
+        if isinstance(a,torch.Tensor):
+            return a.detach().cpu().numpy()
+        assert a is None or isinstance(a,np.ndarray)
+        return a
+    return convert(args[0]) if len(args)==1 else tuple(convert(a) for a in args)
+def laplacian(
+        num_verts:int,
+        edges: torch.Tensor #E,2
+        ) -> torch.Tensor: #sparse V,V
+    """create sparse Laplacian matrix"""
+    V = num_verts
+    E = edges.shape[0]
+    #adjacency matrix,
+    idx = torch.cat([edges, edges.fliplr()], dim=0).type(torch.long).T  # (2, 2*E)
+    ones = torch.ones(2*E, dtype=torch.float32, device=edges.device)
+    A = torch.sparse.FloatTensor(idx, ones, (V, V))
+    #degree matrix
+    deg = torch.sparse.sum(A, dim=1).to_dense()
+    idx = torch.arange(V, device=edges.device)
+    idx = torch.stack([idx, idx], dim=0)
+    D = torch.sparse.FloatTensor(idx, deg, (V, V))
+    return D - A
+def _translation(x, y, z, device):
+    return torch.tensor([[1., 0, 0, x],
+                    [0, 1, 0, y],
+                    [0, 0, 1, z],
+                    [0, 0, 0, 1]],device=device) #4,4
+def _projection(r, device, l=None, t=None, b=None, n=1.0, f=50.0, flip_y=True):
+    """
+        see https://blog.csdn.net/wodownload2/article/details/85069240/
+    """
+    if l is None:
+        l = -r
+    if t is None:
+        t = r
+    if b is None:
+        b = -t
+    p = torch.zeros([4,4],device=device)
+    p[0,0] = 2*n/(r-l)
+    p[0,2] = (r+l)/(r-l)
+    p[1,1] = 2*n/(t-b) * (-1 if flip_y else 1)
+    p[1,2] = (t+b)/(t-b)
+    p[2,2] = -(f+n)/(f-n)
+    p[2,3] = -(2*f*n)/(f-n)
+    p[3,2] = -1
+    return p #4,4
+def _orthographic(r, device, l=None, t=None, b=None, n=1.0, f=50.0, flip_y=True):
+    if l is None:
+        l = -r
+    if t is None:
+        t = r
+    if b is None:
+        b = -t
+    o = torch.zeros([4,4],device=device)
+    o[0,0] = 2/(r-l)
+    o[0,3] = -(r+l)/(r-l)
+    o[1,1] = 2/(t-b) * (-1 if flip_y else 1)
+    o[1,3] = -(t+b)/(t-b)
+    o[2,2] = -2/(f-n)
+    o[2,3] = -(f+n)/(f-n)
+    o[3,3] = 1
+    return o #4,4
+def make_star_cameras(az_count,pol_count,distance:float=10.,r=None,image_size=[512,512],device='cuda'):
+    if r is None:
+        r = 1/distance
+    A = az_count
+    P = pol_count
+    C = A * P
+    phi = torch.arange(0,A) * (2*torch.pi/A)
+    phi_rot = torch.eye(3,device=device)[None,None].expand(A,1,3,3).clone()
+    phi_rot[:,0,2,2] = phi.cos()
+    phi_rot[:,0,2,0] = -phi.sin()
+    phi_rot[:,0,0,2] = phi.sin()
+    phi_rot[:,0,0,0] = phi.cos()
+    theta = torch.arange(1,P+1) * (torch.pi/(P+1)) - torch.pi/2
+    theta_rot = torch.eye(3,device=device)[None,None].expand(1,P,3,3).clone()
+    theta_rot[0,:,1,1] = theta.cos()
+    theta_rot[0,:,1,2] = -theta.sin()
+    theta_rot[0,:,2,1] = theta.sin()
+    theta_rot[0,:,2,2] = theta.cos()
+    mv = torch.empty((C,4,4), device=device)
+    mv[:] = torch.eye(4, device=device)
+    mv[:,:3,:3] = (theta_rot @ phi_rot).reshape(C,3,3)
+    mv = _translation(0, 0, -distance, device) @ mv
+    return mv, _projection(r,device)
+def make_star_cameras_orthographic(az_count,pol_count,distance:float=10.,r=None,image_size=[512,512],device='cuda'):
+    mv, _ = make_star_cameras(az_count,pol_count,distance,r,image_size,device)
+    if r is None:
+        r = 1
+    return mv, _orthographic(r,device)
+def make_sphere(level:int=2,radius=1.,device='cuda') -> Tuple[torch.Tensor,torch.Tensor]:
+    sphere = trimesh.creation.icosphere(subdivisions=level, radius=1.0, color=None)
+    vertices = torch.tensor(sphere.vertices, device=device, dtype=torch.float32) * radius
+    faces = torch.tensor(sphere.faces, device=device, dtype=torch.long)
+    return vertices,faces
+from pytorch3d.renderer import (
+    FoVOrthographicCameras,
+    look_at_view_transform,
+)
+def get_camera(R, T, focal_length=1 / (2**0.5)):
+    focal_length = 1 / focal_length
+    camera = FoVOrthographicCameras(device=R.device, R=R, T=T, min_x=-focal_length, max_x=focal_length, min_y=-focal_length, max_y=focal_length)
+    return camera
+def make_star_cameras_orthographic_py3d(azim_list, device, focal=2/1.35, dist=1.1):
+    R, T = look_at_view_transform(dist, 0, azim_list)
+    focal_length = 1 / focal
+    return FoVOrthographicCameras(device=R.device, R=R, T=T, min_x=-focal_length, max_x=focal_length, min_y=-focal_length, max_y=focal_length).to(device)

mesh_reconstruction/opt.py ADDED Viewed

	@@ -0,0 +1,190 @@

+# modified from https://github.com/Profactor/continuous-remeshing
+import time
+import torch
+import torch_scatter
+from typing import Tuple
+from mesh_reconstruction.remesh import calc_edge_length, calc_edges, calc_face_collapses, calc_face_normals, calc_vertex_normals, collapse_edges, flip_edges, pack, prepend_dummies, remove_dummies, split_edges
+@torch.no_grad()
+def remesh(
+        vertices_etc:torch.Tensor, #V,D
+        faces:torch.Tensor, #F,3 long
+        min_edgelen:torch.Tensor, #V
+        max_edgelen:torch.Tensor, #V
+        flip:bool,
+        max_vertices=1e6
+        ):
+    # dummies
+    vertices_etc,faces = prepend_dummies(vertices_etc,faces)
+    vertices = vertices_etc[:,:3] #V,3
+    nan_tensor = torch.tensor([torch.nan],device=min_edgelen.device)
+    min_edgelen = torch.concat((nan_tensor,min_edgelen))
+    max_edgelen = torch.concat((nan_tensor,max_edgelen))
+    # collapse
+    edges,face_to_edge = calc_edges(faces) #E,2 F,3
+    edge_length = calc_edge_length(vertices,edges) #E
+    face_normals = calc_face_normals(vertices,faces,normalize=False) #F,3
+    vertex_normals = calc_vertex_normals(vertices,faces,face_normals) #V,3
+    face_collapse = calc_face_collapses(vertices,faces,edges,face_to_edge,edge_length,face_normals,vertex_normals,min_edgelen,area_ratio=0.5)
+    shortness = (1 - edge_length / min_edgelen[edges].mean(dim=-1)).clamp_min_(0) #e[0,1] 0...ok, 1...edgelen=0
+    priority = face_collapse.float() + shortness
+    vertices_etc,faces = collapse_edges(vertices_etc,faces,edges,priority)
+    # split
+    if vertices.shape[0]<max_vertices:
+        edges,face_to_edge = calc_edges(faces) #E,2 F,3
+        vertices = vertices_etc[:,:3] #V,3
+        edge_length = calc_edge_length(vertices,edges) #E
+        splits = edge_length > max_edgelen[edges].mean(dim=-1)
+        vertices_etc,faces = split_edges(vertices_etc,faces,edges,face_to_edge,splits,pack_faces=False)
+    vertices_etc,faces = pack(vertices_etc,faces)
+    vertices = vertices_etc[:,:3]
+    if flip:
+        edges,_,edge_to_face = calc_edges(faces,with_edge_to_face=True) #E,2 F,3
+        flip_edges(vertices,faces,edges,edge_to_face,with_border=False)
+    return remove_dummies(vertices_etc,faces)
+def lerp_unbiased(a:torch.Tensor,b:torch.Tensor,weight:float,step:int):
+    """lerp with adam's bias correction"""
+    c_prev = 1-weight**(step-1)
+    c = 1-weight**step
+    a_weight = weight*c_prev/c
+    b_weight = (1-weight)/c
+    a.mul_(a_weight).add_(b, alpha=b_weight)
+class MeshOptimizer:
+    """Use this like a pytorch Optimizer, but after calling opt.step(), do vertices,faces = opt.remesh()."""
+    def __init__(self,
+            vertices:torch.Tensor, #V,3
+            faces:torch.Tensor, #F,3
+            lr=0.3, #learning rate
+            betas=(0.8,0.8,0), #betas[0:2] are the same as in Adam, betas[2] may be used to time-smooth the relative velocity nu
+            gammas=(0,0,0), #optional spatial smoothing for m1,m2,nu, values between 0 (no smoothing) and 1 (max. smoothing)
+            nu_ref=0.3, #reference velocity for edge length controller
+            edge_len_lims=(.01,.15), #smallest and largest allowed reference edge length
+            edge_len_tol=.5, #edge length tolerance for split and collapse
+            gain=.2,  #gain value for edge length controller
+            laplacian_weight=.02, #for laplacian smoothing/regularization
+            ramp=1, #learning rate ramp, actual ramp width is ramp/(1-betas[0])
+            grad_lim=10., #gradients are clipped to m1.abs()*grad_lim
+            remesh_interval=1, #larger intervals are faster but with worse mesh quality
+            local_edgelen=True, #set to False to use a global scalar reference edge length instead
+            ):
+        self._vertices = vertices
+        self._faces = faces
+        self._lr = lr
+        self._betas = betas
+        self._gammas = gammas
+        self._nu_ref = nu_ref
+        self._edge_len_lims = edge_len_lims
+        self._edge_len_tol = edge_len_tol
+        self._gain = gain
+        self._laplacian_weight = laplacian_weight
+        self._ramp = ramp
+        self._grad_lim = grad_lim
+        self._remesh_interval = remesh_interval
+        self._local_edgelen = local_edgelen
+        self._step = 0
+        V = self._vertices.shape[0]
+        # prepare continuous tensor for all vertex-based data
+        self._vertices_etc = torch.zeros([V,9],device=vertices.device)
+        self._split_vertices_etc()
+        self.vertices.copy_(vertices) #initialize vertices
+        self._vertices.requires_grad_()
+        self._ref_len.fill_(edge_len_lims[1])
+    @property
+    def vertices(self):
+        return self._vertices
+    @property
+    def faces(self):
+        return self._faces
+    def _split_vertices_etc(self):
+        self._vertices = self._vertices_etc[:,:3]
+        self._m2 = self._vertices_etc[:,3]
+        self._nu = self._vertices_etc[:,4]
+        self._m1 = self._vertices_etc[:,5:8]
+        self._ref_len = self._vertices_etc[:,8]
+        with_gammas = any(g!=0 for g in self._gammas)
+        self._smooth = self._vertices_etc[:,:8] if with_gammas else self._vertices_etc[:,:3]
+    def zero_grad(self):
+        self._vertices.grad = None
+    @torch.no_grad()
+    def step(self):
+        eps = 1e-8
+        self._step += 1
+        # spatial smoothing
+        edges,_ = calc_edges(self._faces) #E,2
+        E = edges.shape[0]
+        edge_smooth = self._smooth[edges] #E,2,S
+        neighbor_smooth = torch.zeros_like(self._smooth) #V,S
+        torch_scatter.scatter_mean(src=edge_smooth.flip(dims=[1]).reshape(E*2,-1),index=edges.reshape(E*2,1),dim=0,out=neighbor_smooth)
+        #apply optional smoothing of m1,m2,nu
+        if self._gammas[0]:
+            self._m1.lerp_(neighbor_smooth[:,5:8],self._gammas[0])
+        if self._gammas[1]:
+            self._m2.lerp_(neighbor_smooth[:,3],self._gammas[1])
+        if self._gammas[2]:
+            self._nu.lerp_(neighbor_smooth[:,4],self._gammas[2])
+        #add laplace smoothing to gradients
+        laplace = self._vertices - neighbor_smooth[:,:3]
+        grad = torch.addcmul(self._vertices.grad, laplace, self._nu[:,None], value=self._laplacian_weight)
+        #gradient clipping
+        if self._step>1:
+            grad_lim = self._m1.abs().mul_(self._grad_lim)
+            grad.clamp_(min=-grad_lim,max=grad_lim)
+        # moment updates
+        lerp_unbiased(self._m1, grad, self._betas[0], self._step)
+        lerp_unbiased(self._m2, (grad**2).sum(dim=-1), self._betas[1], self._step)
+        velocity = self._m1 / self._m2[:,None].sqrt().add_(eps) #V,3
+        speed = velocity.norm(dim=-1) #V
+        if self._betas[2]:
+            lerp_unbiased(self._nu,speed,self._betas[2],self._step) #V
+        else:
+            self._nu.copy_(speed) #V
+        # update vertices
+        ramped_lr = self._lr * min(1,self._step * (1-self._betas[0]) / self._ramp)
+        self._vertices.add_(velocity * self._ref_len[:,None], alpha=-ramped_lr)
+        # update target edge length
+        if self._step % self._remesh_interval == 0:
+            if self._local_edgelen:
+                len_change = (1 + (self._nu - self._nu_ref) * self._gain)
+            else:
+                len_change = (1 + (self._nu.mean() - self._nu_ref) * self._gain)
+            self._ref_len *= len_change
+            self._ref_len.clamp_(*self._edge_len_lims)
+    def remesh(self, flip:bool=True, poisson=False)->Tuple[torch.Tensor,torch.Tensor]:
+        min_edge_len = self._ref_len * (1 - self._edge_len_tol)
+        max_edge_len = self._ref_len * (1 + self._edge_len_tol)
+        self._vertices_etc,self._faces = remesh(self._vertices_etc,self._faces,min_edge_len,max_edge_len,flip, max_vertices=1e6)
+        self._split_vertices_etc()
+        self._vertices.requires_grad_()
+        return self._vertices, self._faces

mesh_reconstruction/recon.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from tqdm import tqdm
+from PIL import Image
+import numpy as np
+import torch
+from typing import List
+from mesh_reconstruction.remesh import calc_vertex_normals
+from mesh_reconstruction.opt import MeshOptimizer
+from mesh_reconstruction.func import make_star_cameras_orthographic, make_star_cameras_orthographic_py3d
+from mesh_reconstruction.render import NormalsRenderer, Pytorch3DNormalsRenderer
+from scripts.utils import to_py3d_mesh, init_target
+def reconstruct_stage1(pils: List[Image.Image], steps=100, vertices=None, faces=None, start_edge_len=0.15, end_edge_len=0.005, decay=0.995, return_mesh=True, loss_expansion_weight=0.1, gain=0.1):
+    vertices, faces = vertices.to("cuda"), faces.to("cuda")
+    assert len(pils) == 4
+    mv,proj = make_star_cameras_orthographic(4, 1)
+    renderer = NormalsRenderer(mv,proj,list(pils[0].size))
+    # cameras = make_star_cameras_orthographic_py3d([0, 270, 180, 90], device="cuda", focal=1., dist=4.0)
+    # renderer = Pytorch3DNormalsRenderer(cameras, list(pils[0].size), device="cuda")
+    target_images = init_target(pils, new_bkgd=(0., 0., 0.)) # 4s
+    # 1. no rotate
+    target_images = target_images[[0, 3, 2, 1]]
+    # 2. init from coarse mesh
+    opt = MeshOptimizer(vertices,faces, local_edgelen=False, gain=gain, edge_len_lims=(end_edge_len, start_edge_len))
+    vertices = opt.vertices
+    mask = target_images[..., -1] < 0.5
+    for i in tqdm(range(steps)):
+        opt.zero_grad()
+        opt._lr *= decay
+        normals = calc_vertex_normals(vertices,faces)
+        images = renderer.render(vertices,normals,faces)
+        loss_expand = 0.5 * ((vertices+normals).detach() - vertices).pow(2).mean()
+        t_mask = images[..., -1] > 0.5
+        loss_target_l2 = (images[t_mask] - target_images[t_mask]).abs().pow(2).mean()
+        loss_alpha_target_mask_l2 = (images[..., -1][mask] - target_images[..., -1][mask]).pow(2).mean()
+        loss = loss_target_l2 + loss_alpha_target_mask_l2 + loss_expand * loss_expansion_weight
+        # out of box
+        loss_oob = (vertices.abs() > 0.99).float().mean() * 10
+        loss = loss + loss_oob
+        loss.backward()
+        opt.step()
+        vertices,faces = opt.remesh(poisson=False)
+    vertices, faces = vertices.detach().cpu(), faces.detach().cpu()
+    if return_mesh:
+        return to_py3d_mesh(vertices, faces)
+    else:
+        return vertices, faces

mesh_reconstruction/refine.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from tqdm import tqdm
+from PIL import Image
+import torch
+from typing import List
+from mesh_reconstruction.remesh import calc_vertex_normals
+from mesh_reconstruction.opt import MeshOptimizer
+from mesh_reconstruction.func import make_star_cameras_orthographic, make_star_cameras_orthographic_py3d
+from mesh_reconstruction.render import NormalsRenderer, Pytorch3DNormalsRenderer
+from scripts.project_mesh import multiview_color_projection, get_cameras_list
+from scripts.utils import to_py3d_mesh, from_py3d_mesh, init_target
+def run_mesh_refine(vertices, faces, pils: List[Image.Image], steps=100, start_edge_len=0.02, end_edge_len=0.005, decay=0.99, update_normal_interval=10, update_warmup=10, return_mesh=True, process_inputs=True, process_outputs=True):
+    vertices, faces = vertices.to("cuda"), faces.to("cuda")
+    if process_inputs:
+        vertices = vertices * 2 / 1.35
+        vertices[..., [0, 2]] = - vertices[..., [0, 2]]
+    poission_steps = []
+    assert len(pils) == 4
+    mv,proj = make_star_cameras_orthographic(4, 1)
+    renderer = NormalsRenderer(mv,proj,list(pils[0].size))
+    # cameras = make_star_cameras_orthographic_py3d([0, 270, 180, 90], device="cuda", focal=1., dist=4.0)
+    # renderer = Pytorch3DNormalsRenderer(cameras, list(pils[0].size), device="cuda")
+    target_images = init_target(pils, new_bkgd=(0., 0., 0.)) # 4s
+    # 1. no rotate
+    target_images = target_images[[0, 3, 2, 1]]
+    # 2. init from coarse mesh
+    opt = MeshOptimizer(vertices,faces, ramp=5, edge_len_lims=(end_edge_len, start_edge_len), local_edgelen=False, laplacian_weight=0.02)
+    vertices = opt.vertices
+    alpha_init = None
+    mask = target_images[..., -1] < 0.5
+    for i in tqdm(range(steps)):
+        opt.zero_grad()
+        opt._lr *= decay
+        normals = calc_vertex_normals(vertices,faces)
+        images = renderer.render(vertices,normals,faces)
+        if alpha_init is None:
+            alpha_init = images.detach()
+        if i < update_warmup or i % update_normal_interval == 0:
+            with torch.no_grad():
+                py3d_mesh = to_py3d_mesh(vertices, faces, normals)
+                cameras = get_cameras_list(azim_list = [0, 90, 180, 270], device=vertices.device, focal=1.)
+                _, _, target_normal = from_py3d_mesh(multiview_color_projection(py3d_mesh, pils, cameras_list=cameras, weights=[2.0, 0.8, 1.0, 0.8], confidence_threshold=0.1, complete_unseen=False, below_confidence_strategy='original', reweight_with_cosangle='linear'))
+                target_normal = target_normal * 2 - 1
+                target_normal = torch.nn.functional.normalize(target_normal, dim=-1)
+                debug_images = renderer.render(vertices,target_normal,faces)
+        d_mask = images[..., -1] > 0.5
+        loss_debug_l2 = (images[..., :3][d_mask] - debug_images[..., :3][d_mask]).pow(2).mean()
+        loss_alpha_target_mask_l2 = (images[..., -1][mask] - target_images[..., -1][mask]).pow(2).mean()
+        loss = loss_debug_l2 + loss_alpha_target_mask_l2
+        # out of box
+        loss_oob = (vertices.abs() > 0.99).float().mean() * 10
+        loss = loss + loss_oob
+        loss.backward()
+        opt.step()
+        vertices,faces = opt.remesh(poisson=(i in poission_steps))
+    vertices, faces = vertices.detach().cpu(), faces.detach().cpu()
+    if process_outputs:
+        vertices = vertices / 2 * 1.35
+        vertices[..., [0, 2]] = - vertices[..., [0, 2]]
+    if return_mesh:
+        return to_py3d_mesh(vertices, faces)
+    else:
+        return vertices, faces

mesh_reconstruction/remesh.py ADDED Viewed

	@@ -0,0 +1,361 @@

+# modified from https://github.com/Profactor/continuous-remeshing
+import torch
+import torch.nn.functional as tfunc
+import torch_scatter
+from typing import Tuple
+def prepend_dummies(
+        vertices:torch.Tensor, #V,D
+        faces:torch.Tensor, #F,3 long
+    )->Tuple[torch.Tensor,torch.Tensor]:
+    """prepend dummy elements to vertices and faces to enable "masked" scatter operations"""
+    V,D = vertices.shape
+    vertices = torch.concat((torch.full((1,D),fill_value=torch.nan,device=vertices.device),vertices),dim=0)
+    faces = torch.concat((torch.zeros((1,3),dtype=torch.long,device=faces.device),faces+1),dim=0)
+    return vertices,faces
+def remove_dummies(
+        vertices:torch.Tensor, #V,D - first vertex all nan and unreferenced
+        faces:torch.Tensor, #F,3 long - first face all zeros
+    )->Tuple[torch.Tensor,torch.Tensor]:
+    """remove dummy elements added with prepend_dummies()"""
+    return vertices[1:],faces[1:]-1
+def calc_edges(
+        faces: torch.Tensor,  # F,3 long - first face may be dummy with all zeros
+        with_edge_to_face: bool = False
+    ) -> Tuple[torch.Tensor, ...]:
+    """
+    returns Tuple of
+    - edges E,2 long, 0 for unused, lower vertex index first
+    - face_to_edge F,3 long
+    - (optional) edge_to_face shape=E,[left,right],[face,side]
+    o-<-----e1     e0,e1...edge, e0<e1
+    |      /A      L,R....left and right face
+    |  L /  |      both triangles ordered counter clockwise
+    |  / R  |      normals pointing out of screen
+    V/      |
+    e0---->-o
+    """
+    F = faces.shape[0]
+    # make full edges, lower vertex index first
+    face_edges = torch.stack((faces,faces.roll(-1,1)),dim=-1) #F*3,3,2
+    full_edges = face_edges.reshape(F*3,2)
+    sorted_edges,_ = full_edges.sort(dim=-1) #F*3,2
+    # make unique edges
+    edges,full_to_unique = torch.unique(input=sorted_edges,sorted=True,return_inverse=True,dim=0) #(E,2),(F*3)
+    E = edges.shape[0]
+    face_to_edge = full_to_unique.reshape(F,3) #F,3
+    if not with_edge_to_face:
+        return edges, face_to_edge
+    is_right = full_edges[:,0]!=sorted_edges[:,0] #F*3
+    edge_to_face = torch.zeros((E,2,2),dtype=torch.long,device=faces.device) #E,LR=2,S=2
+    scatter_src = torch.cartesian_prod(torch.arange(0,F,device=faces.device),torch.arange(0,3,device=faces.device)) #F*3,2
+    edge_to_face.reshape(2*E,2).scatter_(dim=0,index=(2*full_to_unique+is_right)[:,None].expand(F*3,2),src=scatter_src) #E,LR=2,S=2
+    edge_to_face[0] = 0
+    return edges, face_to_edge, edge_to_face
+def calc_edge_length(
+        vertices:torch.Tensor, #V,3 first may be dummy
+        edges:torch.Tensor, #E,2 long, lower vertex index first, (0,0) for unused
+        )->torch.Tensor: #E
+    full_vertices = vertices[edges] #E,2,3
+    a,b = full_vertices.unbind(dim=1) #E,3
+    return torch.norm(a-b,p=2,dim=-1)
+def calc_face_normals(
+        vertices:torch.Tensor, #V,3 first vertex may be unreferenced
+        faces:torch.Tensor, #F,3 long, first face may be all zero
+        normalize:bool=False,
+        )->torch.Tensor: #F,3
+    """
+         n
+         |
+         c0     corners ordered counterclockwise when
+        / \     looking onto surface (in neg normal direction)
+      c1---c2
+    """
+    full_vertices = vertices[faces] #F,C=3,3
+    v0,v1,v2 = full_vertices.unbind(dim=1) #F,3
+    face_normals = torch.cross(v1-v0,v2-v0, dim=1) #F,3
+    if normalize:
+        face_normals = tfunc.normalize(face_normals, eps=1e-6, dim=1)
+    return face_normals #F,3
+def calc_vertex_normals(
+        vertices:torch.Tensor, #V,3 first vertex may be unreferenced
+        faces:torch.Tensor, #F,3 long, first face may be all zero
+        face_normals:torch.Tensor=None, #F,3, not normalized
+        )->torch.Tensor: #F,3
+    F = faces.shape[0]
+    if face_normals is None:
+        face_normals = calc_face_normals(vertices,faces)
+    vertex_normals = torch.zeros((vertices.shape[0],3,3),dtype=vertices.dtype,device=vertices.device) #V,C=3,3
+    vertex_normals.scatter_add_(dim=0,index=faces[:,:,None].expand(F,3,3),src=face_normals[:,None,:].expand(F,3,3))
+    vertex_normals = vertex_normals.sum(dim=1) #V,3
+    return tfunc.normalize(vertex_normals, eps=1e-6, dim=1)
+def calc_face_ref_normals(
+        faces:torch.Tensor, #F,3 long, 0 for unused
+        vertex_normals:torch.Tensor, #V,3 first unused
+        normalize:bool=False,
+        )->torch.Tensor: #F,3
+    """calculate reference normals for face flip detection"""
+    full_normals = vertex_normals[faces] #F,C=3,3
+    ref_normals = full_normals.sum(dim=1) #F,3
+    if normalize:
+        ref_normals = tfunc.normalize(ref_normals, eps=1e-6, dim=1)
+    return ref_normals
+def pack(
+        vertices:torch.Tensor, #V,3 first unused and nan
+        faces:torch.Tensor, #F,3 long, 0 for unused
+        )->Tuple[torch.Tensor,torch.Tensor]: #(vertices,faces), keeps first vertex unused
+    """removes unused elements in vertices and faces"""
+    V = vertices.shape[0]
+    # remove unused faces
+    used_faces = faces[:,0]!=0
+    used_faces[0] = True
+    faces = faces[used_faces] #sync
+    # remove unused vertices
+    used_vertices = torch.zeros(V,3,dtype=torch.bool,device=vertices.device)
+    used_vertices.scatter_(dim=0,index=faces,value=True,reduce='add')
+    used_vertices = used_vertices.any(dim=1)
+    used_vertices[0] = True
+    vertices = vertices[used_vertices] #sync
+    # update used faces
+    ind = torch.zeros(V,dtype=torch.long,device=vertices.device)
+    V1 = used_vertices.sum()
+    ind[used_vertices] =  torch.arange(0,V1,device=vertices.device) #sync
+    faces = ind[faces]
+    return vertices,faces
+def split_edges(
+        vertices:torch.Tensor, #V,3 first unused
+        faces:torch.Tensor, #F,3 long, 0 for unused
+        edges:torch.Tensor, #E,2 long 0 for unused, lower vertex index first
+        face_to_edge:torch.Tensor, #F,3 long 0 for unused
+        splits, #E bool
+        pack_faces:bool=True,
+        )->Tuple[torch.Tensor,torch.Tensor]: #(vertices,faces)
+    #   c2                    c2               c...corners = faces
+    #    . .                   . .             s...side_vert, 0 means no split
+    #    .   .                 .N2 .           S...shrunk_face
+    #    .     .               .     .         Ni...new_faces
+    #   s2      s1           s2|c2...s1|c1
+    #    .        .            .     .  .
+    #    .          .          . S .      .
+    #    .            .        . .     N1    .
+    #   c0...(s0=0)....c1    s0|c0...........c1
+    #
+    # pseudo-code:
+    #   S = [s0|c0,s1|c1,s2|c2] example:[c0,s1,s2]
+    #   split = side_vert!=0 example:[False,True,True]
+    #   N0 = split[0]*[c0,s0,s2|c2] example:[0,0,0]
+    #   N1 = split[1]*[c1,s1,s0|c0] example:[c1,s1,c0]
+    #   N2 = split[2]*[c2,s2,s1|c1] example:[c2,s2,s1]
+    V = vertices.shape[0]
+    F = faces.shape[0]
+    S = splits.sum().item() #sync
+    if S==0:
+        return vertices,faces
+    edge_vert = torch.zeros_like(splits, dtype=torch.long) #E
+    edge_vert[splits] = torch.arange(V,V+S,dtype=torch.long,device=vertices.device) #E 0 for no split, sync
+    side_vert = edge_vert[face_to_edge] #F,3 long, 0 for no split
+    split_edges = edges[splits] #S sync
+    #vertices
+    split_vertices = vertices[split_edges].mean(dim=1) #S,3
+    vertices = torch.concat((vertices,split_vertices),dim=0)
+    #faces
+    side_split = side_vert!=0 #F,3
+    shrunk_faces = torch.where(side_split,side_vert,faces) #F,3 long, 0 for no split
+    new_faces = side_split[:,:,None] * torch.stack((faces,side_vert,shrunk_faces.roll(1,dims=-1)),dim=-1) #F,N=3,C=3
+    faces = torch.concat((shrunk_faces,new_faces.reshape(F*3,3))) #4F,3
+    if pack_faces:
+        mask = faces[:,0]!=0
+        mask[0] = True
+        faces = faces[mask] #F',3 sync
+    return vertices,faces
+def collapse_edges(
+        vertices:torch.Tensor, #V,3 first unused
+        faces:torch.Tensor, #F,3 long 0 for unused
+        edges:torch.Tensor, #E,2 long 0 for unused, lower vertex index first
+        priorities:torch.Tensor, #E float
+        stable:bool=False, #only for unit testing
+        )->Tuple[torch.Tensor,torch.Tensor]: #(vertices,faces)
+    V = vertices.shape[0]
+    # check spacing
+    _,order = priorities.sort(stable=stable) #E
+    rank = torch.zeros_like(order)
+    rank[order] = torch.arange(0,len(rank),device=rank.device)
+    vert_rank = torch.zeros(V,dtype=torch.long,device=vertices.device) #V
+    edge_rank = rank #E
+    for i in range(3):
+        torch_scatter.scatter_max(src=edge_rank[:,None].expand(-1,2).reshape(-1),index=edges.reshape(-1),dim=0,out=vert_rank)
+        edge_rank,_ = vert_rank[edges].max(dim=-1) #E
+    candidates = edges[(edge_rank==rank).logical_and_(priorities>0)] #E',2
+    # check connectivity
+    vert_connections = torch.zeros(V,dtype=torch.long,device=vertices.device) #V
+    vert_connections[candidates[:,0]] = 1 #start
+    edge_connections = vert_connections[edges].sum(dim=-1) #E, edge connected to start
+    vert_connections.scatter_add_(dim=0,index=edges.reshape(-1),src=edge_connections[:,None].expand(-1,2).reshape(-1))# one edge from start
+    vert_connections[candidates] = 0 #clear start and end
+    edge_connections = vert_connections[edges].sum(dim=-1) #E, one or two edges from start
+    vert_connections.scatter_add_(dim=0,index=edges.reshape(-1),src=edge_connections[:,None].expand(-1,2).reshape(-1)) #one or two edges from start
+    collapses = candidates[vert_connections[candidates[:,1]] <= 2] # E" not more than two connections between start and end
+    # mean vertices
+    vertices[collapses[:,0]] = vertices[collapses].mean(dim=1)
+    # update faces
+    dest = torch.arange(0,V,dtype=torch.long,device=vertices.device) #V
+    dest[collapses[:,1]] = dest[collapses[:,0]]
+    faces = dest[faces] #F,3
+    c0,c1,c2 = faces.unbind(dim=-1)
+    collapsed = (c0==c1).logical_or_(c1==c2).logical_or_(c0==c2)
+    faces[collapsed] = 0
+    return vertices,faces
+def calc_face_collapses(
+        vertices:torch.Tensor, #V,3 first unused
+        faces:torch.Tensor, #F,3 long, 0 for unused
+        edges:torch.Tensor, #E,2 long 0 for unused, lower vertex index first
+        face_to_edge:torch.Tensor, #F,3 long 0 for unused
+        edge_length:torch.Tensor, #E
+        face_normals:torch.Tensor, #F,3
+        vertex_normals:torch.Tensor, #V,3 first unused
+        min_edge_length:torch.Tensor=None, #V
+        area_ratio = 0.5, #collapse if area < min_edge_length**2 * area_ratio
+        shortest_probability = 0.8
+        )->torch.Tensor: #E edges to collapse
+    E = edges.shape[0]
+    F = faces.shape[0]
+    # face flips
+    ref_normals = calc_face_ref_normals(faces,vertex_normals,normalize=False) #F,3
+    face_collapses = (face_normals*ref_normals).sum(dim=-1)<0 #F
+    # small faces
+    if min_edge_length is not None:
+        min_face_length = min_edge_length[faces].mean(dim=-1) #F
+        min_area = min_face_length**2 * area_ratio #F
+        face_collapses.logical_or_(face_normals.norm(dim=-1) < min_area*2) #F
+        face_collapses[0] = False
+    # faces to edges
+    face_length = edge_length[face_to_edge] #F,3
+    if shortest_probability<1:
+        #select shortest edge with shortest_probability chance
+        randlim = round(2/(1-shortest_probability))
+        rand_ind = torch.randint(0,randlim,size=(F,),device=faces.device).clamp_max_(2) #selected edge local index in face
+        sort_ind = torch.argsort(face_length,dim=-1,descending=True) #F,3
+        local_ind = sort_ind.gather(dim=-1,index=rand_ind[:,None])
+    else:
+        local_ind = torch.argmin(face_length,dim=-1)[:,None] #F,1 0...2 shortest edge local index in face
+    edge_ind = face_to_edge.gather(dim=1,index=local_ind)[:,0] #F 0...E selected edge global index
+    edge_collapses = torch.zeros(E,dtype=torch.long,device=vertices.device)
+    edge_collapses.scatter_add_(dim=0,index=edge_ind,src=face_collapses.long())
+    return edge_collapses.bool()
+def flip_edges(
+        vertices:torch.Tensor, #V,3 first unused
+        faces:torch.Tensor, #F,3 long, first must be 0, 0 for unused
+        edges:torch.Tensor, #E,2 long, first must be 0, 0 for unused, lower vertex index first
+        edge_to_face:torch.Tensor, #E,[left,right],[face,side]
+        with_border:bool=True, #handle border edges (D=4 instead of D=6)
+        with_normal_check:bool=True, #check face normal flips
+        stable:bool=False, #only for unit testing
+        ):
+    V = vertices.shape[0]
+    E = edges.shape[0]
+    device=vertices.device
+    vertex_degree = torch.zeros(V,dtype=torch.long,device=device) #V long
+    vertex_degree.scatter_(dim=0,index=edges.reshape(E*2),value=1,reduce='add')
+    neighbor_corner = (edge_to_face[:,:,1] + 2) % 3 #go from side to corner
+    neighbors = faces[edge_to_face[:,:,0],neighbor_corner] #E,LR=2
+    edge_is_inside = neighbors.all(dim=-1) #E
+    if with_border:
+        # inside vertices should have D=6, border edges D=4, so we subtract 2 for all inside vertices
+        # need to use float for masks in order to use scatter(reduce='multiply')
+        vertex_is_inside = torch.ones(V,2,dtype=torch.float32,device=vertices.device) #V,2 float
+        src = edge_is_inside.type(torch.float32)[:,None].expand(E,2) #E,2 float
+        vertex_is_inside.scatter_(dim=0,index=edges,src=src,reduce='multiply')
+        vertex_is_inside = vertex_is_inside.prod(dim=-1,dtype=torch.long) #V long
+        vertex_degree -= 2 * vertex_is_inside #V long
+    neighbor_degrees = vertex_degree[neighbors] #E,LR=2
+    edge_degrees = vertex_degree[edges] #E,2
+    #
+    # loss = Sum_over_affected_vertices((new_degree-6)**2)
+    # loss_change = Sum_over_neighbor_vertices((degree+1-6)**2-(degree-6)**2)
+    #                   + Sum_over_edge_vertices((degree-1-6)**2-(degree-6)**2)
+    #             = 2 * (2 + Sum_over_neighbor_vertices(degree) - Sum_over_edge_vertices(degree))
+    #
+    loss_change = 2 + neighbor_degrees.sum(dim=-1) - edge_degrees.sum(dim=-1) #E
+    candidates = torch.logical_and(loss_change<0, edge_is_inside) #E
+    loss_change = loss_change[candidates] #E'
+    if loss_change.shape[0]==0:
+        return
+    edges_neighbors = torch.concat((edges[candidates],neighbors[candidates]),dim=-1) #E',4
+    _,order = loss_change.sort(descending=True, stable=stable) #E'
+    rank = torch.zeros_like(order)
+    rank[order] = torch.arange(0,len(rank),device=rank.device)
+    vertex_rank = torch.zeros((V,4),dtype=torch.long,device=device) #V,4
+    torch_scatter.scatter_max(src=rank[:,None].expand(-1,4),index=edges_neighbors,dim=0,out=vertex_rank)
+    vertex_rank,_ = vertex_rank.max(dim=-1) #V
+    neighborhood_rank,_ = vertex_rank[edges_neighbors].max(dim=-1) #E'
+    flip = rank==neighborhood_rank #E'
+    if with_normal_check:
+        #  cl-<-----e1     e0,e1...edge, e0<e1
+        #   |      /A      L,R....left and right face
+        #   |  L /  |      both triangles ordered counter clockwise
+        #   |  / R  |      normals pointing out of screen
+        #   V/      |
+        #   e0---->-cr
+        v = vertices[edges_neighbors] #E",4,3
+        v = v - v[:,0:1] #make relative to e0
+        e1 = v[:,1]
+        cl = v[:,2]
+        cr = v[:,3]
+        n = torch.cross(e1,cl) + torch.cross(cr,e1) #sum of old normal vectors
+        flip.logical_and_(torch.sum(n*torch.cross(cr,cl),dim=-1)>0) #first new face
+        flip.logical_and_(torch.sum(n*torch.cross(cl-e1,cr-e1),dim=-1)>0) #second new face
+    flip_edges_neighbors = edges_neighbors[flip] #E",4
+    flip_edge_to_face = edge_to_face[candidates,:,0][flip] #E",2
+    flip_faces = flip_edges_neighbors[:,[[0,3,2],[1,2,3]]] #E",2,3
+    faces.scatter_(dim=0,index=flip_edge_to_face.reshape(-1,1).expand(-1,3),src=flip_faces.reshape(-1,3))

mesh_reconstruction/render.py ADDED Viewed

	@@ -0,0 +1,159 @@

+# modified from https://github.com/Profactor/continuous-remeshing
+import nvdiffrast.torch as dr
+import torch
+from typing import Tuple
+def _warmup(glctx, device=None):
+    device = 'cuda' if device is None else device
+    #windows workaround for https://github.com/NVlabs/nvdiffrast/issues/59
+    def tensor(*args, **kwargs):
+        return torch.tensor(*args, device=device, **kwargs)
+    pos = tensor([[[-0.8, -0.8, 0, 1], [0.8, -0.8, 0, 1], [-0.8, 0.8, 0, 1]]], dtype=torch.float32)
+    tri = tensor([[0, 1, 2]], dtype=torch.int32)
+    dr.rasterize(glctx, pos, tri, resolution=[256, 256])
+class NormalsRenderer:
+    _glctx:dr.RasterizeCudaContext = None
+    def __init__(
+            self,
+            mv: torch.Tensor, #C,4,4
+            proj: torch.Tensor, #C,4,4
+            image_size: Tuple[int,int],
+            mvp = None,
+            device=None,
+            ):
+        if mvp is None:
+            self._mvp = proj @ mv #C,4,4
+        else:
+            self._mvp = mvp
+        self._image_size = image_size
+        self._glctx = dr.RasterizeCudaContext(device=device)
+        _warmup(self._glctx, device)
+    def render(self,
+            vertices: torch.Tensor, #V,3 float
+            normals: torch.Tensor, #V,3 float   in [-1, 1]
+            faces: torch.Tensor, #F,3 long
+            ) ->torch.Tensor: #C,H,W,4
+        V = vertices.shape[0]
+        faces = faces.type(torch.int32)
+        vert_hom = torch.cat((vertices, torch.ones(V,1,device=vertices.device)),axis=-1) #V,3 -> V,4
+        vertices_clip = vert_hom @ self._mvp.transpose(-2,-1) #C,V,4
+        rast_out,_ = dr.rasterize(self._glctx, vertices_clip, faces, resolution=self._image_size, grad_db=False) #C,H,W,4
+        vert_col = (normals+1)/2 #V,3
+        col,_ = dr.interpolate(vert_col, rast_out, faces) #C,H,W,3
+        alpha = torch.clamp(rast_out[..., -1:], max=1) #C,H,W,1
+        col = torch.concat((col,alpha),dim=-1) #C,H,W,4
+        col = dr.antialias(col, rast_out, vertices_clip, faces) #C,H,W,4
+        return col #C,H,W,4
+from pytorch3d.structures import Meshes
+from pytorch3d.renderer.mesh.shader import ShaderBase
+from pytorch3d.renderer import (
+    RasterizationSettings,
+    MeshRendererWithFragments,
+    TexturesVertex,
+    MeshRasterizer,
+    BlendParams,
+    FoVOrthographicCameras,
+    look_at_view_transform,
+    hard_rgb_blend,
+)
+class VertexColorShader(ShaderBase):
+    def forward(self, fragments, meshes, **kwargs) -> torch.Tensor:
+        blend_params = kwargs.get("blend_params", self.blend_params)
+        texels = meshes.sample_textures(fragments)
+        return hard_rgb_blend(texels, fragments, blend_params)
+def render_mesh_vertex_color(mesh, cameras, H, W, blur_radius=0.0, faces_per_pixel=1, bkgd=(0., 0., 0.), dtype=torch.float32, device="cuda"):
+    if len(mesh) != len(cameras):
+        if len(cameras) % len(mesh) == 0:
+            mesh = mesh.extend(len(cameras))
+        else:
+            raise NotImplementedError()
+    # render requires everything in float16 or float32
+    input_dtype = dtype
+    blend_params = BlendParams(1e-4, 1e-4, bkgd)
+    # Define the settings for rasterization and shading
+    raster_settings = RasterizationSettings(
+        image_size=(H, W),
+        blur_radius=blur_radius,
+        faces_per_pixel=faces_per_pixel,
+        clip_barycentric_coords=True,
+        bin_size=None,
+        max_faces_per_bin=500000,
+    )
+    # Create a renderer by composing a rasterizer and a shader
+    # We simply render vertex colors through the custom VertexColorShader (no lighting, materials are used)
+    renderer = MeshRendererWithFragments(
+        rasterizer=MeshRasterizer(
+            cameras=cameras,
+            raster_settings=raster_settings
+        ),
+        shader=VertexColorShader(
+            device=device,
+            cameras=cameras,
+            blend_params=blend_params
+        )
+    )
+    # render RGB and depth, get mask
+    with torch.autocast(dtype=input_dtype, device_type=torch.device(device).type):
+        images, _ = renderer(mesh)
+    return images   # BHW4
+class Pytorch3DNormalsRenderer:
+    def __init__(self, cameras, image_size, device):
+        self.cameras = cameras.to(device)
+        self._image_size = image_size
+        self.device = device
+    def render(self,
+            vertices: torch.Tensor, #V,3 float
+            normals: torch.Tensor, #V,3 float   in [-1, 1]
+            faces: torch.Tensor, #F,3 long
+            ) ->torch.Tensor: #C,H,W,4
+        mesh = Meshes(verts=[vertices], faces=[faces], textures=TexturesVertex(verts_features=[(normals + 1) / 2])).to(self.device)
+        return render_mesh_vertex_color(mesh, self.cameras, self._image_size[0], self._image_size[1], device=self.device)
+def save_tensor_to_img(tensor, save_dir):
+    from PIL import Image
+    import numpy as np
+    for idx, img in enumerate(tensor):
+        img = img[..., :3].cpu().numpy()
+        img = (img * 255).astype(np.uint8)
+        img = Image.fromarray(img)
+        img.save(save_dir + f"{idx}.png")
+if __name__ == "__main__":
+    import sys
+    import os
+    sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    from mesh_reconstruction.func import make_star_cameras_orthographic, make_star_cameras_orthographic_py3d
+    cameras = make_star_cameras_orthographic_py3d([0, 270, 180, 90], device="cuda", focal=1., dist=4.0)
+    mv,proj = make_star_cameras_orthographic(4, 1)
+    resolution = 1024
+    renderer1 = NormalsRenderer(mv,proj, [resolution,resolution], device="cuda")
+    renderer2 = Pytorch3DNormalsRenderer(cameras, [resolution,resolution], device="cuda")
+    vertices = torch.tensor([[0,0,0],[0,0,1],[0,1,0],[1,0,0]], device="cuda", dtype=torch.float32)
+    normals = torch.tensor([[-1,-1,-1],[1,-1,-1],[-1,-1,1],[-1,1,-1]], device="cuda", dtype=torch.float32)
+    faces = torch.tensor([[0,1,2],[0,1,3],[0,2,3],[1,2,3]], device="cuda", dtype=torch.long)
+    import time
+    t0 = time.time()
+    r1 = renderer1.render(vertices, normals, faces)
+    print("time r1:", time.time() - t0)
+    t0 = time.time()
+    r2 = renderer2.render(vertices, normals, faces)
+    print("time r2:", time.time() - t0)
+    for i in range(4):
+        print((r1[i]-r2[i]).abs().mean(), (r1[i]+r2[i]).abs().mean())

package/nvdiffrast-0.3.1.torch-cp310-cp310-linux_x86_64.whl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff4a35615ed42148c8579622bee6dca88f7f3be683671524a282fafaf7589682
+size 3079614

package/onnxruntime_gpu-1.17.0-cp310-cp310-manylinux_2_28_x86_64.whl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11e7f7f781fef16c09ec8d03bfb6da84cf61c54fc59e8a4ea047a90c4a24e88f
+size 162720703

scripts/all_typing.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# code from https://github.com/threestudio-project
+"""
+This module contains type annotations for the project, using
+1. Python type hints (https://docs.python.org/3/library/typing.html) for Python objects
+2. jaxtyping (https://github.com/google/jaxtyping/blob/main/API.md) for PyTorch tensors
+Two types of typing checking can be used:
+1. Static type checking with mypy (install with pip and enabled as the default linter in VSCode)
+2. Runtime type checking with typeguard (install with pip and triggered at runtime, mainly for tensor dtype and shape checking)
+"""
+# Basic types
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Literal,
+    NamedTuple,
+    NewType,
+    Optional,
+    Sized,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
+# Tensor dtype
+# for jaxtyping usage, see https://github.com/google/jaxtyping/blob/main/API.md
+from jaxtyping import Bool, Complex, Float, Inexact, Int, Integer, Num, Shaped, UInt
+# Config type
+from omegaconf import DictConfig
+# PyTorch Tensor type
+from torch import Tensor
+# Runtime type checking decorator
+from typeguard import typechecked as typechecker

scripts/load_onnx.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import onnxruntime
+import torch
+providers = [
+    # ('TensorrtExecutionProvider', {
+    #     'device_id': 0,
+    #     'trt_max_workspace_size': 8 * 1024 * 1024 * 1024,
+    #     'trt_fp16_enable': True,
+    #     'trt_engine_cache_enable': True,
+    # }),
+    ('CUDAExecutionProvider', {
+        'device_id': 0,
+        'arena_extend_strategy': 'kSameAsRequested',
+        'gpu_mem_limit': 8 * 1024 * 1024 * 1024,
+        'cudnn_conv_algo_search': 'HEURISTIC',
+    })
+]
+def load_onnx(file_path: str):
+    assert file_path.endswith(".onnx")
+    sess_opt = onnxruntime.SessionOptions()
+    ort_session = onnxruntime.InferenceSession(file_path, sess_opt=sess_opt, providers=providers)
+    return ort_session
+def load_onnx_caller(file_path: str, single_output=False):
+    ort_session = load_onnx(file_path)
+    def caller(*args):
+        torch_input = isinstance(args[0], torch.Tensor)
+        if torch_input:
+            torch_input_dtype = args[0].dtype
+            torch_input_device = args[0].device
+            # check all are torch.Tensor and have same dtype and device
+            assert all([isinstance(arg, torch.Tensor) for arg in args]), "All inputs should be torch.Tensor, if first input is torch.Tensor"
+            assert all([arg.dtype == torch_input_dtype for arg in args]), "All inputs should have same dtype, if first input is torch.Tensor"
+            assert all([arg.device == torch_input_device for arg in args]), "All inputs should have same device, if first input is torch.Tensor"
+            args = [arg.cpu().float().numpy() for arg in args]
+        ort_inputs = {ort_session.get_inputs()[idx].name: args[idx] for idx in range(len(args))}
+        ort_outs = ort_session.run(None, ort_inputs)
+        if torch_input:
+            ort_outs = [torch.tensor(ort_out, dtype=torch_input_dtype, device=torch_input_device) for ort_out in ort_outs]
+        if single_output:
+            return ort_outs[0]
+        return ort_outs
+    return caller

scripts/mesh_init.py ADDED Viewed

	@@ -0,0 +1,132 @@

+from PIL import Image
+import torch
+import numpy as np
+from pytorch3d.structures import Meshes
+from pytorch3d.renderer import TexturesVertex
+from scripts.utils import meshlab_mesh_to_py3dmesh, py3dmesh_to_meshlab_mesh
+import pymeshlab
+_MAX_THREAD = 8
+# rgb and depth to mesh
+def get_ortho_ray_directions_origins(W, H, use_pixel_centers=True, device="cuda"):
+    pixel_center = 0.5 if use_pixel_centers else 0
+    i, j = np.meshgrid(
+        np.arange(W, dtype=np.float32) + pixel_center,
+        np.arange(H, dtype=np.float32) + pixel_center,
+        indexing='xy'
+    )
+    i, j = torch.from_numpy(i).to(device), torch.from_numpy(j).to(device)
+    origins = torch.stack([(i/W-0.5)*2, (j/H-0.5)*2 * H / W, torch.zeros_like(i)], dim=-1) # W, H, 3
+    directions = torch.stack([torch.zeros_like(i), torch.zeros_like(j), torch.ones_like(i)], dim=-1) # W, H, 3
+    return origins, directions
+def depth_and_color_to_mesh(rgb_BCHW, pred_HWC, valid_HWC=None, is_back=False):
+    if valid_HWC is None:
+        valid_HWC = torch.ones_like(pred_HWC).bool()
+    H, W = rgb_BCHW.shape[-2:]
+    rgb_BCHW = rgb_BCHW.flip(-2)
+    pred_HWC = pred_HWC.flip(0)
+    valid_HWC = valid_HWC.flip(0)
+    rays_o, rays_d = get_ortho_ray_directions_origins(W, H, device=rgb_BCHW.device)
+    verts = rays_o + rays_d * pred_HWC  # [H, W, 3]
+    verts = verts.reshape(-1, 3)    # [V, 3]
+    indexes = torch.arange(H * W).reshape(H, W).to(rgb_BCHW.device)
+    faces1 = torch.stack([indexes[:-1, :-1], indexes[:-1, 1:], indexes[1:, :-1]], dim=-1)
+    # faces1_valid = valid_HWC[:-1, :-1] | valid_HWC[:-1, 1:] | valid_HWC[1:, :-1]
+    faces1_valid = valid_HWC[:-1, :-1] & valid_HWC[:-1, 1:] & valid_HWC[1:, :-1]
+    faces2 = torch.stack([indexes[1:, 1:], indexes[1:, :-1], indexes[:-1, 1:]], dim=-1)
+    # faces2_valid = valid_HWC[1:, 1:] | valid_HWC[1:, :-1] | valid_HWC[:-1, 1:]
+    faces2_valid = valid_HWC[1:, 1:] & valid_HWC[1:, :-1] & valid_HWC[:-1, 1:]
+    faces = torch.cat([faces1[faces1_valid.expand_as(faces1)].reshape(-1, 3), faces2[faces2_valid.expand_as(faces2)].reshape(-1, 3)], dim=0)  # (F, 3)
+    colors = (rgb_BCHW[0].permute((1,2,0)) / 2 + 0.5).reshape(-1, 3)  # (V, 3)
+    if is_back:
+        verts = verts * torch.tensor([-1, 1, -1], dtype=verts.dtype, device=verts.device)
+    used_verts = faces.unique()
+    old_to_new_mapping = torch.zeros_like(verts[..., 0]).long()
+    old_to_new_mapping[used_verts] = torch.arange(used_verts.shape[0], device=verts.device)
+    new_faces = old_to_new_mapping[faces]
+    mesh = Meshes(verts=[verts[used_verts]], faces=[new_faces], textures=TexturesVertex(verts_features=[colors[used_verts]]))
+    return mesh
+def normalmap_to_depthmap(normal_np):
+    from scripts.normal_to_height_map import estimate_height_map
+    height = estimate_height_map(normal_np, raw_values=True, thread_count=_MAX_THREAD, target_iteration_count=96)
+    return height
+def transform_back_normal_to_front(normal_pil):
+    arr = np.array(normal_pil)  # in [0, 255]
+    arr[..., 0] = 255-arr[..., 0]
+    arr[..., 2] = 255-arr[..., 2]
+    return Image.fromarray(arr.astype(np.uint8))
+def calc_w_over_h(normal_pil):
+    if isinstance(normal_pil, Image.Image):
+        arr = np.array(normal_pil)
+    else:
+        assert isinstance(normal_pil, np.ndarray)
+        arr = normal_pil
+    if arr.shape[-1] == 4:
+        alpha = arr[..., -1] / 255.
+        alpha[alpha >= 0.5] = 1
+        alpha[alpha < 0.5] = 0
+    else:
+        alpha = ~(arr.min(axis=-1) >= 250)
+    h_min, w_min = np.min(np.where(alpha), axis=1)
+    h_max, w_max = np.max(np.where(alpha), axis=1)
+    return (w_max - w_min) / (h_max - h_min)
+def build_mesh(normal_pil, rgb_pil, is_back=False, clamp_min=-1, scale=0.3, init_type="std", offset=0):
+    if is_back:
+        normal_pil = transform_back_normal_to_front(normal_pil)
+    normal_img = np.array(normal_pil)
+    rgb_img = np.array(rgb_pil)
+    if normal_img.shape[-1] == 4:
+        valid_HWC = normal_img[..., [3]] / 255
+    elif rgb_img.shape[-1] == 4:
+        valid_HWC = rgb_img[..., [3]] / 255
+    else:
+        raise ValueError("invalid input, either normal or rgb should have alpha channel")
+    real_height_pix = np.max(np.where(valid_HWC>0.5)[0]) - np.min(np.where(valid_HWC>0.5)[0])
+    heights = normalmap_to_depthmap(normal_img)
+    rgb_BCHW = torch.from_numpy(rgb_img[..., :3] / 255.).permute((2,0,1))[None]
+    valid_HWC[valid_HWC < 0.5] = 0
+    valid_HWC[valid_HWC >= 0.5] = 1
+    valid_HWC = torch.from_numpy(valid_HWC).bool()
+    if init_type == "std":
+        # accurate but not stable
+        pred_HWC = torch.from_numpy(heights / heights.max() * (real_height_pix / heights.shape[0]) * scale * 2).float()[..., None]
+    elif init_type == "thin":
+        heights = heights - heights.min()
+        heights = (heights / heights.max() * 0.2)
+        pred_HWC = torch.from_numpy(heights * scale).float()[..., None]
+    else:
+        # stable but not accurate
+        heights = heights - heights.min()
+        heights = (heights / heights.max() * (1-offset)) + offset # to [0.2, 1]
+        pred_HWC = torch.from_numpy(heights * scale).float()[..., None]
+    # set the boarder pixels to 0 height
+    import cv2
+    # edge filter
+    edge = cv2.Canny((valid_HWC[..., 0] * 255).numpy().astype(np.uint8), 0, 255)
+    edge = torch.from_numpy(edge).bool()[..., None]
+    pred_HWC[edge] = 0
+    valid_HWC[pred_HWC < clamp_min] = False
+    return depth_and_color_to_mesh(rgb_BCHW.cuda(), pred_HWC.cuda(), valid_HWC.cuda(), is_back)
+def fix_border_with_pymeshlab_fast(meshes: Meshes, poissson_depth=6, simplification=0):
+    ms = pymeshlab.MeshSet()
+    ms.add_mesh(py3dmesh_to_meshlab_mesh(meshes), "cube_vcolor_mesh")
+    if simplification > 0:
+        ms.apply_filter('meshing_decimation_quadric_edge_collapse', targetfacenum=simplification, preservetopology=True)
+    ms.apply_filter('generate_surface_reconstruction_screened_poisson', threads = 6, depth = poissson_depth, preclean = True)
+    if simplification > 0:
+        ms.apply_filter('meshing_decimation_quadric_edge_collapse', targetfacenum=simplification, preservetopology=True)
+    return meshlab_mesh_to_py3dmesh(ms.current_mesh())