Spaces:

okaris
/

omni-zero-couples

Running on Zero

App Files Files Community

suggested patching

by multimodalart HF staff - opened Sep 25, 2024

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+71

-380

Files changed (5) hide show

.dockerignore +1 -2
README.md +3 -3
app.py +40 -204
omni_zero.py +15 -152
predict.py +12 -19

.dockerignore CHANGED Viewed

	@@ -1,2 +1 @@
1	- models
2	- venv


1	+ models

README.md CHANGED Viewed

@@ -15,13 +15,13 @@ license: gpl-3.0
 # Omni-Zero-Couples: A diffusion pipeline for zero-shot stylized couples portrait creation.
 ## Use Omni-Zero in HuggingFace Spaces ZeroGPU [https://huggingface.co/spaces/okaris/omni-zero-couples](https://huggingface.co/spaces/okaris/omni-zero-couples)
-![Omni-Zero-Couples-Huggingface](https://github.com/user-attachments/assets/1f4b272b-db36-4355-91f0-b2c1ca310680)
 ## Run on Replicate [https://replicate.com/okaris/omni-zero-couples](https://replicate.com/okaris/omni-zero-couples)
-![Omni-Zero-Couples-Replicate](https://github.com/user-attachments/assets/aeee3626-c343-4441-8e36-89896096910b)
 ### Multiple Identities and Styles
-![Omni-Zero-Couples](https://github.com/user-attachments/assets/87218819-5114-49d8-a0f2-eadf4201736e)
 ### Single Identity and Style [https://github.com/okaris/omni-zero](https://github.com/okaris/omni-zero)
 ![Omni-Zero](https://github.com/okaris/omni-zero/assets/1448702/2c51fb77-a810-4c0a-9555-791a294455ca)

 # Omni-Zero-Couples: A diffusion pipeline for zero-shot stylized couples portrait creation.
 ## Use Omni-Zero in HuggingFace Spaces ZeroGPU [https://huggingface.co/spaces/okaris/omni-zero-couples](https://huggingface.co/spaces/okaris/omni-zero-couples)
+![Omni-Zero-Couples](https://github.com/okaris/omni-zero-couples/assets/1448702/1d4c40e0-41c5-4127-ba06-aec52a2d179d)
 ## Run on Replicate [https://replicate.com/okaris/omni-zero-couples](https://replicate.com/okaris/omni-zero-couples)
+![Omni-Zero-Couples](https://github.com/okaris/omni-zero-couples/assets/1448702/0d53489b-89eb-4277-907f-4317cc98db74)
 ### Multiple Identities and Styles
+![Omni-Zero-Couples](https://github.com/okaris/omni-zero-couples/assets/1448702/c5c20961-83bc-47f7-86ed-5948d5590f07)
 ### Single Identity and Style [https://github.com/okaris/omni-zero](https://github.com/okaris/omni-zero)
 ![Omni-Zero](https://github.com/okaris/omni-zero/assets/1448702/2c51fb77-a810-4c0a-9555-791a294455ca)

app.py CHANGED Viewed

@@ -1,8 +1,6 @@
-import os
 import gradio as gr
 import spaces
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 import torch
@@ -11,78 +9,14 @@ import torch
 torch.jit.script = lambda f: f
 ####
-import cv2
-import numpy as np
-import PIL
-from controlnet_aux import ZoeDetector
-from diffusers import DPMSolverMultistepScheduler
-from diffusers.image_processor import IPAdapterMaskProcessor
-from diffusers.models import ControlNetModel
-from huggingface_hub import snapshot_download
-from insightface.app import FaceAnalysis
-from pipeline import OmniZeroPipeline
-from transformers import CLIPVisionModelWithProjection
-from utils import align_images, draw_kps, load_and_resize_image
-def patch_onnx_runtime(
-    inter_op_num_threads: int = 16,
-    intra_op_num_threads: int = 16,
-    omp_num_threads: int = 16,
-):
-    import os
-    import onnxruntime as ort
-    os.environ["OMP_NUM_THREADS"] = str(omp_num_threads)
-    _default_session_options = ort.capi._pybind_state.get_default_session_options()
-    def get_default_session_options_new():
-        _default_session_options.inter_op_num_threads = inter_op_num_threads
-        _default_session_options.intra_op_num_threads = intra_op_num_threads
-        return _default_session_options
-    ort.capi._pybind_state.get_default_session_options = get_default_session_options_new
-base_model = "frankjoshua/albedobaseXL_v13"
-patch_onnx_runtime()
-snapshot_download("okaris/antelopev2", local_dir="./models/antelopev2")
-face_analysis = FaceAnalysis(name='antelopev2', root='./', providers=['CPUExecutionProvider'])
-face_analysis.prepare(ctx_id=0, det_size=(640, 640))
-dtype = torch.float16
-ip_adapter_plus_image_encoder = CLIPVisionModelWithProjection.from_pretrained(
-    "h94/IP-Adapter",
-    subfolder="models/image_encoder",
-    torch_dtype=dtype,
-).to("cuda")
-zoedepthnet_path = "okaris/zoe-depth-controlnet-xl"
-zoedepthnet = ControlNetModel.from_pretrained(zoedepthnet_path,torch_dtype=dtype).to("cuda")
-identitiynet_path = "okaris/face-controlnet-xl"
-identitynet = ControlNetModel.from_pretrained(identitiynet_path, torch_dtype=dtype).to("cuda")
-zoe_depth_detector = ZoeDetector.from_pretrained("lllyasviel/Annotators").to("cuda")
-ip_adapter_mask_processor = IPAdapterMaskProcessor()
-pipeline = OmniZeroPipeline.from_pretrained(
-    base_model,
-    controlnet=[identitynet, identitynet, zoedepthnet],
-    torch_dtype=dtype,
-    image_encoder=ip_adapter_plus_image_encoder,
-).to("cuda")
-config = pipeline.scheduler.config
-config["timestep_spacing"] = "trailing"
-pipeline.scheduler = DPMSolverMultistepScheduler.from_config(config, use_karras_sigmas=True, algorithm_type="sde-dpmsolver++", final_sigmas_type="zero")
-pipeline.load_ip_adapter(["okaris/ip-adapter-instantid", "okaris/ip-adapter-instantid", "h94/IP-Adapter"], subfolder=[None, None, "sdxl_models"], weight_name=["ip-adapter-instantid.bin", "ip-adapter-instantid.bin", "ip-adapter-plus_sdxl_vit-h.safetensors"])
 @spaces.GPU()
 def generate(
@@ -106,121 +40,26 @@ def generate(
     mask_guidance_end=1.0,
     progress=gr.Progress(track_tqdm=True)
 ):
-    resolution = 1024
-    if base_image is not None:
-        base_image = load_and_resize_image(base_image, resolution, resolution)
-    if depth_image is None:
-        depth_image = zoe_depth_detector(base_image, detect_resolution=resolution, image_resolution=resolution)
-    else:
-        depth_image = load_and_resize_image(depth_image, resolution, resolution)
-    base_image, depth_image = align_images(base_image, depth_image)
-    if style_image is not None:
-        style_image = load_and_resize_image(style_image, resolution, resolution)
-    else:
-        style_image = base_image
-        # raise ValueError("You must provide a style image")
-    if identity_image_1 is not None:
-        identity_image_1 = load_and_resize_image(identity_image_1, resolution, resolution)
-    else:
-        raise ValueError("You must provide an identity image")
-    if identity_image_2 is not None:
-        identity_image_2 = load_and_resize_image(identity_image_2, resolution, resolution)
-    else:
-        raise ValueError("You must provide an identity image 2")
-    height, width = base_image.size
-    face_info_1 = face_analysis.get(cv2.cvtColor(np.array(identity_image_1), cv2.COLOR_RGB2BGR))
-    for i, face in enumerate(face_info_1):
-        print(f"Face 1 -{i}: Age: {face['age']}, Gender: {face['gender']}")
-    face_info_1 = sorted(face_info_1, key=lambda x:(x['bbox'][2]-x['bbox'][0])*x['bbox'][3]-x['bbox'][1])[-1] # only use the maximum face
-    face_emb_1 = torch.tensor(face_info_1['embedding']).to("cuda", dtype=dtype)
-    face_info_2 = face_analysis.get(cv2.cvtColor(np.array(identity_image_2), cv2.COLOR_RGB2BGR))
-    for i, face in enumerate(face_info_2):
-        print(f"Face 2 -{i}: Age: {face['age']}, Gender: {face['gender']}")
-    face_info_2 = sorted(face_info_2, key=lambda x:(x['bbox'][2]-x['bbox'][0])*x['bbox'][3]-x['bbox'][1])[-1] # only use the maximum face
-    face_emb_2 = torch.tensor(face_info_2['embedding']).to("cuda", dtype=dtype)
-    zero = np.zeros((width, height, 3), dtype=np.uint8)
-    # face_kps_identity_image_1 = draw_kps(zero, face_info_1['kps'])
-    # face_kps_identity_image_2 = draw_kps(zero, face_info_2['kps'])
-    face_info_img2img = face_analysis.get(cv2.cvtColor(np.array(base_image), cv2.COLOR_RGB2BGR))
-    faces_info_img2img = sorted(face_info_img2img, key=lambda x:(x['bbox'][2]-x['bbox'][0])*x['bbox'][3]-x['bbox'][1])
-    face_info_a = faces_info_img2img[-1]
-    face_info_b = faces_info_img2img[-2]
-    # face_emb_a = torch.tensor(face_info_a['embedding']).to("cuda", dtype=dtype)
-    # face_emb_b = torch.tensor(face_info_b['embedding']).to("cuda", dtype=dtype)
-    face_kps_identity_image_a = draw_kps(zero, face_info_a['kps'])
-    face_kps_identity_image_b = draw_kps(zero, face_info_b['kps'])
-    general_mask = PIL.Image.fromarray(np.ones((width, height, 3), dtype=np.uint8))
-    control_mask_1 = zero.copy()
-    x1, y1, x2, y2 = face_info_a["bbox"]
-    x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
-    control_mask_1[y1:y2, x1:x2] = 255
-    control_mask_1 = PIL.Image.fromarray(control_mask_1.astype(np.uint8))
-    control_mask_2 = zero.copy()
-    x1, y1, x2, y2 = face_info_b["bbox"]
-    x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
-    control_mask_2[y1:y2, x1:x2] = 255
-    control_mask_2 = PIL.Image.fromarray(control_mask_2.astype(np.uint8))
-    controlnet_masks = [control_mask_1, control_mask_2, general_mask]
-    ip_adapter_images = [face_emb_1, face_emb_2, style_image, ]
-    masks = ip_adapter_mask_processor.preprocess([control_mask_1, control_mask_2, general_mask], height=height, width=width)
-    ip_adapter_masks = [mask.unsqueeze(0) for mask in masks]
-    inpaint_mask = torch.logical_or(torch.tensor(np.array(control_mask_1)), torch.tensor(np.array(control_mask_2))).float()
-    inpaint_mask = PIL.Image.fromarray((inpaint_mask.numpy() * 255).astype(np.uint8)).convert("RGB")
-    new_ip_adapter_masks = []
-    for ip_img, mask in zip(ip_adapter_images, controlnet_masks):
-        if isinstance(ip_img, list):
-            num_images = len(ip_img)
-            mask = mask.repeat(1, num_images, 1, 1)
-        new_ip_adapter_masks.append(mask)
-    generator = torch.Generator(device="cpu").manual_seed(seed)
-    pipeline.set_ip_adapter_scale([identity_image_strength_1, identity_image_strength_2,
-        {
-            "down": { "block_2": [0.0, 0.0] }, #Composition
-            "up": { "block_0": [0.0, style_image_strength, 0.0] } #Style
-        }
-    ])
-    images = pipeline(
         prompt=prompt,
-        negative_prompt=negative_prompt,
         guidance_scale=guidance_scale,
-        num_inference_steps=number_of_steps,
-        num_images_per_prompt=number_of_images,
-        ip_adapter_image=ip_adapter_images,
-        cross_attention_kwargs={"ip_adapter_masks": ip_adapter_masks},
-        image=base_image,
-        mask_image=inpaint_mask,
-        i2i_mask_guidance_start=mask_guidance_start,
-        i2i_mask_guidance_end=mask_guidance_end,
-        control_image=[face_kps_identity_image_a, face_kps_identity_image_b, depth_image],
-        control_mask=controlnet_masks,
-        identity_control_indices=[(0,0), (1,1)],
-        controlnet_conditioning_scale=[identity_image_strength_1, identity_image_strength_2, depth_image_strength],
-        strength=1-base_image_strength,
-        generator=generator,
-        seed=seed,
-    ).images
     return images
@@ -246,24 +85,24 @@ with gr.Blocks() as demo:
                         base_image = gr.Image(label="Base Image")
                     with gr.Row():
                         base_image_strength = gr.Slider(label="Strength",step=0.01, minimum=0.0, maximum=1.0, value=1.0)
                 with gr.Column(min_width=140):
                     with gr.Row():
                         identity_image = gr.Image(label="Identity Image")
                     with gr.Row():
-                        identity_image_strength = gr.Slider(label="Strength",step=0.01, minimum=0.0, maximum=1.0, value=1.0)
                 with gr.Column(min_width=140):
                     with gr.Row():
                         identity_image_2 = gr.Image(label="Identity Image 2")
                     with gr.Row():
-                        identity_image_strength_2 = gr.Slider(label="Strength",step=0.01, minimum=0.0, maximum=1.0, value=1.0)
-            with gr.Accordion("Advanced options", open=False):
-                with gr.Row():
-                    with gr.Column():
-                        style_image = gr.Image(label="Style Image")
-                        style_image_strength = gr.Slider(label="Style Strength",step=0.01, minimum=0.0, maximum=1.0, value=1.0)
-                    with gr.Column():
-                        depth_image = gr.Image(label="Depth Image")
-                        depth_image_strength = gr.Slider(label="Depth Strength",step=0.01, minimum=0.0, maximum=1.0, value=0.5)
                 with gr.Row():
                     seed = gr.Slider(label="Seed",step=1, minimum=0, maximum=10000000, value=42)
                     number_of_images = gr.Slider(label="Number of Outputs",step=1, minimum=1, maximum=4, value=1)
@@ -282,12 +121,12 @@ with gr.Blocks() as demo:
                 submit = gr.Button("Generate")
                 submit.click(generate, inputs=[
                     base_image,
-                    style_image if style_image is not None else bas,
                     identity_image,
                     identity_image_2,
                     seed,
-                    prompt,
                     negative_prompt,
                     guidance_scale,
                     number_of_images,
@@ -296,8 +135,6 @@ with gr.Blocks() as demo:
                     style_image_strength,
                     identity_image_strength,
                     identity_image_strength_2,
-                    depth_image,
-                    depth_image_strength,
                     mask_guidance_start,
                     mask_guidance_end,
                     ],
@@ -307,15 +144,14 @@ with gr.Blocks() as demo:
     gr.Examples(
         examples=[
             [
                 "https://cdn-prod.styleof.com/inferences/cm1ho5cjl14nh14jec6phg2h8/i6k59e7gpsr45ufc7l8kun0g-medium.jpeg",
                 "https://cdn-prod.styleof.com/inferences/cm1ho5cjl14nh14jec6phg2h8/i6k59e7gpsr45ufc7l8kun0g-medium.jpeg",
                 "https://cdn-prod.styleof.com/inferences/cm1hp4lea14oz14jeoghnex7g/dlgc5xwo0qzey7qaixy45i1o-medium.jpeg",
-                "https://cdn-prod.styleof.com/inferences/cm1ho69ha14np14jesnusqiep/mp3aaktzqz20ujco5i3bi5s1-medium.jpeg",
-                42,
-                "Cinematic still photo of a couple. emotional, harmonious, vignette, 4k epic detailed, shot on kodak, 35mm photo, sharp focus, high budget, cinemascope, moody, epic, gorgeous, film grain, grainy",
             ]
         ],
-        inputs=[base_image, style_image, identity_image, identity_image_2, seed, prompt],
         outputs=[out],
         fn=generate,
         cache_examples="lazy",

 import gradio as gr
 import spaces
+import os
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 import torch
 torch.jit.script = lambda f: f
 ####
+from omni_zero import OmniZeroCouple
+omni_zero = OmniZeroCouple(
+    base_model="frankjoshua/albedobaseXL_v13",
+    device="cuda",
+)
+omni_zero.generate = spaces.GPU(omni_zero.generate)
 @spaces.GPU()
 def generate(
     mask_guidance_end=1.0,
     progress=gr.Progress(track_tqdm=True)
 ):
+    images = omni_zero.generate(
+        seed=seed,
         prompt=prompt,
+        negative_prompt=negative_prompt,
         guidance_scale=guidance_scale,
+        number_of_images=number_of_images,
+        number_of_steps=number_of_steps,
+        base_image=base_image,
+        base_image_strength=base_image_strength,
+        style_image=style_image,
+        style_image_strength=style_image_strength,
+        identity_image_1=identity_image_1,
+        identity_image_strength_1=identity_image_strength_1,
+        identity_image_2=identity_image_2,
+        identity_image_strength_2=identity_image_strength_2,
+        depth_image=depth_image,
+        depth_image_strength=depth_image_strength,
+        mask_guidance_start=mask_guidance_start,
+        mask_guidance_end=mask_guidance_end,
+    )
     return images
                         base_image = gr.Image(label="Base Image")
                     with gr.Row():
                         base_image_strength = gr.Slider(label="Strength",step=0.01, minimum=0.0, maximum=1.0, value=1.0)
+            #with gr.Row():
+                with gr.Column(min_width=140):
+                    with gr.Row():
+                        style_image = gr.Image(label="Style Image")
+                    with gr.Row():
+                        style_image_strength = gr.Slider(label="Strength",step=0.01, minimum=0.0, maximum=1.0, value=1.0)
+            with gr.Row():
                 with gr.Column(min_width=140):
                     with gr.Row():
                         identity_image = gr.Image(label="Identity Image")
                     with gr.Row():
+                        identity_image_strength = gr.Slider(label="Strenght",step=0.01, minimum=0.0, maximum=1.0, value=1.0)
                 with gr.Column(min_width=140):
                     with gr.Row():
                         identity_image_2 = gr.Image(label="Identity Image 2")
                     with gr.Row():
+                        identity_image_strength_2 = gr.Slider(label="Strenght",step=0.01, minimum=0.0, maximum=1.0, value=1.0)
+            with gr.Accordion("Advanced options", open=False):
                 with gr.Row():
                     seed = gr.Slider(label="Seed",step=1, minimum=0, maximum=10000000, value=42)
                     number_of_images = gr.Slider(label="Number of Outputs",step=1, minimum=1, maximum=4, value=1)
                 submit = gr.Button("Generate")
                 submit.click(generate, inputs=[
+                    prompt,
                     base_image,
+                    style_image,
                     identity_image,
                     identity_image_2,
                     seed,
                     negative_prompt,
                     guidance_scale,
                     number_of_images,
                     style_image_strength,
                     identity_image_strength,
                     identity_image_strength_2,
                     mask_guidance_start,
                     mask_guidance_end,
                     ],
     gr.Examples(
         examples=[
             [
+                "Cinematic still photo of a couple. emotional, harmonious, vignette, 4k epic detailed, shot on kodak, 35mm photo, sharp focus, high budget, cinemascope, moody, epic, gorgeous, film grain, grainy",
                 "https://cdn-prod.styleof.com/inferences/cm1ho5cjl14nh14jec6phg2h8/i6k59e7gpsr45ufc7l8kun0g-medium.jpeg",
                 "https://cdn-prod.styleof.com/inferences/cm1ho5cjl14nh14jec6phg2h8/i6k59e7gpsr45ufc7l8kun0g-medium.jpeg",
                 "https://cdn-prod.styleof.com/inferences/cm1hp4lea14oz14jeoghnex7g/dlgc5xwo0qzey7qaixy45i1o-medium.jpeg",
+                "https://cdn-prod.styleof.com/inferences/cm1ho69ha14np14jesnusqiep/mp3aaktzqz20ujco5i3bi5s1-medium.jpeg"
             ]
         ],
+        inputs=[prompt, base_image, style_image, identity_image, identity_image_2],
         outputs=[out],
         fn=generate,
         cache_examples="lazy",

omni_zero.py CHANGED Viewed

@@ -1,164 +1,31 @@
 import os
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 import sys
 sys.path.insert(0, './diffusers/src')
-import cv2
-import numpy as np
-import PIL
 import torch
-from controlnet_aux import ZoeDetector
 from diffusers import DPMSolverMultistepScheduler
-from diffusers.image_processor import IPAdapterMaskProcessor
 from diffusers.models import ControlNetModel
-from huggingface_hub import snapshot_download
-from insightface.app import FaceAnalysis
-from pipeline import OmniZeroPipeline
-from transformers import CLIPVisionModelWithProjection
-from utils import align_images, draw_kps, load_and_resize_image
-import random
-class OmniZeroSingle():
-    def __init__(self,
-        base_model="stabilityai/stable-diffusion-xl-base-1.0",
-        device="cuda",
-    ):
-        snapshot_download("okaris/antelopev2", local_dir="./models/antelopev2")
-        self.face_analysis = FaceAnalysis(name='antelopev2', root='./', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
-        self.face_analysis.prepare(ctx_id=0, det_size=(640, 640))
-        dtype = torch.float16
-        ip_adapter_plus_image_encoder = CLIPVisionModelWithProjection.from_pretrained(
-            "h94/IP-Adapter",
-            subfolder="models/image_encoder",
-            torch_dtype=dtype,
-        ).to(device)
-        zoedepthnet_path = "okaris/zoe-depth-controlnet-xl"
-        zoedepthnet = ControlNetModel.from_pretrained(zoedepthnet_path,torch_dtype=dtype).to(device)
-        identitiynet_path = "okaris/face-controlnet-xl"
-        identitynet = ControlNetModel.from_pretrained(identitiynet_path, torch_dtype=dtype).to(device)
-        self.zoe_depth_detector = ZoeDetector.from_pretrained("lllyasviel/Annotators").to(device)
-        self.pipeline = OmniZeroPipeline.from_pretrained(
-            base_model,
-            controlnet=[identitynet, zoedepthnet],
-            torch_dtype=dtype,
-            image_encoder=ip_adapter_plus_image_encoder,
-        ).to(device)
-        config = self.pipeline.scheduler.config
-        config["timestep_spacing"] = "trailing"
-        self.pipeline.scheduler = DPMSolverMultistepScheduler.from_config(config, use_karras_sigmas=True, algorithm_type="sde-dpmsolver++", final_sigmas_type="zero")
-        self.pipeline.load_ip_adapter(["okaris/ip-adapter-instantid", "h94/IP-Adapter", "h94/IP-Adapter"], subfolder=[None, "sdxl_models", "sdxl_models"], weight_name=["ip-adapter-instantid.bin", "ip-adapter-plus_sdxl_vit-h.safetensors", "ip-adapter-plus_sdxl_vit-h.safetensors"])
-    def get_largest_face_embedding_and_kps(self, image, target_image=None):
-        face_info = self.face_analysis.get(cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR))
-        if len(face_info) == 0:
-            return None, None
-        largest_face = sorted(face_info, key=lambda x: x['bbox'][2] * x['bbox'][3], reverse=True)[0]
-        face_embedding = torch.tensor(largest_face['embedding']).to("cuda")
-        if target_image is None:
-            target_image = image
-        zeros = np.zeros((target_image.size[1], target_image.size[0], 3), dtype=np.uint8)
-        face_kps_image = draw_kps(zeros, largest_face['kps'])
-        return face_embedding, face_kps_image
-    def generate(self,
-        seed=42,
-        prompt="A person",
-        negative_prompt="blurry, out of focus",
-        guidance_scale=3.0,
-        number_of_images=1,
-        number_of_steps=10,
-        base_image=None,
-        base_image_strength=0.15,
-        composition_image=None,
-        composition_image_strength=1.0,
-        style_image=None,
-        style_image_strength=1.0,
-        identity_image=None,
-        identity_image_strength=1.0,
-        depth_image=None,
-        depth_image_strength=0.5,
-    ):
-        resolution = 1024
-        if base_image is not None:
-            base_image = load_and_resize_image(base_image, resolution, resolution)
-        else:
-            if composition_image is not None:
-                base_image = load_and_resize_image(composition_image, resolution, resolution)
-            else:
-                raise ValueError("You must provide a base image or a composition image")
-        if depth_image is None:
-            depth_image = self.zoe_depth_detector(base_image, detect_resolution=resolution, image_resolution=resolution)
-        else:
-            depth_image = load_and_resize_image(depth_image, resolution, resolution)
-        base_image, depth_image = align_images(base_image, depth_image)
-        if composition_image is not None:
-            composition_image = load_and_resize_image(composition_image, resolution, resolution)
-        else:
-            composition_image = base_image
-        if style_image is not None:
-            style_image = load_and_resize_image(style_image, resolution, resolution)
-        else:
-            raise ValueError("You must provide a style image")
-        if identity_image is not None:
-            identity_image = load_and_resize_image(identity_image, resolution, resolution)
-        else:
-            raise ValueError("You must provide an identity image")
-        face_embedding_identity_image, target_kps = self.get_largest_face_embedding_and_kps(identity_image, base_image)
-        if face_embedding_identity_image is None:
-            raise ValueError("No face found in the identity image, the image might be cropped too tightly or the face is too small")
-        face_embedding_base_image, face_kps_base_image = self.get_largest_face_embedding_and_kps(base_image)
-        if face_embedding_base_image is not None:
-            target_kps = face_kps_base_image
-        self.pipeline.set_ip_adapter_scale([identity_image_strength,
-            {
-                "down": { "block_2": [0.0, 0.0] },
-                "up": { "block_0": [0.0, style_image_strength, 0.0] }
-            },
-            {
-                "down": { "block_2": [0.0, composition_image_strength] },
-                "up": { "block_0": [0.0, 0.0, 0.0] }
-            }
-        ])
-        generator = torch.Generator(device="cpu").manual_seed(seed)
-        images = self.pipeline(
-            prompt=prompt,
-            negative_prompt=negative_prompt,
-            guidance_scale=guidance_scale,
-            ip_adapter_image=[face_embedding_identity_image, style_image, composition_image],
-            image=base_image,
-            control_image=[target_kps, depth_image],
-            controlnet_conditioning_scale=[identity_image_strength, depth_image_strength],
-            identity_control_indices=[(0,0)],
-            num_inference_steps=number_of_steps,
-            num_images_per_prompt=number_of_images,
-            strength=(1-base_image_strength),
-            generator=generator,
-            seed=seed,
-        ).images
-        return images
 class OmniZeroCouple():
     def __init__(self,
@@ -210,7 +77,7 @@ class OmniZeroCouple():
         number_of_images=1,
         number_of_steps=10,
         base_image=None,
-        base_image_strength=0.2,
         style_image=None,
         style_image_strength=1.0,
         identity_image_1=None,
@@ -223,9 +90,6 @@ class OmniZeroCouple():
         mask_guidance_end=1.0,
     ):
-        if seed == -1:
-            seed = random.randint(0, 1000000)
         resolution = 1024
         if base_image is not None:
@@ -350,7 +214,6 @@ class OmniZeroCouple():
         omp_num_threads: int = 16,
     ):
         import os
         import onnxruntime as ort
         os.environ["OMP_NUM_THREADS"] = str(omp_num_threads)

 import os
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 import sys
 sys.path.insert(0, './diffusers/src')
 import torch
+import torch.nn as nn
+from huggingface_hub import snapshot_download
 from diffusers import DPMSolverMultistepScheduler
 from diffusers.models import ControlNetModel
+from diffusers.image_processor import IPAdapterMaskProcessor
+from transformers import CLIPVisionModelWithProjection
+from pipeline import OmniZeroPipeline
+from insightface.app import FaceAnalysis
+from controlnet_aux import ZoeDetector
+from utils import draw_kps, load_and_resize_image, align_images
+from pydantic import BaseModel, Field
+import cv2
+import numpy as np
+from torchvision.transforms import functional as TVF
+import PIL
 class OmniZeroCouple():
     def __init__(self,
         number_of_images=1,
         number_of_steps=10,
         base_image=None,
+        base_image_strength=0.15,
         style_image=None,
         style_image_strength=1.0,
         identity_image_1=None,
         mask_guidance_end=1.0,
     ):
         resolution = 1024
         if base_image is not None:
         omp_num_threads: int = 16,
     ):
         import os
         import onnxruntime as ort
         os.environ["OMP_NUM_THREADS"] = str(omp_num_threads)

predict.py CHANGED Viewed

@@ -4,7 +4,6 @@
 from cog import BasePredictor, Input, Path
 from typing import List
 from omni_zero import OmniZeroCouple
-from PIL import Image
 class Predictor(BasePredictor):
     def setup(self):
@@ -14,20 +13,20 @@ class Predictor(BasePredictor):
         )
     def predict(
         self,
         base_image: Path = Input(description="Base image for the model", default=None),
-        base_image_strength: float = Input(description="Base image strength for the model", default=0.2, ge=0.0, le=1.0),
         style_image: Path = Input(description="Style image for the model", default=None),
         style_image_strength: float = Input(description="Style image strength for the model", default=1.0, ge=0.0, le=1.0),
         identity_image_1: Path = Input(description="First identity image for the model", default=None),
         identity_image_strength_1: float = Input(description="First identity image strength for the model", default=1.0, ge=0.0, le=1.0),
         identity_image_2: Path = Input(description="Second identity image for the model", default=None),
         identity_image_strength_2: float = Input(description="Second identity image strength for the model", default=1.0, ge=0.0, le=1.0),
-        seed: int = Input(description="Random seed for the model. Use -1 for random", default=-1),
-        prompt: str = Input(description="Prompt for the model", default="Cinematic still photo of a couple. emotional, harmonious, vignette, 4k epic detailed, shot on kodak, 35mm photo, sharp focus, high budget, cinemascope, moody, epic, gorgeous, film grain, grainy"),
-        negative_prompt: str = Input(description="Negative prompt for the model", default="anime, cartoon, graphic, (blur, blurry, bokeh), text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured"),
-        guidance_scale: float = Input(description="Guidance scale for the model", default=3.0, ge=0.0, le=14.0),
-        number_of_images: int = Input(description="Number of images to generate", default=1, ge=1, le=4),
-        number_of_steps: int = Input(description="Number of steps for the model", default=10, ge=1, le=50),
         depth_image: Path = Input(description="Depth image for the model", default=None),
         depth_image_strength: float = Input(description="Depth image strength for the model", default=0.2, ge=0.0, le=1.0),
         mask_guidance_start: float = Input(description="Mask guidance start value", default=0.0, ge=0.0, le=1.0),
@@ -35,17 +34,11 @@ class Predictor(BasePredictor):
     ) -> List[Path]:
         """Run a single prediction on the model"""
-        base_image = Image.open(base_image) if base_image else None
-        style_image = Image.open(style_image) if style_image else None
-        identity_image_1 = Image.open(identity_image_1) if identity_image_1 else None
-        identity_image_2 = Image.open(identity_image_2) if identity_image_2 else None
-        depth_image = Image.open(depth_image) if depth_image else None
-        print("base_image", base_image)
-        print("style_image", style_image)
-        print("identity_image_1", identity_image_1)
-        print("identity_image_2", identity_image_2)
-        print("depth_image", depth_image)
         images = self.omni_zero.generate(
             seed=seed,

 from cog import BasePredictor, Input, Path
 from typing import List
 from omni_zero import OmniZeroCouple
 class Predictor(BasePredictor):
     def setup(self):
         )
     def predict(
         self,
+        seed: int = Input(description="Random seed for the model", default=42),
+        prompt: str = Input(description="Prompt for the model", default="Cinematic still photo of a couple. emotional, harmonious, vignette, 4k epic detailed, shot on kodak, 35mm photo, sharp focus, high budget, cinemascope, moody, epic, gorgeous, film grain, grainy"),
+        negative_prompt: str = Input(description="Negative prompt for the model", default="anime, cartoon, graphic, (blur, blurry, bokeh), text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured"),
+        guidance_scale: float = Input(description="Guidance scale for the model", default=3.0, ge=0.0, le=14.0),
+        number_of_images: int = Input(description="Number of images to generate", default=1, ge=1, le=4),
+        number_of_steps: int = Input(description="Number of steps for the model", default=10, ge=1, le=50),
         base_image: Path = Input(description="Base image for the model", default=None),
+        base_image_strength: float = Input(description="Base image strength for the model", default=0.3, ge=0.0, le=1.0),
         style_image: Path = Input(description="Style image for the model", default=None),
         style_image_strength: float = Input(description="Style image strength for the model", default=1.0, ge=0.0, le=1.0),
         identity_image_1: Path = Input(description="First identity image for the model", default=None),
         identity_image_strength_1: float = Input(description="First identity image strength for the model", default=1.0, ge=0.0, le=1.0),
         identity_image_2: Path = Input(description="Second identity image for the model", default=None),
         identity_image_strength_2: float = Input(description="Second identity image strength for the model", default=1.0, ge=0.0, le=1.0),
         depth_image: Path = Input(description="Depth image for the model", default=None),
         depth_image_strength: float = Input(description="Depth image strength for the model", default=0.2, ge=0.0, le=1.0),
         mask_guidance_start: float = Input(description="Mask guidance start value", default=0.0, ge=0.0, le=1.0),
     ) -> List[Path]:
         """Run a single prediction on the model"""
+        # base_image = Image.open(base_image) if base_image else None
+        # style_image = Image.open(style_image) if style_image else None
+        # identity_image_1 = Image.open(identity_image_1) if identity_image_1 else None
+        # identity_image_2 = Image.open(identity_image_2) if identity_image_2 else None
+        # depth_image = Image.open(depth_image) if depth_image else None
         images = self.omni_zero.generate(
             seed=seed,