Vijish
/

rem

Model card Files Files and versions Community

Vijish commited on Jan 4

Commit

f4f26b2

•

1 Parent(s): becc578

Upload 5 files

Browse files

Files changed (5) hide show

src/__init__.py +0 -0
src/core.py +466 -0
src/helper.py +87 -0
src/pipeline_stable_diffusion_controlnet_inpaint.py +500 -0
src/st_style.py +42 -0

src/__init__.py ADDED Viewed

File without changes

src/core.py ADDED Viewed

	@@ -0,0 +1,466 @@

+import base64
+import json
+import os
+import re
+import time
+import uuid
+from io import BytesIO
+from pathlib import Path
+import cv2
+# For inpainting
+import numpy as np
+import pandas as pd
+import streamlit as st
+from PIL import Image
+#from streamlit_drawable_canvas import st_canvas
+import argparse
+import io
+import multiprocessing
+from typing import Union
+import torch
+try:
+    torch._C._jit_override_can_fuse_on_cpu(False)
+    torch._C._jit_override_can_fuse_on_gpu(False)
+    torch._C._jit_set_texpr_fuser_enabled(False)
+    torch._C._jit_set_nvfuser_enabled(False)
+except:
+    pass
+from src.helper import (
+    download_model,
+    load_img,
+    norm_img,
+    numpy_to_bytes,
+    pad_img_to_modulo,
+    resize_max_size,
+)
+NUM_THREADS = str(multiprocessing.cpu_count())
+os.environ["OMP_NUM_THREADS"] = NUM_THREADS
+os.environ["OPENBLAS_NUM_THREADS"] = NUM_THREADS
+os.environ["MKL_NUM_THREADS"] = NUM_THREADS
+os.environ["VECLIB_MAXIMUM_THREADS"] = NUM_THREADS
+os.environ["NUMEXPR_NUM_THREADS"] = NUM_THREADS
+if os.environ.get("CACHE_DIR"):
+    os.environ["TORCH_HOME"] = os.environ["CACHE_DIR"]
+#BUILD_DIR = os.environ.get("LAMA_CLEANER_BUILD_DIR", "./lama_cleaner/app/build")
+# For Seam-carving
+from scipy import ndimage as ndi
+SEAM_COLOR = np.array([255, 200, 200])    # seam visualization color (BGR)
+SHOULD_DOWNSIZE = True                    # if True, downsize image for faster carving
+DOWNSIZE_WIDTH = 500                      # resized image width if SHOULD_DOWNSIZE is True
+ENERGY_MASK_CONST = 100000.0              # large energy value for protective masking
+MASK_THRESHOLD = 10                       # minimum pixel intensity for binary mask
+USE_FORWARD_ENERGY = True                 # if True, use forward energy algorithm
+device = torch.device("cpu")
+model_path = "./assets/big-lama.pt"
+model = torch.jit.load(model_path, map_location="cpu")
+model = model.to(device)
+model.eval()
+########################################
+# UTILITY CODE
+########################################
+def visualize(im, boolmask=None, rotate=False):
+    vis = im.astype(np.uint8)
+    if boolmask is not None:
+        vis[np.where(boolmask == False)] = SEAM_COLOR
+    if rotate:
+        vis = rotate_image(vis, False)
+    cv2.imshow("visualization", vis)
+    cv2.waitKey(1)
+    return vis
+def resize(image, width):
+    dim = None
+    h, w = image.shape[:2]
+    dim = (width, int(h * width / float(w)))
+    image = image.astype('float32')
+    return cv2.resize(image, dim)
+def rotate_image(image, clockwise):
+    k = 1 if clockwise else 3
+    return np.rot90(image, k)
+########################################
+# ENERGY FUNCTIONS
+########################################
+def backward_energy(im):
+    """
+    Simple gradient magnitude energy map.
+    """
+    xgrad = ndi.convolve1d(im, np.array([1, 0, -1]), axis=1, mode='wrap')
+    ygrad = ndi.convolve1d(im, np.array([1, 0, -1]), axis=0, mode='wrap')
+    grad_mag = np.sqrt(np.sum(xgrad**2, axis=2) + np.sum(ygrad**2, axis=2))
+    # vis = visualize(grad_mag)
+    # cv2.imwrite("backward_energy_demo.jpg", vis)
+    return grad_mag
+def forward_energy(im):
+    """
+    Forward energy algorithm as described in "Improved Seam Carving for Video Retargeting"
+    by Rubinstein, Shamir, Avidan.
+    Vectorized code adapted from
+    https://github.com/axu2/improved-seam-carving.
+    """
+    h, w = im.shape[:2]
+    im = cv2.cvtColor(im.astype(np.uint8), cv2.COLOR_BGR2GRAY).astype(np.float64)
+    energy = np.zeros((h, w))
+    m = np.zeros((h, w))
+    U = np.roll(im, 1, axis=0)
+    L = np.roll(im, 1, axis=1)
+    R = np.roll(im, -1, axis=1)
+    cU = np.abs(R - L)
+    cL = np.abs(U - L) + cU
+    cR = np.abs(U - R) + cU
+    for i in range(1, h):
+        mU = m[i-1]
+        mL = np.roll(mU, 1)
+        mR = np.roll(mU, -1)
+        mULR = np.array([mU, mL, mR])
+        cULR = np.array([cU[i], cL[i], cR[i]])
+        mULR += cULR
+        argmins = np.argmin(mULR, axis=0)
+        m[i] = np.choose(argmins, mULR)
+        energy[i] = np.choose(argmins, cULR)
+    # vis = visualize(energy)
+    # cv2.imwrite("forward_energy_demo.jpg", vis)
+    return energy
+########################################
+# SEAM HELPER FUNCTIONS
+########################################
+def add_seam(im, seam_idx):
+    """
+    Add a vertical seam to a 3-channel color image at the indices provided
+    by averaging the pixels values to the left and right of the seam.
+    Code adapted from https://github.com/vivianhylee/seam-carving.
+    """
+    h, w = im.shape[:2]
+    output = np.zeros((h, w + 1, 3))
+    for row in range(h):
+        col = seam_idx[row]
+        for ch in range(3):
+            if col == 0:
+                p = np.mean(im[row, col: col + 2, ch])
+                output[row, col, ch] = im[row, col, ch]
+                output[row, col + 1, ch] = p
+                output[row, col + 1:, ch] = im[row, col:, ch]
+            else:
+                p = np.mean(im[row, col - 1: col + 1, ch])
+                output[row, : col, ch] = im[row, : col, ch]
+                output[row, col, ch] = p
+                output[row, col + 1:, ch] = im[row, col:, ch]
+    return output
+def add_seam_grayscale(im, seam_idx):
+    """
+    Add a vertical seam to a grayscale image at the indices provided
+    by averaging the pixels values to the left and right of the seam.
+    """
+    h, w = im.shape[:2]
+    output = np.zeros((h, w + 1))
+    for row in range(h):
+        col = seam_idx[row]
+        if col == 0:
+            p = np.mean(im[row, col: col + 2])
+            output[row, col] = im[row, col]
+            output[row, col + 1] = p
+            output[row, col + 1:] = im[row, col:]
+        else:
+            p = np.mean(im[row, col - 1: col + 1])
+            output[row, : col] = im[row, : col]
+            output[row, col] = p
+            output[row, col + 1:] = im[row, col:]
+    return output
+def remove_seam(im, boolmask):
+    h, w = im.shape[:2]
+    boolmask3c = np.stack([boolmask] * 3, axis=2)
+    return im[boolmask3c].reshape((h, w - 1, 3))
+def remove_seam_grayscale(im, boolmask):
+    h, w = im.shape[:2]
+    return im[boolmask].reshape((h, w - 1))
+def get_minimum_seam(im, mask=None, remove_mask=None):
+    """
+    DP algorithm for finding the seam of minimum energy. Code adapted from
+    https://karthikkaranth.me/blog/implementing-seam-carving-with-python/
+    """
+    h, w = im.shape[:2]
+    energyfn = forward_energy if USE_FORWARD_ENERGY else backward_energy
+    M = energyfn(im)
+    if mask is not None:
+        M[np.where(mask > MASK_THRESHOLD)] = ENERGY_MASK_CONST
+    # give removal mask priority over protective mask by using larger negative value
+    if remove_mask is not None:
+        M[np.where(remove_mask > MASK_THRESHOLD)] = -ENERGY_MASK_CONST * 100
+    seam_idx, boolmask = compute_shortest_path(M, im, h, w)
+    return np.array(seam_idx), boolmask
+def compute_shortest_path(M, im, h, w):
+    backtrack = np.zeros_like(M, dtype=np.int_)
+    # populate DP matrix
+    for i in range(1, h):
+        for j in range(0, w):
+            if j == 0:
+                idx = np.argmin(M[i - 1, j:j + 2])
+                backtrack[i, j] = idx + j
+                min_energy = M[i-1, idx + j]
+            else:
+                idx = np.argmin(M[i - 1, j - 1:j + 2])
+                backtrack[i, j] = idx + j - 1
+                min_energy = M[i - 1, idx + j - 1]
+            M[i, j] += min_energy
+    # backtrack to find path
+    seam_idx = []
+    boolmask = np.ones((h, w), dtype=np.bool_)
+    j = np.argmin(M[-1])
+    for i in range(h-1, -1, -1):
+        boolmask[i, j] = False
+        seam_idx.append(j)
+        j = backtrack[i, j]
+    seam_idx.reverse()
+    return seam_idx, boolmask
+########################################
+# MAIN ALGORITHM
+########################################
+def seams_removal(im, num_remove, mask=None, vis=False, rot=False):
+    for _ in range(num_remove):
+        seam_idx, boolmask = get_minimum_seam(im, mask)
+        if vis:
+            visualize(im, boolmask, rotate=rot)
+        im = remove_seam(im, boolmask)
+        if mask is not None:
+            mask = remove_seam_grayscale(mask, boolmask)
+    return im, mask
+def seams_insertion(im, num_add, mask=None, vis=False, rot=False):
+    seams_record = []
+    temp_im = im.copy()
+    temp_mask = mask.copy() if mask is not None else None
+    for _ in range(num_add):
+        seam_idx, boolmask = get_minimum_seam(temp_im, temp_mask)
+        if vis:
+            visualize(temp_im, boolmask, rotate=rot)
+        seams_record.append(seam_idx)
+        temp_im = remove_seam(temp_im, boolmask)
+        if temp_mask is not None:
+            temp_mask = remove_seam_grayscale(temp_mask, boolmask)
+    seams_record.reverse()
+    for _ in range(num_add):
+        seam = seams_record.pop()
+        im = add_seam(im, seam)
+        if vis:
+            visualize(im, rotate=rot)
+        if mask is not None:
+            mask = add_seam_grayscale(mask, seam)
+        # update the remaining seam indices
+        for remaining_seam in seams_record:
+            remaining_seam[np.where(remaining_seam >= seam)] += 2
+    return im, mask
+########################################
+# MAIN DRIVER FUNCTIONS
+########################################
+def seam_carve(im, dy, dx, mask=None, vis=False):
+    im = im.astype(np.float64)
+    h, w = im.shape[:2]
+    assert h + dy > 0 and w + dx > 0 and dy <= h and dx <= w
+    if mask is not None:
+        mask = mask.astype(np.float64)
+    output = im
+    if dx < 0:
+        output, mask = seams_removal(output, -dx, mask, vis)
+    elif dx > 0:
+        output, mask = seams_insertion(output, dx, mask, vis)
+    if dy < 0:
+        output = rotate_image(output, True)
+        if mask is not None:
+            mask = rotate_image(mask, True)
+        output, mask = seams_removal(output, -dy, mask, vis, rot=True)
+        output = rotate_image(output, False)
+    elif dy > 0:
+        output = rotate_image(output, True)
+        if mask is not None:
+            mask = rotate_image(mask, True)
+        output, mask = seams_insertion(output, dy, mask, vis, rot=True)
+        output = rotate_image(output, False)
+    return output
+def object_removal(im, rmask, mask=None, vis=False, horizontal_removal=False):
+    im = im.astype(np.float64)
+    rmask = rmask.astype(np.float64)
+    if mask is not None:
+        mask = mask.astype(np.float64)
+    output = im
+    h, w = im.shape[:2]
+    if horizontal_removal:
+        output = rotate_image(output, True)
+        rmask = rotate_image(rmask, True)
+        if mask is not None:
+            mask = rotate_image(mask, True)
+    while len(np.where(rmask > MASK_THRESHOLD)[0]) > 0:
+        seam_idx, boolmask = get_minimum_seam(output, mask, rmask)
+        if vis:
+            visualize(output, boolmask, rotate=horizontal_removal)
+        output = remove_seam(output, boolmask)
+        rmask = remove_seam_grayscale(rmask, boolmask)
+        if mask is not None:
+            mask = remove_seam_grayscale(mask, boolmask)
+    num_add = (h if horizontal_removal else w) - output.shape[1]
+    output, mask = seams_insertion(output, num_add, mask, vis, rot=horizontal_removal)
+    if horizontal_removal:
+        output = rotate_image(output, False)
+    return output
+def s_image(im,mask,vs,hs,mode="resize"):
+    im = cv2.cvtColor(im, cv2.COLOR_RGBA2RGB)
+    mask = 255-mask[:,:,3]
+    h, w = im.shape[:2]
+    if SHOULD_DOWNSIZE and w > DOWNSIZE_WIDTH:
+        im = resize(im, width=DOWNSIZE_WIDTH)
+        if mask is not None:
+            mask = resize(mask, width=DOWNSIZE_WIDTH)
+    # image resize mode
+    if mode=="resize":
+        dy = hs#reverse
+        dx = vs#reverse
+        assert dy is not None and dx is not None
+        output = seam_carve(im, dy, dx, mask, False)
+    # object removal mode
+    elif mode=="remove":
+        assert mask is not None
+        output = object_removal(im, mask, None, False, True)
+    return output
+##### Inpainting helper code
+def run(image, mask):
+    """
+    image: [C, H, W]
+    mask: [1, H, W]
+    return: BGR IMAGE
+    """
+    origin_height, origin_width = image.shape[1:]
+    image = pad_img_to_modulo(image, mod=8)
+    mask = pad_img_to_modulo(mask, mod=8)
+    mask = (mask > 0) * 1
+    image = torch.from_numpy(image).unsqueeze(0).to(device)
+    mask = torch.from_numpy(mask).unsqueeze(0).to(device)
+    start = time.time()
+    with torch.no_grad():
+        inpainted_image = model(image, mask)
+    print(f"process time: {(time.time() - start)*1000}ms")
+    cur_res = inpainted_image[0].permute(1, 2, 0).detach().cpu().numpy()
+    cur_res = cur_res[0:origin_height, 0:origin_width, :]
+    cur_res = np.clip(cur_res * 255, 0, 255).astype("uint8")
+    cur_res = cv2.cvtColor(cur_res, cv2.COLOR_BGR2RGB)
+    return cur_res
+def get_args_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--port", default=8080, type=int)
+    parser.add_argument("--device", default="cuda", type=str)
+    parser.add_argument("--debug", action="store_true")
+    return parser.parse_args()
+def process_inpaint(image, mask):
+    image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
+    original_shape = image.shape
+    interpolation = cv2.INTER_CUBIC
+    #size_limit: Union[int, str] = request.form.get("sizeLimit", "1080")
+    #if size_limit == "Original":
+    size_limit = max(image.shape)
+    #else:
+    #    size_limit = int(size_limit)
+    print(f"Origin image shape: {original_shape}")
+    image = resize_max_size(image, size_limit=size_limit, interpolation=interpolation)
+    print(f"Resized image shape: {image.shape}")
+    image = norm_img(image)
+    mask = 255-mask[:,:,3]
+    mask = resize_max_size(mask, size_limit=size_limit, interpolation=interpolation)
+    mask = norm_img(mask)
+    res_np_img = run(image, mask)
+    return cv2.cvtColor(res_np_img, cv2.COLOR_BGR2RGB)

src/helper.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import os
+import sys
+from urllib.parse import urlparse
+import cv2
+import numpy as np
+import torch
+from torch.hub import download_url_to_file, get_dir
+LAMA_MODEL_URL = os.environ.get(
+    "LAMA_MODEL_URL",
+    "https://github.com/Sanster/models/releases/download/add_big_lama/big-lama.pt",
+)
+def download_model(url=LAMA_MODEL_URL):
+    parts = urlparse(url)
+    hub_dir = get_dir()
+    model_dir = os.path.join(hub_dir, "checkpoints")
+    if not os.path.isdir(model_dir):
+        os.makedirs(os.path.join(model_dir, "hub", "checkpoints"))
+    filename = os.path.basename(parts.path)
+    cached_file = os.path.join(model_dir, filename)
+    if not os.path.exists(cached_file):
+        sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file))
+        hash_prefix = None
+        download_url_to_file(url, cached_file, hash_prefix, progress=True)
+    return cached_file
+def ceil_modulo(x, mod):
+    if x % mod == 0:
+        return x
+    return (x // mod + 1) * mod
+def numpy_to_bytes(image_numpy: np.ndarray) -> bytes:
+    data = cv2.imencode(".jpg", image_numpy)[1]
+    image_bytes = data.tobytes()
+    return image_bytes
+def load_img(img_bytes, gray: bool = False):
+    nparr = np.frombuffer(img_bytes, np.uint8)
+    if gray:
+        np_img = cv2.imdecode(nparr, cv2.IMREAD_GRAYSCALE)
+    else:
+        np_img = cv2.imdecode(nparr, cv2.IMREAD_UNCHANGED)
+        if len(np_img.shape) == 3 and np_img.shape[2] == 4:
+            np_img = cv2.cvtColor(np_img, cv2.COLOR_BGRA2RGB)
+        else:
+            np_img = cv2.cvtColor(np_img, cv2.COLOR_BGR2RGB)
+    return np_img
+def norm_img(np_img):
+    if len(np_img.shape) == 2:
+        np_img = np_img[:, :, np.newaxis]
+    np_img = np.transpose(np_img, (2, 0, 1))
+    np_img = np_img.astype("float32") / 255
+    return np_img
+def resize_max_size(
+    np_img, size_limit: int, interpolation=cv2.INTER_CUBIC
+) -> np.ndarray:
+    # Resize image's longer size to size_limit if longer size larger than size_limit
+    h, w = np_img.shape[:2]
+    if max(h, w) > size_limit:
+        ratio = size_limit / max(h, w)
+        new_w = int(w * ratio + 0.5)
+        new_h = int(h * ratio + 0.5)
+        return cv2.resize(np_img, dsize=(new_w, new_h), interpolation=interpolation)
+    else:
+        return np_img
+def pad_img_to_modulo(img, mod):
+    channels, height, width = img.shape
+    out_height = ceil_modulo(height, mod)
+    out_width = ceil_modulo(width, mod)
+    return np.pad(
+        img,
+        ((0, 0), (0, out_height - height), (0, out_width - width)),
+        mode="symmetric",
+    )

src/pipeline_stable_diffusion_controlnet_inpaint.py ADDED Viewed

	@@ -0,0 +1,500 @@

+import torch
+import PIL.Image
+import numpy as np
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import *
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # !pip install opencv-python transformers accelerate
+        >>> from diffusers import StableDiffusionControlNetInpaintPipeline, ControlNetModel, UniPCMultistepScheduler
+        >>> from diffusers.utils import load_image
+        >>> import numpy as np
+        >>> import torch
+        >>> import cv2
+        >>> from PIL import Image
+        >>> # download an image
+        >>> image = load_image(
+        ...     "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+        ... )
+        >>> image = np.array(image)
+        >>> mask_image = load_image(
+        ...     "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+        ... )
+        >>> mask_image = np.array(mask_image)
+        >>> # get canny image
+        >>> canny_image = cv2.Canny(image, 100, 200)
+        >>> canny_image = canny_image[:, :, None]
+        >>> canny_image = np.concatenate([canny_image, canny_image, canny_image], axis=2)
+        >>> canny_image = Image.fromarray(canny_image)
+        >>> # load control net and stable diffusion v1-5
+        >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+        >>> pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-inpainting", controlnet=controlnet, torch_dtype=torch.float16
+        ... )
+        >>> # speed up diffusion process with faster scheduler and memory optimization
+        >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+        >>> # remove following line if xformers is not installed
+        >>> pipe.enable_xformers_memory_efficient_attention()
+        >>> pipe.enable_model_cpu_offload()
+        >>> # generate image
+        >>> generator = torch.manual_seed(0)
+        >>> image = pipe(
+        ...     "futuristic-looking doggo",
+        ...     num_inference_steps=20,
+        ...     generator=generator,
+        ...     image=image,
+        ...     control_image=canny_image,
+        ...     mask_image=mask_image
+        ... ).images[0]
+        ```
+"""
+def prepare_mask_and_masked_image(image, mask):
+    """
+    Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
+    converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
+    ``image`` and ``1`` for the ``mask``.
+    The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
+    binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
+    Args:
+        image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
+            ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
+        mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
+            ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
+    Raises:
+        ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
+        should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
+        TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
+            (ot the other way around).
+    Returns:
+        tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
+            dimensions: ``batch x channels x height x width``.
+    """
+    if isinstance(image, torch.Tensor):
+        if not isinstance(mask, torch.Tensor):
+            raise TypeError(f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not")
+        # Batch single image
+        if image.ndim == 3:
+            assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
+            image = image.unsqueeze(0)
+        # Batch and add channel dim for single mask
+        if mask.ndim == 2:
+            mask = mask.unsqueeze(0).unsqueeze(0)
+        # Batch single mask or add channel dim
+        if mask.ndim == 3:
+            # Single batched mask, no channel dim or single mask not batched but channel dim
+            if mask.shape[0] == 1:
+                mask = mask.unsqueeze(0)
+            # Batched masks no channel dim
+            else:
+                mask = mask.unsqueeze(1)
+        assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+        assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+        assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
+        # Check image is in [-1, 1]
+        if image.min() < -1 or image.max() > 1:
+            raise ValueError("Image should be in [-1, 1] range")
+        # Check mask is in [0, 1]
+        if mask.min() < 0 or mask.max() > 1:
+            raise ValueError("Mask should be in [0, 1] range")
+        # Binarize mask
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+        # Image as float32
+        image = image.to(dtype=torch.float32)
+    elif isinstance(mask, torch.Tensor):
+        raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not")
+    else:
+        # preprocess image
+        if isinstance(image, (PIL.Image.Image, np.ndarray)):
+            image = [image]
+        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+            image = [np.array(i.convert("RGB"))[None, :] for i in image]
+            image = np.concatenate(image, axis=0)
+        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+            image = np.concatenate([i[None, :] for i in image], axis=0)
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+        # preprocess mask
+        if isinstance(mask, (PIL.Image.Image, np.ndarray)):
+            mask = [mask]
+        if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
+            mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+            mask = mask.astype(np.float32) / 255.0
+        elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
+            mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+        mask = torch.from_numpy(mask)
+    masked_image = image * (mask < 0.5)
+    return mask, masked_image
+class StableDiffusionControlNetInpaintPipeline(StableDiffusionControlNetPipeline):
+    r"""
+    Pipeline for text-guided image inpainting using Stable Diffusion with ControlNet guidance.
+    This model inherits from [`StableDiffusionControlNetPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        controlnet ([`ControlNetModel`]):
+            Provides additional conditioning to the unet during the denoising process
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+    def prepare_mask_latents(
+        self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
+    ):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+        )
+        mask = mask.to(device=device, dtype=dtype)
+        masked_image = masked_image.to(device=device, dtype=dtype)
+        # encode the mask image into latents space so we can concatenate it to the latents
+        if isinstance(generator, list):
+            masked_image_latents = [
+                self.vae.encode(masked_image[i : i + 1]).latent_dist.sample(generator=generator[i])
+                for i in range(batch_size)
+            ]
+            masked_image_latents = torch.cat(masked_image_latents, dim=0)
+        else:
+            masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator)
+        masked_image_latents = self.vae.config.scaling_factor * masked_image_latents
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+        if masked_image_latents.shape[0] < batch_size:
+            if not batch_size % masked_image_latents.shape[0] == 0:
+                raise ValueError(
+                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                    f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                    " Make sure the number of images that you pass is divisible by the total requested batch size."
+                )
+            masked_image_latents = masked_image_latents.repeat(batch_size // masked_image_latents.shape[0], 1, 1, 1)
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+        masked_image_latents = (
+            torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+        )
+        # aligning device to prevent device errors when concating it with the latent model input
+        masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+        return mask, masked_image_latents
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        control_image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]] = None,
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: float = 1.0,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+                be masked out with `mask_image` and repainted according to `prompt`.
+            control_image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]`):
+                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
+                the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. PIL.Image.Image` can
+                also be accepted as an image. The control image is automatically resized to fit the output image.
+            mask_image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+            controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original unet.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Default height and width to unet
+        height, width = self._default_height_width(height, width, control_image)
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, control_image, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        # 4. Prepare image
+        control_image = self.prepare_image(
+            control_image,
+            width,
+            height,
+            batch_size * num_images_per_prompt,
+            num_images_per_prompt,
+            device,
+            self.controlnet.dtype,
+        )
+        if do_classifier_free_guidance:
+            control_image = torch.cat([control_image] * 2)
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 6. Prepare latent variables
+        num_channels_latents = self.controlnet.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # EXTRA: prepare mask latents
+        mask, masked_image = prepare_mask_and_masked_image(image, mask_image)
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask,
+            masked_image,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            do_classifier_free_guidance,
+        )
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    controlnet_cond=control_image,
+                    return_dict=False,
+                )
+                down_block_res_samples = [
+                    down_block_res_sample * controlnet_conditioning_scale
+                    for down_block_res_sample in down_block_res_samples
+                ]
+                mid_block_res_sample *= controlnet_conditioning_scale
+                # predict the noise residual
+                latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                ).sample
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        # If we do sequential model offloading, let's offload unet and controlnet
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+        if output_type == "latent":
+            image = latents
+            has_nsfw_concept = None
+        elif output_type == "pil":
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+            # 9. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+            # 10. Convert to PIL
+            image = self.numpy_to_pil(image)
+        else:
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+            # 9. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

src/st_style.py ADDED Viewed

	@@ -0,0 +1,42 @@

+button_style = """
+<style>
+div.stButton > button:first-child {
+    background-color: rgb(255, 75, 75);
+    color: rgb(255, 255, 255);
+}
+div.stButton > button:hover {
+    background-color: rgb(255, 75, 75);
+    color: rgb(255, 255, 255);
+}
+div.stButton > button:active {
+    background-color: rgb(255, 75, 75);
+    color: rgb(255, 255, 255);
+}
+div.stButton > button:focus {
+    background-color: rgb(255, 75, 75);
+    color: rgb(255, 255, 255);
+}
+.css-1cpxqw2:focus:not(:active) {
+    background-color: rgb(255, 75, 75);
+    border-color: rgb(255, 75, 75);
+    color: rgb(255, 255, 255);
+}
+"""
+style = """
+<style>
+#MainMenu {
+    visibility: hidden;
+}
+footer {
+    visibility: hidden;
+}
+header {
+    visibility: hidden;
+}
+</style>
+"""
+def apply_prod_style(st):
+    return st.markdown(style, unsafe_allow_html=True)