Upload 20 files
Browse files- .gitattributes +1 -0
- ai_face.png +3 -0
- feature_extractor/preprocessor_config.json +27 -0
- image_encoder.bin +3 -0
- image_encoder.xml +0 -0
- infer_face.py +529 -0
- input.jpg +0 -0
- scheduler/scheduler_config.json +22 -0
- sd_quant_face.py +790 -0
- text_encoder.bin +3 -0
- text_encoder.xml +0 -0
- tokenizer/merges.txt +0 -0
- tokenizer/special_tokens_map.json +30 -0
- tokenizer/tokenizer_config.json +30 -0
- tokenizer/vocab.json +0 -0
- unet.bin +3 -0
- unet.xml +0 -0
- vae_decoder.bin +3 -0
- vae_decoder.xml +0 -0
- vae_encoder.bin +3 -0
- vae_encoder.xml +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
ai_face.png filter=lfs diff=lfs merge=lfs -text
|
ai_face.png
ADDED
Git LFS Details
|
feature_extractor/preprocessor_config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"crop_size": {
|
3 |
+
"height": 224,
|
4 |
+
"width": 224
|
5 |
+
},
|
6 |
+
"do_center_crop": true,
|
7 |
+
"do_convert_rgb": true,
|
8 |
+
"do_normalize": true,
|
9 |
+
"do_rescale": true,
|
10 |
+
"do_resize": true,
|
11 |
+
"image_mean": [
|
12 |
+
0.48145466,
|
13 |
+
0.4578275,
|
14 |
+
0.40821073
|
15 |
+
],
|
16 |
+
"image_processor_type": "CLIPFeatureExtractor",
|
17 |
+
"image_std": [
|
18 |
+
0.26862954,
|
19 |
+
0.26130258,
|
20 |
+
0.27577711
|
21 |
+
],
|
22 |
+
"resample": 3,
|
23 |
+
"rescale_factor": 0.00392156862745098,
|
24 |
+
"size": {
|
25 |
+
"shortest_edge": 224
|
26 |
+
}
|
27 |
+
}
|
image_encoder.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:92acd3113efe615c1bae76084914e0c95835f2cef5f7d044c7e217ffe813ddac
|
3 |
+
size 1264153732
|
image_encoder.xml
ADDED
The diff for this file is too large to render.
See raw diff
|
|
infer_face.py
ADDED
@@ -0,0 +1,529 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import inspect
|
2 |
+
from typing import List, Optional, Union, Dict, Tuple
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
from pathlib import Path
|
6 |
+
from diffusers import AutoPipelineForText2Image
|
7 |
+
from transformers import CLIPVisionModelWithProjection
|
8 |
+
from diffusers.utils import load_image
|
9 |
+
from diffusers import LCMScheduler
|
10 |
+
|
11 |
+
import PIL
|
12 |
+
import cv2
|
13 |
+
import torch
|
14 |
+
import openvino as ov
|
15 |
+
|
16 |
+
from transformers import CLIPTokenizer, CLIPImageProcessor
|
17 |
+
from diffusers import DiffusionPipeline
|
18 |
+
from diffusers.pipelines.stable_diffusion.pipeline_output import (
|
19 |
+
StableDiffusionPipelineOutput,
|
20 |
+
)
|
21 |
+
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
|
22 |
+
from resampler import Resampler
|
23 |
+
|
24 |
+
|
25 |
+
def scale_fit_to_window(dst_width: int, dst_height: int, image_width: int, image_height: int):
|
26 |
+
"""
|
27 |
+
Preprocessing helper function for calculating image size for resize with peserving original aspect ratio
|
28 |
+
and fitting image to specific window size
|
29 |
+
|
30 |
+
Parameters:
|
31 |
+
dst_width (int): destination window width
|
32 |
+
dst_height (int): destination window height
|
33 |
+
image_width (int): source image width
|
34 |
+
image_height (int): source image height
|
35 |
+
Returns:
|
36 |
+
result_width (int): calculated width for resize
|
37 |
+
result_height (int): calculated height for resize
|
38 |
+
"""
|
39 |
+
im_scale = min(dst_height / image_height, dst_width / image_width)
|
40 |
+
return int(im_scale * image_width), int(im_scale * image_height)
|
41 |
+
|
42 |
+
|
43 |
+
def randn_tensor(
|
44 |
+
shape: Union[Tuple, List],
|
45 |
+
generator: Optional[Union[List["torch.Generator"], "torch.Generator"]] = None,
|
46 |
+
dtype: Optional["torch.dtype"] = None,
|
47 |
+
):
|
48 |
+
"""A helper function to create random tensors on the desired `device` with the desired `dtype`. When
|
49 |
+
passing a list of generators, you can seed each batch size individually.
|
50 |
+
|
51 |
+
"""
|
52 |
+
batch_size = shape[0]
|
53 |
+
rand_device = torch.device("cpu")
|
54 |
+
|
55 |
+
# make sure generator list of length 1 is treated like a non-list
|
56 |
+
if isinstance(generator, list) and len(generator) == 1:
|
57 |
+
generator = generator[0]
|
58 |
+
|
59 |
+
if isinstance(generator, list):
|
60 |
+
shape = (1,) + shape[1:]
|
61 |
+
latents = [torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype) for i in range(batch_size)]
|
62 |
+
latents = torch.cat(latents, dim=0)
|
63 |
+
else:
|
64 |
+
latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype)
|
65 |
+
|
66 |
+
return latents
|
67 |
+
|
68 |
+
|
69 |
+
def preprocess(image: PIL.Image.Image, height, width):
|
70 |
+
"""
|
71 |
+
Image preprocessing function. Takes image in PIL.Image format, resizes it to keep aspect ration and fits to model input window 512x512,
|
72 |
+
then converts it to np.ndarray and adds padding with zeros on right or bottom side of image (depends from aspect ratio), after that
|
73 |
+
converts data to float32 data type and change range of values from [0, 255] to [-1, 1], finally, converts data layout from planar NHWC to NCHW.
|
74 |
+
The function returns preprocessed input tensor and padding size, which can be used in postprocessing.
|
75 |
+
|
76 |
+
Parameters:
|
77 |
+
image (PIL.Image.Image): input image
|
78 |
+
Returns:
|
79 |
+
image (np.ndarray): preprocessed image tensor
|
80 |
+
meta (Dict): dictionary with preprocessing metadata info
|
81 |
+
"""
|
82 |
+
src_width, src_height = image.size
|
83 |
+
dst_width, dst_height = scale_fit_to_window(height, width, src_width, src_height)
|
84 |
+
image = np.array(image.resize((dst_width, dst_height), resample=PIL.Image.Resampling.LANCZOS))[None, :]
|
85 |
+
print(image.shape)
|
86 |
+
pad_width = width - dst_width
|
87 |
+
pad_height = height - dst_height
|
88 |
+
pad = ((0, 0), (0, pad_height), (0, pad_width), (0, 0))
|
89 |
+
image = np.pad(image, pad, mode="constant")
|
90 |
+
image = image.astype(np.float32) / 255.0
|
91 |
+
#image = image.astype(np.float16) / 255.0
|
92 |
+
image = 2.0 * image - 1.0
|
93 |
+
image = image.transpose(0, 3, 1, 2)
|
94 |
+
print(image.shape)
|
95 |
+
return image, {"padding": pad, "src_width": src_width, "src_height": src_height}
|
96 |
+
|
97 |
+
|
98 |
+
class OVStableDiffusionPipeline(DiffusionPipeline):
|
99 |
+
def __init__(
|
100 |
+
self,
|
101 |
+
vae_decoder: ov.Model,
|
102 |
+
text_encoder: ov.Model,
|
103 |
+
tokenizer: CLIPTokenizer,
|
104 |
+
unet: ov.Model,
|
105 |
+
scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
|
106 |
+
image_encoder: ov.Model,
|
107 |
+
feature_extractor: CLIPImageProcessor,
|
108 |
+
vae_encoder: ov.Model,
|
109 |
+
):
|
110 |
+
"""
|
111 |
+
Pipeline for text-to-image generation using Stable Diffusion and IP-Adapter with OpenVINO
|
112 |
+
Parameters:
|
113 |
+
vae_decoder (ov.Model):
|
114 |
+
Variational Auto-Encoder (VAE) Model to decode images to and from latent representations.
|
115 |
+
text_encoder (ov.Model):CLIPImageProcessor
|
116 |
+
Frozen text-encoder. Stable Diffusion uses the text portion of
|
117 |
+
[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
|
118 |
+
the clip-vit-large-patch14(https://huggingface.co/openai/clip-vit-large-patch14) variant.
|
119 |
+
tokenizer (CLIPTokenizer):
|
120 |
+
Tokenizer of class CLIPTokenizer(https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
|
121 |
+
unet (ov.Model): Conditional U-Net architecture to denoise the encoded image latents.
|
122 |
+
scheduler (SchedulerMixin):
|
123 |
+
A scheduler to be used in combination with unet to denoise the encoded image latents
|
124 |
+
image_encoder (ov.Model):
|
125 |
+
IP-Adapter image encoder for embedding input image as input prompt for generation
|
126 |
+
feature_extractor :
|
127 |
+
"""
|
128 |
+
super().__init__()
|
129 |
+
self.scheduler = scheduler
|
130 |
+
self.vae_decoder = vae_decoder
|
131 |
+
self.image_encoder = image_encoder
|
132 |
+
self.text_encoder = text_encoder
|
133 |
+
self.unet = unet
|
134 |
+
self.height = 512
|
135 |
+
self.width = 512
|
136 |
+
self.vae_scale_factor = 8
|
137 |
+
self.tokenizer = tokenizer
|
138 |
+
self.vae_encoder = vae_encoder
|
139 |
+
self.feature_extractor = feature_extractor
|
140 |
+
|
141 |
+
def __call__(
|
142 |
+
self,
|
143 |
+
prompt: Union[str, List[str]],
|
144 |
+
ip_adapter_image: PIL.Image.Image,
|
145 |
+
image: PIL.Image.Image = None,
|
146 |
+
num_inference_steps: Optional[int] = 4,
|
147 |
+
negative_prompt: Union[str, List[str]] = None,
|
148 |
+
guidance_scale: Optional[float] = 0.5,
|
149 |
+
eta: Optional[float] = 0.0,
|
150 |
+
output_type: Optional[str] = "pil",
|
151 |
+
height: Optional[int] = None,
|
152 |
+
width: Optional[int] = None,
|
153 |
+
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
154 |
+
latents: Optional[torch.FloatTensor] = None,
|
155 |
+
strength: float = 1.0,
|
156 |
+
**kwargs,
|
157 |
+
):
|
158 |
+
"""
|
159 |
+
Function invoked when calling the pipeline for generation.
|
160 |
+
Parameters:
|
161 |
+
prompt (str or List[str]):
|
162 |
+
The prompt or prompts to guide the image generation.
|
163 |
+
image (PIL.Image.Image, *optional*, None):
|
164 |
+
Intinal image for generation.
|
165 |
+
num_inference_steps (int, *optional*, defaults to 50):
|
166 |
+
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
167 |
+
expense of slower inference.
|
168 |
+
negative_prompt (str or List[str]):https://user-images.githubusercontent.com/29454499/258651862-28b63016-c5ff-4263-9da8-73ca31100165.jpeg
|
169 |
+
The negative prompt or prompts to guide the image generation.
|
170 |
+
guidance_scale (float, *optional*, defaults to 7.5):
|
171 |
+
Guidance scale as defined in Classifier-Free Diffusion Guidance(https://arxiv.org/abs/2207.12598).
|
172 |
+
guidance_scale is defined as `w` of equation 2.
|
173 |
+
Higher guidance scale encourages to generate images that are closely linked to the text prompt,
|
174 |
+
usually at the expense of lower image quality.
|
175 |
+
eta (float, *optional*, defaults to 0.0):
|
176 |
+
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
|
177 |
+
[DDIMScheduler], will be ignored for others.
|
178 |
+
output_type (`str`, *optional*, defaults to "pil"):
|
179 |
+
The output format of the generate image. Choose between
|
180 |
+
[PIL](https://pillow.readthedocs.io/en/stable/): PIL.Image.Image or np.array.
|
181 |
+
height (int, *optional*, 512):
|
182 |
+
Generated image height
|
183 |
+
width (int, *optional*, 512):
|
184 |
+
Generated image width
|
185 |
+
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
186 |
+
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
187 |
+
generation deterministic.
|
188 |
+
latents (`torch.FloatTensor`, *optional*):
|
189 |
+
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
|
190 |
+
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
191 |
+
tensor is generated by sampling using the supplied random `generator`.
|
192 |
+
Returns:
|
193 |
+
Dictionary with keys:
|
194 |
+
sample - the last generated image PIL.Image.Image or np.arrayhttps://huggingface.co/latent-consistency/lcm-lora-sdv1-5
|
195 |
+
iterations - *optional* (if gif=True) images for all diffusion steps, List of PIL.Image.Image or np.array.
|
196 |
+
"""
|
197 |
+
do_classifier_free_guidance = guidance_scale > 1.0
|
198 |
+
# get prompt text embeddings
|
199 |
+
text_embeddings = self._encode_prompt(
|
200 |
+
prompt,
|
201 |
+
do_classifier_free_guidance=do_classifier_free_guidance,
|
202 |
+
negative_prompt=negative_prompt,
|
203 |
+
)
|
204 |
+
# get ip-adapter image embeddings
|
205 |
+
image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image)
|
206 |
+
if do_classifier_free_guidance:
|
207 |
+
image_embeds = np.concatenate([negative_image_embeds, image_embeds])
|
208 |
+
|
209 |
+
# set timesteps
|
210 |
+
accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
|
211 |
+
extra_set_kwargs = {}
|
212 |
+
if accepts_offset:
|
213 |
+
extra_set_kwargs["offset"] = 1
|
214 |
+
|
215 |
+
self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
|
216 |
+
|
217 |
+
timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
|
218 |
+
latent_timestep = timesteps[:1]
|
219 |
+
|
220 |
+
print(num_inference_steps,timesteps)
|
221 |
+
|
222 |
+
# get the initial random noise unless the user supplied it
|
223 |
+
latents, meta = self.prepare_latents(
|
224 |
+
1,
|
225 |
+
4,
|
226 |
+
height or self.height,
|
227 |
+
width or self.width,
|
228 |
+
generator=generator,
|
229 |
+
latents=latents,
|
230 |
+
image=image,
|
231 |
+
latent_timestep=latent_timestep,
|
232 |
+
)
|
233 |
+
|
234 |
+
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
235 |
+
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
236 |
+
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
|
237 |
+
# and should be between [0, 1]
|
238 |
+
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
239 |
+
extra_step_kwargs = {}
|
240 |
+
if accepts_eta:
|
241 |
+
extra_step_kwargs["eta"] = eta
|
242 |
+
|
243 |
+
for i, t in enumerate(self.progress_bar(timesteps)):
|
244 |
+
# expand the latents if you are doing classifier free guidance
|
245 |
+
latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
|
246 |
+
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
247 |
+
|
248 |
+
# predict the noise residual
|
249 |
+
|
250 |
+
noise_pred = self.unet([latent_model_input, t, text_embeddings, image_embeds])[0]
|
251 |
+
# perform guidance
|
252 |
+
if do_classifier_free_guidance:
|
253 |
+
noise_pred_uncond, noise_pred_text = noise_pred[0], noise_pred[1]
|
254 |
+
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
255 |
+
|
256 |
+
# compute the previous noisy sample x_t -> x_t-1
|
257 |
+
latents = self.scheduler.step(
|
258 |
+
torch.from_numpy(noise_pred),
|
259 |
+
t,
|
260 |
+
torch.from_numpy(latents),
|
261 |
+
**extra_step_kwargs,
|
262 |
+
)["prev_sample"].numpy()
|
263 |
+
|
264 |
+
# scale and decode the image latents with vae
|
265 |
+
image = self.vae_decoder(latents * (1 / 0.18215))[0]
|
266 |
+
|
267 |
+
image = self.postprocess_image(image, meta, output_type)
|
268 |
+
return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=False)
|
269 |
+
|
270 |
+
def _encode_prompt(
|
271 |
+
self,
|
272 |
+
prompt: Union[str, List[str]],
|
273 |
+
num_images_per_prompt: int = 1,
|
274 |
+
do_classifier_free_guidance: bool = True,
|
275 |
+
negative_prompt: Union[str, List[str]] = None,
|
276 |
+
):
|
277 |
+
"""
|
278 |
+
Encodes the prompt into text encoder hidden states.
|
279 |
+
|
280 |
+
Parameters:
|
281 |
+
prompt (str or list(str)): prompt to be encoded
|
282 |
+
num_images_per_prompt (int): number of images that should be generated per prompt
|
283 |
+
do_classifier_free_guidance (bool): whether to use classifier free guidance or not
|
284 |
+
negative_prompt (str or list(str)): negative prompt to be encoded.
|
285 |
+
Returns:
|
286 |
+
text_embeddings (np.ndarray): text encoder hidden states
|
287 |
+
"""
|
288 |
+
batch_size = len(prompt) if isinstance(prompt, list) else 1
|
289 |
+
|
290 |
+
# tokenize input prompts
|
291 |
+
text_inputs = self.tokenizer(
|
292 |
+
prompt,
|
293 |
+
padding="max_length",
|
294 |
+
max_length=self.tokenizer.model_max_length,
|
295 |
+
truncation=True,
|
296 |
+
return_tensors="np",
|
297 |
+
)
|
298 |
+
text_input_ids = text_inputs.input_ids
|
299 |
+
|
300 |
+
text_embeddings = self.text_encoder(text_input_ids)[0]
|
301 |
+
|
302 |
+
# duplicate text embeddings for each generation per prompt
|
303 |
+
if num_images_per_prompt != 1:
|
304 |
+
bs_embed, seq_len, _ = text_embeddings.shape
|
305 |
+
text_embeddings = np.tile(text_embeddings, (1, num_images_per_prompt, 1))
|
306 |
+
text_embeddings = np.reshape(text_embeddings, (bs_embed * num_images_per_prompt, seq_len, -1))
|
307 |
+
|
308 |
+
# get unconditional embeddings for classifier free guidance
|
309 |
+
if do_classifier_free_guidance:
|
310 |
+
uncond_tokens: List[str]
|
311 |
+
max_length = text_input_ids.shape[-1]
|
312 |
+
if negative_prompt is None:
|
313 |
+
uncond_tokens = [""] * batch_size
|
314 |
+
elif isinstance(negative_prompt, str):
|
315 |
+
uncond_tokens = [negative_prompt]
|
316 |
+
else:
|
317 |
+
uncond_tokens = negative_prompt
|
318 |
+
uncond_input = self.tokenizer(
|
319 |
+
uncond_tokens,
|
320 |
+
padding="max_length",
|
321 |
+
max_length=max_length,
|
322 |
+
truncation=True,
|
323 |
+
return_tensors="np",
|
324 |
+
)
|
325 |
+
|
326 |
+
uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0]
|
327 |
+
|
328 |
+
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
|
329 |
+
seq_len = uncond_embeddings.shape[1]
|
330 |
+
uncond_embeddings = np.tile(uncond_embeddings, (1, num_images_per_prompt, 1))
|
331 |
+
uncond_embeddings = np.reshape(uncond_embeddings, (batch_size * num_images_per_prompt, seq_len, -1))
|
332 |
+
|
333 |
+
# For classifier-free guidance, we need to do two forward passes.
|
334 |
+
# Here we concatenate the unconditional and text embeddings into a single batch
|
335 |
+
# to avoid doing two forward passes
|
336 |
+
text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
|
337 |
+
|
338 |
+
return text_embeddings
|
339 |
+
|
340 |
+
def prepare_latents(
|
341 |
+
self,
|
342 |
+
batch_size,
|
343 |
+
num_channels_latents,
|
344 |
+
height,
|
345 |
+
width,
|
346 |
+
dtype=torch.float16,
|
347 |
+
generator=None,
|
348 |
+
latents=None,
|
349 |
+
image=None,
|
350 |
+
latent_timestep=None,
|
351 |
+
):
|
352 |
+
shape = (
|
353 |
+
batch_size,
|
354 |
+
num_channels_latents,
|
355 |
+
height // self.vae_scale_factor,
|
356 |
+
width // self.vae_scale_factor,
|
357 |
+
)
|
358 |
+
if isinstance(generator, list) and len(generator) != batch_size:
|
359 |
+
raise ValueError(
|
360 |
+
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
361 |
+
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
362 |
+
)
|
363 |
+
|
364 |
+
if latents is None:
|
365 |
+
latents = randn_tensor(shape, generator=generator, dtype=dtype)
|
366 |
+
|
367 |
+
if image is None:
|
368 |
+
# scale the initial noise by the standard deviation required by the scheduler
|
369 |
+
latents = latents * self.scheduler.init_noise_sigma
|
370 |
+
return latents.numpy(), {}
|
371 |
+
input_image, meta = preprocess(image, height, width)
|
372 |
+
print(input_image.shape)
|
373 |
+
image_latents = self.vae_encoder(input_image)[0]
|
374 |
+
image_latents = image_latents * 0.18215
|
375 |
+
latents = self.scheduler.add_noise(torch.from_numpy(image_latents), latents, latent_timestep).numpy()
|
376 |
+
return latents, meta
|
377 |
+
|
378 |
+
def postprocess_image(self, image: np.ndarray, meta: Dict, output_type: str = "pil"):
|
379 |
+
"""
|
380 |
+
Postprocessing for decoded image. Takes generated image decoded by VAE decoder, unpad it to initial image size (if required),
|
381 |
+
normalize and convert to [0, 255] pixels range. Optionally, converts it from np.ndarray to PIL.Image format
|
382 |
+
|
383 |
+
Parameters:
|
384 |
+
image (np.ndarray):
|
385 |
+
Generated image
|
386 |
+
meta (Dict):
|
387 |
+
Metadata obtained on the latents preparing step can be empty
|
388 |
+
output_type (str, *optional*, pil):
|
389 |
+
Output format for result, can be pil or numpy
|
390 |
+
Returns:
|
391 |
+
image (List of np.ndarray or PIL.Image.Image):
|
392 |
+
Post-processed images
|
393 |
+
"""
|
394 |
+
if "padding" in meta:
|
395 |
+
pad = meta["padding"]
|
396 |
+
(_, end_h), (_, end_w) = pad[1:3]
|
397 |
+
h, w = image.shape[2:]
|
398 |
+
unpad_h = h - end_h
|
399 |
+
unpad_w = w - end_w
|
400 |
+
image = image[:, :, :unpad_h, :unpad_w]
|
401 |
+
image = np.clip(image / 2 + 0.5, 0, 1)
|
402 |
+
image = np.transpose(image, (0, 2, 3, 1))
|
403 |
+
|
404 |
+
|
405 |
+
# 9. Convert to PIL
|
406 |
+
if output_type == "pil":
|
407 |
+
image = self.numpy_to_pil(image)
|
408 |
+
if "src_height" in meta:
|
409 |
+
orig_height, orig_width = meta["src_height"], meta["src_width"]
|
410 |
+
image = [img.resize((orig_width, orig_height), PIL.Image.Resampling.LANCZOS) for img in image]
|
411 |
+
else:
|
412 |
+
if "src_height" in meta:
|
413 |
+
orig_height, orig_width = meta["src_height"], meta["src_width"]
|
414 |
+
image = [cv2.resize(img, (orig_width, orig_width)) for img in image]
|
415 |
+
|
416 |
+
|
417 |
+
return image
|
418 |
+
|
419 |
+
def encode_image(self, image, num_images_per_prompt=1):
|
420 |
+
if not isinstance(image, torch.Tensor):
|
421 |
+
image = self.feature_extractor(image, return_tensors="pt").pixel_values
|
422 |
+
|
423 |
+
image_embeds = self.image_encoder(image)[0]
|
424 |
+
"""
|
425 |
+
print(1,image_embeds)
|
426 |
+
image_proj_model = Resampler(
|
427 |
+
dim=1024,
|
428 |
+
depth=2,
|
429 |
+
dim_head=64,
|
430 |
+
heads=16,
|
431 |
+
num_queries=8,
|
432 |
+
embedding_dim=1280,
|
433 |
+
output_dim=1280,
|
434 |
+
ff_mult=2,
|
435 |
+
max_seq_len=257,
|
436 |
+
apply_pos_emb=True,
|
437 |
+
num_latents_mean_pooled=4,
|
438 |
+
)
|
439 |
+
|
440 |
+
image_embeds = image_proj_model(image_embeds)
|
441 |
+
print(2,image_embeds)
|
442 |
+
"""
|
443 |
+
|
444 |
+
if num_images_per_prompt > 1:
|
445 |
+
image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
|
446 |
+
|
447 |
+
uncond_image_embeds = np.zeros(image_embeds.shape)
|
448 |
+
return image_embeds, uncond_image_embeds
|
449 |
+
|
450 |
+
def get_timesteps(self, num_inference_steps: int, strength: float):
|
451 |
+
"""
|
452 |
+
Helper function for getting scheduler timesteps for generation
|
453 |
+
In case of image-to-image generation, it updates number of steps according to strength
|
454 |
+
|
455 |
+
Parameters:
|
456 |
+
num_inference_steps (int):
|
457 |
+
number of inference steps for generation
|
458 |
+
strength (float):
|
459 |
+
value between 0.0 and 1.0, that controls the amount of noise that is added to the input image.
|
460 |
+
Values that approach 1.0 allow for lots of variations but will also produce images that are not semantically consistent with the input.
|
461 |
+
"""
|
462 |
+
# get the original timestep using init_timestep
|
463 |
+
init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
|
464 |
+
|
465 |
+
t_start = max(num_inference_steps - init_timestep, 0)
|
466 |
+
timesteps = self.scheduler.timesteps[t_start:]
|
467 |
+
|
468 |
+
return timesteps, num_inference_steps - t_start
|
469 |
+
|
470 |
+
|
471 |
+
core = ov.Core()
|
472 |
+
device = "CPU"
|
473 |
+
|
474 |
+
models_dir = Path('on-canvers-disney-v3.9.1-ov-face') #'on-canvers-real-ov-ref-v3.9.1')
|
475 |
+
IMAGE_ENCODER_PATH = models_dir / "image_encoder.xml"
|
476 |
+
UNET_PATH = models_dir / "unet.xml"
|
477 |
+
VAE_DECODER_PATH = models_dir / "vae_decoder.xml"
|
478 |
+
VAE_ENCODER_PATH = models_dir / "vae_encoder.xml"
|
479 |
+
TEXT_ENCODER_PATH = models_dir / "text_encoder.xml"
|
480 |
+
|
481 |
+
from transformers import AutoTokenizer
|
482 |
+
from PIL import Image
|
483 |
+
|
484 |
+
ov_config = {}# {"INFERENCE_PRECISION_HINT": "fp16"}
|
485 |
+
vae_decoder = core.compile_model(VAE_DECODER_PATH, device, ov_config)
|
486 |
+
vae_encoder = core.compile_model(VAE_ENCODER_PATH, device, ov_config)
|
487 |
+
text_encoder = core.compile_model(TEXT_ENCODER_PATH, device)
|
488 |
+
image_encoder = core.compile_model(IMAGE_ENCODER_PATH, device)
|
489 |
+
unet = core.compile_model(UNET_PATH, device)
|
490 |
+
|
491 |
+
scheduler = LCMScheduler.from_pretrained(models_dir / "scheduler")
|
492 |
+
tokenizer = AutoTokenizer.from_pretrained(models_dir / "tokenizer")
|
493 |
+
feature_extractor = CLIPImageProcessor.from_pretrained(models_dir / "feature_extractor")
|
494 |
+
|
495 |
+
ov_pipe = OVStableDiffusionPipeline(
|
496 |
+
vae_decoder,
|
497 |
+
text_encoder,
|
498 |
+
tokenizer,
|
499 |
+
unet,
|
500 |
+
scheduler,
|
501 |
+
image_encoder,
|
502 |
+
feature_extractor,
|
503 |
+
vae_encoder,
|
504 |
+
)
|
505 |
+
|
506 |
+
generator = torch.Generator(device="cpu").manual_seed(576)
|
507 |
+
|
508 |
+
ip_image = load_image("./input.jpg")
|
509 |
+
#ip_image.resize((512, 512))
|
510 |
+
|
511 |
+
image = Image.open("ai_face.png").convert('RGB')
|
512 |
+
image.resize((512, 512))
|
513 |
+
|
514 |
+
#image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/vermeer.jpg")
|
515 |
+
#ip_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/river.png")
|
516 |
+
|
517 |
+
result = ov_pipe(
|
518 |
+
prompt="best quality, high quality, beautiful korean woman is wearing glasses",
|
519 |
+
#image=image,
|
520 |
+
ip_adapter_image=image,
|
521 |
+
height=512,
|
522 |
+
width=512,
|
523 |
+
guidance_scale=1,
|
524 |
+
generator=generator,
|
525 |
+
#strength=0.7,
|
526 |
+
num_inference_steps=4,
|
527 |
+
).images[0]
|
528 |
+
|
529 |
+
result.save("test7.png")
|
input.jpg
ADDED
scheduler/scheduler_config.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_class_name": "LCMScheduler",
|
3 |
+
"_diffusers_version": "0.30.2",
|
4 |
+
"beta_end": 0.012,
|
5 |
+
"beta_schedule": "scaled_linear",
|
6 |
+
"beta_start": 0.00085,
|
7 |
+
"clip_sample": false,
|
8 |
+
"clip_sample_range": 1.0,
|
9 |
+
"dynamic_thresholding_ratio": 0.995,
|
10 |
+
"num_train_timesteps": 1000,
|
11 |
+
"original_inference_steps": 50,
|
12 |
+
"prediction_type": "epsilon",
|
13 |
+
"rescale_betas_zero_snr": false,
|
14 |
+
"sample_max_value": 1.0,
|
15 |
+
"set_alpha_to_one": false,
|
16 |
+
"skip_prk_steps": true,
|
17 |
+
"steps_offset": 1,
|
18 |
+
"thresholding": false,
|
19 |
+
"timestep_scaling": 10.0,
|
20 |
+
"timestep_spacing": "leading",
|
21 |
+
"trained_betas": null
|
22 |
+
}
|
sd_quant_face.py
ADDED
@@ -0,0 +1,790 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from diffusers import AutoPipelineForText2Image
|
3 |
+
from transformers import CLIPVisionModelWithProjection
|
4 |
+
from diffusers.utils import load_image
|
5 |
+
from diffusers import LCMScheduler
|
6 |
+
|
7 |
+
|
8 |
+
stable_diffusion_id = "circulus/canvers-disney-v3.9.1"
|
9 |
+
ip_adapter_id = "h94/IP-Adapter"
|
10 |
+
ip_adapter_weight_name = "ip-adapter-full-face_sd15.bin" #"ip-adapter-full-face_sd15.bin" # "ip-adapter_sd15.bin"
|
11 |
+
lcm_lora_id = "latent-consistency/lcm-lora-sdv1-5"
|
12 |
+
models_dir = Path("on-canvers-disney-v3.9.1-ov-face")
|
13 |
+
int8_model_path = Path("on-canvers-disney-v3.9.1-ov-face-int8")
|
14 |
+
from optimum.intel import OVConfig, OVQuantizer, OVStableDiffusionPipeline, OVWeightQuantizationConfig
|
15 |
+
from optimum.intel.openvino.configuration import OVQuantizationMethod
|
16 |
+
|
17 |
+
load_original_pipeline = not all(
|
18 |
+
[
|
19 |
+
(models_dir / model_name).exists()
|
20 |
+
for model_name in [
|
21 |
+
"text_encoder.xml",
|
22 |
+
"image_encoder.xml",
|
23 |
+
"unet.xml",
|
24 |
+
"vae_decoder.xml",
|
25 |
+
"vae_encoder.xml",
|
26 |
+
]
|
27 |
+
]
|
28 |
+
)
|
29 |
+
|
30 |
+
|
31 |
+
def get_pipeline_components(
|
32 |
+
stable_diffusion_id,
|
33 |
+
ip_adapter_id,
|
34 |
+
ip_adapter_weight_name,
|
35 |
+
lcm_lora_id,
|
36 |
+
ip_adapter_scale=0.65,
|
37 |
+
):
|
38 |
+
image_encoder = CLIPVisionModelWithProjection.from_pretrained("h94/IP-Adapter", subfolder="models/image_encoder")
|
39 |
+
print(image_encoder)
|
40 |
+
pipeline = AutoPipelineForText2Image.from_pretrained(stable_diffusion_id, image_encoder=image_encoder)
|
41 |
+
pipeline.load_lora_weights(lcm_lora_id)
|
42 |
+
pipeline.fuse_lora()
|
43 |
+
pipeline.load_ip_adapter(ip_adapter_id, subfolder="models", weight_name=ip_adapter_weight_name)
|
44 |
+
pipeline.set_ip_adapter_scale(ip_adapter_scale)
|
45 |
+
scheduler = LCMScheduler.from_pretrained(stable_diffusion_id, subfolder="scheduler")
|
46 |
+
return (
|
47 |
+
pipeline.tokenizer,
|
48 |
+
pipeline.feature_extractor,
|
49 |
+
scheduler,
|
50 |
+
pipeline.text_encoder,
|
51 |
+
pipeline.image_encoder,
|
52 |
+
pipeline.unet,
|
53 |
+
pipeline.vae,
|
54 |
+
)
|
55 |
+
|
56 |
+
|
57 |
+
if load_original_pipeline:
|
58 |
+
(
|
59 |
+
tokenizer,
|
60 |
+
feature_extractor,
|
61 |
+
scheduler,
|
62 |
+
text_encoder,
|
63 |
+
image_encoder,
|
64 |
+
unet,
|
65 |
+
vae,
|
66 |
+
) = get_pipeline_components(stable_diffusion_id, ip_adapter_id, ip_adapter_weight_name, lcm_lora_id)
|
67 |
+
scheduler.save_pretrained(models_dir / "scheduler")
|
68 |
+
else:
|
69 |
+
tokenizer, feature_extractor, scheduler, text_encoder, image_encoder, unet, vae = (
|
70 |
+
None,
|
71 |
+
None,
|
72 |
+
None,
|
73 |
+
None,
|
74 |
+
None,
|
75 |
+
None,
|
76 |
+
None,
|
77 |
+
)
|
78 |
+
|
79 |
+
import openvino as ov
|
80 |
+
import torch
|
81 |
+
import gc
|
82 |
+
|
83 |
+
|
84 |
+
def cleanup_torchscript_cache():
|
85 |
+
"""
|
86 |
+
Helper for removing cached model representation
|
87 |
+
"""
|
88 |
+
torch._C._jit_clear_class_registry()
|
89 |
+
torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()
|
90 |
+
torch.jit._state._clear_class_state()
|
91 |
+
|
92 |
+
IMAGE_ENCODER_PATH = models_dir / "image_encoder.xml"
|
93 |
+
UNET_PATH = models_dir / "unet.xml"
|
94 |
+
VAE_DECODER_PATH = models_dir / "vae_decoder.xml"
|
95 |
+
VAE_ENCODER_PATH = models_dir / "vae_encoder.xml"
|
96 |
+
TEXT_ENCODER_PATH = models_dir / "text_encoder.xml"
|
97 |
+
|
98 |
+
if not IMAGE_ENCODER_PATH.exists():
|
99 |
+
with torch.no_grad():
|
100 |
+
ov_model = ov.convert_model(
|
101 |
+
image_encoder,
|
102 |
+
example_input=torch.zeros((1, 3, 224, 224)),
|
103 |
+
input=[-1, 3, 224, 224],
|
104 |
+
)
|
105 |
+
ov.save_model(ov_model, IMAGE_ENCODER_PATH)
|
106 |
+
feature_extractor.save_pretrained(models_dir / "feature_extractor")
|
107 |
+
del ov_model
|
108 |
+
cleanup_torchscript_cache()
|
109 |
+
|
110 |
+
|
111 |
+
if not UNET_PATH.exists():
|
112 |
+
inputs = {
|
113 |
+
"sample": torch.randn((2, 4, 64, 64)),
|
114 |
+
"timestep": torch.tensor(1),
|
115 |
+
"encoder_hidden_states": torch.randn((2, 77, 768)),
|
116 |
+
"added_cond_kwargs": {"image_embeds": torch.ones((2, 1280))}, # 2,1024
|
117 |
+
}
|
118 |
+
|
119 |
+
print(unet)
|
120 |
+
|
121 |
+
with torch.no_grad():
|
122 |
+
ov_model = ov.convert_model(unet, example_input=inputs)
|
123 |
+
# dictionary with added_cond_kwargs will be decomposed during conversion
|
124 |
+
# in some cases decomposition may lead to losing data type and shape information
|
125 |
+
# We need to recover it manually after the conversion
|
126 |
+
ov_model.inputs[-1].get_node().set_element_type(ov.Type.f32)
|
127 |
+
ov_model.validate_nodes_and_infer_types()
|
128 |
+
ov.save_model(ov_model, UNET_PATH)
|
129 |
+
del ov_model
|
130 |
+
cleanup_torchscript_cache()
|
131 |
+
|
132 |
+
if not VAE_DECODER_PATH.exists():
|
133 |
+
|
134 |
+
class VAEDecoderWrapper(torch.nn.Module):
|
135 |
+
def __init__(self, vae):
|
136 |
+
super().__init__()
|
137 |
+
self.vae = vae
|
138 |
+
|
139 |
+
def forward(self, latents):
|
140 |
+
return self.vae.decode(latents)
|
141 |
+
|
142 |
+
vae_decoder = VAEDecoderWrapper(vae)
|
143 |
+
with torch.no_grad():
|
144 |
+
ov_model = ov.convert_model(vae_decoder, example_input=torch.ones([1, 4, 64, 64]))
|
145 |
+
ov.save_model(ov_model, VAE_DECODER_PATH)
|
146 |
+
del ov_model
|
147 |
+
cleanup_torchscript_cache()
|
148 |
+
del vae_decoder
|
149 |
+
|
150 |
+
if not VAE_ENCODER_PATH.exists():
|
151 |
+
|
152 |
+
class VAEEncoderWrapper(torch.nn.Module):
|
153 |
+
def __init__(self, vae):
|
154 |
+
super().__init__()
|
155 |
+
self.vae = vae
|
156 |
+
|
157 |
+
def forward(self, image):
|
158 |
+
return self.vae.encode(x=image)["latent_dist"].sample()
|
159 |
+
|
160 |
+
vae_encoder = VAEEncoderWrapper(vae)
|
161 |
+
vae_encoder.eval()
|
162 |
+
image = torch.zeros((1, 3, 512, 512))
|
163 |
+
with torch.no_grad():
|
164 |
+
ov_model = ov.convert_model(vae_encoder, example_input=image)
|
165 |
+
ov.save_model(ov_model, VAE_ENCODER_PATH)
|
166 |
+
del ov_model
|
167 |
+
cleanup_torchscript_cache()
|
168 |
+
|
169 |
+
|
170 |
+
if not TEXT_ENCODER_PATH.exists():
|
171 |
+
with torch.no_grad():
|
172 |
+
ov_model = ov.convert_model(
|
173 |
+
text_encoder,
|
174 |
+
example_input=torch.ones([1, 77], dtype=torch.long),
|
175 |
+
input=[
|
176 |
+
(1, 77),
|
177 |
+
],
|
178 |
+
)
|
179 |
+
ov.save_model(ov_model, TEXT_ENCODER_PATH)
|
180 |
+
del ov_model
|
181 |
+
cleanup_torchscript_cache()
|
182 |
+
tokenizer.save_pretrained(models_dir / "tokenizer")
|
183 |
+
|
184 |
+
|
185 |
+
import inspect
|
186 |
+
from typing import List, Optional, Union, Dict, Tuple
|
187 |
+
import numpy as np
|
188 |
+
|
189 |
+
from pathlib import Path
|
190 |
+
from diffusers import AutoPipelineForText2Image
|
191 |
+
from transformers import CLIPVisionModelWithProjection
|
192 |
+
from diffusers.utils import load_image
|
193 |
+
from diffusers import LCMScheduler
|
194 |
+
|
195 |
+
import PIL
|
196 |
+
import cv2
|
197 |
+
import torch
|
198 |
+
import openvino as ov
|
199 |
+
|
200 |
+
from transformers import CLIPTokenizer, CLIPImageProcessor
|
201 |
+
from diffusers import DiffusionPipeline
|
202 |
+
from diffusers.pipelines.stable_diffusion.pipeline_output import (
|
203 |
+
StableDiffusionPipelineOutput,
|
204 |
+
)
|
205 |
+
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
|
206 |
+
from resampler import Resampler
|
207 |
+
|
208 |
+
|
209 |
+
def scale_fit_to_window(dst_width: int, dst_height: int, image_width: int, image_height: int):
|
210 |
+
"""
|
211 |
+
Preprocessing helper function for calculating image size for resize with peserving original aspect ratio
|
212 |
+
and fitting image to specific window size
|
213 |
+
|
214 |
+
Parameters:
|
215 |
+
dst_width (int): destination window width
|
216 |
+
dst_height (int): destination window height
|
217 |
+
image_width (int): source image width
|
218 |
+
image_height (int): source image height
|
219 |
+
Returns:
|
220 |
+
result_width (int): calculated width for resize
|
221 |
+
result_height (int): calculated height for resize
|
222 |
+
"""
|
223 |
+
im_scale = min(dst_height / image_height, dst_width / image_width)
|
224 |
+
return int(im_scale * image_width), int(im_scale * image_height)
|
225 |
+
|
226 |
+
|
227 |
+
def randn_tensor(
|
228 |
+
shape: Union[Tuple, List],
|
229 |
+
generator: Optional[Union[List["torch.Generator"], "torch.Generator"]] = None,
|
230 |
+
dtype: Optional["torch.dtype"] = None,
|
231 |
+
):
|
232 |
+
"""A helper function to create random tensors on the desired `device` with the desired `dtype`. When
|
233 |
+
passing a list of generators, you can seed each batch size individually.
|
234 |
+
|
235 |
+
"""
|
236 |
+
batch_size = shape[0]
|
237 |
+
rand_device = torch.device("cpu")
|
238 |
+
|
239 |
+
# make sure generator list of length 1 is treated like a non-list
|
240 |
+
if isinstance(generator, list) and len(generator) == 1:
|
241 |
+
generator = generator[0]
|
242 |
+
|
243 |
+
if isinstance(generator, list):
|
244 |
+
shape = (1,) + shape[1:]
|
245 |
+
latents = [torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype) for i in range(batch_size)]
|
246 |
+
latents = torch.cat(latents, dim=0)
|
247 |
+
else:
|
248 |
+
latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype)
|
249 |
+
|
250 |
+
return latents
|
251 |
+
|
252 |
+
|
253 |
+
def preprocess(image: PIL.Image.Image, height, width):
|
254 |
+
"""
|
255 |
+
Image preprocessing function. Takes image in PIL.Image format, resizes it to keep aspect ration and fits to model input window 512x512,
|
256 |
+
then converts it to np.ndarray and adds padding with zeros on right or bottom side of image (depends from aspect ratio), after that
|
257 |
+
converts data to float32 data type and change range of values from [0, 255] to [-1, 1], finally, converts data layout from planar NHWC to NCHW.
|
258 |
+
The function returns preprocessed input tensor and padding size, which can be used in postprocessing.
|
259 |
+
|
260 |
+
Parameters:
|
261 |
+
image (PIL.Image.Image): input image
|
262 |
+
Returns:
|
263 |
+
image (np.ndarray): preprocessed image tensor
|
264 |
+
meta (Dict): dictionary with preprocessing metadata info
|
265 |
+
"""
|
266 |
+
src_width, src_height = image.size
|
267 |
+
dst_width, dst_height = scale_fit_to_window(height, width, src_width, src_height)
|
268 |
+
image = np.array(image.resize((dst_width, dst_height), resample=PIL.Image.Resampling.LANCZOS))[None, :]
|
269 |
+
print(image.shape)
|
270 |
+
pad_width = width - dst_width
|
271 |
+
pad_height = height - dst_height
|
272 |
+
pad = ((0, 0), (0, pad_height), (0, pad_width), (0, 0))
|
273 |
+
image = np.pad(image, pad, mode="constant")
|
274 |
+
image = image.astype(np.float32) / 255.0
|
275 |
+
#image = image.astype(np.float16) / 255.0
|
276 |
+
image = 2.0 * image - 1.0
|
277 |
+
image = image.transpose(0, 3, 1, 2)
|
278 |
+
print(image.shape)
|
279 |
+
return image, {"padding": pad, "src_width": src_width, "src_height": src_height}
|
280 |
+
|
281 |
+
|
282 |
+
class OVStableDiffusionPipeline(DiffusionPipeline):
|
283 |
+
def __init__(
|
284 |
+
self,
|
285 |
+
vae_decoder: ov.Model,
|
286 |
+
text_encoder: ov.Model,
|
287 |
+
tokenizer: CLIPTokenizer,
|
288 |
+
unet: ov.Model,
|
289 |
+
scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
|
290 |
+
image_encoder: ov.Model,
|
291 |
+
feature_extractor: CLIPImageProcessor,
|
292 |
+
vae_encoder: ov.Model,
|
293 |
+
):
|
294 |
+
"""
|
295 |
+
Pipeline for text-to-image generation using Stable Diffusion and IP-Adapter with OpenVINO
|
296 |
+
Parameters:
|
297 |
+
vae_decoder (ov.Model):
|
298 |
+
Variational Auto-Encoder (VAE) Model to decode images to and from latent representations.
|
299 |
+
text_encoder (ov.Model):CLIPImageProcessor
|
300 |
+
Frozen text-encoder. Stable Diffusion uses the text portion of
|
301 |
+
[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
|
302 |
+
the clip-vit-large-patch14(https://huggingface.co/openai/clip-vit-large-patch14) variant.
|
303 |
+
tokenizer (CLIPTokenizer):
|
304 |
+
Tokenizer of class CLIPTokenizer(https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
|
305 |
+
unet (ov.Model): Conditional U-Net architecture to denoise the encoded image latents.
|
306 |
+
scheduler (SchedulerMixin):
|
307 |
+
A scheduler to be used in combination with unet to denoise the encoded image latents
|
308 |
+
image_encoder (ov.Model):
|
309 |
+
IP-Adapter image encoder for embedding input image as input prompt for generation
|
310 |
+
feature_extractor :
|
311 |
+
"""
|
312 |
+
super().__init__()
|
313 |
+
self.scheduler = scheduler
|
314 |
+
self.vae_decoder = vae_decoder
|
315 |
+
self.image_encoder = image_encoder
|
316 |
+
self.text_encoder = text_encoder
|
317 |
+
self.unet = unet
|
318 |
+
self.height = 512
|
319 |
+
self.width = 512
|
320 |
+
self.vae_scale_factor = 8
|
321 |
+
self.tokenizer = tokenizer
|
322 |
+
self.vae_encoder = vae_encoder
|
323 |
+
self.feature_extractor = feature_extractor
|
324 |
+
self.register_to_config(unet=unet) # config
|
325 |
+
|
326 |
+
def __call__(
|
327 |
+
self,
|
328 |
+
prompt: Union[str, List[str]],
|
329 |
+
ip_adapter_image: PIL.Image.Image,
|
330 |
+
image: PIL.Image.Image = None,
|
331 |
+
num_inference_steps: Optional[int] = 4,
|
332 |
+
negative_prompt: Union[str, List[str]] = None,
|
333 |
+
guidance_scale: Optional[float] = 0.5,
|
334 |
+
eta: Optional[float] = 0.0,
|
335 |
+
output_type: Optional[str] = "pil",
|
336 |
+
height: Optional[int] = None,
|
337 |
+
width: Optional[int] = None,
|
338 |
+
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
339 |
+
latents: Optional[torch.FloatTensor] = None,
|
340 |
+
strength: float = 1.0,
|
341 |
+
**kwargs,
|
342 |
+
):
|
343 |
+
"""
|
344 |
+
Function invoked when calling the pipeline for generation.
|
345 |
+
Parameters:
|
346 |
+
prompt (str or List[str]):
|
347 |
+
The prompt or prompts to guide the image generation.
|
348 |
+
image (PIL.Image.Image, *optional*, None):
|
349 |
+
Intinal image for generation.
|
350 |
+
num_inference_steps (int, *optional*, defaults to 50):
|
351 |
+
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
352 |
+
expense of slower inference.
|
353 |
+
negative_prompt (str or List[str]):https://user-images.githubusercontent.com/29454499/258651862-28b63016-c5ff-4263-9da8-73ca31100165.jpeg
|
354 |
+
The negative prompt or prompts to guide the image generation.
|
355 |
+
guidance_scale (float, *optional*, defaults to 7.5):
|
356 |
+
Guidance scale as defined in Classifier-Free Diffusion Guidance(https://arxiv.org/abs/2207.12598).
|
357 |
+
guidance_scale is defined as `w` of equation 2.
|
358 |
+
Higher guidance scale encourages to generate images that are closely linked to the text prompt,
|
359 |
+
usually at the expense of lower image quality.
|
360 |
+
eta (float, *optional*, defaults to 0.0):
|
361 |
+
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
|
362 |
+
[DDIMScheduler], will be ignored for others.
|
363 |
+
output_type (`str`, *optional*, defaults to "pil"):
|
364 |
+
The output format of the generate image. Choose between
|
365 |
+
[PIL](https://pillow.readthedocs.io/en/stable/): PIL.Image.Image or np.array.
|
366 |
+
height (int, *optional*, 512):
|
367 |
+
Generated image height
|
368 |
+
width (int, *optional*, 512):
|
369 |
+
Generated image width
|
370 |
+
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
371 |
+
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
372 |
+
generation deterministic.
|
373 |
+
latents (`torch.FloatTensor`, *optional*):
|
374 |
+
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
|
375 |
+
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
376 |
+
tensor is generated by sampling using the supplied random `generator`.
|
377 |
+
Returns:
|
378 |
+
Dictionary with keys:
|
379 |
+
sample - the last generated image PIL.Image.Image or np.arrayhttps://huggingface.co/latent-consistency/lcm-lora-sdv1-5
|
380 |
+
iterations - *optional* (if gif=True) images for all diffusion steps, List of PIL.Image.Image or np.array.
|
381 |
+
"""
|
382 |
+
do_classifier_free_guidance = guidance_scale > 1.0
|
383 |
+
# get prompt text embeddings
|
384 |
+
text_embeddings = self._encode_prompt(
|
385 |
+
prompt,
|
386 |
+
do_classifier_free_guidance=do_classifier_free_guidance,
|
387 |
+
negative_prompt=negative_prompt,
|
388 |
+
)
|
389 |
+
# get ip-adapter image embeddings
|
390 |
+
image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image)
|
391 |
+
if do_classifier_free_guidance:
|
392 |
+
image_embeds = np.concatenate([negative_image_embeds, image_embeds])
|
393 |
+
|
394 |
+
# set timesteps
|
395 |
+
accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
|
396 |
+
extra_set_kwargs = {}
|
397 |
+
if accepts_offset:
|
398 |
+
extra_set_kwargs["offset"] = 1
|
399 |
+
|
400 |
+
self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
|
401 |
+
|
402 |
+
timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
|
403 |
+
latent_timestep = timesteps[:1]
|
404 |
+
|
405 |
+
print(num_inference_steps,timesteps)
|
406 |
+
|
407 |
+
# get the initial random noise unless the user supplied it
|
408 |
+
latents, meta = self.prepare_latents(
|
409 |
+
1,
|
410 |
+
4,
|
411 |
+
height or self.height,
|
412 |
+
width or self.width,
|
413 |
+
generator=generator,
|
414 |
+
latents=latents,
|
415 |
+
image=image,
|
416 |
+
latent_timestep=latent_timestep,
|
417 |
+
)
|
418 |
+
|
419 |
+
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
420 |
+
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
421 |
+
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
|
422 |
+
# and should be between [0, 1]
|
423 |
+
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
424 |
+
extra_step_kwargs = {}
|
425 |
+
if accepts_eta:
|
426 |
+
extra_step_kwargs["eta"] = eta
|
427 |
+
|
428 |
+
for i, t in enumerate(self.progress_bar(timesteps)):
|
429 |
+
# expand the latents if you are doing classifier free guidance
|
430 |
+
latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
|
431 |
+
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
432 |
+
|
433 |
+
# predict the noise residual
|
434 |
+
|
435 |
+
noise_pred = self.unet([latent_model_input, t, text_embeddings, image_embeds])[0]
|
436 |
+
# perform guidance
|
437 |
+
if do_classifier_free_guidance:
|
438 |
+
noise_pred_uncond, noise_pred_text = noise_pred[0], noise_pred[1]
|
439 |
+
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
440 |
+
|
441 |
+
# compute the previous noisy sample x_t -> x_t-1
|
442 |
+
latents = self.scheduler.step(
|
443 |
+
torch.from_numpy(noise_pred),
|
444 |
+
t,
|
445 |
+
torch.from_numpy(latents),
|
446 |
+
**extra_step_kwargs,
|
447 |
+
)["prev_sample"].numpy()
|
448 |
+
|
449 |
+
# scale and decode the image latents with vae
|
450 |
+
image = self.vae_decoder(latents * (1 / 0.18215))[0]
|
451 |
+
|
452 |
+
image = self.postprocess_image(image, meta, output_type)
|
453 |
+
return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=False)
|
454 |
+
|
455 |
+
def _encode_prompt(
|
456 |
+
self,
|
457 |
+
prompt: Union[str, List[str]],
|
458 |
+
num_images_per_prompt: int = 1,
|
459 |
+
do_classifier_free_guidance: bool = True,
|
460 |
+
negative_prompt: Union[str, List[str]] = None,
|
461 |
+
):
|
462 |
+
"""
|
463 |
+
Encodes the prompt into text encoder hidden states.
|
464 |
+
|
465 |
+
Parameters:
|
466 |
+
prompt (str or list(str)): prompt to be encoded
|
467 |
+
num_images_per_prompt (int): number of images that should be generated per prompt
|
468 |
+
do_classifier_free_guidance (bool): whether to use classifier free guidance or not
|
469 |
+
negative_prompt (str or list(str)): negative prompt to be encoded.
|
470 |
+
Returns:
|
471 |
+
text_embeddings (np.ndarray): text encoder hidden states
|
472 |
+
"""
|
473 |
+
batch_size = len(prompt) if isinstance(prompt, list) else 1
|
474 |
+
|
475 |
+
# tokenize input prompts
|
476 |
+
text_inputs = self.tokenizer(
|
477 |
+
prompt,
|
478 |
+
padding="max_length",
|
479 |
+
max_length=self.tokenizer.model_max_length,
|
480 |
+
truncation=True,
|
481 |
+
return_tensors="np",
|
482 |
+
)
|
483 |
+
text_input_ids = text_inputs.input_ids
|
484 |
+
|
485 |
+
text_embeddings = self.text_encoder(text_input_ids)[0]
|
486 |
+
|
487 |
+
# duplicate text embeddings for each generation per prompt
|
488 |
+
if num_images_per_prompt != 1:
|
489 |
+
bs_embed, seq_len, _ = text_embeddings.shape
|
490 |
+
text_embeddings = np.tile(text_embeddings, (1, num_images_per_prompt, 1))
|
491 |
+
text_embeddings = np.reshape(text_embeddings, (bs_embed * num_images_per_prompt, seq_len, -1))
|
492 |
+
|
493 |
+
# get unconditional embeddings for classifier free guidance
|
494 |
+
if do_classifier_free_guidance:
|
495 |
+
uncond_tokens: List[str]
|
496 |
+
max_length = text_input_ids.shape[-1]
|
497 |
+
if negative_prompt is None:
|
498 |
+
uncond_tokens = [""] * batch_size
|
499 |
+
elif isinstance(negative_prompt, str):
|
500 |
+
uncond_tokens = [negative_prompt]
|
501 |
+
else:
|
502 |
+
uncond_tokens = negative_prompt
|
503 |
+
uncond_input = self.tokenizer(
|
504 |
+
uncond_tokens,
|
505 |
+
padding="max_length",
|
506 |
+
max_length=max_length,
|
507 |
+
truncation=True,
|
508 |
+
return_tensors="np",
|
509 |
+
)
|
510 |
+
|
511 |
+
uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0]
|
512 |
+
|
513 |
+
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
|
514 |
+
seq_len = uncond_embeddings.shape[1]
|
515 |
+
uncond_embeddings = np.tile(uncond_embeddings, (1, num_images_per_prompt, 1))
|
516 |
+
uncond_embeddings = np.reshape(uncond_embeddings, (batch_size * num_images_per_prompt, seq_len, -1))
|
517 |
+
|
518 |
+
# For classifier-free guidance, we need to do two forward passes.
|
519 |
+
# Here we concatenate the unconditional and text embeddings into a single batch
|
520 |
+
# to avoid doing two forward passes
|
521 |
+
text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
|
522 |
+
|
523 |
+
return text_embeddings
|
524 |
+
|
525 |
+
def prepare_latents(
|
526 |
+
self,
|
527 |
+
batch_size,
|
528 |
+
num_channels_latents,
|
529 |
+
height,
|
530 |
+
width,
|
531 |
+
dtype=torch.float16,
|
532 |
+
generator=None,
|
533 |
+
latents=None,
|
534 |
+
image=None,
|
535 |
+
latent_timestep=None,
|
536 |
+
):
|
537 |
+
shape = (
|
538 |
+
batch_size,
|
539 |
+
num_channels_latents,
|
540 |
+
height // self.vae_scale_factor,
|
541 |
+
width // self.vae_scale_factor,
|
542 |
+
)
|
543 |
+
if isinstance(generator, list) and len(generator) != batch_size:
|
544 |
+
raise ValueError(
|
545 |
+
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
546 |
+
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
547 |
+
)
|
548 |
+
|
549 |
+
if latents is None:
|
550 |
+
latents = randn_tensor(shape, generator=generator, dtype=dtype)
|
551 |
+
|
552 |
+
if image is None:
|
553 |
+
# scale the initial noise by the standard deviation required by the scheduler
|
554 |
+
latents = latents * self.scheduler.init_noise_sigma
|
555 |
+
return latents.numpy(), {}
|
556 |
+
input_image, meta = preprocess(image, height, width)
|
557 |
+
print(input_image.shape)
|
558 |
+
image_latents = self.vae_encoder(input_image)[0]
|
559 |
+
image_latents = image_latents * 0.18215
|
560 |
+
latents = self.scheduler.add_noise(torch.from_numpy(image_latents), latents, latent_timestep).numpy()
|
561 |
+
return latents, meta
|
562 |
+
|
563 |
+
def postprocess_image(self, image: np.ndarray, meta: Dict, output_type: str = "pil"):
|
564 |
+
"""
|
565 |
+
Postprocessing for decoded image. Takes generated image decoded by VAE decoder, unpad it to initial image size (if required),
|
566 |
+
normalize and convert to [0, 255] pixels range. Optionally, converts it from np.ndarray to PIL.Image format
|
567 |
+
|
568 |
+
Parameters:
|
569 |
+
image (np.ndarray):
|
570 |
+
Generated image
|
571 |
+
meta (Dict):
|
572 |
+
Metadata obtained on the latents preparing step can be empty
|
573 |
+
output_type (str, *optional*, pil):
|
574 |
+
Output format for result, can be pil or numpy
|
575 |
+
Returns:
|
576 |
+
image (List of np.ndarray or PIL.Image.Image):
|
577 |
+
Post-processed images
|
578 |
+
"""
|
579 |
+
if "padding" in meta:
|
580 |
+
pad = meta["padding"]
|
581 |
+
(_, end_h), (_, end_w) = pad[1:3]
|
582 |
+
h, w = image.shape[2:]
|
583 |
+
unpad_h = h - end_h
|
584 |
+
unpad_w = w - end_w
|
585 |
+
image = image[:, :, :unpad_h, :unpad_w]
|
586 |
+
image = np.clip(image / 2 + 0.5, 0, 1)
|
587 |
+
image = np.transpose(image, (0, 2, 3, 1))
|
588 |
+
|
589 |
+
|
590 |
+
# 9. Convert to PIL
|
591 |
+
if output_type == "pil":
|
592 |
+
image = self.numpy_to_pil(image)
|
593 |
+
if "src_height" in meta:
|
594 |
+
orig_height, orig_width = meta["src_height"], meta["src_width"]
|
595 |
+
image = [img.resize((orig_width, orig_height), PIL.Image.Resampling.LANCZOS) for img in image]
|
596 |
+
else:
|
597 |
+
if "src_height" in meta:
|
598 |
+
orig_height, orig_width = meta["src_height"], meta["src_width"]
|
599 |
+
image = [cv2.resize(img, (orig_width, orig_width)) for img in image]
|
600 |
+
|
601 |
+
|
602 |
+
return image
|
603 |
+
|
604 |
+
def encode_image(self, image, num_images_per_prompt=1):
|
605 |
+
if not isinstance(image, torch.Tensor):
|
606 |
+
image = self.feature_extractor(image, return_tensors="pt").pixel_values
|
607 |
+
|
608 |
+
image_embeds = self.image_encoder(image)[0]
|
609 |
+
"""
|
610 |
+
print(1,image_embeds)
|
611 |
+
image_proj_model = Resampler(
|
612 |
+
dim=1024,
|
613 |
+
depth=2,
|
614 |
+
dim_head=64,
|
615 |
+
heads=16,
|
616 |
+
num_queries=8,
|
617 |
+
embedding_dim=1280,
|
618 |
+
output_dim=1280,
|
619 |
+
ff_mult=2,
|
620 |
+
max_seq_len=257,
|
621 |
+
apply_pos_emb=True,
|
622 |
+
num_latents_mean_pooled=4,
|
623 |
+
)
|
624 |
+
|
625 |
+
image_embeds = image_proj_model(image_embeds)
|
626 |
+
print(2,image_embeds)
|
627 |
+
"""
|
628 |
+
|
629 |
+
if num_images_per_prompt > 1:
|
630 |
+
image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
|
631 |
+
|
632 |
+
uncond_image_embeds = np.zeros(image_embeds.shape)
|
633 |
+
return image_embeds, uncond_image_embeds
|
634 |
+
|
635 |
+
def get_timesteps(self, num_inference_steps: int, strength: float):
|
636 |
+
"""
|
637 |
+
Helper function for getting scheduler timesteps for generation
|
638 |
+
In case of image-to-image generation, it updates number of steps according to strength
|
639 |
+
|
640 |
+
Parameters:
|
641 |
+
num_inference_steps (int):
|
642 |
+
number of inference steps for generation
|
643 |
+
strength (float):
|
644 |
+
value between 0.0 and 1.0, that controls the amount of noise that is added to the input image.
|
645 |
+
Values that approach 1.0 allow for lots of variations but will also produce images that are not semantically consistent with the input.
|
646 |
+
"""
|
647 |
+
# get the original timestep using init_timestep
|
648 |
+
init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
|
649 |
+
|
650 |
+
t_start = max(num_inference_steps - init_timestep, 0)
|
651 |
+
timesteps = self.scheduler.timesteps[t_start:]
|
652 |
+
|
653 |
+
return timesteps, num_inference_steps - t_start
|
654 |
+
|
655 |
+
|
656 |
+
core = ov.Core()
|
657 |
+
device = "GPU"
|
658 |
+
|
659 |
+
|
660 |
+
|
661 |
+
from transformers import AutoTokenizer
|
662 |
+
from PIL import Image
|
663 |
+
|
664 |
+
ov_config = {"INFERENCE_PRECISION_HINT": "f16"}
|
665 |
+
vae_decoder = core.compile_model(VAE_DECODER_PATH, device, ov_config)
|
666 |
+
vae_encoder = core.compile_model(VAE_ENCODER_PATH, device, ov_config)
|
667 |
+
text_encoder = core.compile_model(TEXT_ENCODER_PATH, device )
|
668 |
+
image_encoder = core.compile_model(IMAGE_ENCODER_PATH, device)
|
669 |
+
unet = core.compile_model(UNET_PATH, device)
|
670 |
+
|
671 |
+
scheduler = LCMScheduler.from_pretrained(models_dir / "scheduler")
|
672 |
+
tokenizer = AutoTokenizer.from_pretrained(models_dir / "tokenizer")
|
673 |
+
feature_extractor = CLIPImageProcessor.from_pretrained(models_dir / "feature_extractor")
|
674 |
+
|
675 |
+
ov_pipe = OVStableDiffusionPipeline(
|
676 |
+
vae_decoder,
|
677 |
+
text_encoder,
|
678 |
+
tokenizer,
|
679 |
+
unet,
|
680 |
+
scheduler,
|
681 |
+
image_encoder,
|
682 |
+
feature_extractor,
|
683 |
+
vae_encoder,
|
684 |
+
#safety_checker = None
|
685 |
+
)
|
686 |
+
|
687 |
+
"""
|
688 |
+
import datasets
|
689 |
+
DATASET_NAME = "jxie/coco_captions"
|
690 |
+
|
691 |
+
dataset = datasets.load_dataset("jxie/coco_captions", split="train", streaming=True).shuffle(seed=42)
|
692 |
+
def preprocess_fn(example):
|
693 |
+
return {"prompt": example["caption"]}
|
694 |
+
|
695 |
+
NUM_SAMPLES = 200
|
696 |
+
dataset = dataset.take(NUM_SAMPLES)
|
697 |
+
calibration_dataset = dataset.map(lambda x: preprocess_fn(x), remove_columns=dataset.column_names)
|
698 |
+
|
699 |
+
|
700 |
+
int8_pipe = None
|
701 |
+
|
702 |
+
import nncf
|
703 |
+
import datasets
|
704 |
+
from tqdm import tqdm
|
705 |
+
from transformers import set_seed
|
706 |
+
from typing import Any, Dict, List
|
707 |
+
|
708 |
+
set_seed(1)
|
709 |
+
|
710 |
+
class CompiledModelDecorator(ov.CompiledModel):
|
711 |
+
def __init__(self, compiled_model, prob: float, data_cache: List[Any] = None):
|
712 |
+
super().__init__(compiled_model)
|
713 |
+
self.data_cache = data_cache if data_cache else []
|
714 |
+
self.prob = np.clip(prob, 0, 1)
|
715 |
+
|
716 |
+
def __call__(self, *args, **kwargs):
|
717 |
+
if np.random.rand() >= self.prob:
|
718 |
+
self.data_cache.append(*args)
|
719 |
+
return super().__call__(*args, **kwargs)
|
720 |
+
|
721 |
+
from diffusers.utils import load_image
|
722 |
+
|
723 |
+
def collect_calibration_data(pipeline: OVStableDiffusionPipeline, subset_size: int) -> List[Dict]:
|
724 |
+
original_unet = pipeline.unet
|
725 |
+
pipeline.unet = CompiledModelDecorator(original_unet, prob=0.3)
|
726 |
+
#google-research-datasets/conceptual_captions
|
727 |
+
dataset = datasets.load_dataset("jxie/coco_captions", split="train", streaming=True).shuffle(seed=42)
|
728 |
+
pipeline.set_progress_bar_config(disable=True)
|
729 |
+
#safety_checker = pipeline.safety_checker
|
730 |
+
#pipeline.safety_checker = None
|
731 |
+
|
732 |
+
# Run inference for data collection
|
733 |
+
pbar = tqdm(total=subset_size)
|
734 |
+
diff = 0
|
735 |
+
for batch in dataset:
|
736 |
+
prompt = batch["caption"]
|
737 |
+
image = load_image(batch["image"])
|
738 |
+
if len(prompt) > tokenizer.model_max_length:
|
739 |
+
continue
|
740 |
+
_ = pipeline(
|
741 |
+
prompt,
|
742 |
+
ip_adapter_image = image,
|
743 |
+
num_inference_steps=4,
|
744 |
+
guidance_scale=1,
|
745 |
+
#guidance_scale=8.0,
|
746 |
+
#lcm_origin_steps=50,
|
747 |
+
output_type="pil",
|
748 |
+
height=512,
|
749 |
+
width=512,
|
750 |
+
)
|
751 |
+
collected_subset_size = len(pipeline.unet.data_cache)
|
752 |
+
if collected_subset_size >= subset_size:
|
753 |
+
pbar.update(subset_size - pbar.n)
|
754 |
+
break
|
755 |
+
pbar.update(collected_subset_size - diff)
|
756 |
+
diff = collected_subset_size
|
757 |
+
|
758 |
+
calibration_dataset = pipeline.unet.data_cache
|
759 |
+
pipeline.set_progress_bar_config(disable=False)
|
760 |
+
pipeline.unet = original_unet
|
761 |
+
#pipeline.safety_checker = safety_checker
|
762 |
+
return calibration_dataset
|
763 |
+
|
764 |
+
|
765 |
+
UNET_INT8_PATH = models_dir / "unet_int8.xml"
|
766 |
+
|
767 |
+
if not UNET_INT8_PATH.exists():
|
768 |
+
subset_size = 200
|
769 |
+
unet_calibration_data = collect_calibration_data(ov_pipe, subset_size=subset_size)
|
770 |
+
|
771 |
+
|
772 |
+
import nncf
|
773 |
+
from nncf.scopes import IgnoredScope
|
774 |
+
|
775 |
+
if UNET_INT8_PATH.exists():
|
776 |
+
print("Loading quantized model")
|
777 |
+
quantized_unet = core.read_model(UNET_INT8_PATH)
|
778 |
+
else:
|
779 |
+
unet = core.read_model(UNET_PATH)
|
780 |
+
quantized_unet = nncf.quantize(
|
781 |
+
model=unet,
|
782 |
+
subset_size=subset_size,
|
783 |
+
calibration_dataset=nncf.Dataset(unet_calibration_data),
|
784 |
+
model_type=nncf.ModelType.TRANSFORMER,
|
785 |
+
advanced_parameters=nncf.AdvancedQuantizationParameters(
|
786 |
+
disable_bias_correction=True
|
787 |
+
)
|
788 |
+
)
|
789 |
+
ov.save_model(quantized_unet, UNET_INT8_PATH)
|
790 |
+
"""
|
text_encoder.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0ecfe63f53bccf534038ee94b2e414b457136b02cdb7033279d35693ef487f5e
|
3 |
+
size 246145458
|
text_encoder.xml
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer/merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer/special_tokens_map.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<|startoftext|>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": true,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "<|endoftext|>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "<|endoftext|>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"unk_token": {
|
24 |
+
"content": "<|endoftext|>",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
}
|
30 |
+
}
|
tokenizer/tokenizer_config.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_prefix_space": false,
|
3 |
+
"added_tokens_decoder": {
|
4 |
+
"49406": {
|
5 |
+
"content": "<|startoftext|>",
|
6 |
+
"lstrip": false,
|
7 |
+
"normalized": true,
|
8 |
+
"rstrip": false,
|
9 |
+
"single_word": false,
|
10 |
+
"special": true
|
11 |
+
},
|
12 |
+
"49407": {
|
13 |
+
"content": "<|endoftext|>",
|
14 |
+
"lstrip": false,
|
15 |
+
"normalized": false,
|
16 |
+
"rstrip": false,
|
17 |
+
"single_word": false,
|
18 |
+
"special": true
|
19 |
+
}
|
20 |
+
},
|
21 |
+
"bos_token": "<|startoftext|>",
|
22 |
+
"clean_up_tokenization_spaces": true,
|
23 |
+
"do_lower_case": true,
|
24 |
+
"eos_token": "<|endoftext|>",
|
25 |
+
"errors": "replace",
|
26 |
+
"model_max_length": 77,
|
27 |
+
"pad_token": "<|endoftext|>",
|
28 |
+
"tokenizer_class": "CLIPTokenizer",
|
29 |
+
"unk_token": "<|endoftext|>"
|
30 |
+
}
|
tokenizer/vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
unet.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:91705a6ca0fdfeee9c2035c9c3e69f6b32df476e65960f15c1864c6981de9c63
|
3 |
+
size 1762631378
|
unet.xml
ADDED
The diff for this file is too large to render.
See raw diff
|
|
vae_decoder.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:676c514fd4b6acac962ce74445f486cfaecad455e7299551053fbbc73c1e9a67
|
3 |
+
size 98980618
|
vae_decoder.xml
ADDED
The diff for this file is too large to render.
See raw diff
|
|
vae_encoder.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:43daf33c1843d9bc771c8534c14cb90b4dc836db309261397041bbba8148c687
|
3 |
+
size 68327564
|
vae_encoder.xml
ADDED
The diff for this file is too large to render.
See raw diff
|
|