rippertnt commited on
Commit
5080377
1 Parent(s): dac5ffb

Upload 20 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ ai_face.png filter=lfs diff=lfs merge=lfs -text
ai_face.png ADDED

Git LFS Details

  • SHA256: 4c103383aa58133aba7a0371ae9b7c7f4a067e4fff5896d80b17a7c9d704f6db
  • Pointer size: 132 Bytes
  • Size of remote file: 1.78 MB
feature_extractor/preprocessor_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 224,
4
+ "width": 224
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "image_mean": [
12
+ 0.48145466,
13
+ 0.4578275,
14
+ 0.40821073
15
+ ],
16
+ "image_processor_type": "CLIPFeatureExtractor",
17
+ "image_std": [
18
+ 0.26862954,
19
+ 0.26130258,
20
+ 0.27577711
21
+ ],
22
+ "resample": 3,
23
+ "rescale_factor": 0.00392156862745098,
24
+ "size": {
25
+ "shortest_edge": 224
26
+ }
27
+ }
image_encoder.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92acd3113efe615c1bae76084914e0c95835f2cef5f7d044c7e217ffe813ddac
3
+ size 1264153732
image_encoder.xml ADDED
The diff for this file is too large to render. See raw diff
 
infer_face.py ADDED
@@ -0,0 +1,529 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import inspect
2
+ from typing import List, Optional, Union, Dict, Tuple
3
+ import numpy as np
4
+
5
+ from pathlib import Path
6
+ from diffusers import AutoPipelineForText2Image
7
+ from transformers import CLIPVisionModelWithProjection
8
+ from diffusers.utils import load_image
9
+ from diffusers import LCMScheduler
10
+
11
+ import PIL
12
+ import cv2
13
+ import torch
14
+ import openvino as ov
15
+
16
+ from transformers import CLIPTokenizer, CLIPImageProcessor
17
+ from diffusers import DiffusionPipeline
18
+ from diffusers.pipelines.stable_diffusion.pipeline_output import (
19
+ StableDiffusionPipelineOutput,
20
+ )
21
+ from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
22
+ from resampler import Resampler
23
+
24
+
25
+ def scale_fit_to_window(dst_width: int, dst_height: int, image_width: int, image_height: int):
26
+ """
27
+ Preprocessing helper function for calculating image size for resize with peserving original aspect ratio
28
+ and fitting image to specific window size
29
+
30
+ Parameters:
31
+ dst_width (int): destination window width
32
+ dst_height (int): destination window height
33
+ image_width (int): source image width
34
+ image_height (int): source image height
35
+ Returns:
36
+ result_width (int): calculated width for resize
37
+ result_height (int): calculated height for resize
38
+ """
39
+ im_scale = min(dst_height / image_height, dst_width / image_width)
40
+ return int(im_scale * image_width), int(im_scale * image_height)
41
+
42
+
43
+ def randn_tensor(
44
+ shape: Union[Tuple, List],
45
+ generator: Optional[Union[List["torch.Generator"], "torch.Generator"]] = None,
46
+ dtype: Optional["torch.dtype"] = None,
47
+ ):
48
+ """A helper function to create random tensors on the desired `device` with the desired `dtype`. When
49
+ passing a list of generators, you can seed each batch size individually.
50
+
51
+ """
52
+ batch_size = shape[0]
53
+ rand_device = torch.device("cpu")
54
+
55
+ # make sure generator list of length 1 is treated like a non-list
56
+ if isinstance(generator, list) and len(generator) == 1:
57
+ generator = generator[0]
58
+
59
+ if isinstance(generator, list):
60
+ shape = (1,) + shape[1:]
61
+ latents = [torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype) for i in range(batch_size)]
62
+ latents = torch.cat(latents, dim=0)
63
+ else:
64
+ latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype)
65
+
66
+ return latents
67
+
68
+
69
+ def preprocess(image: PIL.Image.Image, height, width):
70
+ """
71
+ Image preprocessing function. Takes image in PIL.Image format, resizes it to keep aspect ration and fits to model input window 512x512,
72
+ then converts it to np.ndarray and adds padding with zeros on right or bottom side of image (depends from aspect ratio), after that
73
+ converts data to float32 data type and change range of values from [0, 255] to [-1, 1], finally, converts data layout from planar NHWC to NCHW.
74
+ The function returns preprocessed input tensor and padding size, which can be used in postprocessing.
75
+
76
+ Parameters:
77
+ image (PIL.Image.Image): input image
78
+ Returns:
79
+ image (np.ndarray): preprocessed image tensor
80
+ meta (Dict): dictionary with preprocessing metadata info
81
+ """
82
+ src_width, src_height = image.size
83
+ dst_width, dst_height = scale_fit_to_window(height, width, src_width, src_height)
84
+ image = np.array(image.resize((dst_width, dst_height), resample=PIL.Image.Resampling.LANCZOS))[None, :]
85
+ print(image.shape)
86
+ pad_width = width - dst_width
87
+ pad_height = height - dst_height
88
+ pad = ((0, 0), (0, pad_height), (0, pad_width), (0, 0))
89
+ image = np.pad(image, pad, mode="constant")
90
+ image = image.astype(np.float32) / 255.0
91
+ #image = image.astype(np.float16) / 255.0
92
+ image = 2.0 * image - 1.0
93
+ image = image.transpose(0, 3, 1, 2)
94
+ print(image.shape)
95
+ return image, {"padding": pad, "src_width": src_width, "src_height": src_height}
96
+
97
+
98
+ class OVStableDiffusionPipeline(DiffusionPipeline):
99
+ def __init__(
100
+ self,
101
+ vae_decoder: ov.Model,
102
+ text_encoder: ov.Model,
103
+ tokenizer: CLIPTokenizer,
104
+ unet: ov.Model,
105
+ scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
106
+ image_encoder: ov.Model,
107
+ feature_extractor: CLIPImageProcessor,
108
+ vae_encoder: ov.Model,
109
+ ):
110
+ """
111
+ Pipeline for text-to-image generation using Stable Diffusion and IP-Adapter with OpenVINO
112
+ Parameters:
113
+ vae_decoder (ov.Model):
114
+ Variational Auto-Encoder (VAE) Model to decode images to and from latent representations.
115
+ text_encoder (ov.Model):CLIPImageProcessor
116
+ Frozen text-encoder. Stable Diffusion uses the text portion of
117
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
118
+ the clip-vit-large-patch14(https://huggingface.co/openai/clip-vit-large-patch14) variant.
119
+ tokenizer (CLIPTokenizer):
120
+ Tokenizer of class CLIPTokenizer(https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
121
+ unet (ov.Model): Conditional U-Net architecture to denoise the encoded image latents.
122
+ scheduler (SchedulerMixin):
123
+ A scheduler to be used in combination with unet to denoise the encoded image latents
124
+ image_encoder (ov.Model):
125
+ IP-Adapter image encoder for embedding input image as input prompt for generation
126
+ feature_extractor :
127
+ """
128
+ super().__init__()
129
+ self.scheduler = scheduler
130
+ self.vae_decoder = vae_decoder
131
+ self.image_encoder = image_encoder
132
+ self.text_encoder = text_encoder
133
+ self.unet = unet
134
+ self.height = 512
135
+ self.width = 512
136
+ self.vae_scale_factor = 8
137
+ self.tokenizer = tokenizer
138
+ self.vae_encoder = vae_encoder
139
+ self.feature_extractor = feature_extractor
140
+
141
+ def __call__(
142
+ self,
143
+ prompt: Union[str, List[str]],
144
+ ip_adapter_image: PIL.Image.Image,
145
+ image: PIL.Image.Image = None,
146
+ num_inference_steps: Optional[int] = 4,
147
+ negative_prompt: Union[str, List[str]] = None,
148
+ guidance_scale: Optional[float] = 0.5,
149
+ eta: Optional[float] = 0.0,
150
+ output_type: Optional[str] = "pil",
151
+ height: Optional[int] = None,
152
+ width: Optional[int] = None,
153
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
154
+ latents: Optional[torch.FloatTensor] = None,
155
+ strength: float = 1.0,
156
+ **kwargs,
157
+ ):
158
+ """
159
+ Function invoked when calling the pipeline for generation.
160
+ Parameters:
161
+ prompt (str or List[str]):
162
+ The prompt or prompts to guide the image generation.
163
+ image (PIL.Image.Image, *optional*, None):
164
+ Intinal image for generation.
165
+ num_inference_steps (int, *optional*, defaults to 50):
166
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
167
+ expense of slower inference.
168
+ negative_prompt (str or List[str]):https://user-images.githubusercontent.com/29454499/258651862-28b63016-c5ff-4263-9da8-73ca31100165.jpeg
169
+ The negative prompt or prompts to guide the image generation.
170
+ guidance_scale (float, *optional*, defaults to 7.5):
171
+ Guidance scale as defined in Classifier-Free Diffusion Guidance(https://arxiv.org/abs/2207.12598).
172
+ guidance_scale is defined as `w` of equation 2.
173
+ Higher guidance scale encourages to generate images that are closely linked to the text prompt,
174
+ usually at the expense of lower image quality.
175
+ eta (float, *optional*, defaults to 0.0):
176
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
177
+ [DDIMScheduler], will be ignored for others.
178
+ output_type (`str`, *optional*, defaults to "pil"):
179
+ The output format of the generate image. Choose between
180
+ [PIL](https://pillow.readthedocs.io/en/stable/): PIL.Image.Image or np.array.
181
+ height (int, *optional*, 512):
182
+ Generated image height
183
+ width (int, *optional*, 512):
184
+ Generated image width
185
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
186
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
187
+ generation deterministic.
188
+ latents (`torch.FloatTensor`, *optional*):
189
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
190
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
191
+ tensor is generated by sampling using the supplied random `generator`.
192
+ Returns:
193
+ Dictionary with keys:
194
+ sample - the last generated image PIL.Image.Image or np.arrayhttps://huggingface.co/latent-consistency/lcm-lora-sdv1-5
195
+ iterations - *optional* (if gif=True) images for all diffusion steps, List of PIL.Image.Image or np.array.
196
+ """
197
+ do_classifier_free_guidance = guidance_scale > 1.0
198
+ # get prompt text embeddings
199
+ text_embeddings = self._encode_prompt(
200
+ prompt,
201
+ do_classifier_free_guidance=do_classifier_free_guidance,
202
+ negative_prompt=negative_prompt,
203
+ )
204
+ # get ip-adapter image embeddings
205
+ image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image)
206
+ if do_classifier_free_guidance:
207
+ image_embeds = np.concatenate([negative_image_embeds, image_embeds])
208
+
209
+ # set timesteps
210
+ accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
211
+ extra_set_kwargs = {}
212
+ if accepts_offset:
213
+ extra_set_kwargs["offset"] = 1
214
+
215
+ self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
216
+
217
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
218
+ latent_timestep = timesteps[:1]
219
+
220
+ print(num_inference_steps,timesteps)
221
+
222
+ # get the initial random noise unless the user supplied it
223
+ latents, meta = self.prepare_latents(
224
+ 1,
225
+ 4,
226
+ height or self.height,
227
+ width or self.width,
228
+ generator=generator,
229
+ latents=latents,
230
+ image=image,
231
+ latent_timestep=latent_timestep,
232
+ )
233
+
234
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
235
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
236
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
237
+ # and should be between [0, 1]
238
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
239
+ extra_step_kwargs = {}
240
+ if accepts_eta:
241
+ extra_step_kwargs["eta"] = eta
242
+
243
+ for i, t in enumerate(self.progress_bar(timesteps)):
244
+ # expand the latents if you are doing classifier free guidance
245
+ latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
246
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
247
+
248
+ # predict the noise residual
249
+
250
+ noise_pred = self.unet([latent_model_input, t, text_embeddings, image_embeds])[0]
251
+ # perform guidance
252
+ if do_classifier_free_guidance:
253
+ noise_pred_uncond, noise_pred_text = noise_pred[0], noise_pred[1]
254
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
255
+
256
+ # compute the previous noisy sample x_t -> x_t-1
257
+ latents = self.scheduler.step(
258
+ torch.from_numpy(noise_pred),
259
+ t,
260
+ torch.from_numpy(latents),
261
+ **extra_step_kwargs,
262
+ )["prev_sample"].numpy()
263
+
264
+ # scale and decode the image latents with vae
265
+ image = self.vae_decoder(latents * (1 / 0.18215))[0]
266
+
267
+ image = self.postprocess_image(image, meta, output_type)
268
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=False)
269
+
270
+ def _encode_prompt(
271
+ self,
272
+ prompt: Union[str, List[str]],
273
+ num_images_per_prompt: int = 1,
274
+ do_classifier_free_guidance: bool = True,
275
+ negative_prompt: Union[str, List[str]] = None,
276
+ ):
277
+ """
278
+ Encodes the prompt into text encoder hidden states.
279
+
280
+ Parameters:
281
+ prompt (str or list(str)): prompt to be encoded
282
+ num_images_per_prompt (int): number of images that should be generated per prompt
283
+ do_classifier_free_guidance (bool): whether to use classifier free guidance or not
284
+ negative_prompt (str or list(str)): negative prompt to be encoded.
285
+ Returns:
286
+ text_embeddings (np.ndarray): text encoder hidden states
287
+ """
288
+ batch_size = len(prompt) if isinstance(prompt, list) else 1
289
+
290
+ # tokenize input prompts
291
+ text_inputs = self.tokenizer(
292
+ prompt,
293
+ padding="max_length",
294
+ max_length=self.tokenizer.model_max_length,
295
+ truncation=True,
296
+ return_tensors="np",
297
+ )
298
+ text_input_ids = text_inputs.input_ids
299
+
300
+ text_embeddings = self.text_encoder(text_input_ids)[0]
301
+
302
+ # duplicate text embeddings for each generation per prompt
303
+ if num_images_per_prompt != 1:
304
+ bs_embed, seq_len, _ = text_embeddings.shape
305
+ text_embeddings = np.tile(text_embeddings, (1, num_images_per_prompt, 1))
306
+ text_embeddings = np.reshape(text_embeddings, (bs_embed * num_images_per_prompt, seq_len, -1))
307
+
308
+ # get unconditional embeddings for classifier free guidance
309
+ if do_classifier_free_guidance:
310
+ uncond_tokens: List[str]
311
+ max_length = text_input_ids.shape[-1]
312
+ if negative_prompt is None:
313
+ uncond_tokens = [""] * batch_size
314
+ elif isinstance(negative_prompt, str):
315
+ uncond_tokens = [negative_prompt]
316
+ else:
317
+ uncond_tokens = negative_prompt
318
+ uncond_input = self.tokenizer(
319
+ uncond_tokens,
320
+ padding="max_length",
321
+ max_length=max_length,
322
+ truncation=True,
323
+ return_tensors="np",
324
+ )
325
+
326
+ uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0]
327
+
328
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
329
+ seq_len = uncond_embeddings.shape[1]
330
+ uncond_embeddings = np.tile(uncond_embeddings, (1, num_images_per_prompt, 1))
331
+ uncond_embeddings = np.reshape(uncond_embeddings, (batch_size * num_images_per_prompt, seq_len, -1))
332
+
333
+ # For classifier-free guidance, we need to do two forward passes.
334
+ # Here we concatenate the unconditional and text embeddings into a single batch
335
+ # to avoid doing two forward passes
336
+ text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
337
+
338
+ return text_embeddings
339
+
340
+ def prepare_latents(
341
+ self,
342
+ batch_size,
343
+ num_channels_latents,
344
+ height,
345
+ width,
346
+ dtype=torch.float16,
347
+ generator=None,
348
+ latents=None,
349
+ image=None,
350
+ latent_timestep=None,
351
+ ):
352
+ shape = (
353
+ batch_size,
354
+ num_channels_latents,
355
+ height // self.vae_scale_factor,
356
+ width // self.vae_scale_factor,
357
+ )
358
+ if isinstance(generator, list) and len(generator) != batch_size:
359
+ raise ValueError(
360
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
361
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
362
+ )
363
+
364
+ if latents is None:
365
+ latents = randn_tensor(shape, generator=generator, dtype=dtype)
366
+
367
+ if image is None:
368
+ # scale the initial noise by the standard deviation required by the scheduler
369
+ latents = latents * self.scheduler.init_noise_sigma
370
+ return latents.numpy(), {}
371
+ input_image, meta = preprocess(image, height, width)
372
+ print(input_image.shape)
373
+ image_latents = self.vae_encoder(input_image)[0]
374
+ image_latents = image_latents * 0.18215
375
+ latents = self.scheduler.add_noise(torch.from_numpy(image_latents), latents, latent_timestep).numpy()
376
+ return latents, meta
377
+
378
+ def postprocess_image(self, image: np.ndarray, meta: Dict, output_type: str = "pil"):
379
+ """
380
+ Postprocessing for decoded image. Takes generated image decoded by VAE decoder, unpad it to initial image size (if required),
381
+ normalize and convert to [0, 255] pixels range. Optionally, converts it from np.ndarray to PIL.Image format
382
+
383
+ Parameters:
384
+ image (np.ndarray):
385
+ Generated image
386
+ meta (Dict):
387
+ Metadata obtained on the latents preparing step can be empty
388
+ output_type (str, *optional*, pil):
389
+ Output format for result, can be pil or numpy
390
+ Returns:
391
+ image (List of np.ndarray or PIL.Image.Image):
392
+ Post-processed images
393
+ """
394
+ if "padding" in meta:
395
+ pad = meta["padding"]
396
+ (_, end_h), (_, end_w) = pad[1:3]
397
+ h, w = image.shape[2:]
398
+ unpad_h = h - end_h
399
+ unpad_w = w - end_w
400
+ image = image[:, :, :unpad_h, :unpad_w]
401
+ image = np.clip(image / 2 + 0.5, 0, 1)
402
+ image = np.transpose(image, (0, 2, 3, 1))
403
+
404
+
405
+ # 9. Convert to PIL
406
+ if output_type == "pil":
407
+ image = self.numpy_to_pil(image)
408
+ if "src_height" in meta:
409
+ orig_height, orig_width = meta["src_height"], meta["src_width"]
410
+ image = [img.resize((orig_width, orig_height), PIL.Image.Resampling.LANCZOS) for img in image]
411
+ else:
412
+ if "src_height" in meta:
413
+ orig_height, orig_width = meta["src_height"], meta["src_width"]
414
+ image = [cv2.resize(img, (orig_width, orig_width)) for img in image]
415
+
416
+
417
+ return image
418
+
419
+ def encode_image(self, image, num_images_per_prompt=1):
420
+ if not isinstance(image, torch.Tensor):
421
+ image = self.feature_extractor(image, return_tensors="pt").pixel_values
422
+
423
+ image_embeds = self.image_encoder(image)[0]
424
+ """
425
+ print(1,image_embeds)
426
+ image_proj_model = Resampler(
427
+ dim=1024,
428
+ depth=2,
429
+ dim_head=64,
430
+ heads=16,
431
+ num_queries=8,
432
+ embedding_dim=1280,
433
+ output_dim=1280,
434
+ ff_mult=2,
435
+ max_seq_len=257,
436
+ apply_pos_emb=True,
437
+ num_latents_mean_pooled=4,
438
+ )
439
+
440
+ image_embeds = image_proj_model(image_embeds)
441
+ print(2,image_embeds)
442
+ """
443
+
444
+ if num_images_per_prompt > 1:
445
+ image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
446
+
447
+ uncond_image_embeds = np.zeros(image_embeds.shape)
448
+ return image_embeds, uncond_image_embeds
449
+
450
+ def get_timesteps(self, num_inference_steps: int, strength: float):
451
+ """
452
+ Helper function for getting scheduler timesteps for generation
453
+ In case of image-to-image generation, it updates number of steps according to strength
454
+
455
+ Parameters:
456
+ num_inference_steps (int):
457
+ number of inference steps for generation
458
+ strength (float):
459
+ value between 0.0 and 1.0, that controls the amount of noise that is added to the input image.
460
+ Values that approach 1.0 allow for lots of variations but will also produce images that are not semantically consistent with the input.
461
+ """
462
+ # get the original timestep using init_timestep
463
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
464
+
465
+ t_start = max(num_inference_steps - init_timestep, 0)
466
+ timesteps = self.scheduler.timesteps[t_start:]
467
+
468
+ return timesteps, num_inference_steps - t_start
469
+
470
+
471
+ core = ov.Core()
472
+ device = "CPU"
473
+
474
+ models_dir = Path('on-canvers-disney-v3.9.1-ov-face') #'on-canvers-real-ov-ref-v3.9.1')
475
+ IMAGE_ENCODER_PATH = models_dir / "image_encoder.xml"
476
+ UNET_PATH = models_dir / "unet.xml"
477
+ VAE_DECODER_PATH = models_dir / "vae_decoder.xml"
478
+ VAE_ENCODER_PATH = models_dir / "vae_encoder.xml"
479
+ TEXT_ENCODER_PATH = models_dir / "text_encoder.xml"
480
+
481
+ from transformers import AutoTokenizer
482
+ from PIL import Image
483
+
484
+ ov_config = {}# {"INFERENCE_PRECISION_HINT": "fp16"}
485
+ vae_decoder = core.compile_model(VAE_DECODER_PATH, device, ov_config)
486
+ vae_encoder = core.compile_model(VAE_ENCODER_PATH, device, ov_config)
487
+ text_encoder = core.compile_model(TEXT_ENCODER_PATH, device)
488
+ image_encoder = core.compile_model(IMAGE_ENCODER_PATH, device)
489
+ unet = core.compile_model(UNET_PATH, device)
490
+
491
+ scheduler = LCMScheduler.from_pretrained(models_dir / "scheduler")
492
+ tokenizer = AutoTokenizer.from_pretrained(models_dir / "tokenizer")
493
+ feature_extractor = CLIPImageProcessor.from_pretrained(models_dir / "feature_extractor")
494
+
495
+ ov_pipe = OVStableDiffusionPipeline(
496
+ vae_decoder,
497
+ text_encoder,
498
+ tokenizer,
499
+ unet,
500
+ scheduler,
501
+ image_encoder,
502
+ feature_extractor,
503
+ vae_encoder,
504
+ )
505
+
506
+ generator = torch.Generator(device="cpu").manual_seed(576)
507
+
508
+ ip_image = load_image("./input.jpg")
509
+ #ip_image.resize((512, 512))
510
+
511
+ image = Image.open("ai_face.png").convert('RGB')
512
+ image.resize((512, 512))
513
+
514
+ #image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/vermeer.jpg")
515
+ #ip_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/river.png")
516
+
517
+ result = ov_pipe(
518
+ prompt="best quality, high quality, beautiful korean woman is wearing glasses",
519
+ #image=image,
520
+ ip_adapter_image=image,
521
+ height=512,
522
+ width=512,
523
+ guidance_scale=1,
524
+ generator=generator,
525
+ #strength=0.7,
526
+ num_inference_steps=4,
527
+ ).images[0]
528
+
529
+ result.save("test7.png")
input.jpg ADDED
scheduler/scheduler_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "LCMScheduler",
3
+ "_diffusers_version": "0.30.2",
4
+ "beta_end": 0.012,
5
+ "beta_schedule": "scaled_linear",
6
+ "beta_start": 0.00085,
7
+ "clip_sample": false,
8
+ "clip_sample_range": 1.0,
9
+ "dynamic_thresholding_ratio": 0.995,
10
+ "num_train_timesteps": 1000,
11
+ "original_inference_steps": 50,
12
+ "prediction_type": "epsilon",
13
+ "rescale_betas_zero_snr": false,
14
+ "sample_max_value": 1.0,
15
+ "set_alpha_to_one": false,
16
+ "skip_prk_steps": true,
17
+ "steps_offset": 1,
18
+ "thresholding": false,
19
+ "timestep_scaling": 10.0,
20
+ "timestep_spacing": "leading",
21
+ "trained_betas": null
22
+ }
sd_quant_face.py ADDED
@@ -0,0 +1,790 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from diffusers import AutoPipelineForText2Image
3
+ from transformers import CLIPVisionModelWithProjection
4
+ from diffusers.utils import load_image
5
+ from diffusers import LCMScheduler
6
+
7
+
8
+ stable_diffusion_id = "circulus/canvers-disney-v3.9.1"
9
+ ip_adapter_id = "h94/IP-Adapter"
10
+ ip_adapter_weight_name = "ip-adapter-full-face_sd15.bin" #"ip-adapter-full-face_sd15.bin" # "ip-adapter_sd15.bin"
11
+ lcm_lora_id = "latent-consistency/lcm-lora-sdv1-5"
12
+ models_dir = Path("on-canvers-disney-v3.9.1-ov-face")
13
+ int8_model_path = Path("on-canvers-disney-v3.9.1-ov-face-int8")
14
+ from optimum.intel import OVConfig, OVQuantizer, OVStableDiffusionPipeline, OVWeightQuantizationConfig
15
+ from optimum.intel.openvino.configuration import OVQuantizationMethod
16
+
17
+ load_original_pipeline = not all(
18
+ [
19
+ (models_dir / model_name).exists()
20
+ for model_name in [
21
+ "text_encoder.xml",
22
+ "image_encoder.xml",
23
+ "unet.xml",
24
+ "vae_decoder.xml",
25
+ "vae_encoder.xml",
26
+ ]
27
+ ]
28
+ )
29
+
30
+
31
+ def get_pipeline_components(
32
+ stable_diffusion_id,
33
+ ip_adapter_id,
34
+ ip_adapter_weight_name,
35
+ lcm_lora_id,
36
+ ip_adapter_scale=0.65,
37
+ ):
38
+ image_encoder = CLIPVisionModelWithProjection.from_pretrained("h94/IP-Adapter", subfolder="models/image_encoder")
39
+ print(image_encoder)
40
+ pipeline = AutoPipelineForText2Image.from_pretrained(stable_diffusion_id, image_encoder=image_encoder)
41
+ pipeline.load_lora_weights(lcm_lora_id)
42
+ pipeline.fuse_lora()
43
+ pipeline.load_ip_adapter(ip_adapter_id, subfolder="models", weight_name=ip_adapter_weight_name)
44
+ pipeline.set_ip_adapter_scale(ip_adapter_scale)
45
+ scheduler = LCMScheduler.from_pretrained(stable_diffusion_id, subfolder="scheduler")
46
+ return (
47
+ pipeline.tokenizer,
48
+ pipeline.feature_extractor,
49
+ scheduler,
50
+ pipeline.text_encoder,
51
+ pipeline.image_encoder,
52
+ pipeline.unet,
53
+ pipeline.vae,
54
+ )
55
+
56
+
57
+ if load_original_pipeline:
58
+ (
59
+ tokenizer,
60
+ feature_extractor,
61
+ scheduler,
62
+ text_encoder,
63
+ image_encoder,
64
+ unet,
65
+ vae,
66
+ ) = get_pipeline_components(stable_diffusion_id, ip_adapter_id, ip_adapter_weight_name, lcm_lora_id)
67
+ scheduler.save_pretrained(models_dir / "scheduler")
68
+ else:
69
+ tokenizer, feature_extractor, scheduler, text_encoder, image_encoder, unet, vae = (
70
+ None,
71
+ None,
72
+ None,
73
+ None,
74
+ None,
75
+ None,
76
+ None,
77
+ )
78
+
79
+ import openvino as ov
80
+ import torch
81
+ import gc
82
+
83
+
84
+ def cleanup_torchscript_cache():
85
+ """
86
+ Helper for removing cached model representation
87
+ """
88
+ torch._C._jit_clear_class_registry()
89
+ torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()
90
+ torch.jit._state._clear_class_state()
91
+
92
+ IMAGE_ENCODER_PATH = models_dir / "image_encoder.xml"
93
+ UNET_PATH = models_dir / "unet.xml"
94
+ VAE_DECODER_PATH = models_dir / "vae_decoder.xml"
95
+ VAE_ENCODER_PATH = models_dir / "vae_encoder.xml"
96
+ TEXT_ENCODER_PATH = models_dir / "text_encoder.xml"
97
+
98
+ if not IMAGE_ENCODER_PATH.exists():
99
+ with torch.no_grad():
100
+ ov_model = ov.convert_model(
101
+ image_encoder,
102
+ example_input=torch.zeros((1, 3, 224, 224)),
103
+ input=[-1, 3, 224, 224],
104
+ )
105
+ ov.save_model(ov_model, IMAGE_ENCODER_PATH)
106
+ feature_extractor.save_pretrained(models_dir / "feature_extractor")
107
+ del ov_model
108
+ cleanup_torchscript_cache()
109
+
110
+
111
+ if not UNET_PATH.exists():
112
+ inputs = {
113
+ "sample": torch.randn((2, 4, 64, 64)),
114
+ "timestep": torch.tensor(1),
115
+ "encoder_hidden_states": torch.randn((2, 77, 768)),
116
+ "added_cond_kwargs": {"image_embeds": torch.ones((2, 1280))}, # 2,1024
117
+ }
118
+
119
+ print(unet)
120
+
121
+ with torch.no_grad():
122
+ ov_model = ov.convert_model(unet, example_input=inputs)
123
+ # dictionary with added_cond_kwargs will be decomposed during conversion
124
+ # in some cases decomposition may lead to losing data type and shape information
125
+ # We need to recover it manually after the conversion
126
+ ov_model.inputs[-1].get_node().set_element_type(ov.Type.f32)
127
+ ov_model.validate_nodes_and_infer_types()
128
+ ov.save_model(ov_model, UNET_PATH)
129
+ del ov_model
130
+ cleanup_torchscript_cache()
131
+
132
+ if not VAE_DECODER_PATH.exists():
133
+
134
+ class VAEDecoderWrapper(torch.nn.Module):
135
+ def __init__(self, vae):
136
+ super().__init__()
137
+ self.vae = vae
138
+
139
+ def forward(self, latents):
140
+ return self.vae.decode(latents)
141
+
142
+ vae_decoder = VAEDecoderWrapper(vae)
143
+ with torch.no_grad():
144
+ ov_model = ov.convert_model(vae_decoder, example_input=torch.ones([1, 4, 64, 64]))
145
+ ov.save_model(ov_model, VAE_DECODER_PATH)
146
+ del ov_model
147
+ cleanup_torchscript_cache()
148
+ del vae_decoder
149
+
150
+ if not VAE_ENCODER_PATH.exists():
151
+
152
+ class VAEEncoderWrapper(torch.nn.Module):
153
+ def __init__(self, vae):
154
+ super().__init__()
155
+ self.vae = vae
156
+
157
+ def forward(self, image):
158
+ return self.vae.encode(x=image)["latent_dist"].sample()
159
+
160
+ vae_encoder = VAEEncoderWrapper(vae)
161
+ vae_encoder.eval()
162
+ image = torch.zeros((1, 3, 512, 512))
163
+ with torch.no_grad():
164
+ ov_model = ov.convert_model(vae_encoder, example_input=image)
165
+ ov.save_model(ov_model, VAE_ENCODER_PATH)
166
+ del ov_model
167
+ cleanup_torchscript_cache()
168
+
169
+
170
+ if not TEXT_ENCODER_PATH.exists():
171
+ with torch.no_grad():
172
+ ov_model = ov.convert_model(
173
+ text_encoder,
174
+ example_input=torch.ones([1, 77], dtype=torch.long),
175
+ input=[
176
+ (1, 77),
177
+ ],
178
+ )
179
+ ov.save_model(ov_model, TEXT_ENCODER_PATH)
180
+ del ov_model
181
+ cleanup_torchscript_cache()
182
+ tokenizer.save_pretrained(models_dir / "tokenizer")
183
+
184
+
185
+ import inspect
186
+ from typing import List, Optional, Union, Dict, Tuple
187
+ import numpy as np
188
+
189
+ from pathlib import Path
190
+ from diffusers import AutoPipelineForText2Image
191
+ from transformers import CLIPVisionModelWithProjection
192
+ from diffusers.utils import load_image
193
+ from diffusers import LCMScheduler
194
+
195
+ import PIL
196
+ import cv2
197
+ import torch
198
+ import openvino as ov
199
+
200
+ from transformers import CLIPTokenizer, CLIPImageProcessor
201
+ from diffusers import DiffusionPipeline
202
+ from diffusers.pipelines.stable_diffusion.pipeline_output import (
203
+ StableDiffusionPipelineOutput,
204
+ )
205
+ from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
206
+ from resampler import Resampler
207
+
208
+
209
+ def scale_fit_to_window(dst_width: int, dst_height: int, image_width: int, image_height: int):
210
+ """
211
+ Preprocessing helper function for calculating image size for resize with peserving original aspect ratio
212
+ and fitting image to specific window size
213
+
214
+ Parameters:
215
+ dst_width (int): destination window width
216
+ dst_height (int): destination window height
217
+ image_width (int): source image width
218
+ image_height (int): source image height
219
+ Returns:
220
+ result_width (int): calculated width for resize
221
+ result_height (int): calculated height for resize
222
+ """
223
+ im_scale = min(dst_height / image_height, dst_width / image_width)
224
+ return int(im_scale * image_width), int(im_scale * image_height)
225
+
226
+
227
+ def randn_tensor(
228
+ shape: Union[Tuple, List],
229
+ generator: Optional[Union[List["torch.Generator"], "torch.Generator"]] = None,
230
+ dtype: Optional["torch.dtype"] = None,
231
+ ):
232
+ """A helper function to create random tensors on the desired `device` with the desired `dtype`. When
233
+ passing a list of generators, you can seed each batch size individually.
234
+
235
+ """
236
+ batch_size = shape[0]
237
+ rand_device = torch.device("cpu")
238
+
239
+ # make sure generator list of length 1 is treated like a non-list
240
+ if isinstance(generator, list) and len(generator) == 1:
241
+ generator = generator[0]
242
+
243
+ if isinstance(generator, list):
244
+ shape = (1,) + shape[1:]
245
+ latents = [torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype) for i in range(batch_size)]
246
+ latents = torch.cat(latents, dim=0)
247
+ else:
248
+ latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype)
249
+
250
+ return latents
251
+
252
+
253
+ def preprocess(image: PIL.Image.Image, height, width):
254
+ """
255
+ Image preprocessing function. Takes image in PIL.Image format, resizes it to keep aspect ration and fits to model input window 512x512,
256
+ then converts it to np.ndarray and adds padding with zeros on right or bottom side of image (depends from aspect ratio), after that
257
+ converts data to float32 data type and change range of values from [0, 255] to [-1, 1], finally, converts data layout from planar NHWC to NCHW.
258
+ The function returns preprocessed input tensor and padding size, which can be used in postprocessing.
259
+
260
+ Parameters:
261
+ image (PIL.Image.Image): input image
262
+ Returns:
263
+ image (np.ndarray): preprocessed image tensor
264
+ meta (Dict): dictionary with preprocessing metadata info
265
+ """
266
+ src_width, src_height = image.size
267
+ dst_width, dst_height = scale_fit_to_window(height, width, src_width, src_height)
268
+ image = np.array(image.resize((dst_width, dst_height), resample=PIL.Image.Resampling.LANCZOS))[None, :]
269
+ print(image.shape)
270
+ pad_width = width - dst_width
271
+ pad_height = height - dst_height
272
+ pad = ((0, 0), (0, pad_height), (0, pad_width), (0, 0))
273
+ image = np.pad(image, pad, mode="constant")
274
+ image = image.astype(np.float32) / 255.0
275
+ #image = image.astype(np.float16) / 255.0
276
+ image = 2.0 * image - 1.0
277
+ image = image.transpose(0, 3, 1, 2)
278
+ print(image.shape)
279
+ return image, {"padding": pad, "src_width": src_width, "src_height": src_height}
280
+
281
+
282
+ class OVStableDiffusionPipeline(DiffusionPipeline):
283
+ def __init__(
284
+ self,
285
+ vae_decoder: ov.Model,
286
+ text_encoder: ov.Model,
287
+ tokenizer: CLIPTokenizer,
288
+ unet: ov.Model,
289
+ scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
290
+ image_encoder: ov.Model,
291
+ feature_extractor: CLIPImageProcessor,
292
+ vae_encoder: ov.Model,
293
+ ):
294
+ """
295
+ Pipeline for text-to-image generation using Stable Diffusion and IP-Adapter with OpenVINO
296
+ Parameters:
297
+ vae_decoder (ov.Model):
298
+ Variational Auto-Encoder (VAE) Model to decode images to and from latent representations.
299
+ text_encoder (ov.Model):CLIPImageProcessor
300
+ Frozen text-encoder. Stable Diffusion uses the text portion of
301
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
302
+ the clip-vit-large-patch14(https://huggingface.co/openai/clip-vit-large-patch14) variant.
303
+ tokenizer (CLIPTokenizer):
304
+ Tokenizer of class CLIPTokenizer(https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
305
+ unet (ov.Model): Conditional U-Net architecture to denoise the encoded image latents.
306
+ scheduler (SchedulerMixin):
307
+ A scheduler to be used in combination with unet to denoise the encoded image latents
308
+ image_encoder (ov.Model):
309
+ IP-Adapter image encoder for embedding input image as input prompt for generation
310
+ feature_extractor :
311
+ """
312
+ super().__init__()
313
+ self.scheduler = scheduler
314
+ self.vae_decoder = vae_decoder
315
+ self.image_encoder = image_encoder
316
+ self.text_encoder = text_encoder
317
+ self.unet = unet
318
+ self.height = 512
319
+ self.width = 512
320
+ self.vae_scale_factor = 8
321
+ self.tokenizer = tokenizer
322
+ self.vae_encoder = vae_encoder
323
+ self.feature_extractor = feature_extractor
324
+ self.register_to_config(unet=unet) # config
325
+
326
+ def __call__(
327
+ self,
328
+ prompt: Union[str, List[str]],
329
+ ip_adapter_image: PIL.Image.Image,
330
+ image: PIL.Image.Image = None,
331
+ num_inference_steps: Optional[int] = 4,
332
+ negative_prompt: Union[str, List[str]] = None,
333
+ guidance_scale: Optional[float] = 0.5,
334
+ eta: Optional[float] = 0.0,
335
+ output_type: Optional[str] = "pil",
336
+ height: Optional[int] = None,
337
+ width: Optional[int] = None,
338
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
339
+ latents: Optional[torch.FloatTensor] = None,
340
+ strength: float = 1.0,
341
+ **kwargs,
342
+ ):
343
+ """
344
+ Function invoked when calling the pipeline for generation.
345
+ Parameters:
346
+ prompt (str or List[str]):
347
+ The prompt or prompts to guide the image generation.
348
+ image (PIL.Image.Image, *optional*, None):
349
+ Intinal image for generation.
350
+ num_inference_steps (int, *optional*, defaults to 50):
351
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
352
+ expense of slower inference.
353
+ negative_prompt (str or List[str]):https://user-images.githubusercontent.com/29454499/258651862-28b63016-c5ff-4263-9da8-73ca31100165.jpeg
354
+ The negative prompt or prompts to guide the image generation.
355
+ guidance_scale (float, *optional*, defaults to 7.5):
356
+ Guidance scale as defined in Classifier-Free Diffusion Guidance(https://arxiv.org/abs/2207.12598).
357
+ guidance_scale is defined as `w` of equation 2.
358
+ Higher guidance scale encourages to generate images that are closely linked to the text prompt,
359
+ usually at the expense of lower image quality.
360
+ eta (float, *optional*, defaults to 0.0):
361
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
362
+ [DDIMScheduler], will be ignored for others.
363
+ output_type (`str`, *optional*, defaults to "pil"):
364
+ The output format of the generate image. Choose between
365
+ [PIL](https://pillow.readthedocs.io/en/stable/): PIL.Image.Image or np.array.
366
+ height (int, *optional*, 512):
367
+ Generated image height
368
+ width (int, *optional*, 512):
369
+ Generated image width
370
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
371
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
372
+ generation deterministic.
373
+ latents (`torch.FloatTensor`, *optional*):
374
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
375
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
376
+ tensor is generated by sampling using the supplied random `generator`.
377
+ Returns:
378
+ Dictionary with keys:
379
+ sample - the last generated image PIL.Image.Image or np.arrayhttps://huggingface.co/latent-consistency/lcm-lora-sdv1-5
380
+ iterations - *optional* (if gif=True) images for all diffusion steps, List of PIL.Image.Image or np.array.
381
+ """
382
+ do_classifier_free_guidance = guidance_scale > 1.0
383
+ # get prompt text embeddings
384
+ text_embeddings = self._encode_prompt(
385
+ prompt,
386
+ do_classifier_free_guidance=do_classifier_free_guidance,
387
+ negative_prompt=negative_prompt,
388
+ )
389
+ # get ip-adapter image embeddings
390
+ image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image)
391
+ if do_classifier_free_guidance:
392
+ image_embeds = np.concatenate([negative_image_embeds, image_embeds])
393
+
394
+ # set timesteps
395
+ accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
396
+ extra_set_kwargs = {}
397
+ if accepts_offset:
398
+ extra_set_kwargs["offset"] = 1
399
+
400
+ self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
401
+
402
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
403
+ latent_timestep = timesteps[:1]
404
+
405
+ print(num_inference_steps,timesteps)
406
+
407
+ # get the initial random noise unless the user supplied it
408
+ latents, meta = self.prepare_latents(
409
+ 1,
410
+ 4,
411
+ height or self.height,
412
+ width or self.width,
413
+ generator=generator,
414
+ latents=latents,
415
+ image=image,
416
+ latent_timestep=latent_timestep,
417
+ )
418
+
419
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
420
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
421
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
422
+ # and should be between [0, 1]
423
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
424
+ extra_step_kwargs = {}
425
+ if accepts_eta:
426
+ extra_step_kwargs["eta"] = eta
427
+
428
+ for i, t in enumerate(self.progress_bar(timesteps)):
429
+ # expand the latents if you are doing classifier free guidance
430
+ latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
431
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
432
+
433
+ # predict the noise residual
434
+
435
+ noise_pred = self.unet([latent_model_input, t, text_embeddings, image_embeds])[0]
436
+ # perform guidance
437
+ if do_classifier_free_guidance:
438
+ noise_pred_uncond, noise_pred_text = noise_pred[0], noise_pred[1]
439
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
440
+
441
+ # compute the previous noisy sample x_t -> x_t-1
442
+ latents = self.scheduler.step(
443
+ torch.from_numpy(noise_pred),
444
+ t,
445
+ torch.from_numpy(latents),
446
+ **extra_step_kwargs,
447
+ )["prev_sample"].numpy()
448
+
449
+ # scale and decode the image latents with vae
450
+ image = self.vae_decoder(latents * (1 / 0.18215))[0]
451
+
452
+ image = self.postprocess_image(image, meta, output_type)
453
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=False)
454
+
455
+ def _encode_prompt(
456
+ self,
457
+ prompt: Union[str, List[str]],
458
+ num_images_per_prompt: int = 1,
459
+ do_classifier_free_guidance: bool = True,
460
+ negative_prompt: Union[str, List[str]] = None,
461
+ ):
462
+ """
463
+ Encodes the prompt into text encoder hidden states.
464
+
465
+ Parameters:
466
+ prompt (str or list(str)): prompt to be encoded
467
+ num_images_per_prompt (int): number of images that should be generated per prompt
468
+ do_classifier_free_guidance (bool): whether to use classifier free guidance or not
469
+ negative_prompt (str or list(str)): negative prompt to be encoded.
470
+ Returns:
471
+ text_embeddings (np.ndarray): text encoder hidden states
472
+ """
473
+ batch_size = len(prompt) if isinstance(prompt, list) else 1
474
+
475
+ # tokenize input prompts
476
+ text_inputs = self.tokenizer(
477
+ prompt,
478
+ padding="max_length",
479
+ max_length=self.tokenizer.model_max_length,
480
+ truncation=True,
481
+ return_tensors="np",
482
+ )
483
+ text_input_ids = text_inputs.input_ids
484
+
485
+ text_embeddings = self.text_encoder(text_input_ids)[0]
486
+
487
+ # duplicate text embeddings for each generation per prompt
488
+ if num_images_per_prompt != 1:
489
+ bs_embed, seq_len, _ = text_embeddings.shape
490
+ text_embeddings = np.tile(text_embeddings, (1, num_images_per_prompt, 1))
491
+ text_embeddings = np.reshape(text_embeddings, (bs_embed * num_images_per_prompt, seq_len, -1))
492
+
493
+ # get unconditional embeddings for classifier free guidance
494
+ if do_classifier_free_guidance:
495
+ uncond_tokens: List[str]
496
+ max_length = text_input_ids.shape[-1]
497
+ if negative_prompt is None:
498
+ uncond_tokens = [""] * batch_size
499
+ elif isinstance(negative_prompt, str):
500
+ uncond_tokens = [negative_prompt]
501
+ else:
502
+ uncond_tokens = negative_prompt
503
+ uncond_input = self.tokenizer(
504
+ uncond_tokens,
505
+ padding="max_length",
506
+ max_length=max_length,
507
+ truncation=True,
508
+ return_tensors="np",
509
+ )
510
+
511
+ uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0]
512
+
513
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
514
+ seq_len = uncond_embeddings.shape[1]
515
+ uncond_embeddings = np.tile(uncond_embeddings, (1, num_images_per_prompt, 1))
516
+ uncond_embeddings = np.reshape(uncond_embeddings, (batch_size * num_images_per_prompt, seq_len, -1))
517
+
518
+ # For classifier-free guidance, we need to do two forward passes.
519
+ # Here we concatenate the unconditional and text embeddings into a single batch
520
+ # to avoid doing two forward passes
521
+ text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
522
+
523
+ return text_embeddings
524
+
525
+ def prepare_latents(
526
+ self,
527
+ batch_size,
528
+ num_channels_latents,
529
+ height,
530
+ width,
531
+ dtype=torch.float16,
532
+ generator=None,
533
+ latents=None,
534
+ image=None,
535
+ latent_timestep=None,
536
+ ):
537
+ shape = (
538
+ batch_size,
539
+ num_channels_latents,
540
+ height // self.vae_scale_factor,
541
+ width // self.vae_scale_factor,
542
+ )
543
+ if isinstance(generator, list) and len(generator) != batch_size:
544
+ raise ValueError(
545
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
546
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
547
+ )
548
+
549
+ if latents is None:
550
+ latents = randn_tensor(shape, generator=generator, dtype=dtype)
551
+
552
+ if image is None:
553
+ # scale the initial noise by the standard deviation required by the scheduler
554
+ latents = latents * self.scheduler.init_noise_sigma
555
+ return latents.numpy(), {}
556
+ input_image, meta = preprocess(image, height, width)
557
+ print(input_image.shape)
558
+ image_latents = self.vae_encoder(input_image)[0]
559
+ image_latents = image_latents * 0.18215
560
+ latents = self.scheduler.add_noise(torch.from_numpy(image_latents), latents, latent_timestep).numpy()
561
+ return latents, meta
562
+
563
+ def postprocess_image(self, image: np.ndarray, meta: Dict, output_type: str = "pil"):
564
+ """
565
+ Postprocessing for decoded image. Takes generated image decoded by VAE decoder, unpad it to initial image size (if required),
566
+ normalize and convert to [0, 255] pixels range. Optionally, converts it from np.ndarray to PIL.Image format
567
+
568
+ Parameters:
569
+ image (np.ndarray):
570
+ Generated image
571
+ meta (Dict):
572
+ Metadata obtained on the latents preparing step can be empty
573
+ output_type (str, *optional*, pil):
574
+ Output format for result, can be pil or numpy
575
+ Returns:
576
+ image (List of np.ndarray or PIL.Image.Image):
577
+ Post-processed images
578
+ """
579
+ if "padding" in meta:
580
+ pad = meta["padding"]
581
+ (_, end_h), (_, end_w) = pad[1:3]
582
+ h, w = image.shape[2:]
583
+ unpad_h = h - end_h
584
+ unpad_w = w - end_w
585
+ image = image[:, :, :unpad_h, :unpad_w]
586
+ image = np.clip(image / 2 + 0.5, 0, 1)
587
+ image = np.transpose(image, (0, 2, 3, 1))
588
+
589
+
590
+ # 9. Convert to PIL
591
+ if output_type == "pil":
592
+ image = self.numpy_to_pil(image)
593
+ if "src_height" in meta:
594
+ orig_height, orig_width = meta["src_height"], meta["src_width"]
595
+ image = [img.resize((orig_width, orig_height), PIL.Image.Resampling.LANCZOS) for img in image]
596
+ else:
597
+ if "src_height" in meta:
598
+ orig_height, orig_width = meta["src_height"], meta["src_width"]
599
+ image = [cv2.resize(img, (orig_width, orig_width)) for img in image]
600
+
601
+
602
+ return image
603
+
604
+ def encode_image(self, image, num_images_per_prompt=1):
605
+ if not isinstance(image, torch.Tensor):
606
+ image = self.feature_extractor(image, return_tensors="pt").pixel_values
607
+
608
+ image_embeds = self.image_encoder(image)[0]
609
+ """
610
+ print(1,image_embeds)
611
+ image_proj_model = Resampler(
612
+ dim=1024,
613
+ depth=2,
614
+ dim_head=64,
615
+ heads=16,
616
+ num_queries=8,
617
+ embedding_dim=1280,
618
+ output_dim=1280,
619
+ ff_mult=2,
620
+ max_seq_len=257,
621
+ apply_pos_emb=True,
622
+ num_latents_mean_pooled=4,
623
+ )
624
+
625
+ image_embeds = image_proj_model(image_embeds)
626
+ print(2,image_embeds)
627
+ """
628
+
629
+ if num_images_per_prompt > 1:
630
+ image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
631
+
632
+ uncond_image_embeds = np.zeros(image_embeds.shape)
633
+ return image_embeds, uncond_image_embeds
634
+
635
+ def get_timesteps(self, num_inference_steps: int, strength: float):
636
+ """
637
+ Helper function for getting scheduler timesteps for generation
638
+ In case of image-to-image generation, it updates number of steps according to strength
639
+
640
+ Parameters:
641
+ num_inference_steps (int):
642
+ number of inference steps for generation
643
+ strength (float):
644
+ value between 0.0 and 1.0, that controls the amount of noise that is added to the input image.
645
+ Values that approach 1.0 allow for lots of variations but will also produce images that are not semantically consistent with the input.
646
+ """
647
+ # get the original timestep using init_timestep
648
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
649
+
650
+ t_start = max(num_inference_steps - init_timestep, 0)
651
+ timesteps = self.scheduler.timesteps[t_start:]
652
+
653
+ return timesteps, num_inference_steps - t_start
654
+
655
+
656
+ core = ov.Core()
657
+ device = "GPU"
658
+
659
+
660
+
661
+ from transformers import AutoTokenizer
662
+ from PIL import Image
663
+
664
+ ov_config = {"INFERENCE_PRECISION_HINT": "f16"}
665
+ vae_decoder = core.compile_model(VAE_DECODER_PATH, device, ov_config)
666
+ vae_encoder = core.compile_model(VAE_ENCODER_PATH, device, ov_config)
667
+ text_encoder = core.compile_model(TEXT_ENCODER_PATH, device )
668
+ image_encoder = core.compile_model(IMAGE_ENCODER_PATH, device)
669
+ unet = core.compile_model(UNET_PATH, device)
670
+
671
+ scheduler = LCMScheduler.from_pretrained(models_dir / "scheduler")
672
+ tokenizer = AutoTokenizer.from_pretrained(models_dir / "tokenizer")
673
+ feature_extractor = CLIPImageProcessor.from_pretrained(models_dir / "feature_extractor")
674
+
675
+ ov_pipe = OVStableDiffusionPipeline(
676
+ vae_decoder,
677
+ text_encoder,
678
+ tokenizer,
679
+ unet,
680
+ scheduler,
681
+ image_encoder,
682
+ feature_extractor,
683
+ vae_encoder,
684
+ #safety_checker = None
685
+ )
686
+
687
+ """
688
+ import datasets
689
+ DATASET_NAME = "jxie/coco_captions"
690
+
691
+ dataset = datasets.load_dataset("jxie/coco_captions", split="train", streaming=True).shuffle(seed=42)
692
+ def preprocess_fn(example):
693
+ return {"prompt": example["caption"]}
694
+
695
+ NUM_SAMPLES = 200
696
+ dataset = dataset.take(NUM_SAMPLES)
697
+ calibration_dataset = dataset.map(lambda x: preprocess_fn(x), remove_columns=dataset.column_names)
698
+
699
+
700
+ int8_pipe = None
701
+
702
+ import nncf
703
+ import datasets
704
+ from tqdm import tqdm
705
+ from transformers import set_seed
706
+ from typing import Any, Dict, List
707
+
708
+ set_seed(1)
709
+
710
+ class CompiledModelDecorator(ov.CompiledModel):
711
+ def __init__(self, compiled_model, prob: float, data_cache: List[Any] = None):
712
+ super().__init__(compiled_model)
713
+ self.data_cache = data_cache if data_cache else []
714
+ self.prob = np.clip(prob, 0, 1)
715
+
716
+ def __call__(self, *args, **kwargs):
717
+ if np.random.rand() >= self.prob:
718
+ self.data_cache.append(*args)
719
+ return super().__call__(*args, **kwargs)
720
+
721
+ from diffusers.utils import load_image
722
+
723
+ def collect_calibration_data(pipeline: OVStableDiffusionPipeline, subset_size: int) -> List[Dict]:
724
+ original_unet = pipeline.unet
725
+ pipeline.unet = CompiledModelDecorator(original_unet, prob=0.3)
726
+ #google-research-datasets/conceptual_captions
727
+ dataset = datasets.load_dataset("jxie/coco_captions", split="train", streaming=True).shuffle(seed=42)
728
+ pipeline.set_progress_bar_config(disable=True)
729
+ #safety_checker = pipeline.safety_checker
730
+ #pipeline.safety_checker = None
731
+
732
+ # Run inference for data collection
733
+ pbar = tqdm(total=subset_size)
734
+ diff = 0
735
+ for batch in dataset:
736
+ prompt = batch["caption"]
737
+ image = load_image(batch["image"])
738
+ if len(prompt) > tokenizer.model_max_length:
739
+ continue
740
+ _ = pipeline(
741
+ prompt,
742
+ ip_adapter_image = image,
743
+ num_inference_steps=4,
744
+ guidance_scale=1,
745
+ #guidance_scale=8.0,
746
+ #lcm_origin_steps=50,
747
+ output_type="pil",
748
+ height=512,
749
+ width=512,
750
+ )
751
+ collected_subset_size = len(pipeline.unet.data_cache)
752
+ if collected_subset_size >= subset_size:
753
+ pbar.update(subset_size - pbar.n)
754
+ break
755
+ pbar.update(collected_subset_size - diff)
756
+ diff = collected_subset_size
757
+
758
+ calibration_dataset = pipeline.unet.data_cache
759
+ pipeline.set_progress_bar_config(disable=False)
760
+ pipeline.unet = original_unet
761
+ #pipeline.safety_checker = safety_checker
762
+ return calibration_dataset
763
+
764
+
765
+ UNET_INT8_PATH = models_dir / "unet_int8.xml"
766
+
767
+ if not UNET_INT8_PATH.exists():
768
+ subset_size = 200
769
+ unet_calibration_data = collect_calibration_data(ov_pipe, subset_size=subset_size)
770
+
771
+
772
+ import nncf
773
+ from nncf.scopes import IgnoredScope
774
+
775
+ if UNET_INT8_PATH.exists():
776
+ print("Loading quantized model")
777
+ quantized_unet = core.read_model(UNET_INT8_PATH)
778
+ else:
779
+ unet = core.read_model(UNET_PATH)
780
+ quantized_unet = nncf.quantize(
781
+ model=unet,
782
+ subset_size=subset_size,
783
+ calibration_dataset=nncf.Dataset(unet_calibration_data),
784
+ model_type=nncf.ModelType.TRANSFORMER,
785
+ advanced_parameters=nncf.AdvancedQuantizationParameters(
786
+ disable_bias_correction=True
787
+ )
788
+ )
789
+ ov.save_model(quantized_unet, UNET_INT8_PATH)
790
+ """
text_encoder.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ecfe63f53bccf534038ee94b2e414b457136b02cdb7033279d35693ef487f5e
3
+ size 246145458
text_encoder.xml ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "49406": {
5
+ "content": "<|startoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "49407": {
13
+ "content": "<|endoftext|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ }
20
+ },
21
+ "bos_token": "<|startoftext|>",
22
+ "clean_up_tokenization_spaces": true,
23
+ "do_lower_case": true,
24
+ "eos_token": "<|endoftext|>",
25
+ "errors": "replace",
26
+ "model_max_length": 77,
27
+ "pad_token": "<|endoftext|>",
28
+ "tokenizer_class": "CLIPTokenizer",
29
+ "unk_token": "<|endoftext|>"
30
+ }
tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
unet.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91705a6ca0fdfeee9c2035c9c3e69f6b32df476e65960f15c1864c6981de9c63
3
+ size 1762631378
unet.xml ADDED
The diff for this file is too large to render. See raw diff
 
vae_decoder.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:676c514fd4b6acac962ce74445f486cfaecad455e7299551053fbbc73c1e9a67
3
+ size 98980618
vae_decoder.xml ADDED
The diff for this file is too large to render. See raw diff
 
vae_encoder.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43daf33c1843d9bc771c8534c14cb90b4dc836db309261397041bbba8148c687
3
+ size 68327564
vae_encoder.xml ADDED
The diff for this file is too large to render. See raw diff