StevenZhang ypyp commited on
Commit
ba97433
·
verified ·
1 Parent(s): 34fe37a

update demo (#6)

Browse files

- fix load error & update readme (3cb3d96865bbe334ac52f981b3b437333b0eabed)
- update readme (c7c741dcdbce91016f160a1907332bf757a3e89a)


Co-authored-by: sypyp <[email protected]>

README.md CHANGED
@@ -155,19 +155,27 @@ Wan can also be run directly using 🤗 Diffusers!
155
 
156
  ```python
157
  import torch
 
158
  from diffusers import AutoencoderKLWan, WanImageToVideoPipeline
159
  from diffusers.utils import export_to_video, load_image
 
160
 
161
- # Available models: Wan-AI/Wan2.1-I2V-14B-480P-Diffusers, Wan-AI/Wan2.1-I2V-1.3B-720P-Diffusers
162
  model_id = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"
 
163
  vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
164
- pipe = WanImageToVideoPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
165
  pipe.to("cuda")
166
 
167
- height, width = 480, 832
168
  image = load_image(
169
  "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
170
- ).resize((width, height))
 
 
 
 
 
 
171
  prompt = (
172
  "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in "
173
  "the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
@@ -175,9 +183,9 @@ prompt = (
175
  negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
176
 
177
  output = pipe(
178
- image=image, prompt=prompt, negative_prompt=negative_prompt, num_frames=81, guidance_scale=5.0
179
  ).frames[0]
180
- export_to_video(output, "output.mp4", fps=15)
181
  ```
182
 
183
  ##### (2) Using Prompt Extention
 
155
 
156
  ```python
157
  import torch
158
+ import numpy as np
159
  from diffusers import AutoencoderKLWan, WanImageToVideoPipeline
160
  from diffusers.utils import export_to_video, load_image
161
+ from transformers import CLIPVisionModel
162
 
163
+ # Available models: Wan-AI/Wan2.1-I2V-14B-480P-Diffusers, Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
164
  model_id = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"
165
+ image_encoder = CLIPVisionModel.from_pretrained(model_id, subfolder="image_encoder", torch_dtype=torch.float32)
166
  vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
167
+ pipe = WanImageToVideoPipeline.from_pretrained(model_id, vae=vae, image_encoder=image_encoder, torch_dtype=torch.bfloat16)
168
  pipe.to("cuda")
169
 
 
170
  image = load_image(
171
  "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
172
+ )
173
+ max_area = 480 * 832
174
+ aspect_ratio = image.height / image.width
175
+ mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
176
+ height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
177
+ width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
178
+ image = image.resize((width, height))
179
  prompt = (
180
  "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in "
181
  "the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
 
183
  negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
184
 
185
  output = pipe(
186
+ image=image, prompt=prompt, negative_prompt=negative_prompt, height=height, width=width, num_frames=81, guidance_scale=5.0
187
  ).frames[0]
188
+ export_to_video(output, "output.mp4", fps=16)
189
  ```
190
 
191
  ##### (2) Using Prompt Extention
image_encoder/config.json CHANGED
@@ -18,6 +18,6 @@
18
  "num_hidden_layers": 32,
19
  "patch_size": 14,
20
  "projection_dim": 1024,
21
- "torch_dtype": "bfloat16",
22
  "transformers_version": "4.48.0.dev0"
23
  }
 
18
  "num_hidden_layers": 32,
19
  "patch_size": 14,
20
  "projection_dim": 1024,
21
+ "torch_dtype": "float32",
22
  "transformers_version": "4.48.0.dev0"
23
  }
image_processor/merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
image_processor/preprocessor_config.json CHANGED
@@ -19,7 +19,6 @@
19
  0.26130258,
20
  0.27577711
21
  ],
22
- "processor_class": "CLIPProcessor",
23
  "resample": 3,
24
  "rescale_factor": 0.00392156862745098,
25
  "size": {
 
19
  0.26130258,
20
  0.27577711
21
  ],
 
22
  "resample": 3,
23
  "rescale_factor": 0.00392156862745098,
24
  "size": {
image_processor/special_tokens_map.json DELETED
@@ -1,30 +0,0 @@
1
- {
2
- "bos_token": {
3
- "content": "<|startoftext|>",
4
- "lstrip": false,
5
- "normalized": true,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "<|endoftext|>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": {
17
- "content": "<|endoftext|>",
18
- "lstrip": false,
19
- "normalized": false,
20
- "rstrip": false,
21
- "single_word": false
22
- },
23
- "unk_token": {
24
- "content": "<|endoftext|>",
25
- "lstrip": false,
26
- "normalized": false,
27
- "rstrip": false,
28
- "single_word": false
29
- }
30
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
image_processor/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
image_processor/tokenizer_config.json DELETED
@@ -1,32 +0,0 @@
1
- {
2
- "add_prefix_space": false,
3
- "added_tokens_decoder": {
4
- "49406": {
5
- "content": "<|startoftext|>",
6
- "lstrip": false,
7
- "normalized": true,
8
- "rstrip": false,
9
- "single_word": false,
10
- "special": true
11
- },
12
- "49407": {
13
- "content": "<|endoftext|>",
14
- "lstrip": false,
15
- "normalized": false,
16
- "rstrip": false,
17
- "single_word": false,
18
- "special": true
19
- }
20
- },
21
- "bos_token": "<|startoftext|>",
22
- "clean_up_tokenization_spaces": false,
23
- "do_lower_case": true,
24
- "eos_token": "<|endoftext|>",
25
- "errors": "replace",
26
- "extra_special_tokens": {},
27
- "model_max_length": 77,
28
- "pad_token": "<|endoftext|>",
29
- "processor_class": "CLIPProcessor",
30
- "tokenizer_class": "CLIPTokenizer",
31
- "unk_token": "<|endoftext|>"
32
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
image_processor/vocab.json DELETED
The diff for this file is too large to render. See raw diff
 
model_index.json CHANGED
@@ -7,7 +7,7 @@
7
  ],
8
  "image_processor": [
9
  "transformers",
10
- "CLIPProcessor"
11
  ],
12
  "scheduler": [
13
  "diffusers",
 
7
  ],
8
  "image_processor": [
9
  "transformers",
10
+ "CLIPImageProcessor"
11
  ],
12
  "scheduler": [
13
  "diffusers",