ypyp commited on
Commit
c7c741d
·
1 Parent(s): 3cb3d96

update readme

Browse files
Files changed (2) hide show
  1. README.md +12 -4
  2. image_encoder/config.json +1 -1
README.md CHANGED
@@ -155,19 +155,27 @@ Wan can also be run directly using 🤗 Diffusers!
155
 
156
  ```python
157
  import torch
 
158
  from diffusers import AutoencoderKLWan, WanImageToVideoPipeline
159
  from diffusers.utils import export_to_video, load_image
 
160
 
161
  # Available models: Wan-AI/Wan2.1-I2V-14B-480P-Diffusers, Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
162
  model_id = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"
 
163
  vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
164
- pipe = WanImageToVideoPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
165
  pipe.to("cuda")
166
 
167
- height, width = 480, 832
168
  image = load_image(
169
  "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
170
- ).resize((width, height))
 
 
 
 
 
 
171
  prompt = (
172
  "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in "
173
  "the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
@@ -175,7 +183,7 @@ prompt = (
175
  negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
176
 
177
  output = pipe(
178
- image=image, prompt=prompt, negative_prompt=negative_prompt, num_frames=81, guidance_scale=5.0
179
  ).frames[0]
180
  export_to_video(output, "output.mp4", fps=16)
181
  ```
 
155
 
156
  ```python
157
  import torch
158
+ import numpy as np
159
  from diffusers import AutoencoderKLWan, WanImageToVideoPipeline
160
  from diffusers.utils import export_to_video, load_image
161
+ from transformers import CLIPVisionModel
162
 
163
  # Available models: Wan-AI/Wan2.1-I2V-14B-480P-Diffusers, Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
164
  model_id = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"
165
+ image_encoder = CLIPVisionModel.from_pretrained(model_id, subfolder="image_encoder", torch_dtype=torch.float32)
166
  vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
167
+ pipe = WanImageToVideoPipeline.from_pretrained(model_id, vae=vae, image_encoder=image_encoder, torch_dtype=torch.bfloat16)
168
  pipe.to("cuda")
169
 
 
170
  image = load_image(
171
  "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
172
+ )
173
+ max_area = 480 * 832
174
+ aspect_ratio = image.height / image.width
175
+ mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
176
+ height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
177
+ width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
178
+ image = image.resize((width, height))
179
  prompt = (
180
  "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in "
181
  "the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
 
183
  negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
184
 
185
  output = pipe(
186
+ image=image, prompt=prompt, negative_prompt=negative_prompt, height=height, width=width, num_frames=81, guidance_scale=5.0
187
  ).frames[0]
188
  export_to_video(output, "output.mp4", fps=16)
189
  ```
image_encoder/config.json CHANGED
@@ -18,6 +18,6 @@
18
  "num_hidden_layers": 32,
19
  "patch_size": 14,
20
  "projection_dim": 1024,
21
- "torch_dtype": "bfloat16",
22
  "transformers_version": "4.48.0.dev0"
23
  }
 
18
  "num_hidden_layers": 32,
19
  "patch_size": 14,
20
  "projection_dim": 1024,
21
+ "torch_dtype": "float32",
22
  "transformers_version": "4.48.0.dev0"
23
  }