Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- epoch100/adapter_config.json +43 -0
- epoch100/adapter_model.safetensors +3 -0
- epoch100/hunyuan_config.toml +94 -0
- epoch104/adapter_config.json +43 -0
- epoch104/adapter_model.safetensors +3 -0
- epoch104/hunyuan_config.toml +94 -0
- epoch108/adapter_config.json +43 -0
- epoch108/adapter_model.safetensors +3 -0
- epoch108/hunyuan_config.toml +94 -0
- epoch112/adapter_config.json +43 -0
- epoch112/adapter_model.safetensors +3 -0
- epoch112/hunyuan_config.toml +94 -0
- epoch116/adapter_config.json +43 -0
- epoch116/adapter_model.safetensors +3 -0
- epoch116/hunyuan_config.toml +94 -0
- epoch12/adapter_config.json +43 -0
- epoch12/adapter_model.safetensors +3 -0
- epoch12/hunyuan_config.toml +94 -0
- epoch120/adapter_config.json +43 -0
- epoch120/adapter_model.safetensors +3 -0
- epoch120/hunyuan_config.toml +94 -0
- epoch124/adapter_config.json +43 -0
- epoch124/adapter_model.safetensors +3 -0
- epoch124/hunyuan_config.toml +94 -0
- epoch128/adapter_config.json +43 -0
- epoch128/adapter_model.safetensors +3 -0
- epoch128/hunyuan_config.toml +94 -0
- epoch132/adapter_config.json +43 -0
- epoch132/adapter_model.safetensors +3 -0
- epoch132/hunyuan_config.toml +94 -0
- epoch136/adapter_config.json +43 -0
- epoch136/adapter_model.safetensors +3 -0
- epoch136/hunyuan_config.toml +94 -0
- epoch140/adapter_config.json +43 -0
- epoch140/adapter_model.safetensors +3 -0
- epoch140/hunyuan_config.toml +94 -0
- epoch144/adapter_config.json +43 -0
- epoch144/adapter_model.safetensors +3 -0
- epoch144/hunyuan_config.toml +94 -0
- epoch148/adapter_config.json +43 -0
- epoch148/adapter_model.safetensors +3 -0
- epoch148/hunyuan_config.toml +94 -0
- epoch152/adapter_config.json +43 -0
- epoch152/adapter_model.safetensors +3 -0
- epoch152/hunyuan_config.toml +94 -0
- epoch156/adapter_config.json +43 -0
- epoch156/adapter_model.safetensors +3 -0
- epoch156/hunyuan_config.toml +94 -0
- epoch16/adapter_config.json +43 -0
- epoch16/adapter_model.safetensors +3 -0
epoch100/adapter_config.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"eva_config": null,
|
7 |
+
"exclude_modules": null,
|
8 |
+
"fan_in_fan_out": false,
|
9 |
+
"inference_mode": false,
|
10 |
+
"init_lora_weights": true,
|
11 |
+
"layer_replication": null,
|
12 |
+
"layers_pattern": null,
|
13 |
+
"layers_to_transform": null,
|
14 |
+
"loftq_config": {},
|
15 |
+
"lora_alpha": 32,
|
16 |
+
"lora_bias": false,
|
17 |
+
"lora_dropout": 0.0,
|
18 |
+
"megatron_config": null,
|
19 |
+
"megatron_core": "megatron.core",
|
20 |
+
"modules_to_save": null,
|
21 |
+
"peft_type": "LORA",
|
22 |
+
"r": 32,
|
23 |
+
"rank_pattern": {},
|
24 |
+
"revision": null,
|
25 |
+
"target_modules": [
|
26 |
+
"img_attn_qkv",
|
27 |
+
"txt_attn_qkv",
|
28 |
+
"img_mod.linear",
|
29 |
+
"img_mlp.fc1",
|
30 |
+
"txt_mlp.fc1",
|
31 |
+
"linear1",
|
32 |
+
"linear2",
|
33 |
+
"img_mlp.fc2",
|
34 |
+
"img_attn_proj",
|
35 |
+
"txt_mod.linear",
|
36 |
+
"txt_mlp.fc2",
|
37 |
+
"modulation.linear",
|
38 |
+
"txt_attn_proj"
|
39 |
+
],
|
40 |
+
"task_type": null,
|
41 |
+
"use_dora": false,
|
42 |
+
"use_rslora": false
|
43 |
+
}
|
epoch100/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dc63f3ccf550bd7278935a5384d18abf9c75878fd8f26b6484a27a563b62199c
|
3 |
+
size 322519480
|
epoch100/hunyuan_config.toml
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Output path for training runs. Each training run makes a new directory in here.
|
2 |
+
output_dir = '/notebooks/diffusion-pipe/output'
|
3 |
+
|
4 |
+
# Dataset config file.
|
5 |
+
dataset = '/notebooks/diffusion-pipe/dataset_files/dataset_config.toml'
|
6 |
+
# You can have separate eval datasets. Give them a name for Tensorboard metrics.
|
7 |
+
# eval_datasets = [
|
8 |
+
# {name = 'something', config = 'path/to/eval_dataset.toml'},
|
9 |
+
# ]
|
10 |
+
|
11 |
+
# training settings
|
12 |
+
|
13 |
+
# I usually set this to a really high value because I don't know how long I want to train.
|
14 |
+
epochs = 1000
|
15 |
+
# Batch size of a single forward/backward pass for one GPU.
|
16 |
+
micro_batch_size_per_gpu = 1
|
17 |
+
# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
|
18 |
+
pipeline_stages = 1
|
19 |
+
# Number of micro-batches sent through the pipeline for each training step.
|
20 |
+
# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
|
21 |
+
gradient_accumulation_steps = 4
|
22 |
+
# Grad norm clipping.
|
23 |
+
gradient_clipping = 1.0
|
24 |
+
# Learning rate warmup.
|
25 |
+
warmup_steps = 100
|
26 |
+
|
27 |
+
# eval settings
|
28 |
+
|
29 |
+
eval_every_n_epochs = 1
|
30 |
+
eval_before_first_step = true
|
31 |
+
# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
|
32 |
+
# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
|
33 |
+
# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
|
34 |
+
eval_micro_batch_size_per_gpu = 1
|
35 |
+
eval_gradient_accumulation_steps = 1
|
36 |
+
|
37 |
+
# misc settings
|
38 |
+
|
39 |
+
# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
|
40 |
+
save_every_n_epochs = 4
|
41 |
+
# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
|
42 |
+
#checkpoint_every_n_epochs = 1
|
43 |
+
checkpoint_every_n_minutes = 120
|
44 |
+
# Always set to true unless you have a huge amount of VRAM.
|
45 |
+
activation_checkpointing = true
|
46 |
+
# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
|
47 |
+
partition_method = 'parameters'
|
48 |
+
# dtype for saving the LoRA or model, if different from training dtype
|
49 |
+
save_dtype = 'bfloat16'
|
50 |
+
# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
|
51 |
+
caching_batch_size = 1
|
52 |
+
# How often deepspeed logs to console.
|
53 |
+
steps_per_print = 1
|
54 |
+
# How to extract video clips for training from a single input video file.
|
55 |
+
# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
|
56 |
+
# number of frames for that bucket.
|
57 |
+
# single_beginning: one clip starting at the beginning of the video
|
58 |
+
# single_middle: one clip from the middle of the video (cutting off the start and end equally)
|
59 |
+
# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
|
60 |
+
# default is single_middle
|
61 |
+
video_clip_mode = 'single_middle'
|
62 |
+
|
63 |
+
[model]
|
64 |
+
type = 'hunyuan-video'
|
65 |
+
# Can load Hunyuan Video entirely from the ckpt path set up for the official inference scripts.
|
66 |
+
#ckpt_path = '/home/anon/HunyuanVideo/ckpts'
|
67 |
+
# Or you can load it by pointing to all the ComfyUI files.
|
68 |
+
transformer_path = '/notebooks/diffusion-pipe/hunyuan_model_files/hunyuan_dit.safetensors'
|
69 |
+
vae_path = '/notebooks/diffusion-pipe/hunyuan_model_files/vae.safetensors'
|
70 |
+
llm_path = '/notebooks/diffusion-pipe/hunyuan_model_files/llava-llama-3-8b-text-encoder-tokenizer/'
|
71 |
+
clip_path = '/notebooks/diffusion-pipe/hunyuan_model_files/clip-vit-large-patch14/'
|
72 |
+
# Base dtype used for all models.
|
73 |
+
dtype = 'bfloat16'
|
74 |
+
# Hunyuan Video supports fp8 for the transformer when training LoRA.
|
75 |
+
transformer_dtype = 'float8'
|
76 |
+
# How to sample timesteps to train on. Can be logit_normal or uniform.
|
77 |
+
timestep_sample_method = 'logit_normal'
|
78 |
+
|
79 |
+
[adapter]
|
80 |
+
type = 'lora'
|
81 |
+
rank = 32
|
82 |
+
# Dtype for the LoRA weights you are training.
|
83 |
+
dtype = 'bfloat16'
|
84 |
+
# You can initialize the lora weights from a previously trained lora.
|
85 |
+
#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
|
86 |
+
|
87 |
+
[optimizer]
|
88 |
+
# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
|
89 |
+
# Look at train.py for other options. You could also easily edit the file and add your own.
|
90 |
+
type = 'adamw_optimi'
|
91 |
+
lr = 2e-5
|
92 |
+
betas = [0.9, 0.99]
|
93 |
+
weight_decay = 0.01
|
94 |
+
eps = 1e-8
|
epoch104/adapter_config.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"eva_config": null,
|
7 |
+
"exclude_modules": null,
|
8 |
+
"fan_in_fan_out": false,
|
9 |
+
"inference_mode": false,
|
10 |
+
"init_lora_weights": true,
|
11 |
+
"layer_replication": null,
|
12 |
+
"layers_pattern": null,
|
13 |
+
"layers_to_transform": null,
|
14 |
+
"loftq_config": {},
|
15 |
+
"lora_alpha": 32,
|
16 |
+
"lora_bias": false,
|
17 |
+
"lora_dropout": 0.0,
|
18 |
+
"megatron_config": null,
|
19 |
+
"megatron_core": "megatron.core",
|
20 |
+
"modules_to_save": null,
|
21 |
+
"peft_type": "LORA",
|
22 |
+
"r": 32,
|
23 |
+
"rank_pattern": {},
|
24 |
+
"revision": null,
|
25 |
+
"target_modules": [
|
26 |
+
"img_attn_qkv",
|
27 |
+
"txt_attn_qkv",
|
28 |
+
"img_mod.linear",
|
29 |
+
"img_mlp.fc1",
|
30 |
+
"txt_mlp.fc1",
|
31 |
+
"linear1",
|
32 |
+
"linear2",
|
33 |
+
"img_mlp.fc2",
|
34 |
+
"img_attn_proj",
|
35 |
+
"txt_mod.linear",
|
36 |
+
"txt_mlp.fc2",
|
37 |
+
"modulation.linear",
|
38 |
+
"txt_attn_proj"
|
39 |
+
],
|
40 |
+
"task_type": null,
|
41 |
+
"use_dora": false,
|
42 |
+
"use_rslora": false
|
43 |
+
}
|
epoch104/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7d1fca1401292c3c36b2c366aa833e234b0d435b347bdeb637cd848b27b208d6
|
3 |
+
size 322519480
|
epoch104/hunyuan_config.toml
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Output path for training runs. Each training run makes a new directory in here.
|
2 |
+
output_dir = '/notebooks/diffusion-pipe/output'
|
3 |
+
|
4 |
+
# Dataset config file.
|
5 |
+
dataset = '/notebooks/diffusion-pipe/dataset_files/dataset_config.toml'
|
6 |
+
# You can have separate eval datasets. Give them a name for Tensorboard metrics.
|
7 |
+
# eval_datasets = [
|
8 |
+
# {name = 'something', config = 'path/to/eval_dataset.toml'},
|
9 |
+
# ]
|
10 |
+
|
11 |
+
# training settings
|
12 |
+
|
13 |
+
# I usually set this to a really high value because I don't know how long I want to train.
|
14 |
+
epochs = 1000
|
15 |
+
# Batch size of a single forward/backward pass for one GPU.
|
16 |
+
micro_batch_size_per_gpu = 1
|
17 |
+
# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
|
18 |
+
pipeline_stages = 1
|
19 |
+
# Number of micro-batches sent through the pipeline for each training step.
|
20 |
+
# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
|
21 |
+
gradient_accumulation_steps = 4
|
22 |
+
# Grad norm clipping.
|
23 |
+
gradient_clipping = 1.0
|
24 |
+
# Learning rate warmup.
|
25 |
+
warmup_steps = 100
|
26 |
+
|
27 |
+
# eval settings
|
28 |
+
|
29 |
+
eval_every_n_epochs = 1
|
30 |
+
eval_before_first_step = true
|
31 |
+
# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
|
32 |
+
# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
|
33 |
+
# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
|
34 |
+
eval_micro_batch_size_per_gpu = 1
|
35 |
+
eval_gradient_accumulation_steps = 1
|
36 |
+
|
37 |
+
# misc settings
|
38 |
+
|
39 |
+
# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
|
40 |
+
save_every_n_epochs = 4
|
41 |
+
# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
|
42 |
+
#checkpoint_every_n_epochs = 1
|
43 |
+
checkpoint_every_n_minutes = 120
|
44 |
+
# Always set to true unless you have a huge amount of VRAM.
|
45 |
+
activation_checkpointing = true
|
46 |
+
# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
|
47 |
+
partition_method = 'parameters'
|
48 |
+
# dtype for saving the LoRA or model, if different from training dtype
|
49 |
+
save_dtype = 'bfloat16'
|
50 |
+
# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
|
51 |
+
caching_batch_size = 1
|
52 |
+
# How often deepspeed logs to console.
|
53 |
+
steps_per_print = 1
|
54 |
+
# How to extract video clips for training from a single input video file.
|
55 |
+
# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
|
56 |
+
# number of frames for that bucket.
|
57 |
+
# single_beginning: one clip starting at the beginning of the video
|
58 |
+
# single_middle: one clip from the middle of the video (cutting off the start and end equally)
|
59 |
+
# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
|
60 |
+
# default is single_middle
|
61 |
+
video_clip_mode = 'single_middle'
|
62 |
+
|
63 |
+
[model]
|
64 |
+
type = 'hunyuan-video'
|
65 |
+
# Can load Hunyuan Video entirely from the ckpt path set up for the official inference scripts.
|
66 |
+
#ckpt_path = '/home/anon/HunyuanVideo/ckpts'
|
67 |
+
# Or you can load it by pointing to all the ComfyUI files.
|
68 |
+
transformer_path = '/notebooks/diffusion-pipe/hunyuan_model_files/hunyuan_dit.safetensors'
|
69 |
+
vae_path = '/notebooks/diffusion-pipe/hunyuan_model_files/vae.safetensors'
|
70 |
+
llm_path = '/notebooks/diffusion-pipe/hunyuan_model_files/llava-llama-3-8b-text-encoder-tokenizer/'
|
71 |
+
clip_path = '/notebooks/diffusion-pipe/hunyuan_model_files/clip-vit-large-patch14/'
|
72 |
+
# Base dtype used for all models.
|
73 |
+
dtype = 'bfloat16'
|
74 |
+
# Hunyuan Video supports fp8 for the transformer when training LoRA.
|
75 |
+
transformer_dtype = 'float8'
|
76 |
+
# How to sample timesteps to train on. Can be logit_normal or uniform.
|
77 |
+
timestep_sample_method = 'logit_normal'
|
78 |
+
|
79 |
+
[adapter]
|
80 |
+
type = 'lora'
|
81 |
+
rank = 32
|
82 |
+
# Dtype for the LoRA weights you are training.
|
83 |
+
dtype = 'bfloat16'
|
84 |
+
# You can initialize the lora weights from a previously trained lora.
|
85 |
+
#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
|
86 |
+
|
87 |
+
[optimizer]
|
88 |
+
# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
|
89 |
+
# Look at train.py for other options. You could also easily edit the file and add your own.
|
90 |
+
type = 'adamw_optimi'
|
91 |
+
lr = 2e-5
|
92 |
+
betas = [0.9, 0.99]
|
93 |
+
weight_decay = 0.01
|
94 |
+
eps = 1e-8
|
epoch108/adapter_config.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"eva_config": null,
|
7 |
+
"exclude_modules": null,
|
8 |
+
"fan_in_fan_out": false,
|
9 |
+
"inference_mode": false,
|
10 |
+
"init_lora_weights": true,
|
11 |
+
"layer_replication": null,
|
12 |
+
"layers_pattern": null,
|
13 |
+
"layers_to_transform": null,
|
14 |
+
"loftq_config": {},
|
15 |
+
"lora_alpha": 32,
|
16 |
+
"lora_bias": false,
|
17 |
+
"lora_dropout": 0.0,
|
18 |
+
"megatron_config": null,
|
19 |
+
"megatron_core": "megatron.core",
|
20 |
+
"modules_to_save": null,
|
21 |
+
"peft_type": "LORA",
|
22 |
+
"r": 32,
|
23 |
+
"rank_pattern": {},
|
24 |
+
"revision": null,
|
25 |
+
"target_modules": [
|
26 |
+
"img_attn_qkv",
|
27 |
+
"txt_attn_qkv",
|
28 |
+
"img_mod.linear",
|
29 |
+
"img_mlp.fc1",
|
30 |
+
"txt_mlp.fc1",
|
31 |
+
"linear1",
|
32 |
+
"linear2",
|
33 |
+
"img_mlp.fc2",
|
34 |
+
"img_attn_proj",
|
35 |
+
"txt_mod.linear",
|
36 |
+
"txt_mlp.fc2",
|
37 |
+
"modulation.linear",
|
38 |
+
"txt_attn_proj"
|
39 |
+
],
|
40 |
+
"task_type": null,
|
41 |
+
"use_dora": false,
|
42 |
+
"use_rslora": false
|
43 |
+
}
|
epoch108/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b70c49fa1225af28314e41f4f389993235b562fd2647652d15bce9ab43514505
|
3 |
+
size 322519480
|
epoch108/hunyuan_config.toml
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Output path for training runs. Each training run makes a new directory in here.
|
2 |
+
output_dir = '/notebooks/diffusion-pipe/output'
|
3 |
+
|
4 |
+
# Dataset config file.
|
5 |
+
dataset = '/notebooks/diffusion-pipe/dataset_files/dataset_config.toml'
|
6 |
+
# You can have separate eval datasets. Give them a name for Tensorboard metrics.
|
7 |
+
# eval_datasets = [
|
8 |
+
# {name = 'something', config = 'path/to/eval_dataset.toml'},
|
9 |
+
# ]
|
10 |
+
|
11 |
+
# training settings
|
12 |
+
|
13 |
+
# I usually set this to a really high value because I don't know how long I want to train.
|
14 |
+
epochs = 1000
|
15 |
+
# Batch size of a single forward/backward pass for one GPU.
|
16 |
+
micro_batch_size_per_gpu = 1
|
17 |
+
# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
|
18 |
+
pipeline_stages = 1
|
19 |
+
# Number of micro-batches sent through the pipeline for each training step.
|
20 |
+
# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
|
21 |
+
gradient_accumulation_steps = 4
|
22 |
+
# Grad norm clipping.
|
23 |
+
gradient_clipping = 1.0
|
24 |
+
# Learning rate warmup.
|
25 |
+
warmup_steps = 100
|
26 |
+
|
27 |
+
# eval settings
|
28 |
+
|
29 |
+
eval_every_n_epochs = 1
|
30 |
+
eval_before_first_step = true
|
31 |
+
# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
|
32 |
+
# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
|
33 |
+
# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
|
34 |
+
eval_micro_batch_size_per_gpu = 1
|
35 |
+
eval_gradient_accumulation_steps = 1
|
36 |
+
|
37 |
+
# misc settings
|
38 |
+
|
39 |
+
# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
|
40 |
+
save_every_n_epochs = 4
|
41 |
+
# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
|
42 |
+
#checkpoint_every_n_epochs = 1
|
43 |
+
checkpoint_every_n_minutes = 120
|
44 |
+
# Always set to true unless you have a huge amount of VRAM.
|
45 |
+
activation_checkpointing = true
|
46 |
+
# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
|
47 |
+
partition_method = 'parameters'
|
48 |
+
# dtype for saving the LoRA or model, if different from training dtype
|
49 |
+
save_dtype = 'bfloat16'
|
50 |
+
# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
|
51 |
+
caching_batch_size = 1
|
52 |
+
# How often deepspeed logs to console.
|
53 |
+
steps_per_print = 1
|
54 |
+
# How to extract video clips for training from a single input video file.
|
55 |
+
# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
|
56 |
+
# number of frames for that bucket.
|
57 |
+
# single_beginning: one clip starting at the beginning of the video
|
58 |
+
# single_middle: one clip from the middle of the video (cutting off the start and end equally)
|
59 |
+
# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
|
60 |
+
# default is single_middle
|
61 |
+
video_clip_mode = 'single_middle'
|
62 |
+
|
63 |
+
[model]
|
64 |
+
type = 'hunyuan-video'
|
65 |
+
# Can load Hunyuan Video entirely from the ckpt path set up for the official inference scripts.
|
66 |
+
#ckpt_path = '/home/anon/HunyuanVideo/ckpts'
|
67 |
+
# Or you can load it by pointing to all the ComfyUI files.
|
68 |
+
transformer_path = '/notebooks/diffusion-pipe/hunyuan_model_files/hunyuan_dit.safetensors'
|
69 |
+
vae_path = '/notebooks/diffusion-pipe/hunyuan_model_files/vae.safetensors'
|
70 |
+
llm_path = '/notebooks/diffusion-pipe/hunyuan_model_files/llava-llama-3-8b-text-encoder-tokenizer/'
|
71 |
+
clip_path = '/notebooks/diffusion-pipe/hunyuan_model_files/clip-vit-large-patch14/'
|
72 |
+
# Base dtype used for all models.
|
73 |
+
dtype = 'bfloat16'
|
74 |
+
# Hunyuan Video supports fp8 for the transformer when training LoRA.
|
75 |
+
transformer_dtype = 'float8'
|
76 |
+
# How to sample timesteps to train on. Can be logit_normal or uniform.
|
77 |
+
timestep_sample_method = 'logit_normal'
|
78 |
+
|
79 |
+
[adapter]
|
80 |
+
type = 'lora'
|
81 |
+
rank = 32
|
82 |
+
# Dtype for the LoRA weights you are training.
|
83 |
+
dtype = 'bfloat16'
|
84 |
+
# You can initialize the lora weights from a previously trained lora.
|
85 |
+
#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
|
86 |
+
|
87 |
+
[optimizer]
|
88 |
+
# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
|
89 |
+
# Look at train.py for other options. You could also easily edit the file and add your own.
|
90 |
+
type = 'adamw_optimi'
|
91 |
+
lr = 2e-5
|
92 |
+
betas = [0.9, 0.99]
|
93 |
+
weight_decay = 0.01
|
94 |
+
eps = 1e-8
|
epoch112/adapter_config.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"eva_config": null,
|
7 |
+
"exclude_modules": null,
|
8 |
+
"fan_in_fan_out": false,
|
9 |
+
"inference_mode": false,
|
10 |
+
"init_lora_weights": true,
|
11 |
+
"layer_replication": null,
|
12 |
+
"layers_pattern": null,
|
13 |
+
"layers_to_transform": null,
|
14 |
+
"loftq_config": {},
|
15 |
+
"lora_alpha": 32,
|
16 |
+
"lora_bias": false,
|
17 |
+
"lora_dropout": 0.0,
|
18 |
+
"megatron_config": null,
|
19 |
+
"megatron_core": "megatron.core",
|
20 |
+
"modules_to_save": null,
|
21 |
+
"peft_type": "LORA",
|
22 |
+
"r": 32,
|
23 |
+
"rank_pattern": {},
|
24 |
+
"revision": null,
|
25 |
+
"target_modules": [
|
26 |
+
"img_attn_qkv",
|
27 |
+
"txt_attn_qkv",
|
28 |
+
"img_mod.linear",
|
29 |
+
"img_mlp.fc1",
|
30 |
+
"txt_mlp.fc1",
|
31 |
+
"linear1",
|
32 |
+
"linear2",
|
33 |
+
"img_mlp.fc2",
|
34 |
+
"img_attn_proj",
|
35 |
+
"txt_mod.linear",
|
36 |
+
"txt_mlp.fc2",
|
37 |
+
"modulation.linear",
|
38 |
+
"txt_attn_proj"
|
39 |
+
],
|
40 |
+
"task_type": null,
|
41 |
+
"use_dora": false,
|
42 |
+
"use_rslora": false
|
43 |
+
}
|
epoch112/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:63faddadf9540dbf2275fd2e93e6301ff888e5885f040f8dc069027a8619c40d
|
3 |
+
size 322519480
|
epoch112/hunyuan_config.toml
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Output path for training runs. Each training run makes a new directory in here.
|
2 |
+
output_dir = '/notebooks/diffusion-pipe/output'
|
3 |
+
|
4 |
+
# Dataset config file.
|
5 |
+
dataset = '/notebooks/diffusion-pipe/dataset_files/dataset_config.toml'
|
6 |
+
# You can have separate eval datasets. Give them a name for Tensorboard metrics.
|
7 |
+
# eval_datasets = [
|
8 |
+
# {name = 'something', config = 'path/to/eval_dataset.toml'},
|
9 |
+
# ]
|
10 |
+
|
11 |
+
# training settings
|
12 |
+
|
13 |
+
# I usually set this to a really high value because I don't know how long I want to train.
|
14 |
+
epochs = 1000
|
15 |
+
# Batch size of a single forward/backward pass for one GPU.
|
16 |
+
micro_batch_size_per_gpu = 1
|
17 |
+
# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
|
18 |
+
pipeline_stages = 1
|
19 |
+
# Number of micro-batches sent through the pipeline for each training step.
|
20 |
+
# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
|
21 |
+
gradient_accumulation_steps = 4
|
22 |
+
# Grad norm clipping.
|
23 |
+
gradient_clipping = 1.0
|
24 |
+
# Learning rate warmup.
|
25 |
+
warmup_steps = 100
|
26 |
+
|
27 |
+
# eval settings
|
28 |
+
|
29 |
+
eval_every_n_epochs = 1
|
30 |
+
eval_before_first_step = true
|
31 |
+
# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
|
32 |
+
# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
|
33 |
+
# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
|
34 |
+
eval_micro_batch_size_per_gpu = 1
|
35 |
+
eval_gradient_accumulation_steps = 1
|
36 |
+
|
37 |
+
# misc settings
|
38 |
+
|
39 |
+
# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
|
40 |
+
save_every_n_epochs = 4
|
41 |
+
# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
|
42 |
+
#checkpoint_every_n_epochs = 1
|
43 |
+
checkpoint_every_n_minutes = 120
|
44 |
+
# Always set to true unless you have a huge amount of VRAM.
|
45 |
+
activation_checkpointing = true
|
46 |
+
# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
|
47 |
+
partition_method = 'parameters'
|
48 |
+
# dtype for saving the LoRA or model, if different from training dtype
|
49 |
+
save_dtype = 'bfloat16'
|
50 |
+
# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
|
51 |
+
caching_batch_size = 1
|
52 |
+
# How often deepspeed logs to console.
|
53 |
+
steps_per_print = 1
|
54 |
+
# How to extract video clips for training from a single input video file.
|
55 |
+
# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
|
56 |
+
# number of frames for that bucket.
|
57 |
+
# single_beginning: one clip starting at the beginning of the video
|
58 |
+
# single_middle: one clip from the middle of the video (cutting off the start and end equally)
|
59 |
+
# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
|
60 |
+
# default is single_middle
|
61 |
+
video_clip_mode = 'single_middle'
|
62 |
+
|
63 |
+
[model]
|
64 |
+
type = 'hunyuan-video'
|
65 |
+
# Can load Hunyuan Video entirely from the ckpt path set up for the official inference scripts.
|
66 |
+
#ckpt_path = '/home/anon/HunyuanVideo/ckpts'
|
67 |
+
# Or you can load it by pointing to all the ComfyUI files.
|
68 |
+
transformer_path = '/notebooks/diffusion-pipe/hunyuan_model_files/hunyuan_dit.safetensors'
|
69 |
+
vae_path = '/notebooks/diffusion-pipe/hunyuan_model_files/vae.safetensors'
|
70 |
+
llm_path = '/notebooks/diffusion-pipe/hunyuan_model_files/llava-llama-3-8b-text-encoder-tokenizer/'
|
71 |
+
clip_path = '/notebooks/diffusion-pipe/hunyuan_model_files/clip-vit-large-patch14/'
|
72 |
+
# Base dtype used for all models.
|
73 |
+
dtype = 'bfloat16'
|
74 |
+
# Hunyuan Video supports fp8 for the transformer when training LoRA.
|
75 |
+
transformer_dtype = 'float8'
|
76 |
+
# How to sample timesteps to train on. Can be logit_normal or uniform.
|
77 |
+
timestep_sample_method = 'logit_normal'
|
78 |
+
|
79 |
+
[adapter]
|
80 |
+
type = 'lora'
|
81 |
+
rank = 32
|
82 |
+
# Dtype for the LoRA weights you are training.
|
83 |
+
dtype = 'bfloat16'
|
84 |
+
# You can initialize the lora weights from a previously trained lora.
|
85 |
+
#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
|
86 |
+
|
87 |
+
[optimizer]
|
88 |
+
# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
|
89 |
+
# Look at train.py for other options. You could also easily edit the file and add your own.
|
90 |
+
type = 'adamw_optimi'
|
91 |
+
lr = 2e-5
|
92 |
+
betas = [0.9, 0.99]
|
93 |
+
weight_decay = 0.01
|
94 |
+
eps = 1e-8
|
epoch116/adapter_config.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"eva_config": null,
|
7 |
+
"exclude_modules": null,
|
8 |
+
"fan_in_fan_out": false,
|
9 |
+
"inference_mode": false,
|
10 |
+
"init_lora_weights": true,
|
11 |
+
"layer_replication": null,
|
12 |
+
"layers_pattern": null,
|
13 |
+
"layers_to_transform": null,
|
14 |
+
"loftq_config": {},
|
15 |
+
"lora_alpha": 32,
|
16 |
+
"lora_bias": false,
|
17 |
+
"lora_dropout": 0.0,
|
18 |
+
"megatron_config": null,
|
19 |
+
"megatron_core": "megatron.core",
|
20 |
+
"modules_to_save": null,
|
21 |
+
"peft_type": "LORA",
|
22 |
+
"r": 32,
|
23 |
+
"rank_pattern": {},
|
24 |
+
"revision": null,
|
25 |
+
"target_modules": [
|
26 |
+
"img_attn_qkv",
|
27 |
+
"txt_attn_qkv",
|
28 |
+
"img_mod.linear",
|
29 |
+
"img_mlp.fc1",
|
30 |
+
"txt_mlp.fc1",
|
31 |
+
"linear1",
|
32 |
+
"linear2",
|
33 |
+
"img_mlp.fc2",
|
34 |
+
"img_attn_proj",
|
35 |
+
"txt_mod.linear",
|
36 |
+
"txt_mlp.fc2",
|
37 |
+
"modulation.linear",
|
38 |
+
"txt_attn_proj"
|
39 |
+
],
|
40 |
+
"task_type": null,
|
41 |
+
"use_dora": false,
|
42 |
+
"use_rslora": false
|
43 |
+
}
|
epoch116/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5dc983ff8f2245aa4af5d02d450e55b60aa03c7b28270751da68d6c389fb8083
|
3 |
+
size 322519480
|
epoch116/hunyuan_config.toml
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Output path for training runs. Each training run makes a new directory in here.
|
2 |
+
output_dir = '/notebooks/diffusion-pipe/output'
|
3 |
+
|
4 |
+
# Dataset config file.
|
5 |
+
dataset = '/notebooks/diffusion-pipe/dataset_files/dataset_config.toml'
|
6 |
+
# You can have separate eval datasets. Give them a name for Tensorboard metrics.
|
7 |
+
# eval_datasets = [
|
8 |
+
# {name = 'something', config = 'path/to/eval_dataset.toml'},
|
9 |
+
# ]
|
10 |
+
|
11 |
+
# training settings
|
12 |
+
|
13 |
+
# I usually set this to a really high value because I don't know how long I want to train.
|
14 |
+
epochs = 1000
|
15 |
+
# Batch size of a single forward/backward pass for one GPU.
|
16 |
+
micro_batch_size_per_gpu = 1
|
17 |
+
# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
|
18 |
+
pipeline_stages = 1
|
19 |
+
# Number of micro-batches sent through the pipeline for each training step.
|
20 |
+
# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
|
21 |
+
gradient_accumulation_steps = 4
|
22 |
+
# Grad norm clipping.
|
23 |
+
gradient_clipping = 1.0
|
24 |
+
# Learning rate warmup.
|
25 |
+
warmup_steps = 100
|
26 |
+
|
27 |
+
# eval settings
|
28 |
+
|
29 |
+
eval_every_n_epochs = 1
|
30 |
+
eval_before_first_step = true
|
31 |
+
# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
|
32 |
+
# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
|
33 |
+
# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
|
34 |
+
eval_micro_batch_size_per_gpu = 1
|
35 |
+
eval_gradient_accumulation_steps = 1
|
36 |
+
|
37 |
+
# misc settings
|
38 |
+
|
39 |
+
# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
|
40 |
+
save_every_n_epochs = 4
|
41 |
+
# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
|
42 |
+
#checkpoint_every_n_epochs = 1
|
43 |
+
checkpoint_every_n_minutes = 120
|
44 |
+
# Always set to true unless you have a huge amount of VRAM.
|
45 |
+
activation_checkpointing = true
|
46 |
+
# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
|
47 |
+
partition_method = 'parameters'
|
48 |
+
# dtype for saving the LoRA or model, if different from training dtype
|
49 |
+
save_dtype = 'bfloat16'
|
50 |
+
# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
|
51 |
+
caching_batch_size = 1
|
52 |
+
# How often deepspeed logs to console.
|
53 |
+
steps_per_print = 1
|
54 |
+
# How to extract video clips for training from a single input video file.
|
55 |
+
# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
|
56 |
+
# number of frames for that bucket.
|
57 |
+
# single_beginning: one clip starting at the beginning of the video
|
58 |
+
# single_middle: one clip from the middle of the video (cutting off the start and end equally)
|
59 |
+
# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
|
60 |
+
# default is single_middle
|
61 |
+
video_clip_mode = 'single_middle'
|
62 |
+
|
63 |
+
[model]
|
64 |
+
type = 'hunyuan-video'
|
65 |
+
# Can load Hunyuan Video entirely from the ckpt path set up for the official inference scripts.
|
66 |
+
#ckpt_path = '/home/anon/HunyuanVideo/ckpts'
|
67 |
+
# Or you can load it by pointing to all the ComfyUI files.
|
68 |
+
transformer_path = '/notebooks/diffusion-pipe/hunyuan_model_files/hunyuan_dit.safetensors'
|
69 |
+
vae_path = '/notebooks/diffusion-pipe/hunyuan_model_files/vae.safetensors'
|
70 |
+
llm_path = '/notebooks/diffusion-pipe/hunyuan_model_files/llava-llama-3-8b-text-encoder-tokenizer/'
|
71 |
+
clip_path = '/notebooks/diffusion-pipe/hunyuan_model_files/clip-vit-large-patch14/'
|
72 |
+
# Base dtype used for all models.
|
73 |
+
dtype = 'bfloat16'
|
74 |
+
# Hunyuan Video supports fp8 for the transformer when training LoRA.
|
75 |
+
transformer_dtype = 'float8'
|
76 |
+
# How to sample timesteps to train on. Can be logit_normal or uniform.
|
77 |
+
timestep_sample_method = 'logit_normal'
|
78 |
+
|
79 |
+
[adapter]
|
80 |
+
type = 'lora'
|
81 |
+
rank = 32
|
82 |
+
# Dtype for the LoRA weights you are training.
|
83 |
+
dtype = 'bfloat16'
|
84 |
+
# You can initialize the lora weights from a previously trained lora.
|
85 |
+
#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
|
86 |
+
|
87 |
+
[optimizer]
|
88 |
+
# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
|
89 |
+
# Look at train.py for other options. You could also easily edit the file and add your own.
|
90 |
+
type = 'adamw_optimi'
|
91 |
+
lr = 2e-5
|
92 |
+
betas = [0.9, 0.99]
|
93 |
+
weight_decay = 0.01
|
94 |
+
eps = 1e-8
|
epoch12/adapter_config.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"eva_config": null,
|
7 |
+
"exclude_modules": null,
|
8 |
+
"fan_in_fan_out": false,
|
9 |
+
"inference_mode": false,
|
10 |
+
"init_lora_weights": true,
|
11 |
+
"layer_replication": null,
|
12 |
+
"layers_pattern": null,
|
13 |
+
"layers_to_transform": null,
|
14 |
+
"loftq_config": {},
|
15 |
+
"lora_alpha": 32,
|
16 |
+
"lora_bias": false,
|
17 |
+
"lora_dropout": 0.0,
|
18 |
+
"megatron_config": null,
|
19 |
+
"megatron_core": "megatron.core",
|
20 |
+
"modules_to_save": null,
|
21 |
+
"peft_type": "LORA",
|
22 |
+
"r": 32,
|
23 |
+
"rank_pattern": {},
|
24 |
+
"revision": null,
|
25 |
+
"target_modules": [
|
26 |
+
"img_attn_qkv",
|
27 |
+
"txt_attn_qkv",
|
28 |
+
"img_mod.linear",
|
29 |
+
"img_mlp.fc1",
|
30 |
+
"txt_mlp.fc1",
|
31 |
+
"linear1",
|
32 |
+
"linear2",
|
33 |
+
"img_mlp.fc2",
|
34 |
+
"img_attn_proj",
|
35 |
+
"txt_mod.linear",
|
36 |
+
"txt_mlp.fc2",
|
37 |
+
"modulation.linear",
|
38 |
+
"txt_attn_proj"
|
39 |
+
],
|
40 |
+
"task_type": null,
|
41 |
+
"use_dora": false,
|
42 |
+
"use_rslora": false
|
43 |
+
}
|
epoch12/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:78414cb47b3f6b71de54d64fc75ce24891411fb626d7ac4a41efac7e38abe562
|
3 |
+
size 322519480
|
epoch12/hunyuan_config.toml
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Output path for training runs. Each training run makes a new directory in here.
|
2 |
+
output_dir = '/notebooks/diffusion-pipe/output'
|
3 |
+
|
4 |
+
# Dataset config file.
|
5 |
+
dataset = '/notebooks/diffusion-pipe/dataset_files/dataset_config.toml'
|
6 |
+
# You can have separate eval datasets. Give them a name for Tensorboard metrics.
|
7 |
+
# eval_datasets = [
|
8 |
+
# {name = 'something', config = 'path/to/eval_dataset.toml'},
|
9 |
+
# ]
|
10 |
+
|
11 |
+
# training settings
|
12 |
+
|
13 |
+
# I usually set this to a really high value because I don't know how long I want to train.
|
14 |
+
epochs = 1000
|
15 |
+
# Batch size of a single forward/backward pass for one GPU.
|
16 |
+
micro_batch_size_per_gpu = 1
|
17 |
+
# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
|
18 |
+
pipeline_stages = 1
|
19 |
+
# Number of micro-batches sent through the pipeline for each training step.
|
20 |
+
# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
|
21 |
+
gradient_accumulation_steps = 4
|
22 |
+
# Grad norm clipping.
|
23 |
+
gradient_clipping = 1.0
|
24 |
+
# Learning rate warmup.
|
25 |
+
warmup_steps = 100
|
26 |
+
|
27 |
+
# eval settings
|
28 |
+
|
29 |
+
eval_every_n_epochs = 1
|
30 |
+
eval_before_first_step = true
|
31 |
+
# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
|
32 |
+
# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
|
33 |
+
# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
|
34 |
+
eval_micro_batch_size_per_gpu = 1
|
35 |
+
eval_gradient_accumulation_steps = 1
|
36 |
+
|
37 |
+
# misc settings
|
38 |
+
|
39 |
+
# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
|
40 |
+
save_every_n_epochs = 4
|
41 |
+
# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
|
42 |
+
#checkpoint_every_n_epochs = 1
|
43 |
+
checkpoint_every_n_minutes = 120
|
44 |
+
# Always set to true unless you have a huge amount of VRAM.
|
45 |
+
activation_checkpointing = true
|
46 |
+
# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
|
47 |
+
partition_method = 'parameters'
|
48 |
+
# dtype for saving the LoRA or model, if different from training dtype
|
49 |
+
save_dtype = 'bfloat16'
|
50 |
+
# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
|
51 |
+
caching_batch_size = 1
|
52 |
+
# How often deepspeed logs to console.
|
53 |
+
steps_per_print = 1
|
54 |
+
# How to extract video clips for training from a single input video file.
|
55 |
+
# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
|
56 |
+
# number of frames for that bucket.
|
57 |
+
# single_beginning: one clip starting at the beginning of the video
|
58 |
+
# single_middle: one clip from the middle of the video (cutting off the start and end equally)
|
59 |
+
# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
|
60 |
+
# default is single_middle
|
61 |
+
video_clip_mode = 'single_middle'
|
62 |
+
|
63 |
+
[model]
|
64 |
+
type = 'hunyuan-video'
|
65 |
+
# Can load Hunyuan Video entirely from the ckpt path set up for the official inference scripts.
|
66 |
+
#ckpt_path = '/home/anon/HunyuanVideo/ckpts'
|
67 |
+
# Or you can load it by pointing to all the ComfyUI files.
|
68 |
+
transformer_path = '/notebooks/diffusion-pipe/hunyuan_model_files/hunyuan_dit.safetensors'
|
69 |
+
vae_path = '/notebooks/diffusion-pipe/hunyuan_model_files/vae.safetensors'
|
70 |
+
llm_path = '/notebooks/diffusion-pipe/hunyuan_model_files/llava-llama-3-8b-text-encoder-tokenizer/'
|
71 |
+
clip_path = '/notebooks/diffusion-pipe/hunyuan_model_files/clip-vit-large-patch14/'
|
72 |
+
# Base dtype used for all models.
|
73 |
+
dtype = 'bfloat16'
|
74 |
+
# Hunyuan Video supports fp8 for the transformer when training LoRA.
|
75 |
+
transformer_dtype = 'float8'
|
76 |
+
# How to sample timesteps to train on. Can be logit_normal or uniform.
|
77 |
+
timestep_sample_method = 'logit_normal'
|
78 |
+
|
79 |
+
[adapter]
|
80 |
+
type = 'lora'
|
81 |
+
rank = 32
|
82 |
+
# Dtype for the LoRA weights you are training.
|
83 |
+
dtype = 'bfloat16'
|
84 |
+
# You can initialize the lora weights from a previously trained lora.
|
85 |
+
#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
|
86 |
+
|
87 |
+
[optimizer]
|
88 |
+
# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
|
89 |
+
# Look at train.py for other options. You could also easily edit the file and add your own.
|
90 |
+
type = 'adamw_optimi'
|
91 |
+
lr = 2e-5
|
92 |
+
betas = [0.9, 0.99]
|
93 |
+
weight_decay = 0.01
|
94 |
+
eps = 1e-8
|
epoch120/adapter_config.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"eva_config": null,
|
7 |
+
"exclude_modules": null,
|
8 |
+
"fan_in_fan_out": false,
|
9 |
+
"inference_mode": false,
|
10 |
+
"init_lora_weights": true,
|
11 |
+
"layer_replication": null,
|
12 |
+
"layers_pattern": null,
|
13 |
+
"layers_to_transform": null,
|
14 |
+
"loftq_config": {},
|
15 |
+
"lora_alpha": 32,
|
16 |
+
"lora_bias": false,
|
17 |
+
"lora_dropout": 0.0,
|
18 |
+
"megatron_config": null,
|
19 |
+
"megatron_core": "megatron.core",
|
20 |
+
"modules_to_save": null,
|
21 |
+
"peft_type": "LORA",
|
22 |
+
"r": 32,
|
23 |
+
"rank_pattern": {},
|
24 |
+
"revision": null,
|
25 |
+
"target_modules": [
|
26 |
+
"img_attn_qkv",
|
27 |
+
"txt_attn_qkv",
|
28 |
+
"img_mod.linear",
|
29 |
+
"img_mlp.fc1",
|
30 |
+
"txt_mlp.fc1",
|
31 |
+
"linear1",
|
32 |
+
"linear2",
|
33 |
+
"img_mlp.fc2",
|
34 |
+
"img_attn_proj",
|
35 |
+
"txt_mod.linear",
|
36 |
+
"txt_mlp.fc2",
|
37 |
+
"modulation.linear",
|
38 |
+
"txt_attn_proj"
|
39 |
+
],
|
40 |
+
"task_type": null,
|
41 |
+
"use_dora": false,
|
42 |
+
"use_rslora": false
|
43 |
+
}
|
epoch120/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3bc6d23cdbdfacbd863d29466011df66521a910715acb1247d0cc54e4d6941d8
|
3 |
+
size 322519480
|
epoch120/hunyuan_config.toml
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Output path for training runs. Each training run makes a new directory in here.
|
2 |
+
output_dir = '/notebooks/diffusion-pipe/output'
|
3 |
+
|
4 |
+
# Dataset config file.
|
5 |
+
dataset = '/notebooks/diffusion-pipe/dataset_files/dataset_config.toml'
|
6 |
+
# You can have separate eval datasets. Give them a name for Tensorboard metrics.
|
7 |
+
# eval_datasets = [
|
8 |
+
# {name = 'something', config = 'path/to/eval_dataset.toml'},
|
9 |
+
# ]
|
10 |
+
|
11 |
+
# training settings
|
12 |
+
|
13 |
+
# I usually set this to a really high value because I don't know how long I want to train.
|
14 |
+
epochs = 1000
|
15 |
+
# Batch size of a single forward/backward pass for one GPU.
|
16 |
+
micro_batch_size_per_gpu = 1
|
17 |
+
# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
|
18 |
+
pipeline_stages = 1
|
19 |
+
# Number of micro-batches sent through the pipeline for each training step.
|
20 |
+
# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
|
21 |
+
gradient_accumulation_steps = 4
|
22 |
+
# Grad norm clipping.
|
23 |
+
gradient_clipping = 1.0
|
24 |
+
# Learning rate warmup.
|
25 |
+
warmup_steps = 100
|
26 |
+
|
27 |
+
# eval settings
|
28 |
+
|
29 |
+
eval_every_n_epochs = 1
|
30 |
+
eval_before_first_step = true
|
31 |
+
# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
|
32 |
+
# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
|
33 |
+
# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
|
34 |
+
eval_micro_batch_size_per_gpu = 1
|
35 |
+
eval_gradient_accumulation_steps = 1
|
36 |
+
|
37 |
+
# misc settings
|
38 |
+
|
39 |
+
# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
|
40 |
+
save_every_n_epochs = 4
|
41 |
+
# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
|
42 |
+
#checkpoint_every_n_epochs = 1
|
43 |
+
checkpoint_every_n_minutes = 120
|
44 |
+
# Always set to true unless you have a huge amount of VRAM.
|
45 |
+
activation_checkpointing = true
|
46 |
+
# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
|
47 |
+
partition_method = 'parameters'
|
48 |
+
# dtype for saving the LoRA or model, if different from training dtype
|
49 |
+
save_dtype = 'bfloat16'
|
50 |
+
# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
|
51 |
+
caching_batch_size = 1
|
52 |
+
# How often deepspeed logs to console.
|
53 |
+
steps_per_print = 1
|
54 |
+
# How to extract video clips for training from a single input video file.
|
55 |
+
# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
|
56 |
+
# number of frames for that bucket.
|
57 |
+
# single_beginning: one clip starting at the beginning of the video
|
58 |
+
# single_middle: one clip from the middle of the video (cutting off the start and end equally)
|
59 |
+
# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
|
60 |
+
# default is single_middle
|
61 |
+
video_clip_mode = 'single_middle'
|
62 |
+
|
63 |
+
[model]
|
64 |
+
type = 'hunyuan-video'
|
65 |
+
# Can load Hunyuan Video entirely from the ckpt path set up for the official inference scripts.
|
66 |
+
#ckpt_path = '/home/anon/HunyuanVideo/ckpts'
|
67 |
+
# Or you can load it by pointing to all the ComfyUI files.
|
68 |
+
transformer_path = '/notebooks/diffusion-pipe/hunyuan_model_files/hunyuan_dit.safetensors'
|
69 |
+
vae_path = '/notebooks/diffusion-pipe/hunyuan_model_files/vae.safetensors'
|
70 |
+
llm_path = '/notebooks/diffusion-pipe/hunyuan_model_files/llava-llama-3-8b-text-encoder-tokenizer/'
|
71 |
+
clip_path = '/notebooks/diffusion-pipe/hunyuan_model_files/clip-vit-large-patch14/'
|
72 |
+
# Base dtype used for all models.
|
73 |
+
dtype = 'bfloat16'
|
74 |
+
# Hunyuan Video supports fp8 for the transformer when training LoRA.
|
75 |
+
transformer_dtype = 'float8'
|
76 |
+
# How to sample timesteps to train on. Can be logit_normal or uniform.
|
77 |
+
timestep_sample_method = 'logit_normal'
|
78 |
+
|
79 |
+
[adapter]
|
80 |
+
type = 'lora'
|
81 |
+
rank = 32
|
82 |
+
# Dtype for the LoRA weights you are training.
|
83 |
+
dtype = 'bfloat16'
|
84 |
+
# You can initialize the lora weights from a previously trained lora.
|
85 |
+
#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
|
86 |
+
|
87 |
+
[optimizer]
|
88 |
+
# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
|
89 |
+
# Look at train.py for other options. You could also easily edit the file and add your own.
|
90 |
+
type = 'adamw_optimi'
|
91 |
+
lr = 2e-5
|
92 |
+
betas = [0.9, 0.99]
|
93 |
+
weight_decay = 0.01
|
94 |
+
eps = 1e-8
|
epoch124/adapter_config.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"eva_config": null,
|
7 |
+
"exclude_modules": null,
|
8 |
+
"fan_in_fan_out": false,
|
9 |
+
"inference_mode": false,
|
10 |
+
"init_lora_weights": true,
|
11 |
+
"layer_replication": null,
|
12 |
+
"layers_pattern": null,
|
13 |
+
"layers_to_transform": null,
|
14 |
+
"loftq_config": {},
|
15 |
+
"lora_alpha": 32,
|
16 |
+
"lora_bias": false,
|
17 |
+
"lora_dropout": 0.0,
|
18 |
+
"megatron_config": null,
|
19 |
+
"megatron_core": "megatron.core",
|
20 |
+
"modules_to_save": null,
|
21 |
+
"peft_type": "LORA",
|
22 |
+
"r": 32,
|
23 |
+
"rank_pattern": {},
|
24 |
+
"revision": null,
|
25 |
+
"target_modules": [
|
26 |
+
"img_attn_qkv",
|
27 |
+
"txt_attn_qkv",
|
28 |
+
"img_mod.linear",
|
29 |
+
"img_mlp.fc1",
|
30 |
+
"txt_mlp.fc1",
|
31 |
+
"linear1",
|
32 |
+
"linear2",
|
33 |
+
"img_mlp.fc2",
|
34 |
+
"img_attn_proj",
|
35 |
+
"txt_mod.linear",
|
36 |
+
"txt_mlp.fc2",
|
37 |
+
"modulation.linear",
|
38 |
+
"txt_attn_proj"
|
39 |
+
],
|
40 |
+
"task_type": null,
|
41 |
+
"use_dora": false,
|
42 |
+
"use_rslora": false
|
43 |
+
}
|
epoch124/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f997957e09710dfad00dc23e8592f5ed1295b40e347b211ddd2dafab38beb181
|
3 |
+
size 322519480
|
epoch124/hunyuan_config.toml
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Output path for training runs. Each training run makes a new directory in here.
|
2 |
+
output_dir = '/notebooks/diffusion-pipe/output'
|
3 |
+
|
4 |
+
# Dataset config file.
|
5 |
+
dataset = '/notebooks/diffusion-pipe/dataset_files/dataset_config.toml'
|
6 |
+
# You can have separate eval datasets. Give them a name for Tensorboard metrics.
|
7 |
+
# eval_datasets = [
|
8 |
+
# {name = 'something', config = 'path/to/eval_dataset.toml'},
|
9 |
+
# ]
|
10 |
+
|
11 |
+
# training settings
|
12 |
+
|
13 |
+
# I usually set this to a really high value because I don't know how long I want to train.
|
14 |
+
epochs = 1000
|
15 |
+
# Batch size of a single forward/backward pass for one GPU.
|
16 |
+
micro_batch_size_per_gpu = 1
|
17 |
+
# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
|
18 |
+
pipeline_stages = 1
|
19 |
+
# Number of micro-batches sent through the pipeline for each training step.
|
20 |
+
# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
|
21 |
+
gradient_accumulation_steps = 4
|
22 |
+
# Grad norm clipping.
|
23 |
+
gradient_clipping = 1.0
|
24 |
+
# Learning rate warmup.
|
25 |
+
warmup_steps = 100
|
26 |
+
|
27 |
+
# eval settings
|
28 |
+
|
29 |
+
eval_every_n_epochs = 1
|
30 |
+
eval_before_first_step = true
|
31 |
+
# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
|
32 |
+
# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
|
33 |
+
# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
|
34 |
+
eval_micro_batch_size_per_gpu = 1
|
35 |
+
eval_gradient_accumulation_steps = 1
|
36 |
+
|
37 |
+
# misc settings
|
38 |
+
|
39 |
+
# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
|
40 |
+
save_every_n_epochs = 4
|
41 |
+
# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
|
42 |
+
#checkpoint_every_n_epochs = 1
|
43 |
+
checkpoint_every_n_minutes = 120
|
44 |
+
# Always set to true unless you have a huge amount of VRAM.
|
45 |
+
activation_checkpointing = true
|
46 |
+
# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
|
47 |
+
partition_method = 'parameters'
|
48 |
+
# dtype for saving the LoRA or model, if different from training dtype
|
49 |
+
save_dtype = 'bfloat16'
|
50 |
+
# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
|
51 |
+
caching_batch_size = 1
|
52 |
+
# How often deepspeed logs to console.
|
53 |
+
steps_per_print = 1
|
54 |
+
# How to extract video clips for training from a single input video file.
|
55 |
+
# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
|
56 |
+
# number of frames for that bucket.
|
57 |
+
# single_beginning: one clip starting at the beginning of the video
|
58 |
+
# single_middle: one clip from the middle of the video (cutting off the start and end equally)
|
59 |
+
# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
|
60 |
+
# default is single_middle
|
61 |
+
video_clip_mode = 'single_middle'
|
62 |
+
|
63 |
+
[model]
|
64 |
+
type = 'hunyuan-video'
|
65 |
+
# Can load Hunyuan Video entirely from the ckpt path set up for the official inference scripts.
|
66 |
+
#ckpt_path = '/home/anon/HunyuanVideo/ckpts'
|
67 |
+
# Or you can load it by pointing to all the ComfyUI files.
|
68 |
+
transformer_path = '/notebooks/diffusion-pipe/hunyuan_model_files/hunyuan_dit.safetensors'
|
69 |
+
vae_path = '/notebooks/diffusion-pipe/hunyuan_model_files/vae.safetensors'
|
70 |
+
llm_path = '/notebooks/diffusion-pipe/hunyuan_model_files/llava-llama-3-8b-text-encoder-tokenizer/'
|
71 |
+
clip_path = '/notebooks/diffusion-pipe/hunyuan_model_files/clip-vit-large-patch14/'
|
72 |
+
# Base dtype used for all models.
|
73 |
+
dtype = 'bfloat16'
|
74 |
+
# Hunyuan Video supports fp8 for the transformer when training LoRA.
|
75 |
+
transformer_dtype = 'float8'
|
76 |
+
# How to sample timesteps to train on. Can be logit_normal or uniform.
|
77 |
+
timestep_sample_method = 'logit_normal'
|
78 |
+
|
79 |
+
[adapter]
|
80 |
+
type = 'lora'
|
81 |
+
rank = 32
|
82 |
+
# Dtype for the LoRA weights you are training.
|
83 |
+
dtype = 'bfloat16'
|
84 |
+
# You can initialize the lora weights from a previously trained lora.
|
85 |
+
#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
|
86 |
+
|
87 |
+
[optimizer]
|
88 |
+
# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
|
89 |
+
# Look at train.py for other options. You could also easily edit the file and add your own.
|
90 |
+
type = 'adamw_optimi'
|
91 |
+
lr = 2e-5
|
92 |
+
betas = [0.9, 0.99]
|
93 |
+
weight_decay = 0.01
|
94 |
+
eps = 1e-8
|
epoch128/adapter_config.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"eva_config": null,
|
7 |
+
"exclude_modules": null,
|
8 |
+
"fan_in_fan_out": false,
|
9 |
+
"inference_mode": false,
|
10 |
+
"init_lora_weights": true,
|
11 |
+
"layer_replication": null,
|
12 |
+
"layers_pattern": null,
|
13 |
+
"layers_to_transform": null,
|
14 |
+
"loftq_config": {},
|
15 |
+
"lora_alpha": 32,
|
16 |
+
"lora_bias": false,
|
17 |
+
"lora_dropout": 0.0,
|
18 |
+
"megatron_config": null,
|
19 |
+
"megatron_core": "megatron.core",
|
20 |
+
"modules_to_save": null,
|
21 |
+
"peft_type": "LORA",
|
22 |
+
"r": 32,
|
23 |
+
"rank_pattern": {},
|
24 |
+
"revision": null,
|
25 |
+
"target_modules": [
|
26 |
+
"img_attn_qkv",
|
27 |
+
"txt_attn_qkv",
|
28 |
+
"img_mod.linear",
|
29 |
+
"img_mlp.fc1",
|
30 |
+
"txt_mlp.fc1",
|
31 |
+
"linear1",
|
32 |
+
"linear2",
|
33 |
+
"img_mlp.fc2",
|
34 |
+
"img_attn_proj",
|
35 |
+
"txt_mod.linear",
|
36 |
+
"txt_mlp.fc2",
|
37 |
+
"modulation.linear",
|
38 |
+
"txt_attn_proj"
|
39 |
+
],
|
40 |
+
"task_type": null,
|
41 |
+
"use_dora": false,
|
42 |
+
"use_rslora": false
|
43 |
+
}
|
epoch128/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6234471dcb2faaa15261c88165de238cb9752c0252d0bd000cdb6114f3ffdb60
|
3 |
+
size 322519480
|
epoch128/hunyuan_config.toml
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Output path for training runs. Each training run makes a new directory in here.
|
2 |
+
output_dir = '/notebooks/diffusion-pipe/output'
|
3 |
+
|
4 |
+
# Dataset config file.
|
5 |
+
dataset = '/notebooks/diffusion-pipe/dataset_files/dataset_config.toml'
|
6 |
+
# You can have separate eval datasets. Give them a name for Tensorboard metrics.
|
7 |
+
# eval_datasets = [
|
8 |
+
# {name = 'something', config = 'path/to/eval_dataset.toml'},
|
9 |
+
# ]
|
10 |
+
|
11 |
+
# training settings
|
12 |
+
|
13 |
+
# I usually set this to a really high value because I don't know how long I want to train.
|
14 |
+
epochs = 1000
|
15 |
+
# Batch size of a single forward/backward pass for one GPU.
|
16 |
+
micro_batch_size_per_gpu = 1
|
17 |
+
# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
|
18 |
+
pipeline_stages = 1
|
19 |
+
# Number of micro-batches sent through the pipeline for each training step.
|
20 |
+
# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
|
21 |
+
gradient_accumulation_steps = 4
|
22 |
+
# Grad norm clipping.
|
23 |
+
gradient_clipping = 1.0
|
24 |
+
# Learning rate warmup.
|
25 |
+
warmup_steps = 100
|
26 |
+
|
27 |
+
# eval settings
|
28 |
+
|
29 |
+
eval_every_n_epochs = 1
|
30 |
+
eval_before_first_step = true
|
31 |
+
# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
|
32 |
+
# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
|
33 |
+
# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
|
34 |
+
eval_micro_batch_size_per_gpu = 1
|
35 |
+
eval_gradient_accumulation_steps = 1
|
36 |
+
|
37 |
+
# misc settings
|
38 |
+
|
39 |
+
# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
|
40 |
+
save_every_n_epochs = 4
|
41 |
+
# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
|
42 |
+
#checkpoint_every_n_epochs = 1
|
43 |
+
checkpoint_every_n_minutes = 120
|
44 |
+
# Always set to true unless you have a huge amount of VRAM.
|
45 |
+
activation_checkpointing = true
|
46 |
+
# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
|
47 |
+
partition_method = 'parameters'
|
48 |
+
# dtype for saving the LoRA or model, if different from training dtype
|
49 |
+
save_dtype = 'bfloat16'
|
50 |
+
# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
|
51 |
+
caching_batch_size = 1
|
52 |
+
# How often deepspeed logs to console.
|
53 |
+
steps_per_print = 1
|
54 |
+
# How to extract video clips for training from a single input video file.
|
55 |
+
# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
|
56 |
+
# number of frames for that bucket.
|
57 |
+
# single_beginning: one clip starting at the beginning of the video
|
58 |
+
# single_middle: one clip from the middle of the video (cutting off the start and end equally)
|
59 |
+
# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
|
60 |
+
# default is single_middle
|
61 |
+
video_clip_mode = 'single_middle'
|
62 |
+
|
63 |
+
[model]
|
64 |
+
type = 'hunyuan-video'
|
65 |
+
# Can load Hunyuan Video entirely from the ckpt path set up for the official inference scripts.
|
66 |
+
#ckpt_path = '/home/anon/HunyuanVideo/ckpts'
|
67 |
+
# Or you can load it by pointing to all the ComfyUI files.
|
68 |
+
transformer_path = '/notebooks/diffusion-pipe/hunyuan_model_files/hunyuan_dit.safetensors'
|
69 |
+
vae_path = '/notebooks/diffusion-pipe/hunyuan_model_files/vae.safetensors'
|
70 |
+
llm_path = '/notebooks/diffusion-pipe/hunyuan_model_files/llava-llama-3-8b-text-encoder-tokenizer/'
|
71 |
+
clip_path = '/notebooks/diffusion-pipe/hunyuan_model_files/clip-vit-large-patch14/'
|
72 |
+
# Base dtype used for all models.
|
73 |
+
dtype = 'bfloat16'
|
74 |
+
# Hunyuan Video supports fp8 for the transformer when training LoRA.
|
75 |
+
transformer_dtype = 'float8'
|
76 |
+
# How to sample timesteps to train on. Can be logit_normal or uniform.
|
77 |
+
timestep_sample_method = 'logit_normal'
|
78 |
+
|
79 |
+
[adapter]
|
80 |
+
type = 'lora'
|
81 |
+
rank = 32
|
82 |
+
# Dtype for the LoRA weights you are training.
|
83 |
+
dtype = 'bfloat16'
|
84 |
+
# You can initialize the lora weights from a previously trained lora.
|
85 |
+
#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
|
86 |
+
|
87 |
+
[optimizer]
|
88 |
+
# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
|
89 |
+
# Look at train.py for other options. You could also easily edit the file and add your own.
|
90 |
+
type = 'adamw_optimi'
|
91 |
+
lr = 2e-5
|
92 |
+
betas = [0.9, 0.99]
|
93 |
+
weight_decay = 0.01
|
94 |
+
eps = 1e-8
|
epoch132/adapter_config.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"eva_config": null,
|
7 |
+
"exclude_modules": null,
|
8 |
+
"fan_in_fan_out": false,
|
9 |
+
"inference_mode": false,
|
10 |
+
"init_lora_weights": true,
|
11 |
+
"layer_replication": null,
|
12 |
+
"layers_pattern": null,
|
13 |
+
"layers_to_transform": null,
|
14 |
+
"loftq_config": {},
|
15 |
+
"lora_alpha": 32,
|
16 |
+
"lora_bias": false,
|
17 |
+
"lora_dropout": 0.0,
|
18 |
+
"megatron_config": null,
|
19 |
+
"megatron_core": "megatron.core",
|
20 |
+
"modules_to_save": null,
|
21 |
+
"peft_type": "LORA",
|
22 |
+
"r": 32,
|
23 |
+
"rank_pattern": {},
|
24 |
+
"revision": null,
|
25 |
+
"target_modules": [
|
26 |
+
"img_attn_qkv",
|
27 |
+
"txt_attn_qkv",
|
28 |
+
"img_mod.linear",
|
29 |
+
"img_mlp.fc1",
|
30 |
+
"txt_mlp.fc1",
|
31 |
+
"linear1",
|
32 |
+
"linear2",
|
33 |
+
"img_mlp.fc2",
|
34 |
+
"img_attn_proj",
|
35 |
+
"txt_mod.linear",
|
36 |
+
"txt_mlp.fc2",
|
37 |
+
"modulation.linear",
|
38 |
+
"txt_attn_proj"
|
39 |
+
],
|
40 |
+
"task_type": null,
|
41 |
+
"use_dora": false,
|
42 |
+
"use_rslora": false
|
43 |
+
}
|
epoch132/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f4edd799fb2d4b5b838f9172fd2ebe90fd9f7f169da2701311c929cb0d555964
|
3 |
+
size 322519480
|
epoch132/hunyuan_config.toml
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Output path for training runs. Each training run makes a new directory in here.
|
2 |
+
output_dir = '/notebooks/diffusion-pipe/output'
|
3 |
+
|
4 |
+
# Dataset config file.
|
5 |
+
dataset = '/notebooks/diffusion-pipe/dataset_files/dataset_config.toml'
|
6 |
+
# You can have separate eval datasets. Give them a name for Tensorboard metrics.
|
7 |
+
# eval_datasets = [
|
8 |
+
# {name = 'something', config = 'path/to/eval_dataset.toml'},
|
9 |
+
# ]
|
10 |
+
|
11 |
+
# training settings
|
12 |
+
|
13 |
+
# I usually set this to a really high value because I don't know how long I want to train.
|
14 |
+
epochs = 1000
|
15 |
+
# Batch size of a single forward/backward pass for one GPU.
|
16 |
+
micro_batch_size_per_gpu = 1
|
17 |
+
# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
|
18 |
+
pipeline_stages = 1
|
19 |
+
# Number of micro-batches sent through the pipeline for each training step.
|
20 |
+
# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
|
21 |
+
gradient_accumulation_steps = 4
|
22 |
+
# Grad norm clipping.
|
23 |
+
gradient_clipping = 1.0
|
24 |
+
# Learning rate warmup.
|
25 |
+
warmup_steps = 100
|
26 |
+
|
27 |
+
# eval settings
|
28 |
+
|
29 |
+
eval_every_n_epochs = 1
|
30 |
+
eval_before_first_step = true
|
31 |
+
# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
|
32 |
+
# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
|
33 |
+
# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
|
34 |
+
eval_micro_batch_size_per_gpu = 1
|
35 |
+
eval_gradient_accumulation_steps = 1
|
36 |
+
|
37 |
+
# misc settings
|
38 |
+
|
39 |
+
# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
|
40 |
+
save_every_n_epochs = 4
|
41 |
+
# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
|
42 |
+
#checkpoint_every_n_epochs = 1
|
43 |
+
checkpoint_every_n_minutes = 120
|
44 |
+
# Always set to true unless you have a huge amount of VRAM.
|
45 |
+
activation_checkpointing = true
|
46 |
+
# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
|
47 |
+
partition_method = 'parameters'
|
48 |
+
# dtype for saving the LoRA or model, if different from training dtype
|
49 |
+
save_dtype = 'bfloat16'
|
50 |
+
# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
|
51 |
+
caching_batch_size = 1
|
52 |
+
# How often deepspeed logs to console.
|
53 |
+
steps_per_print = 1
|
54 |
+
# How to extract video clips for training from a single input video file.
|
55 |
+
# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
|
56 |
+
# number of frames for that bucket.
|
57 |
+
# single_beginning: one clip starting at the beginning of the video
|
58 |
+
# single_middle: one clip from the middle of the video (cutting off the start and end equally)
|
59 |
+
# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
|
60 |
+
# default is single_middle
|
61 |
+
video_clip_mode = 'single_middle'
|
62 |
+
|
63 |
+
[model]
|
64 |
+
type = 'hunyuan-video'
|
65 |
+
# Can load Hunyuan Video entirely from the ckpt path set up for the official inference scripts.
|
66 |
+
#ckpt_path = '/home/anon/HunyuanVideo/ckpts'
|
67 |
+
# Or you can load it by pointing to all the ComfyUI files.
|
68 |
+
transformer_path = '/notebooks/diffusion-pipe/hunyuan_model_files/hunyuan_dit.safetensors'
|
69 |
+
vae_path = '/notebooks/diffusion-pipe/hunyuan_model_files/vae.safetensors'
|
70 |
+
llm_path = '/notebooks/diffusion-pipe/hunyuan_model_files/llava-llama-3-8b-text-encoder-tokenizer/'
|
71 |
+
clip_path = '/notebooks/diffusion-pipe/hunyuan_model_files/clip-vit-large-patch14/'
|
72 |
+
# Base dtype used for all models.
|
73 |
+
dtype = 'bfloat16'
|
74 |
+
# Hunyuan Video supports fp8 for the transformer when training LoRA.
|
75 |
+
transformer_dtype = 'float8'
|
76 |
+
# How to sample timesteps to train on. Can be logit_normal or uniform.
|
77 |
+
timestep_sample_method = 'logit_normal'
|
78 |
+
|
79 |
+
[adapter]
|
80 |
+
type = 'lora'
|
81 |
+
rank = 32
|
82 |
+
# Dtype for the LoRA weights you are training.
|
83 |
+
dtype = 'bfloat16'
|
84 |
+
# You can initialize the lora weights from a previously trained lora.
|
85 |
+
#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
|
86 |
+
|
87 |
+
[optimizer]
|
88 |
+
# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
|
89 |
+
# Look at train.py for other options. You could also easily edit the file and add your own.
|
90 |
+
type = 'adamw_optimi'
|
91 |
+
lr = 2e-5
|
92 |
+
betas = [0.9, 0.99]
|
93 |
+
weight_decay = 0.01
|
94 |
+
eps = 1e-8
|
epoch136/adapter_config.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"eva_config": null,
|
7 |
+
"exclude_modules": null,
|
8 |
+
"fan_in_fan_out": false,
|
9 |
+
"inference_mode": false,
|
10 |
+
"init_lora_weights": true,
|
11 |
+
"layer_replication": null,
|
12 |
+
"layers_pattern": null,
|
13 |
+
"layers_to_transform": null,
|
14 |
+
"loftq_config": {},
|
15 |
+
"lora_alpha": 32,
|
16 |
+
"lora_bias": false,
|
17 |
+
"lora_dropout": 0.0,
|
18 |
+
"megatron_config": null,
|
19 |
+
"megatron_core": "megatron.core",
|
20 |
+
"modules_to_save": null,
|
21 |
+
"peft_type": "LORA",
|
22 |
+
"r": 32,
|
23 |
+
"rank_pattern": {},
|
24 |
+
"revision": null,
|
25 |
+
"target_modules": [
|
26 |
+
"img_attn_qkv",
|
27 |
+
"txt_attn_qkv",
|
28 |
+
"img_mod.linear",
|
29 |
+
"img_mlp.fc1",
|
30 |
+
"txt_mlp.fc1",
|
31 |
+
"linear1",
|
32 |
+
"linear2",
|
33 |
+
"img_mlp.fc2",
|
34 |
+
"img_attn_proj",
|
35 |
+
"txt_mod.linear",
|
36 |
+
"txt_mlp.fc2",
|
37 |
+
"modulation.linear",
|
38 |
+
"txt_attn_proj"
|
39 |
+
],
|
40 |
+
"task_type": null,
|
41 |
+
"use_dora": false,
|
42 |
+
"use_rslora": false
|
43 |
+
}
|
epoch136/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:af3a6f999acfa39b46058c8e91a23cfa0c2c904b080637c21309dcce3367e3da
|
3 |
+
size 322519480
|
epoch136/hunyuan_config.toml
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Output path for training runs. Each training run makes a new directory in here.
|
2 |
+
output_dir = '/notebooks/diffusion-pipe/output'
|
3 |
+
|
4 |
+
# Dataset config file.
|
5 |
+
dataset = '/notebooks/diffusion-pipe/dataset_files/dataset_config.toml'
|
6 |
+
# You can have separate eval datasets. Give them a name for Tensorboard metrics.
|
7 |
+
# eval_datasets = [
|
8 |
+
# {name = 'something', config = 'path/to/eval_dataset.toml'},
|
9 |
+
# ]
|
10 |
+
|
11 |
+
# training settings
|
12 |
+
|
13 |
+
# I usually set this to a really high value because I don't know how long I want to train.
|
14 |
+
epochs = 1000
|
15 |
+
# Batch size of a single forward/backward pass for one GPU.
|
16 |
+
micro_batch_size_per_gpu = 1
|
17 |
+
# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
|
18 |
+
pipeline_stages = 1
|
19 |
+
# Number of micro-batches sent through the pipeline for each training step.
|
20 |
+
# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
|
21 |
+
gradient_accumulation_steps = 4
|
22 |
+
# Grad norm clipping.
|
23 |
+
gradient_clipping = 1.0
|
24 |
+
# Learning rate warmup.
|
25 |
+
warmup_steps = 100
|
26 |
+
|
27 |
+
# eval settings
|
28 |
+
|
29 |
+
eval_every_n_epochs = 1
|
30 |
+
eval_before_first_step = true
|
31 |
+
# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
|
32 |
+
# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
|
33 |
+
# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
|
34 |
+
eval_micro_batch_size_per_gpu = 1
|
35 |
+
eval_gradient_accumulation_steps = 1
|
36 |
+
|
37 |
+
# misc settings
|
38 |
+
|
39 |
+
# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
|
40 |
+
save_every_n_epochs = 4
|
41 |
+
# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
|
42 |
+
#checkpoint_every_n_epochs = 1
|
43 |
+
checkpoint_every_n_minutes = 120
|
44 |
+
# Always set to true unless you have a huge amount of VRAM.
|
45 |
+
activation_checkpointing = true
|
46 |
+
# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
|
47 |
+
partition_method = 'parameters'
|
48 |
+
# dtype for saving the LoRA or model, if different from training dtype
|
49 |
+
save_dtype = 'bfloat16'
|
50 |
+
# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
|
51 |
+
caching_batch_size = 1
|
52 |
+
# How often deepspeed logs to console.
|
53 |
+
steps_per_print = 1
|
54 |
+
# How to extract video clips for training from a single input video file.
|
55 |
+
# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
|
56 |
+
# number of frames for that bucket.
|
57 |
+
# single_beginning: one clip starting at the beginning of the video
|
58 |
+
# single_middle: one clip from the middle of the video (cutting off the start and end equally)
|
59 |
+
# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
|
60 |
+
# default is single_middle
|
61 |
+
video_clip_mode = 'single_middle'
|
62 |
+
|
63 |
+
[model]
|
64 |
+
type = 'hunyuan-video'
|
65 |
+
# Can load Hunyuan Video entirely from the ckpt path set up for the official inference scripts.
|
66 |
+
#ckpt_path = '/home/anon/HunyuanVideo/ckpts'
|
67 |
+
# Or you can load it by pointing to all the ComfyUI files.
|
68 |
+
transformer_path = '/notebooks/diffusion-pipe/hunyuan_model_files/hunyuan_dit.safetensors'
|
69 |
+
vae_path = '/notebooks/diffusion-pipe/hunyuan_model_files/vae.safetensors'
|
70 |
+
llm_path = '/notebooks/diffusion-pipe/hunyuan_model_files/llava-llama-3-8b-text-encoder-tokenizer/'
|
71 |
+
clip_path = '/notebooks/diffusion-pipe/hunyuan_model_files/clip-vit-large-patch14/'
|
72 |
+
# Base dtype used for all models.
|
73 |
+
dtype = 'bfloat16'
|
74 |
+
# Hunyuan Video supports fp8 for the transformer when training LoRA.
|
75 |
+
transformer_dtype = 'float8'
|
76 |
+
# How to sample timesteps to train on. Can be logit_normal or uniform.
|
77 |
+
timestep_sample_method = 'logit_normal'
|
78 |
+
|
79 |
+
[adapter]
|
80 |
+
type = 'lora'
|
81 |
+
rank = 32
|
82 |
+
# Dtype for the LoRA weights you are training.
|
83 |
+
dtype = 'bfloat16'
|
84 |
+
# You can initialize the lora weights from a previously trained lora.
|
85 |
+
#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
|
86 |
+
|
87 |
+
[optimizer]
|
88 |
+
# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
|
89 |
+
# Look at train.py for other options. You could also easily edit the file and add your own.
|
90 |
+
type = 'adamw_optimi'
|
91 |
+
lr = 2e-5
|
92 |
+
betas = [0.9, 0.99]
|
93 |
+
weight_decay = 0.01
|
94 |
+
eps = 1e-8
|
epoch140/adapter_config.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"eva_config": null,
|
7 |
+
"exclude_modules": null,
|
8 |
+
"fan_in_fan_out": false,
|
9 |
+
"inference_mode": false,
|
10 |
+
"init_lora_weights": true,
|
11 |
+
"layer_replication": null,
|
12 |
+
"layers_pattern": null,
|
13 |
+
"layers_to_transform": null,
|
14 |
+
"loftq_config": {},
|
15 |
+
"lora_alpha": 32,
|
16 |
+
"lora_bias": false,
|
17 |
+
"lora_dropout": 0.0,
|
18 |
+
"megatron_config": null,
|
19 |
+
"megatron_core": "megatron.core",
|
20 |
+
"modules_to_save": null,
|
21 |
+
"peft_type": "LORA",
|
22 |
+
"r": 32,
|
23 |
+
"rank_pattern": {},
|
24 |
+
"revision": null,
|
25 |
+
"target_modules": [
|
26 |
+
"img_attn_qkv",
|
27 |
+
"txt_attn_qkv",
|
28 |
+
"img_mod.linear",
|
29 |
+
"img_mlp.fc1",
|
30 |
+
"txt_mlp.fc1",
|
31 |
+
"linear1",
|
32 |
+
"linear2",
|
33 |
+
"img_mlp.fc2",
|
34 |
+
"img_attn_proj",
|
35 |
+
"txt_mod.linear",
|
36 |
+
"txt_mlp.fc2",
|
37 |
+
"modulation.linear",
|
38 |
+
"txt_attn_proj"
|
39 |
+
],
|
40 |
+
"task_type": null,
|
41 |
+
"use_dora": false,
|
42 |
+
"use_rslora": false
|
43 |
+
}
|
epoch140/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:61274788213d842ddb1b9082bb941378a1c66aa3f7fe654427d0eda3ab547913
|
3 |
+
size 322519480
|
epoch140/hunyuan_config.toml
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Output path for training runs. Each training run makes a new directory in here.
|
2 |
+
output_dir = '/notebooks/diffusion-pipe/output'
|
3 |
+
|
4 |
+
# Dataset config file.
|
5 |
+
dataset = '/notebooks/diffusion-pipe/dataset_files/dataset_config.toml'
|
6 |
+
# You can have separate eval datasets. Give them a name for Tensorboard metrics.
|
7 |
+
# eval_datasets = [
|
8 |
+
# {name = 'something', config = 'path/to/eval_dataset.toml'},
|
9 |
+
# ]
|
10 |
+
|
11 |
+
# training settings
|
12 |
+
|
13 |
+
# I usually set this to a really high value because I don't know how long I want to train.
|
14 |
+
epochs = 1000
|
15 |
+
# Batch size of a single forward/backward pass for one GPU.
|
16 |
+
micro_batch_size_per_gpu = 1
|
17 |
+
# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
|
18 |
+
pipeline_stages = 1
|
19 |
+
# Number of micro-batches sent through the pipeline for each training step.
|
20 |
+
# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
|
21 |
+
gradient_accumulation_steps = 4
|
22 |
+
# Grad norm clipping.
|
23 |
+
gradient_clipping = 1.0
|
24 |
+
# Learning rate warmup.
|
25 |
+
warmup_steps = 100
|
26 |
+
|
27 |
+
# eval settings
|
28 |
+
|
29 |
+
eval_every_n_epochs = 1
|
30 |
+
eval_before_first_step = true
|
31 |
+
# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
|
32 |
+
# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
|
33 |
+
# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
|
34 |
+
eval_micro_batch_size_per_gpu = 1
|
35 |
+
eval_gradient_accumulation_steps = 1
|
36 |
+
|
37 |
+
# misc settings
|
38 |
+
|
39 |
+
# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
|
40 |
+
save_every_n_epochs = 4
|
41 |
+
# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
|
42 |
+
#checkpoint_every_n_epochs = 1
|
43 |
+
checkpoint_every_n_minutes = 120
|
44 |
+
# Always set to true unless you have a huge amount of VRAM.
|
45 |
+
activation_checkpointing = true
|
46 |
+
# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
|
47 |
+
partition_method = 'parameters'
|
48 |
+
# dtype for saving the LoRA or model, if different from training dtype
|
49 |
+
save_dtype = 'bfloat16'
|
50 |
+
# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
|
51 |
+
caching_batch_size = 1
|
52 |
+
# How often deepspeed logs to console.
|
53 |
+
steps_per_print = 1
|
54 |
+
# How to extract video clips for training from a single input video file.
|
55 |
+
# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
|
56 |
+
# number of frames for that bucket.
|
57 |
+
# single_beginning: one clip starting at the beginning of the video
|
58 |
+
# single_middle: one clip from the middle of the video (cutting off the start and end equally)
|
59 |
+
# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
|
60 |
+
# default is single_middle
|
61 |
+
video_clip_mode = 'single_middle'
|
62 |
+
|
63 |
+
[model]
|
64 |
+
type = 'hunyuan-video'
|
65 |
+
# Can load Hunyuan Video entirely from the ckpt path set up for the official inference scripts.
|
66 |
+
#ckpt_path = '/home/anon/HunyuanVideo/ckpts'
|
67 |
+
# Or you can load it by pointing to all the ComfyUI files.
|
68 |
+
transformer_path = '/notebooks/diffusion-pipe/hunyuan_model_files/hunyuan_dit.safetensors'
|
69 |
+
vae_path = '/notebooks/diffusion-pipe/hunyuan_model_files/vae.safetensors'
|
70 |
+
llm_path = '/notebooks/diffusion-pipe/hunyuan_model_files/llava-llama-3-8b-text-encoder-tokenizer/'
|
71 |
+
clip_path = '/notebooks/diffusion-pipe/hunyuan_model_files/clip-vit-large-patch14/'
|
72 |
+
# Base dtype used for all models.
|
73 |
+
dtype = 'bfloat16'
|
74 |
+
# Hunyuan Video supports fp8 for the transformer when training LoRA.
|
75 |
+
transformer_dtype = 'float8'
|
76 |
+
# How to sample timesteps to train on. Can be logit_normal or uniform.
|
77 |
+
timestep_sample_method = 'logit_normal'
|
78 |
+
|
79 |
+
[adapter]
|
80 |
+
type = 'lora'
|
81 |
+
rank = 32
|
82 |
+
# Dtype for the LoRA weights you are training.
|
83 |
+
dtype = 'bfloat16'
|
84 |
+
# You can initialize the lora weights from a previously trained lora.
|
85 |
+
#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
|
86 |
+
|
87 |
+
[optimizer]
|
88 |
+
# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
|
89 |
+
# Look at train.py for other options. You could also easily edit the file and add your own.
|
90 |
+
type = 'adamw_optimi'
|
91 |
+
lr = 2e-5
|
92 |
+
betas = [0.9, 0.99]
|
93 |
+
weight_decay = 0.01
|
94 |
+
eps = 1e-8
|
epoch144/adapter_config.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"eva_config": null,
|
7 |
+
"exclude_modules": null,
|
8 |
+
"fan_in_fan_out": false,
|
9 |
+
"inference_mode": false,
|
10 |
+
"init_lora_weights": true,
|
11 |
+
"layer_replication": null,
|
12 |
+
"layers_pattern": null,
|
13 |
+
"layers_to_transform": null,
|
14 |
+
"loftq_config": {},
|
15 |
+
"lora_alpha": 32,
|
16 |
+
"lora_bias": false,
|
17 |
+
"lora_dropout": 0.0,
|
18 |
+
"megatron_config": null,
|
19 |
+
"megatron_core": "megatron.core",
|
20 |
+
"modules_to_save": null,
|
21 |
+
"peft_type": "LORA",
|
22 |
+
"r": 32,
|
23 |
+
"rank_pattern": {},
|
24 |
+
"revision": null,
|
25 |
+
"target_modules": [
|
26 |
+
"img_attn_qkv",
|
27 |
+
"txt_attn_qkv",
|
28 |
+
"img_mod.linear",
|
29 |
+
"img_mlp.fc1",
|
30 |
+
"txt_mlp.fc1",
|
31 |
+
"linear1",
|
32 |
+
"linear2",
|
33 |
+
"img_mlp.fc2",
|
34 |
+
"img_attn_proj",
|
35 |
+
"txt_mod.linear",
|
36 |
+
"txt_mlp.fc2",
|
37 |
+
"modulation.linear",
|
38 |
+
"txt_attn_proj"
|
39 |
+
],
|
40 |
+
"task_type": null,
|
41 |
+
"use_dora": false,
|
42 |
+
"use_rslora": false
|
43 |
+
}
|
epoch144/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:275674607d3a829c159cbec32e0b887f85c687145618f46dbea90e290328bfcf
|
3 |
+
size 322519480
|
epoch144/hunyuan_config.toml
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Output path for training runs. Each training run makes a new directory in here.
|
2 |
+
output_dir = '/notebooks/diffusion-pipe/output'
|
3 |
+
|
4 |
+
# Dataset config file.
|
5 |
+
dataset = '/notebooks/diffusion-pipe/dataset_files/dataset_config.toml'
|
6 |
+
# You can have separate eval datasets. Give them a name for Tensorboard metrics.
|
7 |
+
# eval_datasets = [
|
8 |
+
# {name = 'something', config = 'path/to/eval_dataset.toml'},
|
9 |
+
# ]
|
10 |
+
|
11 |
+
# training settings
|
12 |
+
|
13 |
+
# I usually set this to a really high value because I don't know how long I want to train.
|
14 |
+
epochs = 1000
|
15 |
+
# Batch size of a single forward/backward pass for one GPU.
|
16 |
+
micro_batch_size_per_gpu = 1
|
17 |
+
# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
|
18 |
+
pipeline_stages = 1
|
19 |
+
# Number of micro-batches sent through the pipeline for each training step.
|
20 |
+
# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
|
21 |
+
gradient_accumulation_steps = 4
|
22 |
+
# Grad norm clipping.
|
23 |
+
gradient_clipping = 1.0
|
24 |
+
# Learning rate warmup.
|
25 |
+
warmup_steps = 100
|
26 |
+
|
27 |
+
# eval settings
|
28 |
+
|
29 |
+
eval_every_n_epochs = 1
|
30 |
+
eval_before_first_step = true
|
31 |
+
# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
|
32 |
+
# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
|
33 |
+
# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
|
34 |
+
eval_micro_batch_size_per_gpu = 1
|
35 |
+
eval_gradient_accumulation_steps = 1
|
36 |
+
|
37 |
+
# misc settings
|
38 |
+
|
39 |
+
# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
|
40 |
+
save_every_n_epochs = 4
|
41 |
+
# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
|
42 |
+
#checkpoint_every_n_epochs = 1
|
43 |
+
checkpoint_every_n_minutes = 120
|
44 |
+
# Always set to true unless you have a huge amount of VRAM.
|
45 |
+
activation_checkpointing = true
|
46 |
+
# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
|
47 |
+
partition_method = 'parameters'
|
48 |
+
# dtype for saving the LoRA or model, if different from training dtype
|
49 |
+
save_dtype = 'bfloat16'
|
50 |
+
# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
|
51 |
+
caching_batch_size = 1
|
52 |
+
# How often deepspeed logs to console.
|
53 |
+
steps_per_print = 1
|
54 |
+
# How to extract video clips for training from a single input video file.
|
55 |
+
# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
|
56 |
+
# number of frames for that bucket.
|
57 |
+
# single_beginning: one clip starting at the beginning of the video
|
58 |
+
# single_middle: one clip from the middle of the video (cutting off the start and end equally)
|
59 |
+
# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
|
60 |
+
# default is single_middle
|
61 |
+
video_clip_mode = 'single_middle'
|
62 |
+
|
63 |
+
[model]
|
64 |
+
type = 'hunyuan-video'
|
65 |
+
# Can load Hunyuan Video entirely from the ckpt path set up for the official inference scripts.
|
66 |
+
#ckpt_path = '/home/anon/HunyuanVideo/ckpts'
|
67 |
+
# Or you can load it by pointing to all the ComfyUI files.
|
68 |
+
transformer_path = '/notebooks/diffusion-pipe/hunyuan_model_files/hunyuan_dit.safetensors'
|
69 |
+
vae_path = '/notebooks/diffusion-pipe/hunyuan_model_files/vae.safetensors'
|
70 |
+
llm_path = '/notebooks/diffusion-pipe/hunyuan_model_files/llava-llama-3-8b-text-encoder-tokenizer/'
|
71 |
+
clip_path = '/notebooks/diffusion-pipe/hunyuan_model_files/clip-vit-large-patch14/'
|
72 |
+
# Base dtype used for all models.
|
73 |
+
dtype = 'bfloat16'
|
74 |
+
# Hunyuan Video supports fp8 for the transformer when training LoRA.
|
75 |
+
transformer_dtype = 'float8'
|
76 |
+
# How to sample timesteps to train on. Can be logit_normal or uniform.
|
77 |
+
timestep_sample_method = 'logit_normal'
|
78 |
+
|
79 |
+
[adapter]
|
80 |
+
type = 'lora'
|
81 |
+
rank = 32
|
82 |
+
# Dtype for the LoRA weights you are training.
|
83 |
+
dtype = 'bfloat16'
|
84 |
+
# You can initialize the lora weights from a previously trained lora.
|
85 |
+
#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
|
86 |
+
|
87 |
+
[optimizer]
|
88 |
+
# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
|
89 |
+
# Look at train.py for other options. You could also easily edit the file and add your own.
|
90 |
+
type = 'adamw_optimi'
|
91 |
+
lr = 2e-5
|
92 |
+
betas = [0.9, 0.99]
|
93 |
+
weight_decay = 0.01
|
94 |
+
eps = 1e-8
|
epoch148/adapter_config.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"eva_config": null,
|
7 |
+
"exclude_modules": null,
|
8 |
+
"fan_in_fan_out": false,
|
9 |
+
"inference_mode": false,
|
10 |
+
"init_lora_weights": true,
|
11 |
+
"layer_replication": null,
|
12 |
+
"layers_pattern": null,
|
13 |
+
"layers_to_transform": null,
|
14 |
+
"loftq_config": {},
|
15 |
+
"lora_alpha": 32,
|
16 |
+
"lora_bias": false,
|
17 |
+
"lora_dropout": 0.0,
|
18 |
+
"megatron_config": null,
|
19 |
+
"megatron_core": "megatron.core",
|
20 |
+
"modules_to_save": null,
|
21 |
+
"peft_type": "LORA",
|
22 |
+
"r": 32,
|
23 |
+
"rank_pattern": {},
|
24 |
+
"revision": null,
|
25 |
+
"target_modules": [
|
26 |
+
"img_attn_qkv",
|
27 |
+
"txt_attn_qkv",
|
28 |
+
"img_mod.linear",
|
29 |
+
"img_mlp.fc1",
|
30 |
+
"txt_mlp.fc1",
|
31 |
+
"linear1",
|
32 |
+
"linear2",
|
33 |
+
"img_mlp.fc2",
|
34 |
+
"img_attn_proj",
|
35 |
+
"txt_mod.linear",
|
36 |
+
"txt_mlp.fc2",
|
37 |
+
"modulation.linear",
|
38 |
+
"txt_attn_proj"
|
39 |
+
],
|
40 |
+
"task_type": null,
|
41 |
+
"use_dora": false,
|
42 |
+
"use_rslora": false
|
43 |
+
}
|
epoch148/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5cfc441c42531f2697fea72f0356dea7b451e4b1c7bda12f19b308f3e520a018
|
3 |
+
size 322519480
|
epoch148/hunyuan_config.toml
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Output path for training runs. Each training run makes a new directory in here.
|
2 |
+
output_dir = '/notebooks/diffusion-pipe/output'
|
3 |
+
|
4 |
+
# Dataset config file.
|
5 |
+
dataset = '/notebooks/diffusion-pipe/dataset_files/dataset_config.toml'
|
6 |
+
# You can have separate eval datasets. Give them a name for Tensorboard metrics.
|
7 |
+
# eval_datasets = [
|
8 |
+
# {name = 'something', config = 'path/to/eval_dataset.toml'},
|
9 |
+
# ]
|
10 |
+
|
11 |
+
# training settings
|
12 |
+
|
13 |
+
# I usually set this to a really high value because I don't know how long I want to train.
|
14 |
+
epochs = 1000
|
15 |
+
# Batch size of a single forward/backward pass for one GPU.
|
16 |
+
micro_batch_size_per_gpu = 1
|
17 |
+
# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
|
18 |
+
pipeline_stages = 1
|
19 |
+
# Number of micro-batches sent through the pipeline for each training step.
|
20 |
+
# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
|
21 |
+
gradient_accumulation_steps = 4
|
22 |
+
# Grad norm clipping.
|
23 |
+
gradient_clipping = 1.0
|
24 |
+
# Learning rate warmup.
|
25 |
+
warmup_steps = 100
|
26 |
+
|
27 |
+
# eval settings
|
28 |
+
|
29 |
+
eval_every_n_epochs = 1
|
30 |
+
eval_before_first_step = true
|
31 |
+
# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
|
32 |
+
# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
|
33 |
+
# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
|
34 |
+
eval_micro_batch_size_per_gpu = 1
|
35 |
+
eval_gradient_accumulation_steps = 1
|
36 |
+
|
37 |
+
# misc settings
|
38 |
+
|
39 |
+
# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
|
40 |
+
save_every_n_epochs = 4
|
41 |
+
# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
|
42 |
+
#checkpoint_every_n_epochs = 1
|
43 |
+
checkpoint_every_n_minutes = 120
|
44 |
+
# Always set to true unless you have a huge amount of VRAM.
|
45 |
+
activation_checkpointing = true
|
46 |
+
# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
|
47 |
+
partition_method = 'parameters'
|
48 |
+
# dtype for saving the LoRA or model, if different from training dtype
|
49 |
+
save_dtype = 'bfloat16'
|
50 |
+
# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
|
51 |
+
caching_batch_size = 1
|
52 |
+
# How often deepspeed logs to console.
|
53 |
+
steps_per_print = 1
|
54 |
+
# How to extract video clips for training from a single input video file.
|
55 |
+
# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
|
56 |
+
# number of frames for that bucket.
|
57 |
+
# single_beginning: one clip starting at the beginning of the video
|
58 |
+
# single_middle: one clip from the middle of the video (cutting off the start and end equally)
|
59 |
+
# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
|
60 |
+
# default is single_middle
|
61 |
+
video_clip_mode = 'single_middle'
|
62 |
+
|
63 |
+
[model]
|
64 |
+
type = 'hunyuan-video'
|
65 |
+
# Can load Hunyuan Video entirely from the ckpt path set up for the official inference scripts.
|
66 |
+
#ckpt_path = '/home/anon/HunyuanVideo/ckpts'
|
67 |
+
# Or you can load it by pointing to all the ComfyUI files.
|
68 |
+
transformer_path = '/notebooks/diffusion-pipe/hunyuan_model_files/hunyuan_dit.safetensors'
|
69 |
+
vae_path = '/notebooks/diffusion-pipe/hunyuan_model_files/vae.safetensors'
|
70 |
+
llm_path = '/notebooks/diffusion-pipe/hunyuan_model_files/llava-llama-3-8b-text-encoder-tokenizer/'
|
71 |
+
clip_path = '/notebooks/diffusion-pipe/hunyuan_model_files/clip-vit-large-patch14/'
|
72 |
+
# Base dtype used for all models.
|
73 |
+
dtype = 'bfloat16'
|
74 |
+
# Hunyuan Video supports fp8 for the transformer when training LoRA.
|
75 |
+
transformer_dtype = 'float8'
|
76 |
+
# How to sample timesteps to train on. Can be logit_normal or uniform.
|
77 |
+
timestep_sample_method = 'logit_normal'
|
78 |
+
|
79 |
+
[adapter]
|
80 |
+
type = 'lora'
|
81 |
+
rank = 32
|
82 |
+
# Dtype for the LoRA weights you are training.
|
83 |
+
dtype = 'bfloat16'
|
84 |
+
# You can initialize the lora weights from a previously trained lora.
|
85 |
+
#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
|
86 |
+
|
87 |
+
[optimizer]
|
88 |
+
# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
|
89 |
+
# Look at train.py for other options. You could also easily edit the file and add your own.
|
90 |
+
type = 'adamw_optimi'
|
91 |
+
lr = 2e-5
|
92 |
+
betas = [0.9, 0.99]
|
93 |
+
weight_decay = 0.01
|
94 |
+
eps = 1e-8
|
epoch152/adapter_config.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"eva_config": null,
|
7 |
+
"exclude_modules": null,
|
8 |
+
"fan_in_fan_out": false,
|
9 |
+
"inference_mode": false,
|
10 |
+
"init_lora_weights": true,
|
11 |
+
"layer_replication": null,
|
12 |
+
"layers_pattern": null,
|
13 |
+
"layers_to_transform": null,
|
14 |
+
"loftq_config": {},
|
15 |
+
"lora_alpha": 32,
|
16 |
+
"lora_bias": false,
|
17 |
+
"lora_dropout": 0.0,
|
18 |
+
"megatron_config": null,
|
19 |
+
"megatron_core": "megatron.core",
|
20 |
+
"modules_to_save": null,
|
21 |
+
"peft_type": "LORA",
|
22 |
+
"r": 32,
|
23 |
+
"rank_pattern": {},
|
24 |
+
"revision": null,
|
25 |
+
"target_modules": [
|
26 |
+
"img_attn_qkv",
|
27 |
+
"txt_attn_qkv",
|
28 |
+
"img_mod.linear",
|
29 |
+
"img_mlp.fc1",
|
30 |
+
"txt_mlp.fc1",
|
31 |
+
"linear1",
|
32 |
+
"linear2",
|
33 |
+
"img_mlp.fc2",
|
34 |
+
"img_attn_proj",
|
35 |
+
"txt_mod.linear",
|
36 |
+
"txt_mlp.fc2",
|
37 |
+
"modulation.linear",
|
38 |
+
"txt_attn_proj"
|
39 |
+
],
|
40 |
+
"task_type": null,
|
41 |
+
"use_dora": false,
|
42 |
+
"use_rslora": false
|
43 |
+
}
|
epoch152/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:67215223c3ea863d6406fb8533902e246950dd78e971d400386e448e6309aee3
|
3 |
+
size 322519480
|
epoch152/hunyuan_config.toml
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Output path for training runs. Each training run makes a new directory in here.
|
2 |
+
output_dir = '/notebooks/diffusion-pipe/output'
|
3 |
+
|
4 |
+
# Dataset config file.
|
5 |
+
dataset = '/notebooks/diffusion-pipe/dataset_files/dataset_config.toml'
|
6 |
+
# You can have separate eval datasets. Give them a name for Tensorboard metrics.
|
7 |
+
# eval_datasets = [
|
8 |
+
# {name = 'something', config = 'path/to/eval_dataset.toml'},
|
9 |
+
# ]
|
10 |
+
|
11 |
+
# training settings
|
12 |
+
|
13 |
+
# I usually set this to a really high value because I don't know how long I want to train.
|
14 |
+
epochs = 1000
|
15 |
+
# Batch size of a single forward/backward pass for one GPU.
|
16 |
+
micro_batch_size_per_gpu = 1
|
17 |
+
# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
|
18 |
+
pipeline_stages = 1
|
19 |
+
# Number of micro-batches sent through the pipeline for each training step.
|
20 |
+
# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
|
21 |
+
gradient_accumulation_steps = 4
|
22 |
+
# Grad norm clipping.
|
23 |
+
gradient_clipping = 1.0
|
24 |
+
# Learning rate warmup.
|
25 |
+
warmup_steps = 100
|
26 |
+
|
27 |
+
# eval settings
|
28 |
+
|
29 |
+
eval_every_n_epochs = 1
|
30 |
+
eval_before_first_step = true
|
31 |
+
# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
|
32 |
+
# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
|
33 |
+
# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
|
34 |
+
eval_micro_batch_size_per_gpu = 1
|
35 |
+
eval_gradient_accumulation_steps = 1
|
36 |
+
|
37 |
+
# misc settings
|
38 |
+
|
39 |
+
# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
|
40 |
+
save_every_n_epochs = 4
|
41 |
+
# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
|
42 |
+
#checkpoint_every_n_epochs = 1
|
43 |
+
checkpoint_every_n_minutes = 120
|
44 |
+
# Always set to true unless you have a huge amount of VRAM.
|
45 |
+
activation_checkpointing = true
|
46 |
+
# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
|
47 |
+
partition_method = 'parameters'
|
48 |
+
# dtype for saving the LoRA or model, if different from training dtype
|
49 |
+
save_dtype = 'bfloat16'
|
50 |
+
# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
|
51 |
+
caching_batch_size = 1
|
52 |
+
# How often deepspeed logs to console.
|
53 |
+
steps_per_print = 1
|
54 |
+
# How to extract video clips for training from a single input video file.
|
55 |
+
# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
|
56 |
+
# number of frames for that bucket.
|
57 |
+
# single_beginning: one clip starting at the beginning of the video
|
58 |
+
# single_middle: one clip from the middle of the video (cutting off the start and end equally)
|
59 |
+
# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
|
60 |
+
# default is single_middle
|
61 |
+
video_clip_mode = 'single_middle'
|
62 |
+
|
63 |
+
[model]
|
64 |
+
type = 'hunyuan-video'
|
65 |
+
# Can load Hunyuan Video entirely from the ckpt path set up for the official inference scripts.
|
66 |
+
#ckpt_path = '/home/anon/HunyuanVideo/ckpts'
|
67 |
+
# Or you can load it by pointing to all the ComfyUI files.
|
68 |
+
transformer_path = '/notebooks/diffusion-pipe/hunyuan_model_files/hunyuan_dit.safetensors'
|
69 |
+
vae_path = '/notebooks/diffusion-pipe/hunyuan_model_files/vae.safetensors'
|
70 |
+
llm_path = '/notebooks/diffusion-pipe/hunyuan_model_files/llava-llama-3-8b-text-encoder-tokenizer/'
|
71 |
+
clip_path = '/notebooks/diffusion-pipe/hunyuan_model_files/clip-vit-large-patch14/'
|
72 |
+
# Base dtype used for all models.
|
73 |
+
dtype = 'bfloat16'
|
74 |
+
# Hunyuan Video supports fp8 for the transformer when training LoRA.
|
75 |
+
transformer_dtype = 'float8'
|
76 |
+
# How to sample timesteps to train on. Can be logit_normal or uniform.
|
77 |
+
timestep_sample_method = 'logit_normal'
|
78 |
+
|
79 |
+
[adapter]
|
80 |
+
type = 'lora'
|
81 |
+
rank = 32
|
82 |
+
# Dtype for the LoRA weights you are training.
|
83 |
+
dtype = 'bfloat16'
|
84 |
+
# You can initialize the lora weights from a previously trained lora.
|
85 |
+
#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
|
86 |
+
|
87 |
+
[optimizer]
|
88 |
+
# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
|
89 |
+
# Look at train.py for other options. You could also easily edit the file and add your own.
|
90 |
+
type = 'adamw_optimi'
|
91 |
+
lr = 2e-5
|
92 |
+
betas = [0.9, 0.99]
|
93 |
+
weight_decay = 0.01
|
94 |
+
eps = 1e-8
|
epoch156/adapter_config.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"eva_config": null,
|
7 |
+
"exclude_modules": null,
|
8 |
+
"fan_in_fan_out": false,
|
9 |
+
"inference_mode": false,
|
10 |
+
"init_lora_weights": true,
|
11 |
+
"layer_replication": null,
|
12 |
+
"layers_pattern": null,
|
13 |
+
"layers_to_transform": null,
|
14 |
+
"loftq_config": {},
|
15 |
+
"lora_alpha": 32,
|
16 |
+
"lora_bias": false,
|
17 |
+
"lora_dropout": 0.0,
|
18 |
+
"megatron_config": null,
|
19 |
+
"megatron_core": "megatron.core",
|
20 |
+
"modules_to_save": null,
|
21 |
+
"peft_type": "LORA",
|
22 |
+
"r": 32,
|
23 |
+
"rank_pattern": {},
|
24 |
+
"revision": null,
|
25 |
+
"target_modules": [
|
26 |
+
"img_attn_qkv",
|
27 |
+
"txt_attn_qkv",
|
28 |
+
"img_mod.linear",
|
29 |
+
"img_mlp.fc1",
|
30 |
+
"txt_mlp.fc1",
|
31 |
+
"linear1",
|
32 |
+
"linear2",
|
33 |
+
"img_mlp.fc2",
|
34 |
+
"img_attn_proj",
|
35 |
+
"txt_mod.linear",
|
36 |
+
"txt_mlp.fc2",
|
37 |
+
"modulation.linear",
|
38 |
+
"txt_attn_proj"
|
39 |
+
],
|
40 |
+
"task_type": null,
|
41 |
+
"use_dora": false,
|
42 |
+
"use_rslora": false
|
43 |
+
}
|
epoch156/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4a322958a74f9cda35f6032de6d6d1931d810d86adb5246e69f2cb2af26b7981
|
3 |
+
size 322519480
|
epoch156/hunyuan_config.toml
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Output path for training runs. Each training run makes a new directory in here.
|
2 |
+
output_dir = '/notebooks/diffusion-pipe/output'
|
3 |
+
|
4 |
+
# Dataset config file.
|
5 |
+
dataset = '/notebooks/diffusion-pipe/dataset_files/dataset_config.toml'
|
6 |
+
# You can have separate eval datasets. Give them a name for Tensorboard metrics.
|
7 |
+
# eval_datasets = [
|
8 |
+
# {name = 'something', config = 'path/to/eval_dataset.toml'},
|
9 |
+
# ]
|
10 |
+
|
11 |
+
# training settings
|
12 |
+
|
13 |
+
# I usually set this to a really high value because I don't know how long I want to train.
|
14 |
+
epochs = 1000
|
15 |
+
# Batch size of a single forward/backward pass for one GPU.
|
16 |
+
micro_batch_size_per_gpu = 1
|
17 |
+
# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
|
18 |
+
pipeline_stages = 1
|
19 |
+
# Number of micro-batches sent through the pipeline for each training step.
|
20 |
+
# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
|
21 |
+
gradient_accumulation_steps = 4
|
22 |
+
# Grad norm clipping.
|
23 |
+
gradient_clipping = 1.0
|
24 |
+
# Learning rate warmup.
|
25 |
+
warmup_steps = 100
|
26 |
+
|
27 |
+
# eval settings
|
28 |
+
|
29 |
+
eval_every_n_epochs = 1
|
30 |
+
eval_before_first_step = true
|
31 |
+
# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
|
32 |
+
# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
|
33 |
+
# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
|
34 |
+
eval_micro_batch_size_per_gpu = 1
|
35 |
+
eval_gradient_accumulation_steps = 1
|
36 |
+
|
37 |
+
# misc settings
|
38 |
+
|
39 |
+
# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
|
40 |
+
save_every_n_epochs = 4
|
41 |
+
# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
|
42 |
+
#checkpoint_every_n_epochs = 1
|
43 |
+
checkpoint_every_n_minutes = 120
|
44 |
+
# Always set to true unless you have a huge amount of VRAM.
|
45 |
+
activation_checkpointing = true
|
46 |
+
# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
|
47 |
+
partition_method = 'parameters'
|
48 |
+
# dtype for saving the LoRA or model, if different from training dtype
|
49 |
+
save_dtype = 'bfloat16'
|
50 |
+
# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
|
51 |
+
caching_batch_size = 1
|
52 |
+
# How often deepspeed logs to console.
|
53 |
+
steps_per_print = 1
|
54 |
+
# How to extract video clips for training from a single input video file.
|
55 |
+
# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
|
56 |
+
# number of frames for that bucket.
|
57 |
+
# single_beginning: one clip starting at the beginning of the video
|
58 |
+
# single_middle: one clip from the middle of the video (cutting off the start and end equally)
|
59 |
+
# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
|
60 |
+
# default is single_middle
|
61 |
+
video_clip_mode = 'single_middle'
|
62 |
+
|
63 |
+
[model]
|
64 |
+
type = 'hunyuan-video'
|
65 |
+
# Can load Hunyuan Video entirely from the ckpt path set up for the official inference scripts.
|
66 |
+
#ckpt_path = '/home/anon/HunyuanVideo/ckpts'
|
67 |
+
# Or you can load it by pointing to all the ComfyUI files.
|
68 |
+
transformer_path = '/notebooks/diffusion-pipe/hunyuan_model_files/hunyuan_dit.safetensors'
|
69 |
+
vae_path = '/notebooks/diffusion-pipe/hunyuan_model_files/vae.safetensors'
|
70 |
+
llm_path = '/notebooks/diffusion-pipe/hunyuan_model_files/llava-llama-3-8b-text-encoder-tokenizer/'
|
71 |
+
clip_path = '/notebooks/diffusion-pipe/hunyuan_model_files/clip-vit-large-patch14/'
|
72 |
+
# Base dtype used for all models.
|
73 |
+
dtype = 'bfloat16'
|
74 |
+
# Hunyuan Video supports fp8 for the transformer when training LoRA.
|
75 |
+
transformer_dtype = 'float8'
|
76 |
+
# How to sample timesteps to train on. Can be logit_normal or uniform.
|
77 |
+
timestep_sample_method = 'logit_normal'
|
78 |
+
|
79 |
+
[adapter]
|
80 |
+
type = 'lora'
|
81 |
+
rank = 32
|
82 |
+
# Dtype for the LoRA weights you are training.
|
83 |
+
dtype = 'bfloat16'
|
84 |
+
# You can initialize the lora weights from a previously trained lora.
|
85 |
+
#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
|
86 |
+
|
87 |
+
[optimizer]
|
88 |
+
# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
|
89 |
+
# Look at train.py for other options. You could also easily edit the file and add your own.
|
90 |
+
type = 'adamw_optimi'
|
91 |
+
lr = 2e-5
|
92 |
+
betas = [0.9, 0.99]
|
93 |
+
weight_decay = 0.01
|
94 |
+
eps = 1e-8
|
epoch16/adapter_config.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"eva_config": null,
|
7 |
+
"exclude_modules": null,
|
8 |
+
"fan_in_fan_out": false,
|
9 |
+
"inference_mode": false,
|
10 |
+
"init_lora_weights": true,
|
11 |
+
"layer_replication": null,
|
12 |
+
"layers_pattern": null,
|
13 |
+
"layers_to_transform": null,
|
14 |
+
"loftq_config": {},
|
15 |
+
"lora_alpha": 32,
|
16 |
+
"lora_bias": false,
|
17 |
+
"lora_dropout": 0.0,
|
18 |
+
"megatron_config": null,
|
19 |
+
"megatron_core": "megatron.core",
|
20 |
+
"modules_to_save": null,
|
21 |
+
"peft_type": "LORA",
|
22 |
+
"r": 32,
|
23 |
+
"rank_pattern": {},
|
24 |
+
"revision": null,
|
25 |
+
"target_modules": [
|
26 |
+
"img_attn_qkv",
|
27 |
+
"txt_attn_qkv",
|
28 |
+
"img_mod.linear",
|
29 |
+
"img_mlp.fc1",
|
30 |
+
"txt_mlp.fc1",
|
31 |
+
"linear1",
|
32 |
+
"linear2",
|
33 |
+
"img_mlp.fc2",
|
34 |
+
"img_attn_proj",
|
35 |
+
"txt_mod.linear",
|
36 |
+
"txt_mlp.fc2",
|
37 |
+
"modulation.linear",
|
38 |
+
"txt_attn_proj"
|
39 |
+
],
|
40 |
+
"task_type": null,
|
41 |
+
"use_dora": false,
|
42 |
+
"use_rslora": false
|
43 |
+
}
|
epoch16/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fd025bd0d222b9f859ff18d78197b11e43021ac0ead66bd694d4e620a7adad7c
|
3 |
+
size 322519480
|