k4d3 commited on
Commit
e8af80a
1 Parent(s): 0f6b578

train functions and templates

Browse files
training_scripts/t35-template ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env zsh
2
+ set -e -o pipefail
3
+
4
+ source "$HOME/toolkit/zsh/train_functions.zsh"
5
+
6
+ local name_default="by_momowaaai-v1s6000"
7
+ NAME="${NAME:-"$name_default"}"
8
+
9
+ # Extract steps from name
10
+ STEPS=$(extract_steps_from_name "$NAME" "4096")
11
+ # remove the suffix (eg. -v1s6000) from NAME
12
+ NAME=$(echo "$NAME" | awk -F'-' '{print $1}')
13
+
14
+ local training_dir_default="${HOME}/datasets/${NAME}"
15
+ TRAINING_DIR=${TRAINING_DIR:-"$training_dir_default"}
16
+
17
+ OUTPUT_DIR="${HOME}/output_dir"
18
+
19
+ SD_SCRIPT="${SD_SCRIPT:-sd3_train_network.py}"
20
+ SD_REPO="${SD_REPO:-$HOME/source/repos/sd-scripts-sd3}"
21
+
22
+ echo "\e[35mNAME\e[0m: $NAME, \e[35mTRAINING_DIR\e[0m: $TRAINING_DIR, \e[35mOUTPUT_DIR\e[0m: $OUTPUT_DIR"
23
+
24
+ [[ ! -d "$TRAINING_DIR" ]] && echo "ERROR: Training directory not found" && exit 1
25
+ if [[ -d "$OUTPUT_DIR/$NAME" ]]; then
26
+ echo "ERROR: Output directory already exists: $OUTPUT_DIR/$NAME"
27
+ exit 1
28
+ fi
29
+
30
+ # Initialize conda and activate environment
31
+ setup_conda_env "sdscripts"
32
+
33
+ # Store the commits hashes for libraries, copy the script to the output directory
34
+ LYCORIS_REPO=$(get_lycoris_repo) # Python package editable path
35
+ store_commits_hashes "$SD_REPO" "$LYCORIS_REPO"
36
+
37
+ # alpha=1 @ dim=16 is the same lr than alpha=4 @ dim=256
38
+ # --min_snr_gamma=1
39
+ args=(
40
+ # ⚠️ TODO: Benchmark...
41
+ #--debiased_estimation_loss
42
+ # ⚠️ TODO: What does this do? Does it even work?
43
+ #--max_token_length=225
44
+ --clip_g_dropout_rate=0.0
45
+ --t5_dropout_rate=0.0
46
+ --enable_scaled_pos_embed
47
+ # Keep Tokens
48
+ --keep_tokens=1
49
+ --keep_tokens_separator="|||"
50
+ # Model
51
+ --pretrained_model_name_or_path=/home/kade/ComfyUI/models/checkpoints/sd3.5_large.safetensors
52
+ --clip_l=/home/kade/ComfyUI/models/clip/clip_l.safetensors
53
+ --clip_g=/home/kade/ComfyUI/models/clip/clip_g.safetensors
54
+ --t5xxl=/home/kade/ComfyUI/models/clip/t5xxl_fp16.safetensors
55
+ # Output, logging
56
+ --output_dir="$OUTPUT_DIR/$NAME"
57
+ --output_name="$NAME"
58
+ --log_prefix="$NAME-"
59
+ --log_with=tensorboard
60
+ --logging_dir="$OUTPUT_DIR/logs"
61
+ --seed=1728871242
62
+ --fp8_base
63
+ # Dataset
64
+ --dataset_config="$TRAINING_DIR/config.toml"
65
+ #--train_data_dir="$TRAINING_DIR"
66
+ --dataset_repeats=1
67
+ --resolution="1024,1024"
68
+ --enable_bucket
69
+ --bucket_reso_steps=64
70
+ --min_bucket_reso=128
71
+ --max_bucket_reso=2048
72
+ --flip_aug
73
+ --shuffle_caption
74
+ --cache_latents
75
+ --cache_latents_to_disk
76
+ --max_data_loader_n_workers=8
77
+ --persistent_data_loader_workers
78
+ # Network config
79
+ --network_dim=64
80
+ # ⚠️ TODO: Plot
81
+ --network_alpha=0.0625
82
+ --network_module="lycoris.kohya"
83
+ --network_args
84
+ "preset=full"
85
+ "decompose_both=False"
86
+ "rank_dropout=0"
87
+ "module_dropout=0"
88
+ "use_tucker=True"
89
+ "use_scalar=False"
90
+ "rank_dropout_scale=False"
91
+ "algo=lokr"
92
+ "bypass_mode=False"
93
+ "factor=16"
94
+ "dora_wd=True"
95
+ "train_norm=False"
96
+ --network_dropout=0
97
+ # Optimizer config
98
+ --optimizer_type=ClybW
99
+ --train_batch_size=14
100
+ #--gradient_accumulation_steps=1
101
+ --max_grad_norm=1
102
+ --gradient_checkpointing
103
+ #--scale_weight_norms=1
104
+ # LR Scheduling
105
+ --max_train_steps=$STEPS
106
+ #--lr_warmup_steps=100
107
+ # NOTE: 0.0004 if its anything like FLUX..
108
+ --learning_rate=0.0005
109
+ --unet_lr=0.0002
110
+ --text_encoder_lr=0.0001
111
+ --lr_scheduler="cosine"
112
+ --lr_scheduler_args="num_cycles=0.375"
113
+ # Noise
114
+ --multires_noise_iterations=12
115
+ --multires_noise_discount=0.4
116
+ #--min_snr_gamma=1
117
+ # Optimization, details
118
+ --no_half_vae
119
+ --sdpa
120
+ --mixed_precision="bf16"
121
+ # Saving
122
+ --save_model_as="safetensors"
123
+ --save_precision="fp16"
124
+ --save_every_n_steps=100
125
+ # Saving States
126
+ #--save_state
127
+ # Either resume from a saved state
128
+ #--resume="$OUTPUT_DIR/wolflink-vfucks400" # Resume from saved state
129
+ #--skip_until_initial_step
130
+ # Or from a checkpoint
131
+ #--network_weights="$OUTPUT_DIR/wolflink-vfucks400/wolflink-vfucks400-step00000120.safetensors" # Resume from checkpoint (not needed with state, i think)
132
+ #--initial_step=120
133
+ # Sampling
134
+ --sample_every_n_steps=25
135
+ --sample_prompts="$TRAINING_DIR/sample-prompts.txt"
136
+ --sample_sampler="euler_a"
137
+ --sample_at_first
138
+ --caption_extension=".txt"
139
+ )
140
+
141
+ run_training_script "$SD_REPO/$SD_SCRIPT" "${args[@]}" "$@"
142
+
143
+ # Delete the output directory if it is empty
144
+ # that is if there is no samples under $OUTPUT_DIR/$NAME/samples/ and no safetensors under $OUTPUT_DIR/$NAME/
145
+ if [[ ! -e "$OUTPUT_DIR/$NAME/samples/*" ]] && [[ ! -e "$OUTPUT_DIR/$NAME"/*.safetensors ]]; then
146
+ echo "No samples or model files found, deleting empty output directory"
147
+ rm -rf "$OUTPUT_DIR/$NAME"
148
+ fi
149
+
training_scripts/tp-template ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env zsh
2
+ set -e -o pipefail
3
+
4
+ local name_default="by_"
5
+ NAME="${NAME:-"$name_default"}"
6
+
7
+ local training_dir_default="${HOME}/datasets/${NAME}"
8
+ TRAINING_DIR=${TRAINING_DIR:-"$training_dir_default"}
9
+
10
+ OUTPUT_DIR="${HOME}/output_dir"
11
+
12
+ SD_SCRIPT="${SD_SCRIPT:-sdxl_train_network.py}"
13
+ SD_REPO="${SD_REPO:-$HOME/source/repos/sd-scripts-sd3}"
14
+
15
+ echo "\e[35mNAME\e[0m: $NAME, \e[35mTRAINING_DIR\e[0m: $TRAINING_DIR, \e[35mOUTPUT_DIR\e[0m: $OUTPUT_DIR"
16
+
17
+ [[ ! -d "$TRAINING_DIR" ]] && echo "ERROR: Training directory not found" && exit 1
18
+ if [[ -d "$OUTPUT_DIR/$NAME" ]]; then
19
+ echo "ERROR: Output directory already exists: $OUTPUT_DIR/$NAME"
20
+ exit 1
21
+ fi
22
+
23
+ source "$HOME/toolkit/zsh/train_functions.zsh"
24
+
25
+ # Extract steps from name
26
+ STEPS=$(extract_steps_from_name "$NAME" "4096")
27
+
28
+ # Initialize conda and activate environment
29
+ setup_conda_env "sdscripts"
30
+
31
+ # Store the commits hashes for libraries, copy the script to the output directory
32
+ LYCORIS_REPO=$(get_lycoris_repo) # Python package editable path
33
+ store_commits_hashes "$SD_REPO" "$LYCORIS_REPO"
34
+
35
+ # alpha=1 @ dim=16 is the same lr than alpha=4 @ dim=256
36
+ # --min_snr_gamma=1
37
+ args=(
38
+ # ⚠️ TODO: Benchmark...
39
+ --debiased_estimation_loss
40
+ # ⚠️ TODO: What does this do? Does it even work?
41
+ --max_token_length=225
42
+ # Keep Tokens
43
+ --keep_tokens=1
44
+ --keep_tokens_separator="|||"
45
+ # Model
46
+ --pretrained_model_name_or_path=/home/kade/ComfyUI/models/checkpoints/ponyDiffusionV6XL_v6StartWithThisOne.safetensors
47
+ # Output, logging
48
+ --output_dir="$OUTPUT_DIR/$NAME"
49
+ --output_name="$NAME"
50
+ --log_prefix="$NAME-"
51
+ --log_with=tensorboard
52
+ --logging_dir="$OUTPUT_DIR/logs"
53
+ --seed=1728871242
54
+ # Dataset
55
+ --train_data_dir="$TRAINING_DIR"
56
+ --dataset_repeats=1
57
+ --resolution="1024,1024"
58
+ --enable_bucket
59
+ --bucket_reso_steps=64
60
+ --min_bucket_reso=256
61
+ --max_bucket_reso=2048
62
+ --flip_aug
63
+ --shuffle_caption
64
+ --cache_latents
65
+ --cache_latents_to_disk
66
+ --max_data_loader_n_workers=8
67
+ --persistent_data_loader_workers
68
+ # Network config
69
+ --network_dim=100000
70
+ # ⚠️ TODO: Plot
71
+ --network_alpha=64
72
+ --network_module="lycoris.kohya"
73
+ --network_args
74
+ "preset=full"
75
+ "conv_dim=100000"
76
+ "decompose_both=False"
77
+ "conv_alpha=64"
78
+ "rank_dropout=0"
79
+ "module_dropout=0"
80
+ "use_tucker=True"
81
+ "use_scalar=False"
82
+ "rank_dropout_scale=False"
83
+ "algo=lokr"
84
+ "bypass_mode=False"
85
+ "factor=16"
86
+ "dora_wd=True"
87
+ "train_norm=False"
88
+ --network_dropout=0
89
+ # Optimizer config
90
+ --optimizer_type=ClybW
91
+ --train_batch_size=14
92
+ #--gradient_accumulation_steps=1
93
+ --max_grad_norm=1
94
+ --gradient_checkpointing
95
+ #--scale_weight_norms=1
96
+ # LR Scheduling
97
+ --max_train_steps=$STEPS
98
+ --lr_warmup_steps=0
99
+ --learning_rate=0.0003
100
+ --unet_lr=0.0003
101
+ --text_encoder_lr=0.00015
102
+ --lr_scheduler="cosine"
103
+ --lr_scheduler_args="num_cycles=0.375"
104
+ # Noise
105
+ --multires_noise_iterations=12
106
+ --multires_noise_discount=0.4
107
+ #--min_snr_gamma=1
108
+ # Optimization, details
109
+ --no_half_vae
110
+ --sdpa
111
+ --mixed_precision="bf16"
112
+ # Saving
113
+ --save_model_as="safetensors"
114
+ --save_precision="fp16"
115
+ --save_every_n_steps=100
116
+ # Sampling
117
+ --sample_every_n_steps=10
118
+ --sample_prompts="$TRAINING_DIR/sample-prompts.txt"
119
+ --sample_sampler="euler_a"
120
+ --sample_at_first
121
+ --caption_extension=".txt"
122
+ )
123
+
124
+ run_training_script "$SD_REPO/$SD_SCRIPT" "${args[@]}" "$@"
125
+
126
+ # Delete the output directory if it is empty
127
+ # that is if there is no samples under $OUTPUT_DIR/$NAME/samples/ and no safetensors under $OUTPUT_DIR/$NAME/
128
+ if [[ ! -e "$OUTPUT_DIR/$NAME/samples/*" ]] && [[ ! -e "$OUTPUT_DIR/$NAME"/*.safetensors ]]; then
129
+ echo "No samples or model files found, deleting empty output directory"
130
+ rm -rf "$OUTPUT_DIR/$NAME"
131
+ fi
132
+
zsh/train_functions.zsh ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Functions for sd-scripts training scripts
2
+
3
+ # Executes a training script located at the specified path with the provided arguments.
4
+ #
5
+ # Parameters:
6
+ # - script_path: The path to the training script to be executed.
7
+ # - args_array: An array of arguments to be passed to the training script.
8
+ #
9
+ # Behavior:
10
+ # - Changes the current directory to the directory of the script.
11
+ # - If the DEBUG environment variable is set, it prints the working directory and the arguments.
12
+ # - Executes the script using `wat python` and captures the exit code.
13
+ # - Returns to the original directory before exiting.
14
+ #
15
+ # Returns:
16
+ # - The exit code of the executed script.
17
+ run_training_script() {
18
+ local script_path="$1"
19
+ local args_array=("${@:2}") # Get all arguments after the first one
20
+
21
+ # Store the current directory
22
+ local current_dir=$(pwd)
23
+
24
+ # Change to script directory
25
+ local script_dir=$(dirname "$script_path")
26
+ local script_name=$(basename "$script_path")
27
+ cd "$script_dir" || return 1
28
+
29
+ # Test if the script exists
30
+ [[ ! -f "$script_name" ]] && echo "\e[31mERROR\e[0m: Script not found: $script_name" && return 1
31
+
32
+ echo "Working directory: $(pwd)\nRunning $script_name arguments:"
33
+ for arg in "${args_array[@]}"; do
34
+ echo " $arg"
35
+ done
36
+
37
+ if [[ -n "$DEBUG" ]]; then
38
+ echo "This was a dry run, exiting."
39
+ local exit_code=0
40
+ else
41
+ python "$(basename "$script_path")" "${args_array[@]}"
42
+ local exit_code=$?
43
+ fi
44
+
45
+ # Return to original directory
46
+ cd "$current_dir"
47
+
48
+ return $exit_code
49
+ }
50
+
51
+ # Sets up and activates a specified Conda environment.
52
+ #
53
+ # Parameters:
54
+ # - env_name: The name of the Conda environment to activate.
55
+ # - conda_path: (Optional) The path to the Conda installation. Defaults to $HOME/miniconda3.
56
+ #
57
+ # Behavior:
58
+ # - Checks if the environment name is provided and if the Conda installation exists.
59
+ # - Initializes Conda for the current shell session.
60
+ # - Activates the specified Conda environment and verifies its activation.
61
+ #
62
+ # Returns:
63
+ # - 0 on success, or 1 if any error occurs (e.g., missing environment name, Conda installation not found, activation failure).
64
+ setup_conda_env() {
65
+ local env_name="$1"
66
+ [[ -z "$env_name" ]] && echo "\e[31mERROR\e[0m: Environment name required" && return 1
67
+
68
+ local conda_path="${2:-$HOME/miniconda3}"
69
+ [[ ! -d "$conda_path" ]] && echo "\e[31mERROR\e[0m: Conda installation not found at $conda_path" && return 1
70
+
71
+ # Initialize conda for the shell session
72
+ if __conda_setup="$("$conda_path/bin/conda" 'shell.zsh' 'hook' 2>/dev/null)" && eval "$__conda_setup"; then
73
+ unset __conda_setup
74
+ else
75
+ echo "\e[31mERROR\e[0m: Failed to initialize conda environment" && return 1
76
+ fi
77
+
78
+ # Activate conda environment
79
+ conda activate "$env_name"
80
+ if [ $? -ne 0 ]; then
81
+ echo "\e[31mERROR\e[0m: Failed to activate conda environment: $env_name"
82
+ return 1
83
+ fi
84
+ echo "Conda environment: $CONDA_PREFIX"
85
+
86
+ # Verify environment activation
87
+ if ! conda env list | grep -q "^${env_name} "; then
88
+ echo "ERROR: Environment $env_name not found"
89
+ return 1
90
+ fi
91
+ }
92
+
93
+ # Extracts the number of steps from a given name or uses a default value if not found.
94
+ #
95
+ # Parameters:
96
+ # - name: The name from which to extract the step count.
97
+ # - default_steps: The default number of steps to use if extraction fails.
98
+ #
99
+ # Behavior:
100
+ # - Uses a regular expression to find a number at the end of the name.
101
+ # - If no valid step count is found, it uses the default value and prints a warning.
102
+ # - If the extracted step count is less than 1, it also uses the default value and prints a warning.
103
+ # - Prints a confirmation message if a valid step count is found.
104
+ #
105
+ # Returns:
106
+ # - The extracted or default step count.
107
+ extract_steps_from_name() {
108
+ local name="$1"
109
+ local default_steps="$2"
110
+
111
+ # More robust step extraction with validation
112
+ local steps=$(echo "$name" | grep -oE '[0-9]+$')
113
+ if [[ -z "$steps" ]]; then
114
+ steps="$default_steps"
115
+ echo "⚠️ No step count found in NAME. Using default: \e[35m$steps\e[0m" >&2
116
+ elif ((steps < 1)); then
117
+ echo "⚠️ Invalid step count. Using default: \e[35m$default_steps\e[0m" >&2
118
+ steps="$default_steps"
119
+ fi
120
+
121
+ echo "$steps"
122
+ }
123
+
124
+ # Stores the commit hashes of specified Git repositories and copies the script to an output directory.
125
+ #
126
+ # Parameters:
127
+ # - output_dir: The directory where the commit hashes will be stored.
128
+ # - repo_path: One or more paths to Git repositories.
129
+ #
130
+ # Behavior:
131
+ # - Creates the output directory if it does not exist.
132
+ # - Copies the current script to the output directory.
133
+ # - Iterates over each repository path, checking if it is a valid Git repository.
134
+ # - Retrieves the current commit SHA for each repository and writes it to an output file.
135
+ # - Generates a SHA-1 hash of the script and appends it to the output file.
136
+ #
137
+ # Returns:
138
+ # - 0 on success, or 1 if any error occurs during the process (e.g., Git command failure, not a Git repository).
139
+ store_commits_hashes() {
140
+ # Construct the output directory path
141
+ local output_dir="$OUTPUT_DIR/$NAME"
142
+ # Define the path for the output file
143
+ local output_file="$output_dir/repos.git"
144
+ # If DEBUG is set, print the output directory path
145
+ [[ -n "$DEBUG" ]] && echo "Output directory: $output_dir"
146
+ # Create the output directory if it doesn't exist
147
+ [[ ! -d "$output_dir" ]] && mkdir -p "$output_dir"
148
+ # Create or truncate the output file
149
+ : >"$output_file"
150
+
151
+ local summary=""
152
+ local res=0
153
+
154
+ for repo_path in "$@"; do
155
+ local repo_name=$(basename "$repo_path")
156
+ if [[ -d "$repo_path/.git" ]]; then
157
+ if local commit_sha=$(git -C "$repo_path" rev-parse HEAD 2>/dev/null); then
158
+ # Get the checked-out branch
159
+ if local branch_name=$(git -C "$repo_path" rev-parse --abbrev-ref HEAD 2>/dev/null); then
160
+ echo "$repo_path: ($branch_name) $commit_sha" >>"$output_file"
161
+ summary+="✓ $repo_name: $commit_sha ($branch_name) $repo_path\n"
162
+ else
163
+ echo "$repo_path: $commit_sha (Failed to get branch)" >>"$output_file"
164
+ summary+="⚠️ $repo_name: $commit_sha (Failed to get branch) $repo_path\n"
165
+ res=1
166
+ fi
167
+ else
168
+ echo "$repo_path: Git command failed" >>"$output_file"
169
+ summary+="⚠️ $repo_name: Git command failed $repo_path\n"
170
+ res=1
171
+ fi
172
+ else
173
+ echo "$repo_path: Not a git repository" >>"$output_file"
174
+ summary+="⚠️ $repo_name: Not a git repository $repo_path\n"
175
+ res=1
176
+ fi
177
+ done
178
+
179
+ # Copy the script to the output directory
180
+ local script_path=$(readlink -f "$ZSH_SCRIPT")
181
+ cp "$script_path" "$output_dir/$(basename "$script_path")"
182
+ [[ -n "$DEBUG" ]] && echo "Copied $script_path to $output_dir"
183
+
184
+ # Add script hash with error handling
185
+ if local script_sha=$(sha1sum "$script_path" | cut -f1 -d' '); then
186
+ echo "$script_path: $script_sha" >>"$output_file"
187
+ summary+="✓ $ZSH_SCRIPT: $script_sha\n"
188
+ else
189
+ echo "$script_path: Failed to generate SHA-1" >>"$output_file"
190
+ summary+="⚠️ Failed to generate script SHA-1\n"
191
+ res=1
192
+ fi
193
+
194
+ echo -e "$summary"
195
+ return $res
196
+ }
197
+
198
+ get_lycoris_repo() {
199
+ python -c """
200
+ import importlib.util
201
+ import pathlib
202
+ spec = importlib.util.find_spec('lycoris')
203
+ print(pathlib.Path(spec.origin).parent.parent)
204
+ """
205
+ }