k4d3 commited on
Commit
1ecba12
1 Parent(s): e8af80a

train scripts qol, manual refactor

Browse files
training_scripts/t35-template CHANGED
@@ -1,41 +1,11 @@
1
  #!/usr/bin/env zsh
2
  set -e -o pipefail
3
 
4
- source "$HOME/toolkit/zsh/train_functions.zsh"
5
-
6
- local name_default="by_momowaaai-v1s6000"
7
- NAME="${NAME:-"$name_default"}"
8
-
9
- # Extract steps from name
10
- STEPS=$(extract_steps_from_name "$NAME" "4096")
11
- # remove the suffix (eg. -v1s6000) from NAME
12
- NAME=$(echo "$NAME" | awk -F'-' '{print $1}')
13
-
14
- local training_dir_default="${HOME}/datasets/${NAME}"
15
- TRAINING_DIR=${TRAINING_DIR:-"$training_dir_default"}
16
-
17
- OUTPUT_DIR="${HOME}/output_dir"
18
 
19
  SD_SCRIPT="${SD_SCRIPT:-sd3_train_network.py}"
20
  SD_REPO="${SD_REPO:-$HOME/source/repos/sd-scripts-sd3}"
21
 
22
- echo "\e[35mNAME\e[0m: $NAME, \e[35mTRAINING_DIR\e[0m: $TRAINING_DIR, \e[35mOUTPUT_DIR\e[0m: $OUTPUT_DIR"
23
-
24
- [[ ! -d "$TRAINING_DIR" ]] && echo "ERROR: Training directory not found" && exit 1
25
- if [[ -d "$OUTPUT_DIR/$NAME" ]]; then
26
- echo "ERROR: Output directory already exists: $OUTPUT_DIR/$NAME"
27
- exit 1
28
- fi
29
-
30
- # Initialize conda and activate environment
31
- setup_conda_env "sdscripts"
32
-
33
- # Store the commits hashes for libraries, copy the script to the output directory
34
- LYCORIS_REPO=$(get_lycoris_repo) # Python package editable path
35
- store_commits_hashes "$SD_REPO" "$LYCORIS_REPO"
36
-
37
- # alpha=1 @ dim=16 is the same lr than alpha=4 @ dim=256
38
- # --min_snr_gamma=1
39
  args=(
40
  # ⚠️ TODO: Benchmark...
41
  #--debiased_estimation_loss
@@ -53,16 +23,10 @@ args=(
53
  --clip_g=/home/kade/ComfyUI/models/clip/clip_g.safetensors
54
  --t5xxl=/home/kade/ComfyUI/models/clip/t5xxl_fp16.safetensors
55
  # Output, logging
56
- --output_dir="$OUTPUT_DIR/$NAME"
57
- --output_name="$NAME"
58
- --log_prefix="$NAME-"
59
  --log_with=tensorboard
60
- --logging_dir="$OUTPUT_DIR/logs"
61
  --seed=1728871242
62
  --fp8_base
63
  # Dataset
64
- --dataset_config="$TRAINING_DIR/config.toml"
65
- #--train_data_dir="$TRAINING_DIR"
66
  --dataset_repeats=1
67
  --resolution="1024,1024"
68
  --enable_bucket
@@ -102,7 +66,6 @@ args=(
102
  --gradient_checkpointing
103
  #--scale_weight_norms=1
104
  # LR Scheduling
105
- --max_train_steps=$STEPS
106
  #--lr_warmup_steps=100
107
  # NOTE: 0.0004 if its anything like FLUX..
108
  --learning_rate=0.0005
@@ -132,12 +95,40 @@ args=(
132
  #--initial_step=120
133
  # Sampling
134
  --sample_every_n_steps=25
135
- --sample_prompts="$TRAINING_DIR/sample-prompts.txt"
136
  --sample_sampler="euler_a"
137
  --sample_at_first
138
  --caption_extension=".txt"
139
  )
140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  run_training_script "$SD_REPO/$SD_SCRIPT" "${args[@]}" "$@"
142
 
143
  # Delete the output directory if it is empty
@@ -145,5 +136,4 @@ run_training_script "$SD_REPO/$SD_SCRIPT" "${args[@]}" "$@"
145
  if [[ ! -e "$OUTPUT_DIR/$NAME/samples/*" ]] && [[ ! -e "$OUTPUT_DIR/$NAME"/*.safetensors ]]; then
146
  echo "No samples or model files found, deleting empty output directory"
147
  rm -rf "$OUTPUT_DIR/$NAME"
148
- fi
149
-
 
1
  #!/usr/bin/env zsh
2
  set -e -o pipefail
3
 
4
+ NAME=test-deleteme-v0s100
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  SD_SCRIPT="${SD_SCRIPT:-sd3_train_network.py}"
7
  SD_REPO="${SD_REPO:-$HOME/source/repos/sd-scripts-sd3}"
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  args=(
10
  # ⚠️ TODO: Benchmark...
11
  #--debiased_estimation_loss
 
23
  --clip_g=/home/kade/ComfyUI/models/clip/clip_g.safetensors
24
  --t5xxl=/home/kade/ComfyUI/models/clip/t5xxl_fp16.safetensors
25
  # Output, logging
 
 
 
26
  --log_with=tensorboard
 
27
  --seed=1728871242
28
  --fp8_base
29
  # Dataset
 
 
30
  --dataset_repeats=1
31
  --resolution="1024,1024"
32
  --enable_bucket
 
66
  --gradient_checkpointing
67
  #--scale_weight_norms=1
68
  # LR Scheduling
 
69
  #--lr_warmup_steps=100
70
  # NOTE: 0.0004 if its anything like FLUX..
71
  --learning_rate=0.0005
 
95
  #--initial_step=120
96
  # Sampling
97
  --sample_every_n_steps=25
 
98
  --sample_sampler="euler_a"
99
  --sample_at_first
100
  --caption_extension=".txt"
101
  )
102
 
103
+ # ===== Default variables =====
104
+ # Remove suffix from NAME (eg. -v1s4096)
105
+ DATASET_NAME="${NAME%-*}"
106
+ TRAINING_DIR="${TRAINING_DIR:-"${HOME}/datasets/${DATASET_NAME}"}"
107
+ # Extract steps from name (depends on NAME)
108
+ STEPS=${STEPS:-"${NAME##*[^0-9]}"}
109
+ OUTPUT_DIR="${HOME}/output_dir"
110
+
111
+ # Everything that depends on the environment or computed defaults goes here
112
+ args+=(
113
+ --output_dir="$OUTPUT_DIR/$NAME"
114
+ --output_name="$NAME"
115
+ --log_prefix="$NAME-"
116
+ --logging_dir="$OUTPUT_DIR/logs"
117
+
118
+ --max_train_steps=$STEPS
119
+ --dataset_config="$TRAINING_DIR/config.toml"
120
+ #--train_data_dir="$TRAINING_DIR"
121
+ --sample_prompts="$TRAINING_DIR/sample-prompts.txt"
122
+ )
123
+
124
+ # ===== Environment Setup =====
125
+ source "$HOME/toolkit/zsh/train_functions.zsh"
126
+ validate_environment
127
+ setup_conda_env "sdscripts"
128
+ LYCORIS_REPO=$(get_lycoris_repo)
129
+ store_commits_hashes "$SD_REPO" "$LYCORIS_REPO"
130
+
131
+ # ===== Run Training Script =====
132
  run_training_script "$SD_REPO/$SD_SCRIPT" "${args[@]}" "$@"
133
 
134
  # Delete the output directory if it is empty
 
136
  if [[ ! -e "$OUTPUT_DIR/$NAME/samples/*" ]] && [[ ! -e "$OUTPUT_DIR/$NAME"/*.safetensors ]]; then
137
  echo "No samples or model files found, deleting empty output directory"
138
  rm -rf "$OUTPUT_DIR/$NAME"
139
+ fi
 
training_scripts/tp-template CHANGED
@@ -1,37 +1,11 @@
1
  #!/usr/bin/env zsh
2
  set -e -o pipefail
3
 
4
- local name_default="by_"
5
- NAME="${NAME:-"$name_default"}"
6
-
7
- local training_dir_default="${HOME}/datasets/${NAME}"
8
- TRAINING_DIR=${TRAINING_DIR:-"$training_dir_default"}
9
-
10
- OUTPUT_DIR="${HOME}/output_dir"
11
 
12
  SD_SCRIPT="${SD_SCRIPT:-sdxl_train_network.py}"
13
  SD_REPO="${SD_REPO:-$HOME/source/repos/sd-scripts-sd3}"
14
 
15
- echo "\e[35mNAME\e[0m: $NAME, \e[35mTRAINING_DIR\e[0m: $TRAINING_DIR, \e[35mOUTPUT_DIR\e[0m: $OUTPUT_DIR"
16
-
17
- [[ ! -d "$TRAINING_DIR" ]] && echo "ERROR: Training directory not found" && exit 1
18
- if [[ -d "$OUTPUT_DIR/$NAME" ]]; then
19
- echo "ERROR: Output directory already exists: $OUTPUT_DIR/$NAME"
20
- exit 1
21
- fi
22
-
23
- source "$HOME/toolkit/zsh/train_functions.zsh"
24
-
25
- # Extract steps from name
26
- STEPS=$(extract_steps_from_name "$NAME" "4096")
27
-
28
- # Initialize conda and activate environment
29
- setup_conda_env "sdscripts"
30
-
31
- # Store the commits hashes for libraries, copy the script to the output directory
32
- LYCORIS_REPO=$(get_lycoris_repo) # Python package editable path
33
- store_commits_hashes "$SD_REPO" "$LYCORIS_REPO"
34
-
35
  # alpha=1 @ dim=16 is the same lr than alpha=4 @ dim=256
36
  # --min_snr_gamma=1
37
  args=(
@@ -45,14 +19,9 @@ args=(
45
  # Model
46
  --pretrained_model_name_or_path=/home/kade/ComfyUI/models/checkpoints/ponyDiffusionV6XL_v6StartWithThisOne.safetensors
47
  # Output, logging
48
- --output_dir="$OUTPUT_DIR/$NAME"
49
- --output_name="$NAME"
50
- --log_prefix="$NAME-"
51
  --log_with=tensorboard
52
- --logging_dir="$OUTPUT_DIR/logs"
53
  --seed=1728871242
54
  # Dataset
55
- --train_data_dir="$TRAINING_DIR"
56
  --dataset_repeats=1
57
  --resolution="1024,1024"
58
  --enable_bucket
@@ -115,18 +84,41 @@ args=(
115
  --save_every_n_steps=100
116
  # Sampling
117
  --sample_every_n_steps=10
118
- --sample_prompts="$TRAINING_DIR/sample-prompts.txt"
119
  --sample_sampler="euler_a"
120
  --sample_at_first
121
  --caption_extension=".txt"
122
  )
123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  run_training_script "$SD_REPO/$SD_SCRIPT" "${args[@]}" "$@"
125
 
126
- # Delete the output directory if it is empty
127
- # that is if there is no samples under $OUTPUT_DIR/$NAME/samples/ and no safetensors under $OUTPUT_DIR/$NAME/
128
- if [[ ! -e "$OUTPUT_DIR/$NAME/samples/*" ]] && [[ ! -e "$OUTPUT_DIR/$NAME"/*.safetensors ]]; then
129
- echo "No samples or model files found, deleting empty output directory"
130
- rm -rf "$OUTPUT_DIR/$NAME"
131
- fi
132
 
 
1
  #!/usr/bin/env zsh
2
  set -e -o pipefail
3
 
4
+ NAME=test-deleteme-v0s100
 
 
 
 
 
 
5
 
6
  SD_SCRIPT="${SD_SCRIPT:-sdxl_train_network.py}"
7
  SD_REPO="${SD_REPO:-$HOME/source/repos/sd-scripts-sd3}"
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  # alpha=1 @ dim=16 is the same lr than alpha=4 @ dim=256
10
  # --min_snr_gamma=1
11
  args=(
 
19
  # Model
20
  --pretrained_model_name_or_path=/home/kade/ComfyUI/models/checkpoints/ponyDiffusionV6XL_v6StartWithThisOne.safetensors
21
  # Output, logging
 
 
 
22
  --log_with=tensorboard
 
23
  --seed=1728871242
24
  # Dataset
 
25
  --dataset_repeats=1
26
  --resolution="1024,1024"
27
  --enable_bucket
 
84
  --save_every_n_steps=100
85
  # Sampling
86
  --sample_every_n_steps=10
 
87
  --sample_sampler="euler_a"
88
  --sample_at_first
89
  --caption_extension=".txt"
90
  )
91
 
92
+ # ===== Default variables =====
93
+ # Remove suffix from NAME (eg. -v1s4096)
94
+ DATASET_NAME="${NAME%-*}"
95
+ TRAINING_DIR="${TRAINING_DIR:-"${HOME}/datasets/${DATASET_NAME}"}"
96
+ # Extract steps from name (depends on NAME)
97
+ STEPS=${STEPS:-"${NAME##*[^0-9]}"}
98
+ OUTPUT_DIR="${HOME}/output_dir"
99
+
100
+ # Everything that depends on the environment or computed defaults goes here
101
+ args+=(
102
+ --output_dir="$OUTPUT_DIR/$NAME"
103
+ --output_name="$NAME"
104
+ --log_prefix="$NAME-"
105
+ --logging_dir="$OUTPUT_DIR/logs"
106
+
107
+ --max_train_steps=$STEPS
108
+ --dataset_config="$TRAINING_DIR/config.toml"
109
+ #--train_data_dir="$TRAINING_DIR"
110
+ --sample_prompts="$TRAINING_DIR/sample-prompts.txt"
111
+ )
112
+
113
+ # ===== Environment Setup =====
114
+ source "$HOME/toolkit/zsh/train_functions.zsh"
115
+ setup_conda_env "sdscripts"
116
+ LYCORIS_REPO=$(get_lycoris_repo)
117
+ validate_environment
118
+ store_commits_hashes "$SD_REPO" "$LYCORIS_REPO"
119
+
120
+ # ===== Run Training Script =====
121
  run_training_script "$SD_REPO/$SD_SCRIPT" "${args[@]}" "$@"
122
 
123
+
 
 
 
 
 
124
 
zsh/train_functions.zsh CHANGED
@@ -35,10 +35,10 @@ run_training_script() {
35
  done
36
 
37
  if [[ -n "$DEBUG" ]]; then
38
- echo "This was a dry run, exiting."
39
  local exit_code=0
40
  else
41
- python "$(basename "$script_path")" "${args_array[@]}"
42
  local exit_code=$?
43
  fi
44
 
@@ -48,6 +48,19 @@ run_training_script() {
48
  return $exit_code
49
  }
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  # Sets up and activates a specified Conda environment.
52
  #
53
  # Parameters:
@@ -90,37 +103,6 @@ setup_conda_env() {
90
  fi
91
  }
92
 
93
- # Extracts the number of steps from a given name or uses a default value if not found.
94
- #
95
- # Parameters:
96
- # - name: The name from which to extract the step count.
97
- # - default_steps: The default number of steps to use if extraction fails.
98
- #
99
- # Behavior:
100
- # - Uses a regular expression to find a number at the end of the name.
101
- # - If no valid step count is found, it uses the default value and prints a warning.
102
- # - If the extracted step count is less than 1, it also uses the default value and prints a warning.
103
- # - Prints a confirmation message if a valid step count is found.
104
- #
105
- # Returns:
106
- # - The extracted or default step count.
107
- extract_steps_from_name() {
108
- local name="$1"
109
- local default_steps="$2"
110
-
111
- # More robust step extraction with validation
112
- local steps=$(echo "$name" | grep -oE '[0-9]+$')
113
- if [[ -z "$steps" ]]; then
114
- steps="$default_steps"
115
- echo "⚠️ No step count found in NAME. Using default: \e[35m$steps\e[0m" >&2
116
- elif ((steps < 1)); then
117
- echo "⚠️ Invalid step count. Using default: \e[35m$default_steps\e[0m" >&2
118
- steps="$default_steps"
119
- fi
120
-
121
- echo "$steps"
122
- }
123
-
124
  # Stores the commit hashes of specified Git repositories and copies the script to an output directory.
125
  #
126
  # Parameters:
@@ -141,8 +123,6 @@ store_commits_hashes() {
141
  local output_dir="$OUTPUT_DIR/$NAME"
142
  # Define the path for the output file
143
  local output_file="$output_dir/repos.git"
144
- # If DEBUG is set, print the output directory path
145
- [[ -n "$DEBUG" ]] && echo "Output directory: $output_dir"
146
  # Create the output directory if it doesn't exist
147
  [[ ! -d "$output_dir" ]] && mkdir -p "$output_dir"
148
  # Create or truncate the output file
@@ -158,15 +138,15 @@ store_commits_hashes() {
158
  # Get the checked-out branch
159
  if local branch_name=$(git -C "$repo_path" rev-parse --abbrev-ref HEAD 2>/dev/null); then
160
  echo "$repo_path: ($branch_name) $commit_sha" >>"$output_file"
161
- summary+="✓ $repo_name: $commit_sha ($branch_name) $repo_path\n"
162
  else
163
  echo "$repo_path: $commit_sha (Failed to get branch)" >>"$output_file"
164
- summary+="⚠️ $repo_name: $commit_sha (Failed to get branch) $repo_path\n"
165
  res=1
166
  fi
167
  else
168
  echo "$repo_path: Git command failed" >>"$output_file"
169
- summary+="⚠️ $repo_name: Git command failed $repo_path\n"
170
  res=1
171
  fi
172
  else
@@ -182,14 +162,19 @@ store_commits_hashes() {
182
  [[ -n "$DEBUG" ]] && echo "Copied $script_path to $output_dir"
183
 
184
  # Add script hash with error handling
185
- if local script_sha=$(sha1sum "$script_path" | cut -f1 -d' '); then
186
- echo "$script_path: $script_sha" >>"$output_file"
187
- summary+="✓ $ZSH_SCRIPT: $script_sha\n"
188
- else
189
- echo "$script_path: Failed to generate SHA-1" >>"$output_file"
190
- summary+="⚠️ Failed to generate script SHA-1\n"
191
- res=1
192
- fi
 
 
 
 
 
193
 
194
  echo -e "$summary"
195
  return $res
 
35
  done
36
 
37
  if [[ -n "$DEBUG" ]]; then
38
+ echo "This was a dry run, exiting." | tee "$OUTPUT_DIR/$NAME/sdscripts.log"
39
  local exit_code=0
40
  else
41
+ python "$(basename "$script_path")" "${args_array[@]}" | tee "$OUTPUT_DIR/$NAME/sdscripts.log"
42
  local exit_code=$?
43
  fi
44
 
 
48
  return $exit_code
49
  }
50
 
51
+ # Print environment setup than validate it
52
+ validate_environment() {
53
+ echo "\e[35moutput_name\e[0m: $NAME, \e[35msteps\e[0m: $STEPS, \e[35mtraining_dir\e[0m: $(realpath --relative-to=. $TRAINING_DIR), \e[35moutput_dir\e[0m: $(realpath --relative-to=. "$OUTPUT_DIR/$NAME")"
54
+ echo "\e[35mconda_env\e[0m: $CONDA_PREFIX"
55
+
56
+ # ===== Validation =====
57
+ [[ ! -d "$TRAINING_DIR" ]] && echo "ERROR: Training directory not found" && exit 1
58
+ if [[ -d "$OUTPUT_DIR/$NAME" ]]; then
59
+ echo "ERROR: Output directory already exists: $OUTPUT_DIR/$NAME"
60
+ exit 1
61
+ fi
62
+ }
63
+
64
  # Sets up and activates a specified Conda environment.
65
  #
66
  # Parameters:
 
103
  fi
104
  }
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  # Stores the commit hashes of specified Git repositories and copies the script to an output directory.
107
  #
108
  # Parameters:
 
123
  local output_dir="$OUTPUT_DIR/$NAME"
124
  # Define the path for the output file
125
  local output_file="$output_dir/repos.git"
 
 
126
  # Create the output directory if it doesn't exist
127
  [[ ! -d "$output_dir" ]] && mkdir -p "$output_dir"
128
  # Create or truncate the output file
 
138
  # Get the checked-out branch
139
  if local branch_name=$(git -C "$repo_path" rev-parse --abbrev-ref HEAD 2>/dev/null); then
140
  echo "$repo_path: ($branch_name) $commit_sha" >>"$output_file"
141
+ summary+="✓ $repo_name: $repo_path ${commit_sha:0:8} ($branch_name)\n"
142
  else
143
  echo "$repo_path: $commit_sha (Failed to get branch)" >>"$output_file"
144
+ summary+="⚠️ $repo_name: $repo_path ${commit_sha:0:8} (Failed to get branch)\n"
145
  res=1
146
  fi
147
  else
148
  echo "$repo_path: Git command failed" >>"$output_file"
149
+ summary+="⚠️ $repo_name: $repo_path (Git command failed) \n"
150
  res=1
151
  fi
152
  else
 
162
  [[ -n "$DEBUG" ]] && echo "Copied $script_path to $output_dir"
163
 
164
  # Add script hash with error handling
165
+ local script_sha=$(sha1sum "$script_path" | cut -f1 -d' ')
166
+ echo "$script_path: $script_sha" >>"$output_file"
167
+ summary+="✓ Training script: $ZSH_SCRIPT ${script_sha:0:8}\n"
168
+
169
+ # Computes hash for "$TRAINING_DIR/config.toml" and "$TRAINING_DIR/sample-prompts.txt" then copy them to the output directory
170
+ local config_sha=$(sha1sum "$TRAINING_DIR/config.toml" | cut -f1 -d' ')
171
+ local prompts_sha=$(sha1sum "$TRAINING_DIR/sample-prompts.txt" | cut -f1 -d' ')
172
+ cp "$TRAINING_DIR/config.toml" "$output_dir/config.toml"
173
+ cp "$TRAINING_DIR/sample-prompts.txt" "$output_dir/sample-prompts.txt"
174
+ echo "$TRAINING_DIR/config.toml: $config_sha" >>"$output_file"
175
+ echo "$TRAINING_DIR/sample-prompts.txt: $prompts_sha" >>"$output_file"
176
+ summary+="✓ Training config: $TRAINING_DIR/config.toml ${config_sha:0:8}\n"
177
+ summary+="✓ Training prompts: $TRAINING_DIR/sample-prompts.txt ${prompts_sha:0:8}\n"
178
 
179
  echo -e "$summary"
180
  return $res