train scripts qol, manual refactor

Browse files

Files changed (3) hide show

training_scripts/t35-template +31 -41
training_scripts/tp-template +31 -39
zsh/train_functions.zsh +31 -46

training_scripts/t35-template CHANGED Viewed

@@ -1,41 +1,11 @@
 #!/usr/bin/env zsh
 set -e -o pipefail
-source "$HOME/toolkit/zsh/train_functions.zsh"
-local name_default="by_momowaaai-v1s6000"
-NAME="${NAME:-"$name_default"}"
-# Extract steps from name
-STEPS=$(extract_steps_from_name "$NAME" "4096")
-# remove the suffix (eg. -v1s6000) from NAME
-NAME=$(echo "$NAME" | awk -F'-' '{print $1}')
-local training_dir_default="${HOME}/datasets/${NAME}"
-TRAINING_DIR=${TRAINING_DIR:-"$training_dir_default"}
-OUTPUT_DIR="${HOME}/output_dir"
 SD_SCRIPT="${SD_SCRIPT:-sd3_train_network.py}"
 SD_REPO="${SD_REPO:-$HOME/source/repos/sd-scripts-sd3}"
-echo "\e[35mNAME\e[0m: $NAME, \e[35mTRAINING_DIR\e[0m: $TRAINING_DIR, \e[35mOUTPUT_DIR\e[0m: $OUTPUT_DIR"
-[[ ! -d "$TRAINING_DIR" ]] && echo "ERROR: Training directory not found" && exit 1
-if [[ -d "$OUTPUT_DIR/$NAME" ]]; then
-    echo "ERROR: Output directory already exists: $OUTPUT_DIR/$NAME"
-    exit 1
-fi
-# Initialize conda and activate environment
-setup_conda_env "sdscripts"
-# Store the commits hashes for libraries, copy the script to the output directory
-LYCORIS_REPO=$(get_lycoris_repo) # Python package editable path
-store_commits_hashes "$SD_REPO" "$LYCORIS_REPO"
-# alpha=1 @ dim=16 is the same lr than alpha=4 @ dim=256
-# --min_snr_gamma=1
 args=(
     # ⚠️  TODO: Benchmark...
     #--debiased_estimation_loss
@@ -53,16 +23,10 @@ args=(
     --clip_g=/home/kade/ComfyUI/models/clip/clip_g.safetensors
     --t5xxl=/home/kade/ComfyUI/models/clip/t5xxl_fp16.safetensors
     # Output, logging
-    --output_dir="$OUTPUT_DIR/$NAME"
-    --output_name="$NAME"
-    --log_prefix="$NAME-"
     --log_with=tensorboard
-    --logging_dir="$OUTPUT_DIR/logs"
     --seed=1728871242
     --fp8_base
     # Dataset
-    --dataset_config="$TRAINING_DIR/config.toml"
-    #--train_data_dir="$TRAINING_DIR"
     --dataset_repeats=1
     --resolution="1024,1024"
     --enable_bucket
@@ -102,7 +66,6 @@ args=(
     --gradient_checkpointing
     #--scale_weight_norms=1
     # LR Scheduling
-    --max_train_steps=$STEPS
     #--lr_warmup_steps=100
     # NOTE: 0.0004 if its anything like FLUX..
     --learning_rate=0.0005
@@ -132,12 +95,40 @@ args=(
     #--initial_step=120
     # Sampling
     --sample_every_n_steps=25
-    --sample_prompts="$TRAINING_DIR/sample-prompts.txt"
     --sample_sampler="euler_a"
     --sample_at_first
     --caption_extension=".txt"
 )
 run_training_script "$SD_REPO/$SD_SCRIPT" "${args[@]}" "$@"
 # Delete the output directory if it is empty
@@ -145,5 +136,4 @@ run_training_script "$SD_REPO/$SD_SCRIPT" "${args[@]}" "$@"
 if [[ ! -e "$OUTPUT_DIR/$NAME/samples/*" ]] && [[ ! -e "$OUTPUT_DIR/$NAME"/*.safetensors ]]; then
     echo "No samples or model files found, deleting empty output directory"
     rm -rf "$OUTPUT_DIR/$NAME"
-fi

 #!/usr/bin/env zsh
 set -e -o pipefail
+NAME=test-deleteme-v0s100
 SD_SCRIPT="${SD_SCRIPT:-sd3_train_network.py}"
 SD_REPO="${SD_REPO:-$HOME/source/repos/sd-scripts-sd3}"
 args=(
     # ⚠️  TODO: Benchmark...
     #--debiased_estimation_loss
     --clip_g=/home/kade/ComfyUI/models/clip/clip_g.safetensors
     --t5xxl=/home/kade/ComfyUI/models/clip/t5xxl_fp16.safetensors
     # Output, logging
     --log_with=tensorboard
     --seed=1728871242
     --fp8_base
     # Dataset
     --dataset_repeats=1
     --resolution="1024,1024"
     --enable_bucket
     --gradient_checkpointing
     #--scale_weight_norms=1
     # LR Scheduling
     #--lr_warmup_steps=100
     # NOTE: 0.0004 if its anything like FLUX..
     --learning_rate=0.0005
     #--initial_step=120
     # Sampling
     --sample_every_n_steps=25
     --sample_sampler="euler_a"
     --sample_at_first
     --caption_extension=".txt"
 )
+# ===== Default variables =====
+# Remove suffix from NAME (eg. -v1s4096)
+DATASET_NAME="${NAME%-*}"
+TRAINING_DIR="${TRAINING_DIR:-"${HOME}/datasets/${DATASET_NAME}"}"
+# Extract steps from name (depends on NAME)
+STEPS=${STEPS:-"${NAME##*[^0-9]}"}
+OUTPUT_DIR="${HOME}/output_dir"
+# Everything that depends on the environment or computed defaults goes here
+args+=(
+    --output_dir="$OUTPUT_DIR/$NAME"
+    --output_name="$NAME"
+    --log_prefix="$NAME-"
+    --logging_dir="$OUTPUT_DIR/logs"
+    --max_train_steps=$STEPS
+    --dataset_config="$TRAINING_DIR/config.toml"
+    #--train_data_dir="$TRAINING_DIR"
+    --sample_prompts="$TRAINING_DIR/sample-prompts.txt"
+)
+# ===== Environment Setup =====
+source "$HOME/toolkit/zsh/train_functions.zsh"
+validate_environment
+setup_conda_env "sdscripts"
+LYCORIS_REPO=$(get_lycoris_repo)
+store_commits_hashes "$SD_REPO" "$LYCORIS_REPO"
+# ===== Run Training Script =====
 run_training_script "$SD_REPO/$SD_SCRIPT" "${args[@]}" "$@"
 # Delete the output directory if it is empty
 if [[ ! -e "$OUTPUT_DIR/$NAME/samples/*" ]] && [[ ! -e "$OUTPUT_DIR/$NAME"/*.safetensors ]]; then
     echo "No samples or model files found, deleting empty output directory"
     rm -rf "$OUTPUT_DIR/$NAME"
+fi

training_scripts/tp-template CHANGED Viewed

@@ -1,37 +1,11 @@
 #!/usr/bin/env zsh
 set -e -o pipefail
-local name_default="by_"
-NAME="${NAME:-"$name_default"}"
-local training_dir_default="${HOME}/datasets/${NAME}"
-TRAINING_DIR=${TRAINING_DIR:-"$training_dir_default"}
-OUTPUT_DIR="${HOME}/output_dir"
 SD_SCRIPT="${SD_SCRIPT:-sdxl_train_network.py}"
 SD_REPO="${SD_REPO:-$HOME/source/repos/sd-scripts-sd3}"
-echo "\e[35mNAME\e[0m: $NAME, \e[35mTRAINING_DIR\e[0m: $TRAINING_DIR, \e[35mOUTPUT_DIR\e[0m: $OUTPUT_DIR"
-[[ ! -d "$TRAINING_DIR" ]] && echo "ERROR: Training directory not found" && exit 1
-if [[ -d "$OUTPUT_DIR/$NAME" ]]; then
-    echo "ERROR: Output directory already exists: $OUTPUT_DIR/$NAME"
-    exit 1
-fi
-source "$HOME/toolkit/zsh/train_functions.zsh"
-# Extract steps from name
-STEPS=$(extract_steps_from_name "$NAME" "4096")
-# Initialize conda and activate environment
-setup_conda_env "sdscripts"
-# Store the commits hashes for libraries, copy the script to the output directory
-LYCORIS_REPO=$(get_lycoris_repo) # Python package editable path
-store_commits_hashes "$SD_REPO" "$LYCORIS_REPO"
 # alpha=1 @ dim=16 is the same lr than alpha=4 @ dim=256
 # --min_snr_gamma=1
 args=(
@@ -45,14 +19,9 @@ args=(
     # Model
     --pretrained_model_name_or_path=/home/kade/ComfyUI/models/checkpoints/ponyDiffusionV6XL_v6StartWithThisOne.safetensors
     # Output, logging
-    --output_dir="$OUTPUT_DIR/$NAME"
-    --output_name="$NAME"
-    --log_prefix="$NAME-"
     --log_with=tensorboard
-    --logging_dir="$OUTPUT_DIR/logs"
     --seed=1728871242
     # Dataset
-    --train_data_dir="$TRAINING_DIR"
     --dataset_repeats=1
     --resolution="1024,1024"
     --enable_bucket
@@ -115,18 +84,41 @@ args=(
     --save_every_n_steps=100
     # Sampling
     --sample_every_n_steps=10
-    --sample_prompts="$TRAINING_DIR/sample-prompts.txt"
     --sample_sampler="euler_a"
     --sample_at_first
     --caption_extension=".txt"
 )
 run_training_script "$SD_REPO/$SD_SCRIPT" "${args[@]}" "$@"
-# Delete the output directory if it is empty
-# that is if there is no samples under $OUTPUT_DIR/$NAME/samples/ and no safetensors under $OUTPUT_DIR/$NAME/
-if [[ ! -e "$OUTPUT_DIR/$NAME/samples/*" ]] && [[ ! -e "$OUTPUT_DIR/$NAME"/*.safetensors ]]; then
-    echo "No samples or model files found, deleting empty output directory"
-    rm -rf "$OUTPUT_DIR/$NAME"
-fi

 #!/usr/bin/env zsh
 set -e -o pipefail
+NAME=test-deleteme-v0s100
 SD_SCRIPT="${SD_SCRIPT:-sdxl_train_network.py}"
 SD_REPO="${SD_REPO:-$HOME/source/repos/sd-scripts-sd3}"
 # alpha=1 @ dim=16 is the same lr than alpha=4 @ dim=256
 # --min_snr_gamma=1
 args=(
     # Model
     --pretrained_model_name_or_path=/home/kade/ComfyUI/models/checkpoints/ponyDiffusionV6XL_v6StartWithThisOne.safetensors
     # Output, logging
     --log_with=tensorboard
     --seed=1728871242
     # Dataset
     --dataset_repeats=1
     --resolution="1024,1024"
     --enable_bucket
     --save_every_n_steps=100
     # Sampling
     --sample_every_n_steps=10
     --sample_sampler="euler_a"
     --sample_at_first
     --caption_extension=".txt"
 )
+# ===== Default variables =====
+# Remove suffix from NAME (eg. -v1s4096)
+DATASET_NAME="${NAME%-*}"
+TRAINING_DIR="${TRAINING_DIR:-"${HOME}/datasets/${DATASET_NAME}"}"
+# Extract steps from name (depends on NAME)
+STEPS=${STEPS:-"${NAME##*[^0-9]}"}
+OUTPUT_DIR="${HOME}/output_dir"
+# Everything that depends on the environment or computed defaults goes here
+args+=(
+    --output_dir="$OUTPUT_DIR/$NAME"
+    --output_name="$NAME"
+    --log_prefix="$NAME-"
+    --logging_dir="$OUTPUT_DIR/logs"
+    --max_train_steps=$STEPS
+    --dataset_config="$TRAINING_DIR/config.toml"
+    #--train_data_dir="$TRAINING_DIR"
+    --sample_prompts="$TRAINING_DIR/sample-prompts.txt"
+)
+# ===== Environment Setup =====
+source "$HOME/toolkit/zsh/train_functions.zsh"
+setup_conda_env "sdscripts"
+LYCORIS_REPO=$(get_lycoris_repo)
+validate_environment
+store_commits_hashes "$SD_REPO" "$LYCORIS_REPO"
+# ===== Run Training Script =====
 run_training_script "$SD_REPO/$SD_SCRIPT" "${args[@]}" "$@"

zsh/train_functions.zsh CHANGED Viewed

@@ -35,10 +35,10 @@ run_training_script() {
     done
     if [[ -n "$DEBUG" ]]; then
-        echo "This was a dry run, exiting."
         local exit_code=0
     else
-        python "$(basename "$script_path")" "${args_array[@]}"
         local exit_code=$?
     fi
@@ -48,6 +48,19 @@ run_training_script() {
     return $exit_code
 }
 # Sets up and activates a specified Conda environment.
 #
 # Parameters:
@@ -90,37 +103,6 @@ setup_conda_env() {
     fi
 }
-# Extracts the number of steps from a given name or uses a default value if not found.
-#
-# Parameters:
-# - name: The name from which to extract the step count.
-# - default_steps: The default number of steps to use if extraction fails.
-#
-# Behavior:
-# - Uses a regular expression to find a number at the end of the name.
-# - If no valid step count is found, it uses the default value and prints a warning.
-# - If the extracted step count is less than 1, it also uses the default value and prints a warning.
-# - Prints a confirmation message if a valid step count is found.
-#
-# Returns:
-# - The extracted or default step count.
-extract_steps_from_name() {
-    local name="$1"
-    local default_steps="$2"
-    # More robust step extraction with validation
-    local steps=$(echo "$name" | grep -oE '[0-9]+$')
-    if [[ -z "$steps" ]]; then
-        steps="$default_steps"
-        echo "⚠️  No step count found in NAME. Using default: \e[35m$steps\e[0m" >&2
-    elif ((steps < 1)); then
-        echo "⚠️  Invalid step count. Using default: \e[35m$default_steps\e[0m" >&2
-        steps="$default_steps"
-    fi
-    echo "$steps"
-}
 # Stores the commit hashes of specified Git repositories and copies the script to an output directory.
 #
 # Parameters:
@@ -141,8 +123,6 @@ store_commits_hashes() {
     local output_dir="$OUTPUT_DIR/$NAME"
     # Define the path for the output file
     local output_file="$output_dir/repos.git"
-    # If DEBUG is set, print the output directory path
-    [[ -n "$DEBUG" ]] && echo "Output directory: $output_dir"
     # Create the output directory if it doesn't exist
     [[ ! -d "$output_dir" ]] && mkdir -p "$output_dir"
     # Create or truncate the output file
@@ -158,15 +138,15 @@ store_commits_hashes() {
                 # Get the checked-out branch
                 if local branch_name=$(git -C "$repo_path" rev-parse --abbrev-ref HEAD 2>/dev/null); then
                     echo "$repo_path: ($branch_name) $commit_sha" >>"$output_file"
-                    summary+="✓ $repo_name: $commit_sha ($branch_name) $repo_path\n"
                 else
                     echo "$repo_path: $commit_sha (Failed to get branch)" >>"$output_file"
-                    summary+="⚠️  $repo_name: $commit_sha (Failed to get branch) $repo_path\n"
                     res=1
                 fi
             else
                 echo "$repo_path: Git command failed" >>"$output_file"
-                summary+="⚠️  $repo_name: Git command failed $repo_path\n"
                 res=1
             fi
         else
@@ -182,14 +162,19 @@ store_commits_hashes() {
     [[ -n "$DEBUG" ]] && echo "Copied $script_path to $output_dir"
     # Add script hash with error handling
-    if local script_sha=$(sha1sum "$script_path" | cut -f1 -d' '); then
-        echo "$script_path: $script_sha" >>"$output_file"
-        summary+="✓ $ZSH_SCRIPT: $script_sha\n"
-    else
-        echo "$script_path: Failed to generate SHA-1" >>"$output_file"
-        summary+="⚠️  Failed to generate script SHA-1\n"
-        res=1
-    fi
     echo -e "$summary"
     return $res

     done
     if [[ -n "$DEBUG" ]]; then
+        echo "This was a dry run, exiting." | tee "$OUTPUT_DIR/$NAME/sdscripts.log"
         local exit_code=0
     else
+        python "$(basename "$script_path")" "${args_array[@]}" | tee "$OUTPUT_DIR/$NAME/sdscripts.log"
         local exit_code=$?
     fi
     return $exit_code
 }
+# Print environment setup than validate it
+validate_environment() {
+    echo "\e[35moutput_name\e[0m: $NAME, \e[35msteps\e[0m: $STEPS, \e[35mtraining_dir\e[0m: $(realpath --relative-to=. $TRAINING_DIR), \e[35moutput_dir\e[0m: $(realpath --relative-to=. "$OUTPUT_DIR/$NAME")"
+    echo "\e[35mconda_env\e[0m: $CONDA_PREFIX"
+    # ===== Validation =====
+    [[ ! -d "$TRAINING_DIR" ]] && echo "ERROR: Training directory not found" && exit 1
+    if [[ -d "$OUTPUT_DIR/$NAME" ]]; then
+        echo "ERROR: Output directory already exists: $OUTPUT_DIR/$NAME"
+        exit 1
+    fi
+}
 # Sets up and activates a specified Conda environment.
 #
 # Parameters:
     fi
 }
 # Stores the commit hashes of specified Git repositories and copies the script to an output directory.
 #
 # Parameters:
     local output_dir="$OUTPUT_DIR/$NAME"
     # Define the path for the output file
     local output_file="$output_dir/repos.git"
     # Create the output directory if it doesn't exist
     [[ ! -d "$output_dir" ]] && mkdir -p "$output_dir"
     # Create or truncate the output file
                 # Get the checked-out branch
                 if local branch_name=$(git -C "$repo_path" rev-parse --abbrev-ref HEAD 2>/dev/null); then
                     echo "$repo_path: ($branch_name) $commit_sha" >>"$output_file"
+                    summary+="✓ $repo_name: $repo_path ${commit_sha:0:8} ($branch_name)\n"
                 else
                     echo "$repo_path: $commit_sha (Failed to get branch)" >>"$output_file"
+                    summary+="⚠️  $repo_name: $repo_path ${commit_sha:0:8} (Failed to get branch)\n"
                     res=1
                 fi
             else
                 echo "$repo_path: Git command failed" >>"$output_file"
+                summary+="⚠️  $repo_name: $repo_path (Git command failed) \n"
                 res=1
             fi
         else
     [[ -n "$DEBUG" ]] && echo "Copied $script_path to $output_dir"
     # Add script hash with error handling
+    local script_sha=$(sha1sum "$script_path" | cut -f1 -d' ')
+    echo "$script_path: $script_sha" >>"$output_file"
+    summary+="✓ Training script: $ZSH_SCRIPT ${script_sha:0:8}\n"
+    # Computes hash for "$TRAINING_DIR/config.toml" and "$TRAINING_DIR/sample-prompts.txt" then copy them to the output directory
+    local config_sha=$(sha1sum "$TRAINING_DIR/config.toml" | cut -f1 -d' ')
+    local prompts_sha=$(sha1sum "$TRAINING_DIR/sample-prompts.txt" | cut -f1 -d' ')
+    cp "$TRAINING_DIR/config.toml" "$output_dir/config.toml"
+    cp "$TRAINING_DIR/sample-prompts.txt" "$output_dir/sample-prompts.txt"
+    echo "$TRAINING_DIR/config.toml: $config_sha" >>"$output_file"
+    echo "$TRAINING_DIR/sample-prompts.txt: $prompts_sha" >>"$output_file"
+    summary+="✓ Training config: $TRAINING_DIR/config.toml ${config_sha:0:8}\n"
+    summary+="✓ Training prompts: $TRAINING_DIR/sample-prompts.txt ${prompts_sha:0:8}\n"
     echo -e "$summary"
     return $res