Merge branch 'main' of hf.co:/k4d3/toolkit

Browse files

Files changed (10) hide show

.zshrc +77 -13
audiogen_medium.py → audiogen_medium +2 -0
concat_captions +64 -0
extract_description +107 -0
ocr +33 -0
pick_caption +27 -0
steal_sdscripts_metadata +0 -0
tensorboard_that_errors_on_conda +8 -0
zsh/gallery-dl.zsh +10 -0
zsh/install_members.zsh +27 -0

.zshrc CHANGED Viewed

@@ -42,6 +42,8 @@ ZSH_THEME="kade"
 # COMPLETION_WAITING_DOTS="true"
 # DISABLE_UNTRACKED_FILES_DIRTY="true"
 # Set the system language and locale to Japanese UTF-8
 export LANG=ja_JP.UTF-8
 export LC_ALL=ja_JP.UTF-8
@@ -126,6 +128,12 @@ source $ZSH/oh-my-zsh.sh
 # - CUDA binary directory
 export PATH=$PATH:$HOME/source/repos/dataset-tools/target/x86_64-unknown-linux-gnu/release:$HOME/.cargo/bin:$HOME/miniconda3/bin:$HOME/toolkit:$HOME/db/redis-stable/src:$HOME/db/postgresql/bin:$HOME/.local/bin:/opt/cuda/bin
 # Extend the LD_LIBRARY_PATH to include:
 # - Conda environment's library directory
 # - CUDA library directory for x86_64 Linux
@@ -255,17 +263,17 @@ function re() {
 # It searches for all .txt files in the specified directory and its subdirectories.
 # If a file contains the specified tag, the function removes the tag from its original position
 # and prepends it to the beginning of the file.
-# Usage: rejiggle_captions <tag> <directory>
-rejiggle_captions() {
   local tag="$1"
   local dir="$2"
   if [[ -z "$tag" || -z "$dir" ]]; then
-    echo "Usage: rejiggle_captions <tag> <directory>"
     return 1
   fi
-  find "$dir" -type f -name "*.txt" | while read -r file; do
     if grep -q "$tag" "$file"; then
       sed -i "s/$tag//g" "$file"
       sed -i "1s/^/$tag, /" "$file"
@@ -379,14 +387,14 @@ list_word_freqs() {
 # Function: sample_prompts
 # Description:
-#   This function takes a sample of the tags (or captions) in a target training directory.
 #   It reads and displays the contents of all .txt files in the specified directory,
-#   providing a quick overview of the tags or captions used for training.
 #
 # Usage: sample_prompts <target_directory>
 #
 # Parameters:
-#   - target_directory: The directory containing the .txt files with tags or captions.
 #
 # Output:
 #   Prints the contents of each .txt file in the target directory, separated by newlines.
@@ -400,16 +408,16 @@ sample_prompts() {
 }
 # replace_comma_with_keep_tags
-# Description: This function replaces the specified occurrence of a comma with " |||" in all *.txt files
 #              in all subdirectories of a target directory or the current directory when no path is passed.
 # Usage: replace_comma_with_keep_tags <occurrence_number> [target_directory]
 # Parameters:
 #   - occurrence_number: The occurrence number of the comma to be replaced (e.g., 1 for the first occurrence).
-#   - target_directory (optional): The target directory to search for *.txt files. If not provided, the current directory is used.
 # Example:
 #   replace_comma_with_keep_tags 2 /path/to/directory
 #   replace_comma_with_keep_tags 1
-replace_comma_with_keep_tags() {
   local occurrence_number=$1
   local target_directory=${2:-.}
@@ -434,6 +442,32 @@ replace_comma_with_keep_tags() {
   done
 }
 # Function: display_custom_help
 # Description:
 #   This function displays a custom help menu with various commands, environment information,
@@ -664,14 +698,15 @@ replace_text_in_files() {
 # This script adds a specified prefix to the beginning of each text file in a given directory.
 # If the prefix already exists in the text file, it moves the prefix to the front of the text file without leaving extra commas or spaces.
-# Usage: inject_to_captions <directory> <prefix>
 # Arguments:
 #   <directory> - The directory containing the text files to be modified.
 #   <prefix> - The prefix to be added to the beginning of each text file.
 # The script checks if the specified directory exists and iterates over each text file in the directory.
 # For each text file, it creates a temporary file with the modified content and then replaces the original file with the temporary file.
 # If the directory does not exist, it prints an error message.
-inject_to_captions() {
     local dir="$1"
     local prefix="$2"
     if [[ -d "$dir" ]]; then
@@ -697,6 +732,32 @@ inject_to_captions() {
     fi
 }
 # Function to update git repositories in subdirectories
 update_dir() {
     local target_dir="${1:-.}"
@@ -918,6 +979,8 @@ filePath = '$filePath'
 print(json.loads(safetensors.safe_open(filePath, 'np').metadata().get('ss_seed', 'Not found')))"
 }
 source ~/toolkit/zsh/png2mp4.zsh
 # Function: c
@@ -952,7 +1015,8 @@ source ~/toolkit/zsh/png2mp4.zsh
 c() {
     cd ~/ComfyUI &&
     conda activate comfyui
-    python main.py --listen 0.0.0.0 --preview-method taesd --use-pytorch-cross-attention --disable-xformers --front-end-version Comfy-Org/ComfyUI_frontend@latest --fast
 }
 #
 # Usage:

 # COMPLETION_WAITING_DOTS="true"
 # DISABLE_UNTRACKED_FILES_DIRTY="true"
+export GIN_MODE=release
 # Set the system language and locale to Japanese UTF-8
 export LANG=ja_JP.UTF-8
 export LC_ALL=ja_JP.UTF-8
 # - CUDA binary directory
 export PATH=$PATH:$HOME/source/repos/dataset-tools/target/x86_64-unknown-linux-gnu/release:$HOME/.cargo/bin:$HOME/miniconda3/bin:$HOME/toolkit:$HOME/db/redis-stable/src:$HOME/db/postgresql/bin:$HOME/.local/bin:/opt/cuda/bin
+# Function to remove $HOME/miniconda3/bin, $HOME/miniconda3/condabin from PATH and $HOME/miniconda3/lib from LD_LIBRARY_PATH
+rconda() {
+    export PATH=$(echo $PATH | tr ':' '\n' | grep -v "$HOME/miniconda3/bin" | grep -v "$HOME/miniconda3/condabin" | tr '\n' ':' | sed 's/:$//')
+    export LD_LIBRARY_PATH=$(echo $LD_LIBRARY_PATH | tr ':' '\n' | grep -v "$HOME/miniconda3/lib" | tr '\n' ':' | sed 's/:$//')
+}
 # Extend the LD_LIBRARY_PATH to include:
 # - Conda environment's library directory
 # - CUDA library directory for x86_64 Linux
 # It searches for all .txt files in the specified directory and its subdirectories.
 # If a file contains the specified tag, the function removes the tag from its original position
 # and prepends it to the beginning of the file.
+# Usage: rejiggle_tags <tag> <directory>
+rejiggle_tags() {
   local tag="$1"
   local dir="$2"
   if [[ -z "$tag" || -z "$dir" ]]; then
+    echo "Usage: rejiggle_tags <tag> <directory>"
     return 1
   fi
+  find "$dir" -type f -name "*.tags" | while read -r file; do
     if grep -q "$tag" "$file"; then
       sed -i "s/$tag//g" "$file"
       sed -i "1s/^/$tag, /" "$file"
 # Function: sample_prompts
 # Description:
+#   This function takes a sample of the tag in a target training directory.
 #   It reads and displays the contents of all .txt files in the specified directory,
+#   providing a quick overview of the tags used for training.
 #
 # Usage: sample_prompts <target_directory>
 #
 # Parameters:
+#   - target_directory: The directory containing the .txt files with tags.
 #
 # Output:
 #   Prints the contents of each .txt file in the target directory, separated by newlines.
 }
 # replace_comma_with_keep_tags
+# Description: This function replaces the specified occurrence of a comma with " |||" in all *.tags files
 #              in all subdirectories of a target directory or the current directory when no path is passed.
 # Usage: replace_comma_with_keep_tags <occurrence_number> [target_directory]
 # Parameters:
 #   - occurrence_number: The occurrence number of the comma to be replaced (e.g., 1 for the first occurrence).
+#   - target_directory (optional): The target directory to search for *.tags files. If not provided, the current directory is used.
 # Example:
 #   replace_comma_with_keep_tags 2 /path/to/directory
 #   replace_comma_with_keep_tags 1
+replace_comma_with_keep_tags_txt() {
   local occurrence_number=$1
   local target_directory=${2:-.}
   done
 }
+replace_comma_with_keep_tags() {
+  local occurrence_number=$1
+  local target_directory=${2:-.}
+  if [[ -z "$occurrence_number" ]]; then
+    echo "Error: occurrence_number is required."
+    return 1
+  fi
+  find "$target_directory" -type f -name "*.tags" | while read -r file; do
+    awk -v occurrence="$occurrence_number" '{
+      count = 0
+      for (i = 1; i <= NF; i++) {
+        if ($i ~ /,/) {
+          count++
+          if (count == occurrence) {
+            gsub(/,/, " |||", $i)
+          }
+        }
+      }
+      print
+    }' "$file" > tmpfile && mv tmpfile "$file"
+  done
+}
 # Function: display_custom_help
 # Description:
 #   This function displays a custom help menu with various commands, environment information,
 # This script adds a specified prefix to the beginning of each text file in a given directory.
 # If the prefix already exists in the text file, it moves the prefix to the front of the text file without leaving extra commas or spaces.
+# Usage: inject_to_tags <directory> <prefix>
 # Arguments:
 #   <directory> - The directory containing the text files to be modified.
 #   <prefix> - The prefix to be added to the beginning of each text file.
 # The script checks if the specified directory exists and iterates over each text file in the directory.
 # For each text file, it creates a temporary file with the modified content and then replaces the original file with the temporary file.
 # If the directory does not exist, it prints an error message.
+inject_to_txt() {
     local dir="$1"
     local prefix="$2"
     if [[ -d "$dir" ]]; then
     fi
 }
+inject_to_tags() {
+    local dir="$1"
+    local prefix="$2"
+    if [[ -d "$dir" ]]; then
+        for file in "$dir"/*.tags; do
+            if [[ -f "$file" ]]; then
+                if grep -q "$prefix" "$file"; then
+                    # Move the existing prefix to the front of the text file without leaving extra commas or spaces
+                    local temp_file=$(mktemp)
+                    sed "s/$prefix//" "$file" | sed "1s/^/${prefix}, /" | sed 's/^, //' | sed 's/,,/,/g' | sed 's/, ,/,/g' | sed 's/ ,/,/g' > "$temp_file"
+                    mv "$temp_file" "$file"
+                    echo "Moved '${prefix}' to the front of $file"
+                else
+                    # Use a temporary file to store the modified content
+                    local temp_file=$(mktemp)
+                    echo "${prefix}, $(cat "$file")" | sed 's/,,/,/g' | sed 's/, ,/,/g' | sed 's/ ,/,/g' > "$temp_file"
+                    mv "$temp_file" "$file"
+                    echo "Added '${prefix}, ' to the front of $file"
+                fi
+            fi
+        done
+    else
+        echo "Directory $dir does not exist."
+    fi
+}
 # Function to update git repositories in subdirectories
 update_dir() {
     local target_dir="${1:-.}"
 print(json.loads(safetensors.safe_open(filePath, 'np').metadata().get('ss_seed', 'Not found')))"
 }
+source ~/toolkit/zsh/install_members.zsh
+source ~/toolkit/zsh/gallery-dl.zsh
 source ~/toolkit/zsh/png2mp4.zsh
 # Function: c
 c() {
     cd ~/ComfyUI &&
     conda activate comfyui
+    python main.py --listen 0.0.0.0 --preview-method taesd --use-pytorch-cross-attention --disable-xformers --fast
+    # --front-end-version /home/kade/source/repos/ComfyUI_frontend/dist
 }
 #
 # Usage:

audiogen_medium.py → audiogen_medium RENAMED Viewed

@@ -1,3 +1,5 @@
 import sys
 import torchaudio
 from audiocraft.models import AudioGen

+#!/usr/bin/env python
 import sys
 import torchaudio
 from audiocraft.models import AudioGen

concat_captions ADDED Viewed

	@@ -0,0 +1,64 @@

+#!/usr/bin/env python3
+"""
+This script walks through a directory, identifies image files, and checks for the existence of corresponding
+.caption and .tags files. It then concatenates the contents of .caption and .tags files into the .txt files.
+Usage:
+    - Place the script in the directory containing the image files.
+    - Run the script to concatenate .caption and .tags files into .txt files.
+    - Use the dry_run flag to preview the changes without writing to the .txt files.
+Functions:
+    get_files(path): Walks through the directory and yields image files along with their .caption and .tags files.
+    concat(caption_path, tags_path, txt_path, dry_run=False): Concatenates the contents of .caption and .tags files into the .txt file.
+"""
+from pathlib import Path
+import os
+FILE_EXTS = {".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".gif", ".jxl"}
+def get_files(path):
+    path = Path(path)
+    # Walk the directory, looking for image files
+    for root, dirs, files in os.walk(path):
+        root = path / root
+        for file in files:
+            file = root / file
+            if file.suffix not in FILE_EXTS:
+                continue
+            caption = file.with_suffix(".caption")
+            tags = file.with_suffix(".tags")
+            txt = file.with_suffix(".txt")
+            if not caption.exists():
+                print(f"{caption} does not exist")
+            if not tags.exists():
+                print(f"{tags} does not exist")
+            yield file, caption, tags, txt
+def concat(caption_path, tags_path, txt_path, dry_run=False):
+    with open(caption_path, "r") as f:
+        caption = f.read().strip()
+    with open(tags_path, "r") as f:
+        tags = f.read().strip(", \n")
+    txt = f"{tags}, {caption}"
+    if dry_run:
+        print(f"{txt_path}:")
+        print(txt)
+        print()
+    else:
+        with open(txt_path, 'w') as f:
+            f.write(txt)
+        print(f"wrote {txt_path}")
+if __name__ == "__main__":
+    dry_run = False
+    for f in get_files("."):
+        concat(*f[1:], dry_run=dry_run)

extract_description ADDED Viewed

	@@ -0,0 +1,107 @@

+#!/usr/bin/env python
+"""
+JSON Caption Extractor
+This script processes JSON files in the specified directory (or current directory if not specified)
+and its subdirectories, extracting cleaned captions from them. It performs the following tasks:
+1. Recursively searches for JSON files in the specified directory and its subdirectories.
+2. Reads each JSON file found.
+3. Extracts the 'description' field from each JSON file.
+4. Cleans the description by removing HTML links, newline characters, and extra spaces.
+5. Writes the cleaned description to a new file with the same name as the input file
+   but with a '.caption' extension in the same directory as the input file.
+6. Provides logging information about the processing status and any errors encountered.
+Usage:
+    python extract_description [-v] [-d DIRECTORY]
+Arguments:
+    -d, --directory  The directory to process (optional, defaults to current directory)
+    -v, --verbose    Enable verbose logging (optional)
+The script processes all JSON files in the specified directory and its subdirectories.
+"""
+import json
+import os
+import re
+import argparse
+import logging
+from typing import Dict, Any
+def clean_description(description: str) -> str:
+    """
+    Clean the input description by removing HTML links, newline characters, and extra spaces.
+    Args:
+        description (str): The original description string.
+    Returns:
+        str: The cleaned description string.
+    """
+    # Remove HTML links
+    description = re.sub(r'<a href=".*?".*?>(.*?)</a>', r'\1', description)
+    # Remove newline characters
+    description = description.replace('\n', ' ')
+    # Remove extra spaces
+    description = re.sub(r'\s+', ' ', description).strip()
+    return description
+def process_json_file(file_path: str) -> None:
+    """
+    Process a single JSON file by extracting the description, cleaning it,
+    and writing it to a new caption file.
+    Args:
+        file_path (str): The path to the JSON file to process.
+    Raises:
+        Exception: If there's an error reading the JSON file or writing the output file.
+    """
+    try:
+        with open(file_path, 'r') as json_file:
+            data: Dict[str, Any] = json.load(json_file)
+        description = data.get('description', '')
+        cleaned_description = clean_description(description)
+        output_file = os.path.splitext(file_path)[0] + '.caption'
+        with open(output_file, 'w') as caption_file:
+            caption_file.write(cleaned_description)
+        logging.info(f"Processed {file_path} -> {output_file}")
+    except Exception as e:
+        logging.error(f"Error processing {file_path}: {str(e)}")
+def process_directory(directory: str) -> None:
+    """
+    Recursively process all JSON files in the specified directory and its subdirectories.
+    Args:
+        directory (str): The path to the directory to process.
+    """
+    for root, _, files in os.walk(directory):
+        for file in files:
+            if file.lower().endswith('.json'):
+                file_path = os.path.join(root, file)
+                process_json_file(file_path)
+def main():
+    parser = argparse.ArgumentParser(description="Process JSON files and extract captions.")
+    parser.add_argument('-d', '--directory', default='.', help="Directory to process (default: current directory)")
+    parser.add_argument('-v', '--verbose', action='store_true', help="Enable verbose logging")
+    args = parser.parse_args()
+    log_level = logging.DEBUG if args.verbose else logging.INFO
+    logging.basicConfig(level=log_level, format='%(asctime)s - %(levelname)s - %(message)s')
+    directory = os.path.abspath(args.directory)
+    logging.info(f"Processing directory: {directory}")
+    process_directory(directory)
+    logging.info("Processing complete.")
+if __name__ == "__main__":
+    main()

ocr ADDED Viewed

	@@ -0,0 +1,33 @@

+#!/usr/bin/env python
+from transformers import AutoModel, AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
+model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cuda', use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
+model = model.eval().cuda()
+# input your test image
+image_file = 'xxx.jpg'
+# plain texts OCR
+res = model.chat(tokenizer, image_file, ocr_type='ocr')
+# format texts OCR:
+# res = model.chat(tokenizer, image_file, ocr_type='format')
+# fine-grained OCR:
+# res = model.chat(tokenizer, image_file, ocr_type='ocr', ocr_box='')
+# res = model.chat(tokenizer, image_file, ocr_type='format', ocr_box='')
+# res = model.chat(tokenizer, image_file, ocr_type='ocr', ocr_color='')
+# res = model.chat(tokenizer, image_file, ocr_type='format', ocr_color='')
+# multi-crop OCR:
+# res = model.chat_crop(tokenizer, image_file, ocr_type='ocr')
+# res = model.chat_crop(tokenizer, image_file, ocr_type='format')
+# render the formatted OCR results:
+# res = model.chat(tokenizer, image_file, ocr_type='format', render=True, save_render_file = './demo.html')
+print(res)

pick_caption ADDED Viewed

	@@ -0,0 +1,27 @@

+#!/usr/bin/env python
+import os
+import re
+def process_caption_files():
+    for file in os.listdir():
+        if file.endswith(".caption"):
+            # Check if the file already contains processed data
+            with open(file, 'r') as f:
+                lines = f.readlines()
+                if "\n----------\n" not in "".join(lines):
+                    continue
+                for line in lines:
+                    if "----------" in line:
+                        break  # Stop processing after finding the separator
+                content = ''.join(lines[:lines.index(line)])  # Extract text before the separator
+                processed_content = re.sub(r'[\-]+|\n', '', content)  # Remove newlines and separator
+            with open(file, 'w') as f:  # Save the condensed caption back to the same file
+                f.write(processed_content)
+process_caption_files()

steal_sdscripts_metadata CHANGED Viewed

File without changes

tensorboard_that_errors_on_conda ADDED Viewed

	@@ -0,0 +1,8 @@

+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+import re
+import sys
+from tensorboard.main import run_main
+if __name__ == '__main__':
+    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+    sys.exit(run_main())

zsh/gallery-dl.zsh ADDED Viewed

	@@ -0,0 +1,10 @@

+# Override the gallery-dl command to change the directory to ~/datasets
+# before executing the original command
+function gallery-dl() {
+    # Change to the ~/datasets directory
+    cd ~/datasets
+    # Execute the original gallery-dl command with all passed arguments
+    command gallery-dl "$@"
+}

zsh/install_members.zsh ADDED Viewed

	@@ -0,0 +1,27 @@

+#!/bin/zsh
+install_members() {
+  # Read the Cargo.toml file and extract the members
+  # The awk command extracts the lines between 'members = [' and ']'
+  # The sed command removes the first and last lines (which are '[' and ']')
+  # It also removes quotes, commas, and comments, and deletes empty lines
+  members=$(awk '/members = \[/,/\]/' Cargo.toml | sed -e '1d;$d' -e 's/[",]//g' -e 's/#.*//g' -e '/^\s*$/d')
+  # Convert the members string into an array
+  members_array=(${(f)members})
+  # Loop through each member and run the cargo install command
+  for member in $members_array; do
+    # Trim leading and trailing whitespace from each member
+    member=$(echo $member | xargs)
+    echo "Processing $member..."
+    # Change to the member's directory
+    # If the directory change fails, print an error message and exit the function
+    cd $member || { echo "Failed to enter directory $member"; return 1; }
+    # Run the cargo install command with the specified options
+    cargo install -Z build-std --target x86_64-unknown-linux-gnu --path .
+    # Return to the previous directory
+    cd ..
+  done
+}