there were questionable decisions made here..

Browse files

Signed-off-by: Balazs Horvath <[email protected]>

Files changed (3) hide show

.zshrc +55 -0
extract_description +107 -0
pick_caption +27 -0

.zshrc CHANGED Viewed

@@ -42,6 +42,8 @@ ZSH_THEME="kade"
 # COMPLETION_WAITING_DOTS="true"
 # DISABLE_UNTRACKED_FILES_DIRTY="true"
 # Set the system language and locale to Japanese UTF-8
 export LANG=ja_JP.UTF-8
 export LC_ALL=ja_JP.UTF-8
@@ -415,6 +417,32 @@ sample_prompts() {
 # Example:
 #   replace_comma_with_keep_tags 2 /path/to/directory
 #   replace_comma_with_keep_tags 1
 replace_comma_with_keep_tags() {
   local occurrence_number=$1
   local target_directory=${2:-.}
@@ -677,6 +705,33 @@ replace_text_in_files() {
 # The script checks if the specified directory exists and iterates over each text file in the directory.
 # For each text file, it creates a temporary file with the modified content and then replaces the original file with the temporary file.
 # If the directory does not exist, it prints an error message.
 inject_to_tags() {
     local dir="$1"
     local prefix="$2"

 # COMPLETION_WAITING_DOTS="true"
 # DISABLE_UNTRACKED_FILES_DIRTY="true"
+export GIN_MODE=release
 # Set the system language and locale to Japanese UTF-8
 export LANG=ja_JP.UTF-8
 export LC_ALL=ja_JP.UTF-8
 # Example:
 #   replace_comma_with_keep_tags 2 /path/to/directory
 #   replace_comma_with_keep_tags 1
+replace_comma_with_keep_tags_txt() {
+  local occurrence_number=$1
+  local target_directory=${2:-.}
+  if [[ -z "$occurrence_number" ]]; then
+    echo "Error: occurrence_number is required."
+    return 1
+  fi
+  find "$target_directory" -type f -name "*.txt" | while read -r file; do
+    awk -v occurrence="$occurrence_number" '{
+      count = 0
+      for (i = 1; i <= NF; i++) {
+        if ($i ~ /,/) {
+          count++
+          if (count == occurrence) {
+            gsub(/,/, " |||", $i)
+          }
+        }
+      }
+      print
+    }' "$file" > tmpfile && mv tmpfile "$file"
+  done
+}
 replace_comma_with_keep_tags() {
   local occurrence_number=$1
   local target_directory=${2:-.}
 # The script checks if the specified directory exists and iterates over each text file in the directory.
 # For each text file, it creates a temporary file with the modified content and then replaces the original file with the temporary file.
 # If the directory does not exist, it prints an error message.
+inject_to_txt() {
+    local dir="$1"
+    local prefix="$2"
+    if [[ -d "$dir" ]]; then
+        for file in "$dir"/*.txt; do
+            if [[ -f "$file" ]]; then
+                if grep -q "$prefix" "$file"; then
+                    # Move the existing prefix to the front of the text file without leaving extra commas or spaces
+                    local temp_file=$(mktemp)
+                    sed "s/$prefix//" "$file" | sed "1s/^/${prefix}, /" | sed 's/^, //' | sed 's/,,/,/g' | sed 's/, ,/,/g' | sed 's/ ,/,/g' > "$temp_file"
+                    mv "$temp_file" "$file"
+                    echo "Moved '${prefix}' to the front of $file"
+                else
+                    # Use a temporary file to store the modified content
+                    local temp_file=$(mktemp)
+                    echo "${prefix}, $(cat "$file")" | sed 's/,,/,/g' | sed 's/, ,/,/g' | sed 's/ ,/,/g' > "$temp_file"
+                    mv "$temp_file" "$file"
+                    echo "Added '${prefix}, ' to the front of $file"
+                fi
+            fi
+        done
+    else
+        echo "Directory $dir does not exist."
+    fi
+}
 inject_to_tags() {
     local dir="$1"
     local prefix="$2"

extract_description ADDED Viewed

	@@ -0,0 +1,107 @@

+#!/usr/bin/env python
+"""
+JSON Caption Extractor
+This script processes JSON files in the specified directory (or current directory if not specified)
+and its subdirectories, extracting cleaned captions from them. It performs the following tasks:
+1. Recursively searches for JSON files in the specified directory and its subdirectories.
+2. Reads each JSON file found.
+3. Extracts the 'description' field from each JSON file.
+4. Cleans the description by removing HTML links, newline characters, and extra spaces.
+5. Writes the cleaned description to a new file with the same name as the input file
+   but with a '.caption' extension in the same directory as the input file.
+6. Provides logging information about the processing status and any errors encountered.
+Usage:
+    python extract_description [-v] [-d DIRECTORY]
+Arguments:
+    -d, --directory  The directory to process (optional, defaults to current directory)
+    -v, --verbose    Enable verbose logging (optional)
+The script processes all JSON files in the specified directory and its subdirectories.
+"""
+import json
+import os
+import re
+import argparse
+import logging
+from typing import Dict, Any
+def clean_description(description: str) -> str:
+    """
+    Clean the input description by removing HTML links, newline characters, and extra spaces.
+    Args:
+        description (str): The original description string.
+    Returns:
+        str: The cleaned description string.
+    """
+    # Remove HTML links
+    description = re.sub(r'<a href=".*?".*?>(.*?)</a>', r'\1', description)
+    # Remove newline characters
+    description = description.replace('\n', ' ')
+    # Remove extra spaces
+    description = re.sub(r'\s+', ' ', description).strip()
+    return description
+def process_json_file(file_path: str) -> None:
+    """
+    Process a single JSON file by extracting the description, cleaning it,
+    and writing it to a new caption file.
+    Args:
+        file_path (str): The path to the JSON file to process.
+    Raises:
+        Exception: If there's an error reading the JSON file or writing the output file.
+    """
+    try:
+        with open(file_path, 'r') as json_file:
+            data: Dict[str, Any] = json.load(json_file)
+        description = data.get('description', '')
+        cleaned_description = clean_description(description)
+        output_file = os.path.splitext(file_path)[0] + '.caption'
+        with open(output_file, 'w') as caption_file:
+            caption_file.write(cleaned_description)
+        logging.info(f"Processed {file_path} -> {output_file}")
+    except Exception as e:
+        logging.error(f"Error processing {file_path}: {str(e)}")
+def process_directory(directory: str) -> None:
+    """
+    Recursively process all JSON files in the specified directory and its subdirectories.
+    Args:
+        directory (str): The path to the directory to process.
+    """
+    for root, _, files in os.walk(directory):
+        for file in files:
+            if file.lower().endswith('.json'):
+                file_path = os.path.join(root, file)
+                process_json_file(file_path)
+def main():
+    parser = argparse.ArgumentParser(description="Process JSON files and extract captions.")
+    parser.add_argument('-d', '--directory', default='.', help="Directory to process (default: current directory)")
+    parser.add_argument('-v', '--verbose', action='store_true', help="Enable verbose logging")
+    args = parser.parse_args()
+    log_level = logging.DEBUG if args.verbose else logging.INFO
+    logging.basicConfig(level=log_level, format='%(asctime)s - %(levelname)s - %(message)s')
+    directory = os.path.abspath(args.directory)
+    logging.info(f"Processing directory: {directory}")
+    process_directory(directory)
+    logging.info("Processing complete.")
+if __name__ == "__main__":
+    main()

pick_caption ADDED Viewed

	@@ -0,0 +1,27 @@

+#!/usr/bin/env python
+import os
+import re
+def process_caption_files():
+    for file in os.listdir():
+        if file.endswith(".caption"):
+            # Check if the file already contains processed data
+            with open(file, 'r') as f:
+                lines = f.readlines()
+                if "\n----------\n" not in "".join(lines):
+                    continue
+                for line in lines:
+                    if "----------" in line:
+                        break  # Stop processing after finding the separator
+                content = ''.join(lines[:lines.index(line)])  # Extract text before the separator
+                processed_content = re.sub(r'[\-]+|\n', '', content)  # Remove newlines and separator
+            with open(file, 'w') as f:  # Save the condensed caption back to the same file
+                f.write(processed_content)
+process_caption_files()