k4d3 commited on
Commit
6b5f1f8
1 Parent(s): 3e02680

there were questionable decisions made here..

Browse files

Signed-off-by: Balazs Horvath <[email protected]>

Files changed (3) hide show
  1. .zshrc +55 -0
  2. extract_description +107 -0
  3. pick_caption +27 -0
.zshrc CHANGED
@@ -42,6 +42,8 @@ ZSH_THEME="kade"
42
  # COMPLETION_WAITING_DOTS="true"
43
  # DISABLE_UNTRACKED_FILES_DIRTY="true"
44
 
 
 
45
  # Set the system language and locale to Japanese UTF-8
46
  export LANG=ja_JP.UTF-8
47
  export LC_ALL=ja_JP.UTF-8
@@ -415,6 +417,32 @@ sample_prompts() {
415
  # Example:
416
  # replace_comma_with_keep_tags 2 /path/to/directory
417
  # replace_comma_with_keep_tags 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
  replace_comma_with_keep_tags() {
419
  local occurrence_number=$1
420
  local target_directory=${2:-.}
@@ -677,6 +705,33 @@ replace_text_in_files() {
677
  # The script checks if the specified directory exists and iterates over each text file in the directory.
678
  # For each text file, it creates a temporary file with the modified content and then replaces the original file with the temporary file.
679
  # If the directory does not exist, it prints an error message.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
680
  inject_to_tags() {
681
  local dir="$1"
682
  local prefix="$2"
 
42
  # COMPLETION_WAITING_DOTS="true"
43
  # DISABLE_UNTRACKED_FILES_DIRTY="true"
44
 
45
+ export GIN_MODE=release
46
+
47
  # Set the system language and locale to Japanese UTF-8
48
  export LANG=ja_JP.UTF-8
49
  export LC_ALL=ja_JP.UTF-8
 
417
  # Example:
418
  # replace_comma_with_keep_tags 2 /path/to/directory
419
  # replace_comma_with_keep_tags 1
420
+ replace_comma_with_keep_tags_txt() {
421
+ local occurrence_number=$1
422
+ local target_directory=${2:-.}
423
+
424
+ if [[ -z "$occurrence_number" ]]; then
425
+ echo "Error: occurrence_number is required."
426
+ return 1
427
+ fi
428
+
429
+ find "$target_directory" -type f -name "*.txt" | while read -r file; do
430
+ awk -v occurrence="$occurrence_number" '{
431
+ count = 0
432
+ for (i = 1; i <= NF; i++) {
433
+ if ($i ~ /,/) {
434
+ count++
435
+ if (count == occurrence) {
436
+ gsub(/,/, " |||", $i)
437
+ }
438
+ }
439
+ }
440
+ print
441
+ }' "$file" > tmpfile && mv tmpfile "$file"
442
+ done
443
+ }
444
+
445
+
446
  replace_comma_with_keep_tags() {
447
  local occurrence_number=$1
448
  local target_directory=${2:-.}
 
705
  # The script checks if the specified directory exists and iterates over each text file in the directory.
706
  # For each text file, it creates a temporary file with the modified content and then replaces the original file with the temporary file.
707
  # If the directory does not exist, it prints an error message.
708
+
709
+ inject_to_txt() {
710
+ local dir="$1"
711
+ local prefix="$2"
712
+ if [[ -d "$dir" ]]; then
713
+ for file in "$dir"/*.txt; do
714
+ if [[ -f "$file" ]]; then
715
+ if grep -q "$prefix" "$file"; then
716
+ # Move the existing prefix to the front of the text file without leaving extra commas or spaces
717
+ local temp_file=$(mktemp)
718
+ sed "s/$prefix//" "$file" | sed "1s/^/${prefix}, /" | sed 's/^, //' | sed 's/,,/,/g' | sed 's/, ,/,/g' | sed 's/ ,/,/g' > "$temp_file"
719
+ mv "$temp_file" "$file"
720
+ echo "Moved '${prefix}' to the front of $file"
721
+ else
722
+ # Use a temporary file to store the modified content
723
+ local temp_file=$(mktemp)
724
+ echo "${prefix}, $(cat "$file")" | sed 's/,,/,/g' | sed 's/, ,/,/g' | sed 's/ ,/,/g' > "$temp_file"
725
+ mv "$temp_file" "$file"
726
+ echo "Added '${prefix}, ' to the front of $file"
727
+ fi
728
+ fi
729
+ done
730
+ else
731
+ echo "Directory $dir does not exist."
732
+ fi
733
+ }
734
+
735
  inject_to_tags() {
736
  local dir="$1"
737
  local prefix="$2"
extract_description ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ """
4
+ JSON Caption Extractor
5
+
6
+ This script processes JSON files in the specified directory (or current directory if not specified)
7
+ and its subdirectories, extracting cleaned captions from them. It performs the following tasks:
8
+
9
+ 1. Recursively searches for JSON files in the specified directory and its subdirectories.
10
+ 2. Reads each JSON file found.
11
+ 3. Extracts the 'description' field from each JSON file.
12
+ 4. Cleans the description by removing HTML links, newline characters, and extra spaces.
13
+ 5. Writes the cleaned description to a new file with the same name as the input file
14
+ but with a '.caption' extension in the same directory as the input file.
15
+ 6. Provides logging information about the processing status and any errors encountered.
16
+
17
+ Usage:
18
+ python extract_description [-v] [-d DIRECTORY]
19
+
20
+ Arguments:
21
+ -d, --directory The directory to process (optional, defaults to current directory)
22
+ -v, --verbose Enable verbose logging (optional)
23
+
24
+ The script processes all JSON files in the specified directory and its subdirectories.
25
+ """
26
+
27
+ import json
28
+ import os
29
+ import re
30
+ import argparse
31
+ import logging
32
+ from typing import Dict, Any
33
+
34
+ def clean_description(description: str) -> str:
35
+ """
36
+ Clean the input description by removing HTML links, newline characters, and extra spaces.
37
+
38
+ Args:
39
+ description (str): The original description string.
40
+
41
+ Returns:
42
+ str: The cleaned description string.
43
+ """
44
+ # Remove HTML links
45
+ description = re.sub(r'<a href=".*?".*?>(.*?)</a>', r'\1', description)
46
+ # Remove newline characters
47
+ description = description.replace('\n', ' ')
48
+ # Remove extra spaces
49
+ description = re.sub(r'\s+', ' ', description).strip()
50
+ return description
51
+
52
+ def process_json_file(file_path: str) -> None:
53
+ """
54
+ Process a single JSON file by extracting the description, cleaning it,
55
+ and writing it to a new caption file.
56
+
57
+ Args:
58
+ file_path (str): The path to the JSON file to process.
59
+
60
+ Raises:
61
+ Exception: If there's an error reading the JSON file or writing the output file.
62
+ """
63
+ try:
64
+ with open(file_path, 'r') as json_file:
65
+ data: Dict[str, Any] = json.load(json_file)
66
+
67
+ description = data.get('description', '')
68
+ cleaned_description = clean_description(description)
69
+
70
+ output_file = os.path.splitext(file_path)[0] + '.caption'
71
+
72
+ with open(output_file, 'w') as caption_file:
73
+ caption_file.write(cleaned_description)
74
+
75
+ logging.info(f"Processed {file_path} -> {output_file}")
76
+ except Exception as e:
77
+ logging.error(f"Error processing {file_path}: {str(e)}")
78
+
79
+ def process_directory(directory: str) -> None:
80
+ """
81
+ Recursively process all JSON files in the specified directory and its subdirectories.
82
+
83
+ Args:
84
+ directory (str): The path to the directory to process.
85
+ """
86
+ for root, _, files in os.walk(directory):
87
+ for file in files:
88
+ if file.lower().endswith('.json'):
89
+ file_path = os.path.join(root, file)
90
+ process_json_file(file_path)
91
+
92
+ def main():
93
+ parser = argparse.ArgumentParser(description="Process JSON files and extract captions.")
94
+ parser.add_argument('-d', '--directory', default='.', help="Directory to process (default: current directory)")
95
+ parser.add_argument('-v', '--verbose', action='store_true', help="Enable verbose logging")
96
+ args = parser.parse_args()
97
+
98
+ log_level = logging.DEBUG if args.verbose else logging.INFO
99
+ logging.basicConfig(level=log_level, format='%(asctime)s - %(levelname)s - %(message)s')
100
+
101
+ directory = os.path.abspath(args.directory)
102
+ logging.info(f"Processing directory: {directory}")
103
+ process_directory(directory)
104
+ logging.info("Processing complete.")
105
+
106
+ if __name__ == "__main__":
107
+ main()
pick_caption ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ import os
4
+ import re
5
+
6
+ def process_caption_files():
7
+ for file in os.listdir():
8
+ if file.endswith(".caption"):
9
+ # Check if the file already contains processed data
10
+ with open(file, 'r') as f:
11
+ lines = f.readlines()
12
+
13
+ if "\n----------\n" not in "".join(lines):
14
+ continue
15
+
16
+ for line in lines:
17
+ if "----------" in line:
18
+ break # Stop processing after finding the separator
19
+
20
+ content = ''.join(lines[:lines.index(line)]) # Extract text before the separator
21
+ processed_content = re.sub(r'[\-]+|\n', '', content) # Remove newlines and separator
22
+
23
+ with open(file, 'w') as f: # Save the condensed caption back to the same file
24
+ f.write(processed_content)
25
+
26
+ process_caption_files()
27
+