## e621.net/e6ai.net JSON to `.txt` tags
----

This Python script is designed to process JSON files found within a specified directory and its subdirectories. Each JSON file is expected to contain data related to image posts, particularly sourced from online platforms such as image boards. The script parses these JSON files, extracts relevant information such as image URL, ratings, and tags, and generates caption files (`.txt`) based on this data.

Here's a breakdown of what the script does:

1. **Ignoring Tags**: The script defines a list of tags to be ignored during processing, such as "hi res", "shaded", etc.

2. **Processing Files**: The `process_file` function is responsible for processing each JSON file. It reads the JSON data, extracts the URL of the image file, determines its rating, and extracts tags associated with the image.

3. **Generating Caption File**: For each image, a caption file is generated with the same name as the image file but with a .txt extension. The rating of the image is written first, followed by processed tags.

4. **Processing Tags**: Tags are processed to replace underscores with spaces and to handle special cases such as artist tags. Ignored tags are filtered out.

In [1]:
import re
import os
import json
from rich.console import Console

console = Console()

import re

# Define tags to be ignored using regular expressions for exact matching
ignored_tags = [
    r"\bhi res\b",
    r"\bshaded\b",
    r"\btagme\b",
    r"\babsurd res\b",
    r"\bdetailed\b",
    r"\bdota\b",
    r"\bcreative commons\b",
    r"\bcc-by-nc-nd\b",
    r"\bsquare enix\b",
    r"\bby conditional dnp\b",
    r"\bfinal fantasy xiv\b",
    r"\bfinal fantasy\b",
    r"\bmythological canine\b",
    r"\bancient pokemon\b",
    r"\bfelis\b",
    r"\basian mythology\b",
    r"\bmythological scalie\b",
    r"\bwidescreen\b",
    r"\bmythological creature\b",
    r"\b4k\b",
    r"\bfelid\b",
    r"\bsega\b",
    r"\bhasbro\b",
    r"\blegendary pokemon\b",
    r"\bzootopia\b",
    r"\bfive nights at freddy's\b",
    r"\beeveelution\b",
    r"\bdisney\b",
    r"\bcanis\b",
    r"\bcanine\b",
    r"\bdigimon\b",
    r"\bcanid\b",
    r"\bbandai namco\b",
    r"\bpokemon \(species\)\b",
    r"\bmammal\b",
    r"\bpokemon\b",
    r"\bnintendo\b",
]

# Function to check if a tag should be ignored based on the ignored_tags list
def should_ignore_tag(tag):
    for ignored_tag_pattern in ignored_tags:
        if re.search(ignored_tag_pattern, tag, re.IGNORECASE):
            return True
    return False

# Function to process tags and determine if they should be ignored
def process_tags(tags_list):
    processed_tags = []
    for tag in tags_list:
        # Replace underscores with spaces
        tag = tag.replace("_", " ")
        if not should_ignore_tag(tag):
            processed_tags.append(tag)
    return processed_tags

# Modify the process_file function to use the updated process_tags function
def process_file(file_path):
    try:
        console.print(f"Processing file: [bold]{file_path}[/bold]")
        with open(file_path, "r") as f:
            data = json.load(f)

        # Parse the URL and generate filename
        post_data = data.get("post", {})
        file_data = post_data.get("file", {})
        url = file_data.get("url")
        if url:
            filename, ext = os.path.splitext(os.path.basename(url))

            # Create caption file
            caption_file = f"{filename}.txt"
            caption_path = os.path.join(os.path.dirname(file_path), caption_file)

            with open(caption_path, "w", encoding="utf-8") as f:
                console.print(f"Creating caption file: [bold]{caption_path}[/bold]")
                # Write rating
                rating = post_data.get("rating", "q")
                if rating == "s":
                    f.write("rating_safe, ")
                elif rating == "e":
                    f.write("rating_explicit, ")
                else:
                    f.write("rating_questionable, ")

                # Process tags
                tags_data = post_data.get("tags", {})
                processed_tags = []
                for category, tags_list in tags_data.items():
                    processed_tags.extend(process_tags(tags_list))

                # Check if there are any valid tags before writing
                if processed_tags:
                    # Join tags with commas and write to file
                    tags_line = ", ".join(processed_tags)
                    f.write(tags_line.strip())
                    console.print(f"Writing tags: [italic]{tags_line.strip()}[/italic]")

    except Exception as e:
        console.print(f"Error processing file: [bold]{file_path}[/bold]")
        console.print(e)


def recursive_process(directory):
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".json"):
                file_path = os.path.join(root, file)
                process_file(file_path)

if __name__ == "__main__":
    #root_directory = r"E:\training_dir"
    root_directory = r"C:\Users\kade\Desktop\training_dir_staging"
    recursive_process(root_directory)