#!/usr/bin/env python """ JSON Caption Extractor This script processes JSON files in the specified directory (or current directory if not specified) and its subdirectories, extracting cleaned captions from them. It performs the following tasks: 1. Recursively searches for JSON files in the specified directory and its subdirectories. 2. Reads each JSON file found. 3. Extracts the 'description' field from each JSON file. 4. Cleans the description by removing HTML links, newline characters, and extra spaces. 5. Writes the cleaned description to a new file with the same name as the input file but with a '.caption' extension in the same directory as the input file. 6. Provides logging information about the processing status and any errors encountered. Usage: python extract_description [-v] [-d DIRECTORY] Arguments: -d, --directory The directory to process (optional, defaults to current directory) -v, --verbose Enable verbose logging (optional) The script processes all JSON files in the specified directory and its subdirectories. """ import json import os import re import argparse import logging from typing import Dict, Any def clean_description(description: str) -> str: """ Clean the input description by removing HTML links, newline characters, and extra spaces. Args: description (str): The original description string. Returns: str: The cleaned description string. """ # Remove HTML links description = re.sub(r'(.*?)', r'\1', description) # Remove newline characters description = description.replace('\n', ' ') # Remove extra spaces description = re.sub(r'\s+', ' ', description).strip() return description def process_json_file(file_path: str) -> None: """ Process a single JSON file by extracting the description, cleaning it, and writing it to a new caption file. Args: file_path (str): The path to the JSON file to process. Raises: Exception: If there's an error reading the JSON file or writing the output file. """ try: with open(file_path, 'r') as json_file: data: Dict[str, Any] = json.load(json_file) description = data.get('description', '') cleaned_description = clean_description(description) output_file = os.path.splitext(file_path)[0] + '.caption' with open(output_file, 'w') as caption_file: caption_file.write(cleaned_description) logging.info(f"Processed {file_path} -> {output_file}") except Exception as e: logging.error(f"Error processing {file_path}: {str(e)}") def process_directory(directory: str) -> None: """ Recursively process all JSON files in the specified directory and its subdirectories. Args: directory (str): The path to the directory to process. """ for root, _, files in os.walk(directory): for file in files: if file.lower().endswith('.json'): file_path = os.path.join(root, file) process_json_file(file_path) def main(): parser = argparse.ArgumentParser(description="Process JSON files and extract captions.") parser.add_argument('-d', '--directory', default='.', help="Directory to process (default: current directory)") parser.add_argument('-v', '--verbose', action='store_true', help="Enable verbose logging") args = parser.parse_args() log_level = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig(level=log_level, format='%(asctime)s - %(levelname)s - %(message)s') directory = os.path.abspath(args.directory) logging.info(f"Processing directory: {directory}") process_directory(directory) logging.info("Processing complete.") if __name__ == "__main__": main()