File size: 3,876 Bytes
6b5f1f8
c2cc76d
6b5f1f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
JSON Caption Extractor

This script processes JSON files in the specified directory (or current directory if not specified)
and its subdirectories, extracting cleaned captions from them. It performs the following tasks:

1. Recursively searches for JSON files in the specified directory and its subdirectories.
2. Reads each JSON file found.
3. Extracts the 'description' field from each JSON file.
4. Cleans the description by removing HTML links, newline characters, and extra spaces.
5. Writes the cleaned description to a new file with the same name as the input file
   but with a '.caption' extension in the same directory as the input file.
6. Provides logging information about the processing status and any errors encountered.

Usage:
    python extract_description [-v] [-d DIRECTORY]

Arguments:
    -d, --directory  The directory to process (optional, defaults to current directory)
    -v, --verbose    Enable verbose logging (optional)

The script processes all JSON files in the specified directory and its subdirectories.
"""

import json
import os
import re
import argparse
import logging
from typing import Dict, Any

def clean_description(description: str) -> str:
    """
    Clean the input description by removing HTML links, newline characters, and extra spaces.

    Args:
        description (str): The original description string.

    Returns:
        str: The cleaned description string.
    """
    # Remove HTML links
    description = re.sub(r'<a href=".*?".*?>(.*?)</a>', r'\1', description)
    # Remove newline characters
    description = description.replace('\n', ' ')
    # Remove extra spaces
    description = re.sub(r'\s+', ' ', description).strip()
    return description

def process_json_file(file_path: str) -> None:
    """
    Process a single JSON file by extracting the description, cleaning it,
    and writing it to a new caption file.

    Args:
        file_path (str): The path to the JSON file to process.

    Raises:
        Exception: If there's an error reading the JSON file or writing the output file.
    """
    try:
        with open(file_path, 'r') as json_file:
            data: Dict[str, Any] = json.load(json_file)
        
        description = data.get('description', '')
        cleaned_description = clean_description(description)
        
        output_file = os.path.splitext(file_path)[0] + '.caption'
        
        with open(output_file, 'w') as caption_file:
            caption_file.write(cleaned_description)
        
        logging.info(f"Processed {file_path} -> {output_file}")
    except Exception as e:
        logging.error(f"Error processing {file_path}: {str(e)}")

def process_directory(directory: str) -> None:
    """
    Recursively process all JSON files in the specified directory and its subdirectories.

    Args:
        directory (str): The path to the directory to process.
    """
    for root, _, files in os.walk(directory):
        for file in files:
            if file.lower().endswith('.json'):
                file_path = os.path.join(root, file)
                process_json_file(file_path)

def main():
    parser = argparse.ArgumentParser(description="Process JSON files and extract captions.")
    parser.add_argument('-d', '--directory', default='.', help="Directory to process (default: current directory)")
    parser.add_argument('-v', '--verbose', action='store_true', help="Enable verbose logging")
    args = parser.parse_args()

    log_level = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(level=log_level, format='%(asctime)s - %(levelname)s - %(message)s')

    directory = os.path.abspath(args.directory)
    logging.info(f"Processing directory: {directory}")
    process_directory(directory)
    logging.info("Processing complete.")

if __name__ == "__main__":
    main()