|
|
|
|
|
""" |
|
JSON Caption Extractor |
|
|
|
This script processes JSON files in the specified directory (or current directory if not specified) |
|
and its subdirectories, extracting cleaned captions from them. It performs the following tasks: |
|
|
|
1. Recursively searches for JSON files in the specified directory and its subdirectories. |
|
2. Reads each JSON file found. |
|
3. Extracts the 'description' field from each JSON file. |
|
4. Cleans the description by removing HTML links, newline characters, and extra spaces. |
|
5. Writes the cleaned description to a new file with the same name as the input file |
|
but with a '.caption' extension in the same directory as the input file. |
|
6. Provides logging information about the processing status and any errors encountered. |
|
|
|
Usage: |
|
python extract_description [-v] [-d DIRECTORY] |
|
|
|
Arguments: |
|
-d, --directory The directory to process (optional, defaults to current directory) |
|
-v, --verbose Enable verbose logging (optional) |
|
|
|
The script processes all JSON files in the specified directory and its subdirectories. |
|
""" |
|
|
|
import json |
|
import os |
|
import re |
|
import argparse |
|
import logging |
|
from typing import Dict, Any |
|
|
|
def clean_description(description: str) -> str: |
|
""" |
|
Clean the input description by removing HTML links, newline characters, and extra spaces. |
|
|
|
Args: |
|
description (str): The original description string. |
|
|
|
Returns: |
|
str: The cleaned description string. |
|
""" |
|
|
|
description = re.sub(r'<a href=".*?".*?>(.*?)</a>', r'\1', description) |
|
|
|
description = description.replace('\n', ' ') |
|
|
|
description = re.sub(r'\s+', ' ', description).strip() |
|
return description |
|
|
|
def process_json_file(file_path: str) -> None: |
|
""" |
|
Process a single JSON file by extracting the description, cleaning it, |
|
and writing it to a new caption file. |
|
|
|
Args: |
|
file_path (str): The path to the JSON file to process. |
|
|
|
Raises: |
|
Exception: If there's an error reading the JSON file or writing the output file. |
|
""" |
|
try: |
|
with open(file_path, 'r') as json_file: |
|
data: Dict[str, Any] = json.load(json_file) |
|
|
|
description = data.get('description', '') |
|
cleaned_description = clean_description(description) |
|
|
|
output_file = os.path.splitext(file_path)[0] + '.caption' |
|
|
|
with open(output_file, 'w') as caption_file: |
|
caption_file.write(cleaned_description) |
|
|
|
logging.info(f"Processed {file_path} -> {output_file}") |
|
except Exception as e: |
|
logging.error(f"Error processing {file_path}: {str(e)}") |
|
|
|
def process_directory(directory: str) -> None: |
|
""" |
|
Recursively process all JSON files in the specified directory and its subdirectories. |
|
|
|
Args: |
|
directory (str): The path to the directory to process. |
|
""" |
|
for root, _, files in os.walk(directory): |
|
for file in files: |
|
if file.lower().endswith('.json'): |
|
file_path = os.path.join(root, file) |
|
process_json_file(file_path) |
|
|
|
def main(): |
|
parser = argparse.ArgumentParser(description="Process JSON files and extract captions.") |
|
parser.add_argument('-d', '--directory', default='.', help="Directory to process (default: current directory)") |
|
parser.add_argument('-v', '--verbose', action='store_true', help="Enable verbose logging") |
|
args = parser.parse_args() |
|
|
|
log_level = logging.DEBUG if args.verbose else logging.INFO |
|
logging.basicConfig(level=log_level, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
|
directory = os.path.abspath(args.directory) |
|
logging.info(f"Processing directory: {directory}") |
|
process_directory(directory) |
|
logging.info("Processing complete.") |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|