File size: 3,853 Bytes
6b5f1f8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
#!/usr/bin/env python
"""
JSON Caption Extractor
This script processes JSON files in the specified directory (or current directory if not specified)
and its subdirectories, extracting cleaned captions from them. It performs the following tasks:
1. Recursively searches for JSON files in the specified directory and its subdirectories.
2. Reads each JSON file found.
3. Extracts the 'description' field from each JSON file.
4. Cleans the description by removing HTML links, newline characters, and extra spaces.
5. Writes the cleaned description to a new file with the same name as the input file
but with a '.caption' extension in the same directory as the input file.
6. Provides logging information about the processing status and any errors encountered.
Usage:
python extract_description [-v] [-d DIRECTORY]
Arguments:
-d, --directory The directory to process (optional, defaults to current directory)
-v, --verbose Enable verbose logging (optional)
The script processes all JSON files in the specified directory and its subdirectories.
"""
import json
import os
import re
import argparse
import logging
from typing import Dict, Any
def clean_description(description: str) -> str:
"""
Clean the input description by removing HTML links, newline characters, and extra spaces.
Args:
description (str): The original description string.
Returns:
str: The cleaned description string.
"""
# Remove HTML links
description = re.sub(r'<a href=".*?".*?>(.*?)</a>', r'\1', description)
# Remove newline characters
description = description.replace('\n', ' ')
# Remove extra spaces
description = re.sub(r'\s+', ' ', description).strip()
return description
def process_json_file(file_path: str) -> None:
"""
Process a single JSON file by extracting the description, cleaning it,
and writing it to a new caption file.
Args:
file_path (str): The path to the JSON file to process.
Raises:
Exception: If there's an error reading the JSON file or writing the output file.
"""
try:
with open(file_path, 'r') as json_file:
data: Dict[str, Any] = json.load(json_file)
description = data.get('description', '')
cleaned_description = clean_description(description)
output_file = os.path.splitext(file_path)[0] + '.caption'
with open(output_file, 'w') as caption_file:
caption_file.write(cleaned_description)
logging.info(f"Processed {file_path} -> {output_file}")
except Exception as e:
logging.error(f"Error processing {file_path}: {str(e)}")
def process_directory(directory: str) -> None:
"""
Recursively process all JSON files in the specified directory and its subdirectories.
Args:
directory (str): The path to the directory to process.
"""
for root, _, files in os.walk(directory):
for file in files:
if file.lower().endswith('.json'):
file_path = os.path.join(root, file)
process_json_file(file_path)
def main():
parser = argparse.ArgumentParser(description="Process JSON files and extract captions.")
parser.add_argument('-d', '--directory', default='.', help="Directory to process (default: current directory)")
parser.add_argument('-v', '--verbose', action='store_true', help="Enable verbose logging")
args = parser.parse_args()
log_level = logging.DEBUG if args.verbose else logging.INFO
logging.basicConfig(level=log_level, format='%(asctime)s - %(levelname)s - %(message)s')
directory = os.path.abspath(args.directory)
logging.info(f"Processing directory: {directory}")
process_directory(directory)
logging.info("Processing complete.")
if __name__ == "__main__":
main()
|