k4d3
/

toolkit

Model card Files Files and versions Community

toolkit / extract_description

k4d3's picture

update every stupid script

c2cc76d 1 day ago

3.88 kB

	#!/usr/bin/env python
	# -- coding: utf-8 --
	"""
	JSON Caption Extractor

	This script processes JSON files in the specified directory (or current directory if not specified)
	and its subdirectories, extracting cleaned captions from them. It performs the following tasks:

	1. Recursively searches for JSON files in the specified directory and its subdirectories.
	2. Reads each JSON file found.
	3. Extracts the 'description' field from each JSON file.
	4. Cleans the description by removing HTML links, newline characters, and extra spaces.
	5. Writes the cleaned description to a new file with the same name as the input file
	but with a '.caption' extension in the same directory as the input file.
	6. Provides logging information about the processing status and any errors encountered.

	Usage:
	python extract_description [-v] [-d DIRECTORY]

	Arguments:
	-d, --directory The directory to process (optional, defaults to current directory)
	-v, --verbose Enable verbose logging (optional)

	The script processes all JSON files in the specified directory and its subdirectories.
	"""

	import json
	import os
	import re
	import argparse
	import logging
	from typing import Dict, Any

	def clean_description(description: str) -> str:
	"""
	Clean the input description by removing HTML links, newline characters, and extra spaces.

	Args:
	description (str): The original description string.

	Returns:
	str: The cleaned description string.
	"""
	# Remove HTML links
	description = re.sub(r'<a href=".?".?>(.*?)</a>', r'\1', description)
	# Remove newline characters
	description = description.replace('\n', ' ')
	# Remove extra spaces
	description = re.sub(r'\s+', ' ', description).strip()
	return description

	def process_json_file(file_path: str) -> None:
	"""
	Process a single JSON file by extracting the description, cleaning it,
	and writing it to a new caption file.

	Args:
	file_path (str): The path to the JSON file to process.

	Raises:
	Exception: If there's an error reading the JSON file or writing the output file.
	"""
	try:
	with open(file_path, 'r') as json_file:
	data: Dict[str, Any] = json.load(json_file)

	description = data.get('description', '')
	cleaned_description = clean_description(description)

	output_file = os.path.splitext(file_path)[0] + '.caption'

	with open(output_file, 'w') as caption_file:
	caption_file.write(cleaned_description)

	logging.info(f"Processed {file_path} -> {output_file}")
	except Exception as e:
	logging.error(f"Error processing {file_path}: {str(e)}")

	def process_directory(directory: str) -> None:
	"""
	Recursively process all JSON files in the specified directory and its subdirectories.

	Args:
	directory (str): The path to the directory to process.
	"""
	for root, _, files in os.walk(directory):
	for file in files:
	if file.lower().endswith('.json'):
	file_path = os.path.join(root, file)
	process_json_file(file_path)

	def main():
	parser = argparse.ArgumentParser(description="Process JSON files and extract captions.")
	parser.add_argument('-d', '--directory', default='.', help="Directory to process (default: current directory)")
	parser.add_argument('-v', '--verbose', action='store_true', help="Enable verbose logging")
	args = parser.parse_args()

	log_level = logging.DEBUG if args.verbose else logging.INFO
	logging.basicConfig(level=log_level, format='%(asctime)s - %(levelname)s - %(message)s')

	directory = os.path.abspath(args.directory)
	logging.info(f"Processing directory: {directory}")
	process_directory(directory)
	logging.info("Processing complete.")

	if __name__ == "__main__":
	main()