File size: 3,423 Bytes
14daa4c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import os
from PIL import Image
import pytesseract
from pathlib import Path
import json
from typing import Dict, List
from concurrent.futures import ProcessPoolExecutor
import multiprocessing
def process_image(args) -> tuple:
"""
Process a single image file.
Args:
args: Tuple of (filename, input_dir, output_dir)
Returns:
Tuple of (filename, extracted_text)
"""
filename, input_dir, output_dir = args
try:
# Full path to image
image_path = os.path.join(input_dir, filename)
# Open and process image
with Image.open(image_path) as img:
# Extract text using pytesseract
text = pytesseract.image_to_string(img)
# Save individual text file
text_filename = Path(filename).stem + '.txt'
text_path = os.path.join(output_dir, text_filename)
with open(text_path, 'w', encoding='utf-8') as f:
f.write(text)
print(f"Processed: {filename}")
return filename, text
except Exception as e:
print(f"Error processing {filename}: {str(e)}")
return filename, f"ERROR: {str(e)}"
def process_directory(input_dir: str, output_dir: str, max_workers: int = None) -> Dict[str, str]:
"""
Process all JPEG files in a directory and perform OCR using multiple processes.
Args:
input_dir: Directory containing JPEG files
output_dir: Directory to save OCR results
max_workers: Maximum number of worker processes (defaults to CPU count)
Returns:
Dictionary mapping filenames to extracted text
"""
# Create output directory if it doesn't exist
Path(output_dir).mkdir(parents=True, exist_ok=True)
# If max_workers not specified, use CPU count
if max_workers is None:
max_workers = multiprocessing.cpu_count()
# Supported image extensions
valid_extensions = {'.jpg', '.jpeg', '.JPG', '.JPEG'}
# Get list of valid image files
image_files = [
f for f in os.listdir(input_dir)
if Path(f).suffix in valid_extensions
]
# Prepare arguments for worker processes
work_args = [(f, input_dir, output_dir) for f in image_files]
# Process files concurrently
results = {}
with ProcessPoolExecutor(max_workers=max_workers) as executor:
for filename, text in executor.map(process_image, work_args):
results[filename] = text
# Save consolidated results to JSON
json_path = os.path.join(output_dir, 'ocr_results.json')
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=2, ensure_ascii=False)
return results
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Perform OCR on all JPEG files in a directory')
parser.add_argument('input_dir', help='Input directory containing JPEG files')
parser.add_argument('output_dir', help='Output directory for OCR results')
parser.add_argument('--workers', type=int, help='Number of worker processes (default: CPU count)',
default=None)
args = parser.parse_args()
results = process_directory(args.input_dir, args.output_dir, args.workers)
print(f"\nProcessed {len(results)} files. Results saved to {args.output_dir}")
|