import gradio as gr import fitz # PyMuPDF import cv2 from pdf2image import convert_from_path import pytesseract import numpy as np import os from fpdf import FPDF # Convert PDFs to images def convert_pdf_to_images(pdf_path, dpi=300): try: images = convert_from_path(pdf_path, dpi=dpi, poppler_path="/usr/bin") return [cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) for image in images] except Exception as e: return f"Error converting PDF to images: {e}" # Align images def align_images(img1, img2): gray1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY) gray2 = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY) orb = cv2.ORB_create() kp1, des1 = orb.detectAndCompute(gray1, None) kp2, des2 = orb.detectAndCompute(gray2, None) bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True) matches = bf.match(des1, des2) matches = sorted(matches, key=lambda x: x.distance) src_pts = np.float32([kp1[m.queryIdx].pt for m in matches]).reshape(-1, 1, 2) dst_pts = np.float32([kp2[m.trainIdx].pt for m in matches]).reshape(-1, 1, 2) matrix, _ = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0) aligned_img = cv2.warpPerspective(img2, matrix, (img1.shape[1], img1.shape[0])) return aligned_img # Compare images def compare_images(img1, img2): diff = cv2.absdiff(img1, img2) gray_diff = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY) _, thresh = cv2.threshold(gray_diff, 50, 255, cv2.THRESH_BINARY) return thresh # Highlight changes def highlight_changes(img, mask): overlay = img.copy() contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) for cnt in contours: x, y, w, h = cv2.boundingRect(cnt) cv2.rectangle(overlay, (x, y), (x + w, y + h), (0, 0, 255), 2) return overlay # Generate combined comparison PDF def generate_comparison_pdf(original_pdf, edited_pdf): original_images = convert_pdf_to_images(original_pdf) edited_images = convert_pdf_to_images(edited_pdf) if isinstance(original_images, str) or isinstance(edited_images, str): return original_images if isinstance(original_images, str) else edited_images combined_images = [] for orig_img, edit_img in zip(original_images, edited_images): aligned_img = align_images(orig_img, edit_img) diff_mask = compare_images(orig_img, aligned_img) highlighted_img = highlight_changes(edit_img, diff_mask) combined_images.append(np.hstack((orig_img, highlighted_img))) output_path = "outputs/comparison_result.pdf" pdf = FPDF() for img in combined_images: temp_path = "temp_image.png" cv2.imwrite(temp_path, img) pdf.add_page() pdf.image(temp_path, x=10, y=10, w=190) os.remove(temp_path) pdf.output(output_path) return output_path # Gradio interface function def pdf_comparison(original_pdf, edited_pdf): if original_pdf is None or edited_pdf is None: return "Error: Both PDF files must be provided." original_pdf_path = original_pdf.name edited_pdf_path = edited_pdf.name result_path = generate_comparison_pdf(original_pdf_path, edited_pdf_path) return result_path # Gradio app interface = gr.Interface( fn=pdf_comparison, inputs=[ gr.File(label="Upload Original PDF", file_types=[".pdf"]), gr.File(label="Upload Edited PDF", file_types=[".pdf"]) ], outputs=gr.File(label="Download Comparison Report"), title="PDF Comparison Tool", description="Upload two PDFs for a side-by-side comparison with visual highlights and a summary of changes." ) if __name__ == "__main__": if not os.path.exists("outputs"): os.makedirs("outputs") interface.launch()