from modules.racist_text_query import racist_text_query from modules.bigotry_dict import bigotry_dict from modules.OCR import tiff_to_ocr from modules.racist_chatgpt_analysis import racist_chatgpt_analysis from modules.locate import locate from modules.pagenum import crop_image import os import pandas as pd def racism_threshold(file_dir): # Create the new folder for cropped images cropped_images_dir = os.path.join(file_dir, 'deed page number') if not os.path.exists(cropped_images_dir): os.makedirs(cropped_images_dir) data = [] for images in os.listdir(file_dir): if images.endswith(".tif") or images.endswith(".tiff"): image_path = os.path.join(file_dir, images) # run ocr on images text = tiff_to_ocr(image_path) result1 = racist_chatgpt_analysis(text) result2 = racist_text_query(text, bigotry_dict) a, b, c = locate(text) # Define the output path for the cropped image in the new folder cropped_image_name = "cropped_" + images cropped_image_path = os.path.join(cropped_images_dir, cropped_image_name) # Crop the image and save it to the new folder crop_image(image_path, cropped_image_path) image_path_formatted = cropped_image_path #.replace(' ', '%20') hyperlink_formula = f'file://{image_path_formatted}' # fail safe page number detection page = tiff_to_ocr(cropped_image_path) fail_safe_page = [] result = page.split("\n") for word in result: # checks for possible page numbers if word.isdigit() == True: fail_safe_page.append(word) if result1 or result2: print(images, a, b, c) if len(fail_safe_page) != 0: a.append(fail_safe_page) data.append([images, a, b[0], c[0], hyperlink_formula]) else: print(images + " : Not Racist") # data.append([images, a, b[0], c[0], hyperlink_formula]) # Include the hyperlink in the DataFrame columns df = pd.DataFrame(data, columns=['File Name', 'Probable Page Number', 'Date', 'Book Number', "Page Link"]) df.index += 1 df.to_csv(os.path.join(file_dir, 'Racist Deeds.csv'), index=True) df.to_excel(os.path.join(file_dir, 'Racist Deeds.xlsx'), index=True) racism_threshold('folderpath')