import os import tempfile import random import string from ultralyticsplus import YOLO import streamlit as st import numpy as np import pandas as pd from process import ( filter_columns, extract_text_of_col, prepare_cols, process_cols, finalize_data, ) from file_utils import ( get_img, save_excel_file, concat_csv, convert_pdf_to_image, filter_color, plot, delete_file, ) def process_img( img, page_enumeration: int = 0, filter=False, foldername: str = "", filename: str = "", ): tables = PaddleOCR.table_model(img, conf=0.75) tables = tables[0].boxes.xyxy.cpu().numpy() results = [] for table in tables: try: # * crop the table as an image from the original image sub_img = img[ int(table[1].item()): int(table[3].item()), int(table[0].item()): int(table[2].item()), ] columns_detect = PaddleOCR.column_model(sub_img, conf=0.75) cols_data = columns_detect[0].boxes.data.cpu().numpy() # * Sort columns according to the x coordinate cols_data = np.array( sorted(cols_data, key=lambda x: x[0]), dtype=np.ndarray ) # * merge the duplicated columns cols_data = filter_columns(cols_data) st.image(plot(sub_img, cols_data), channels="RGB") except: st.warning("No Detection") try: columns = cols_data[:, 0:4] sub_imgs = [] for column in columns: # * Create list of cropped images for each column sub_imgs.append(sub_img[:, int(column[0]): int(column[2])]) cols = [] thr = 0 for image in sub_imgs: if filter: # * keep only black color in the image image = filter_color(image) # * extract text of each column and get the length threshold res, threshold = extract_text_of_col(image) thr += threshold # * arrange the rows of each column with respect to row length threshold cols.append(prepare_cols(res, threshold * 0.6)) thr = thr / len(sub_imgs) # * append each element in each column to its right place in the dataframe data = process_cols(cols, thr * 0.6) # * merge the related rows together data: pd.DataFrame = finalize_data(data, page_enumeration) results.append(data) print("data : ",data) print("results : ", results) except: st.warning("Text Extraction Failed") continue list( map( lambda x: save_excel_file( *x, foldername, filename, page_enumeration, ), enumerate(results), ) ) class PaddleOCR: # Load Image Detection model table_model = YOLO("table.pt") column_model = YOLO("columns.pt") def __call__(self, uploaded, filter=False): foldername = tempfile.TemporaryDirectory(dir=os.getcwd()) filename = uploaded.name.split(".")[0] if uploaded.name.split(".")[1].lower() == "pdf": pdf_pages = convert_pdf_to_image(uploaded.read()) for page_enumeration, page in enumerate(pdf_pages, start=1): process_img( np.asarray(page), page_enumeration, filter=filter, foldername=foldername.name, filename=filename, ) else: img = get_img(uploaded) process_img( img, filter=filter, foldername=foldername.name, filename=filename, ) # * concatenate all csv files if many extra = "".join(random.choices(string.ascii_uppercase, k=5)) filename = f"{filename}_{extra}.csv" try: concat_csv(foldername, filename) except: st.warning("No results found") foldername.cleanup() if os.path.exists(filename): with open(f"{filename}", "rb") as fp: st.download_button( label="Download CSV file", data=fp, file_name=filename, mime="text/csv", ) delete_file(filename) else: st.warning("No results found")