Spaces:
Runtime error
Runtime error
import os | |
import cv2 | |
import math | |
import numpy as np | |
import pandas as pd | |
from pdf2image import convert_from_bytes | |
import streamlit as st | |
def get_img(uploaded_file): | |
# convert file bytes into cv2 image | |
file_bytes = np.asarray(bytearray(uploaded_file.read()), dtype=np.uint8) | |
img = cv2.imdecode(file_bytes, 1) | |
return img | |
def convert_pdf_to_image(filename): | |
# * returns back a list of images according to the pdf pages | |
pdf_pages = convert_from_bytes(filename, 500) | |
return pdf_pages | |
def filter_color(img): | |
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) | |
# define range of black color in HSV | |
lower_val = np.array([0, 0, 0]) | |
upper_val = np.array([179, 100, 130]) | |
# Threshold the HSV image to get only black colors | |
mask = cv2.inRange(hsv, lower_val, upper_val) | |
# Bitwise-AND mask and original image | |
res = cv2.bitwise_not(mask) | |
return res | |
def plot(img, boxes): | |
FONT_SCALE = 1e-3 | |
THICKNESS_SCALE = 1e-3 | |
TEXT_Y_OFFSET_SCALE = 2.5e-2 | |
height, width, _ = img.shape | |
font_scale = min(width, height) * FONT_SCALE | |
thickness = math.ceil(min(width, height) * THICKNESS_SCALE) | |
tmp = img.copy() | |
for box in boxes: | |
top_left = (int(box[0]), int(box[1])) | |
bottom_right = (int(box[2]), int(box[3])) | |
tmp = cv2.rectangle(tmp, top_left, bottom_right, | |
(0, 0, 255), thickness) | |
text = str(round(float(box[4]), 2)) | |
cv2.putText( | |
tmp, | |
text, | |
(int(box[0]), int(box[1]) + int(height * TEXT_Y_OFFSET_SCALE)), | |
cv2.FONT_HERSHEY_SIMPLEX, | |
font_scale, | |
(0, 0, 255), | |
thickness, | |
) | |
return tmp | |
def delete_file(filename): | |
if os.path.exists(filename): | |
os.remove(filename) | |
def save_excel_file( | |
idx, df: pd.DataFrame, foldername, filename, page_enumeration: int = 0 | |
): | |
df.to_csv( | |
f"{foldername}/{filename}page{page_enumeration}table{idx}.csv", | |
index=False, | |
) | |
def concat_csv(folder, filename: str): | |
df = pd.DataFrame() | |
foldername = folder.name | |
files = list( | |
sorted( | |
os.listdir(foldername), key=lambda x: x.split("page")[1].split("table")[0] | |
) | |
) | |
columns = [] | |
for idx, file in enumerate(files): | |
tmp = pd.read_csv(f"{foldername}/{file}") | |
try: | |
if idx == 0: | |
columns = tmp.iloc[0] | |
df = pd.concat([df, tmp[1:]]) | |
except: | |
continue | |
if not df.empty: | |
df.columns = columns | |
st.dataframe(df) | |
df.to_csv(filename, index=False) | |