tabel_ocr / file_utils.py
Raj-Master's picture
Create file_utils.py
e1422df
import os
import cv2
import math
import numpy as np
import pandas as pd
from pdf2image import convert_from_bytes
import streamlit as st
def get_img(uploaded_file):
# convert file bytes into cv2 image
file_bytes = np.asarray(bytearray(uploaded_file.read()), dtype=np.uint8)
img = cv2.imdecode(file_bytes, 1)
return img
def convert_pdf_to_image(filename):
# * returns back a list of images according to the pdf pages
pdf_pages = convert_from_bytes(filename, 500)
return pdf_pages
def filter_color(img):
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
# define range of black color in HSV
lower_val = np.array([0, 0, 0])
upper_val = np.array([179, 100, 130])
# Threshold the HSV image to get only black colors
mask = cv2.inRange(hsv, lower_val, upper_val)
# Bitwise-AND mask and original image
res = cv2.bitwise_not(mask)
return res
def plot(img, boxes):
FONT_SCALE = 1e-3
THICKNESS_SCALE = 1e-3
TEXT_Y_OFFSET_SCALE = 2.5e-2
height, width, _ = img.shape
font_scale = min(width, height) * FONT_SCALE
thickness = math.ceil(min(width, height) * THICKNESS_SCALE)
tmp = img.copy()
for box in boxes:
top_left = (int(box[0]), int(box[1]))
bottom_right = (int(box[2]), int(box[3]))
tmp = cv2.rectangle(tmp, top_left, bottom_right,
(0, 0, 255), thickness)
text = str(round(float(box[4]), 2))
cv2.putText(
tmp,
text,
(int(box[0]), int(box[1]) + int(height * TEXT_Y_OFFSET_SCALE)),
cv2.FONT_HERSHEY_SIMPLEX,
font_scale,
(0, 0, 255),
thickness,
)
return tmp
def delete_file(filename):
if os.path.exists(filename):
os.remove(filename)
def save_excel_file(
idx, df: pd.DataFrame, foldername, filename, page_enumeration: int = 0
):
df.to_csv(
f"{foldername}/{filename}page{page_enumeration}table{idx}.csv",
index=False,
)
def concat_csv(folder, filename: str):
df = pd.DataFrame()
foldername = folder.name
files = list(
sorted(
os.listdir(foldername), key=lambda x: x.split("page")[1].split("table")[0]
)
)
columns = []
for idx, file in enumerate(files):
tmp = pd.read_csv(f"{foldername}/{file}")
try:
if idx == 0:
columns = tmp.iloc[0]
df = pd.concat([df, tmp[1:]])
except:
continue
if not df.empty:
df.columns = columns
st.dataframe(df)
df.to_csv(filename, index=False)