File size: 2,633 Bytes
e1422df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
import cv2
import math
import numpy as np
import pandas as pd
from pdf2image import convert_from_bytes

import streamlit as st


def get_img(uploaded_file):
    # convert file bytes into cv2 image
    file_bytes = np.asarray(bytearray(uploaded_file.read()), dtype=np.uint8)
    img = cv2.imdecode(file_bytes, 1)
    return img


def convert_pdf_to_image(filename):
    # * returns back a list of images according to the pdf pages
    pdf_pages = convert_from_bytes(filename, 500)
    return pdf_pages


def filter_color(img):
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)

    # define range of black color in HSV

    lower_val = np.array([0, 0, 0])

    upper_val = np.array([179, 100, 130])

    # Threshold the HSV image to get only black colors

    mask = cv2.inRange(hsv, lower_val, upper_val)

    # Bitwise-AND mask and original image

    res = cv2.bitwise_not(mask)
    return res


def plot(img, boxes):
    FONT_SCALE = 1e-3
    THICKNESS_SCALE = 1e-3
    TEXT_Y_OFFSET_SCALE = 2.5e-2
    height, width, _ = img.shape

    font_scale = min(width, height) * FONT_SCALE
    thickness = math.ceil(min(width, height) * THICKNESS_SCALE)

    tmp = img.copy()
    for box in boxes:
        top_left = (int(box[0]), int(box[1]))
        bottom_right = (int(box[2]), int(box[3]))

        tmp = cv2.rectangle(tmp, top_left, bottom_right,
                            (0, 0, 255), thickness)

        text = str(round(float(box[4]), 2))

        cv2.putText(
            tmp,
            text,
            (int(box[0]), int(box[1]) + int(height * TEXT_Y_OFFSET_SCALE)),
            cv2.FONT_HERSHEY_SIMPLEX,
            font_scale,
            (0, 0, 255),
            thickness,
        )
    return tmp


def delete_file(filename):
    if os.path.exists(filename):
        os.remove(filename)


def save_excel_file(
    idx, df: pd.DataFrame, foldername, filename, page_enumeration: int = 0
):
    df.to_csv(
        f"{foldername}/{filename}page{page_enumeration}table{idx}.csv",
        index=False,
    )


def concat_csv(folder, filename: str):
    df = pd.DataFrame()
    foldername = folder.name
    files = list(
        sorted(
            os.listdir(foldername), key=lambda x: x.split("page")[1].split("table")[0]
        )
    )
    columns = []
    for idx, file in enumerate(files):
        tmp = pd.read_csv(f"{foldername}/{file}")
        try:
            if idx == 0:
                columns = tmp.iloc[0]
            df = pd.concat([df, tmp[1:]])
        except:
            continue

    if not df.empty:
        df.columns = columns
        st.dataframe(df)
        df.to_csv(filename, index=False)