Raj-Master commited on
Commit
e1422df
·
1 Parent(s): 4911ff5

Create file_utils.py

Browse files
Files changed (1) hide show
  1. file_utils.py +109 -0
file_utils.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ import math
4
+ import numpy as np
5
+ import pandas as pd
6
+ from pdf2image import convert_from_bytes
7
+
8
+ import streamlit as st
9
+
10
+
11
+ def get_img(uploaded_file):
12
+ # convert file bytes into cv2 image
13
+ file_bytes = np.asarray(bytearray(uploaded_file.read()), dtype=np.uint8)
14
+ img = cv2.imdecode(file_bytes, 1)
15
+ return img
16
+
17
+
18
+ def convert_pdf_to_image(filename):
19
+ # * returns back a list of images according to the pdf pages
20
+ pdf_pages = convert_from_bytes(filename, 500)
21
+ return pdf_pages
22
+
23
+
24
+ def filter_color(img):
25
+ hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
26
+
27
+ # define range of black color in HSV
28
+
29
+ lower_val = np.array([0, 0, 0])
30
+
31
+ upper_val = np.array([179, 100, 130])
32
+
33
+ # Threshold the HSV image to get only black colors
34
+
35
+ mask = cv2.inRange(hsv, lower_val, upper_val)
36
+
37
+ # Bitwise-AND mask and original image
38
+
39
+ res = cv2.bitwise_not(mask)
40
+ return res
41
+
42
+
43
+ def plot(img, boxes):
44
+ FONT_SCALE = 1e-3
45
+ THICKNESS_SCALE = 1e-3
46
+ TEXT_Y_OFFSET_SCALE = 2.5e-2
47
+ height, width, _ = img.shape
48
+
49
+ font_scale = min(width, height) * FONT_SCALE
50
+ thickness = math.ceil(min(width, height) * THICKNESS_SCALE)
51
+
52
+ tmp = img.copy()
53
+ for box in boxes:
54
+ top_left = (int(box[0]), int(box[1]))
55
+ bottom_right = (int(box[2]), int(box[3]))
56
+
57
+ tmp = cv2.rectangle(tmp, top_left, bottom_right,
58
+ (0, 0, 255), thickness)
59
+
60
+ text = str(round(float(box[4]), 2))
61
+
62
+ cv2.putText(
63
+ tmp,
64
+ text,
65
+ (int(box[0]), int(box[1]) + int(height * TEXT_Y_OFFSET_SCALE)),
66
+ cv2.FONT_HERSHEY_SIMPLEX,
67
+ font_scale,
68
+ (0, 0, 255),
69
+ thickness,
70
+ )
71
+ return tmp
72
+
73
+
74
+ def delete_file(filename):
75
+ if os.path.exists(filename):
76
+ os.remove(filename)
77
+
78
+
79
+ def save_excel_file(
80
+ idx, df: pd.DataFrame, foldername, filename, page_enumeration: int = 0
81
+ ):
82
+ df.to_csv(
83
+ f"{foldername}/{filename}page{page_enumeration}table{idx}.csv",
84
+ index=False,
85
+ )
86
+
87
+
88
+ def concat_csv(folder, filename: str):
89
+ df = pd.DataFrame()
90
+ foldername = folder.name
91
+ files = list(
92
+ sorted(
93
+ os.listdir(foldername), key=lambda x: x.split("page")[1].split("table")[0]
94
+ )
95
+ )
96
+ columns = []
97
+ for idx, file in enumerate(files):
98
+ tmp = pd.read_csv(f"{foldername}/{file}")
99
+ try:
100
+ if idx == 0:
101
+ columns = tmp.iloc[0]
102
+ df = pd.concat([df, tmp[1:]])
103
+ except:
104
+ continue
105
+
106
+ if not df.empty:
107
+ df.columns = columns
108
+ st.dataframe(df)
109
+ df.to_csv(filename, index=False)