Spaces:
Runtime error
Runtime error
Raj-Master
commited on
Commit
·
14e58a7
1
Parent(s):
4635598
Create process.py
Browse files- process.py +226 -0
process.py
ADDED
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from paddleocr import PaddleOCR
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
ocr = PaddleOCR(use_angle_cls=True, lang="ch")
|
6 |
+
|
7 |
+
|
8 |
+
def filter_columns(columns: np.ndarray):
|
9 |
+
for idx, col in enumerate(columns):
|
10 |
+
if idx >= len(columns) - 1:
|
11 |
+
break
|
12 |
+
nxt = columns[idx + 1]
|
13 |
+
threshold = ((col[2] - col[0]) + (nxt[2] - nxt[0])) / 2
|
14 |
+
if (col[2] - columns[idx + 1][0]) > threshold * 0.5:
|
15 |
+
col[1], col[2], col[3] = min(col[1], nxt[1]), nxt[2], max(col[3], nxt[3])
|
16 |
+
columns = np.delete(columns, idx + 1, 0)
|
17 |
+
idx -= 1
|
18 |
+
return columns
|
19 |
+
|
20 |
+
|
21 |
+
def process_text(row):
|
22 |
+
# * concatenate the text of the cell and return the coordinates and the text of the cell
|
23 |
+
coor = np.array([None, None])
|
24 |
+
text = ""
|
25 |
+
for txt in row:
|
26 |
+
coor[0], coor[1] = (
|
27 |
+
txt[0][0][1] if coor[0] is None or txt[0][0][1] < coor[0] else coor[0],
|
28 |
+
txt[0][2][1] if coor[1] is None or txt[0][2][1] > coor[1] else coor[1],
|
29 |
+
)
|
30 |
+
text += f"{txt[1][0]} "
|
31 |
+
text = text.strip()
|
32 |
+
row = [coor, text]
|
33 |
+
return row
|
34 |
+
|
35 |
+
|
36 |
+
def extract_text_of_col(col_img):
|
37 |
+
"""'
|
38 |
+
* extract text from the column image and calculate the average length of the row in the column
|
39 |
+
* the average is calculated by summing the length of each row then divide the total by the number of rows inside the column
|
40 |
+
* return the text and the average length
|
41 |
+
"""
|
42 |
+
result = ocr.ocr(col_img, cls=False)
|
43 |
+
ocr_res = []
|
44 |
+
|
45 |
+
for ps, (text, score) in result[0]:
|
46 |
+
x1 = min(p[0] for p in ps)
|
47 |
+
y1 = min(p[1] for p in ps)
|
48 |
+
x2 = max(p[0] for p in ps)
|
49 |
+
y2 = max(p[1] for p in ps)
|
50 |
+
word_info = {
|
51 |
+
'bbox': [x1, y1, x2, y2],
|
52 |
+
'text': text
|
53 |
+
}
|
54 |
+
ocr_res.append(word_info)
|
55 |
+
threshold = 0
|
56 |
+
print(result)
|
57 |
+
for idx in range(len(result)):
|
58 |
+
summ = 0
|
59 |
+
length = len(result[idx])
|
60 |
+
for line in result[idx]:
|
61 |
+
summ += line[0][2][1] - line[0][0][1]
|
62 |
+
if length > 0:
|
63 |
+
threshold += summ / len(result[idx])
|
64 |
+
return result, threshold / len(result),ocr_res
|
65 |
+
|
66 |
+
|
67 |
+
def prepare_cols(result, threshold):
|
68 |
+
"""
|
69 |
+
** columns are seperated **
|
70 |
+
* add each element from the extracted text to its row according to the coordinate intersection with respect to the average length of the row
|
71 |
+
* the intersection is True if the intersected part is bigger than the threshold number (ex: half of the average length of the row)
|
72 |
+
* return the column of the arranged rows
|
73 |
+
"""
|
74 |
+
col = []
|
75 |
+
for idx in range(len(result)):
|
76 |
+
row = []
|
77 |
+
for i, line in enumerate(result[idx]):
|
78 |
+
if i == 0:
|
79 |
+
row.append(line)
|
80 |
+
if i == len(result[idx]) - 1:
|
81 |
+
col.append(process_text(row))
|
82 |
+
continue
|
83 |
+
if (
|
84 |
+
line[0][0][1] >= row[-1][0][0][1] and line[0][2][1] >= row[-1][0][2][1]
|
85 |
+
) and (
|
86 |
+
line[0][2][1] > row[-1][0][0][1]
|
87 |
+
and line[0][0][1] < row[-1][0][2][1]
|
88 |
+
and (abs(line[0][0][1] - row[-1][0][2][1]) > threshold)
|
89 |
+
):
|
90 |
+
row.append(line)
|
91 |
+
elif (
|
92 |
+
line[0][0][1] <= row[-1][0][0][1] and line[0][2][1] <= row[-1][0][2][1]
|
93 |
+
) and (
|
94 |
+
line[0][2][1] > row[-1][0][0][1]
|
95 |
+
and line[0][0][1] < row[-1][0][2][1]
|
96 |
+
and (abs(line[0][2][1] - row[-1][0][0][1]) > threshold)
|
97 |
+
):
|
98 |
+
row.append(line)
|
99 |
+
elif (
|
100 |
+
line[0][0][1] <= row[-1][0][0][1] and line[0][2][1] >= row[-1][0][2][1]
|
101 |
+
) and (
|
102 |
+
line[0][2][1] > row[-1][0][0][1]
|
103 |
+
and line[0][0][1] < row[-1][0][2][1]
|
104 |
+
and (abs(row[-1][0][2][1] - row[-1][0][0][1]) > threshold)
|
105 |
+
):
|
106 |
+
row.append(line)
|
107 |
+
elif (
|
108 |
+
line[0][0][1] >= row[-1][0][0][1] and line[0][2][1] <= row[-1][0][2][1]
|
109 |
+
) and (
|
110 |
+
line[0][2][1] > row[-1][0][0][1]
|
111 |
+
and line[0][0][1] < row[-1][0][2][1]
|
112 |
+
and (abs(line[0][0][1] - line[0][2][1]) > threshold)
|
113 |
+
):
|
114 |
+
row.append(line)
|
115 |
+
elif (
|
116 |
+
line[0][0][1] == row[-1][0][0][1] and line[0][2][1] == row[-1][0][2][1]
|
117 |
+
) and (
|
118 |
+
line[0][2][1] > row[-1][0][0][1]
|
119 |
+
and line[0][0][1] < row[-1][0][2][1]
|
120 |
+
and (abs(line[0][2][1] - row[-1][0][0][1]) > threshold)
|
121 |
+
):
|
122 |
+
row.append(line)
|
123 |
+
else:
|
124 |
+
col.append(process_text(row))
|
125 |
+
row = [line]
|
126 |
+
if i == len(result[idx]) - 1:
|
127 |
+
col.append(process_text(row))
|
128 |
+
return col
|
129 |
+
|
130 |
+
|
131 |
+
def prepare_coordinates(cols):
|
132 |
+
"""
|
133 |
+
* find the column with the maximum number of rows
|
134 |
+
* create a key value pair in which the key is the coordinates of each row in the column with the highest number of rows
|
135 |
+
and the value is an empty numpy array which has length of number of detected columns
|
136 |
+
"""
|
137 |
+
max_col = max(cols, key=len)
|
138 |
+
array = np.empty(len(cols), dtype=object)
|
139 |
+
array.fill(np.nan)
|
140 |
+
coor_dict = {tuple(k[0]): array for k in max_col}
|
141 |
+
return coor_dict
|
142 |
+
|
143 |
+
|
144 |
+
def process_cols(cols, threshold):
|
145 |
+
coor_dict = prepare_coordinates(cols)
|
146 |
+
"""
|
147 |
+
* loop over each element inside each column and find the right place for it inside the dataframe by using the coordinates intersection with respect to the average length of the row
|
148 |
+
* the intersection is True if the intersected part is bigger than the threshold number (ex: half of the average length of the row)
|
149 |
+
"""
|
150 |
+
for idx, col in enumerate(cols):
|
151 |
+
for element in col:
|
152 |
+
for coor, row in coor_dict.items():
|
153 |
+
if (coor[0] >= element[0][0] and coor[1] >= element[0][1]) and (
|
154 |
+
(coor[1] > element[0][0])
|
155 |
+
and (coor[0] < element[0][1])
|
156 |
+
and (abs(coor[0] - element[0][1]) > threshold)
|
157 |
+
):
|
158 |
+
new = row.copy()
|
159 |
+
new[idx] = element[1]
|
160 |
+
coor_dict[coor] = new
|
161 |
+
elif (coor[0] <= element[0][0] and coor[1] <= element[0][1]) and (
|
162 |
+
(coor[1] > element[0][0])
|
163 |
+
and (coor[0] < element[0][1])
|
164 |
+
and (abs(coor[1] - element[0][0]) > threshold)
|
165 |
+
):
|
166 |
+
new = row.copy()
|
167 |
+
new[idx] = element[1]
|
168 |
+
coor_dict[coor] = new
|
169 |
+
elif (coor[0] >= element[0][0] and coor[1] <= element[0][1]) and (
|
170 |
+
(coor[1] > element[0][0])
|
171 |
+
and (coor[0] < element[0][1])
|
172 |
+
and (abs(coor[1] - coor[0]) > threshold)
|
173 |
+
):
|
174 |
+
new = row.copy()
|
175 |
+
new[idx] = element[1]
|
176 |
+
coor_dict[coor] = new
|
177 |
+
elif (coor[0] <= element[0][0] and coor[1] >= element[0][1]) and (
|
178 |
+
(coor[1] > element[0][0])
|
179 |
+
and (coor[0] < element[0][1])
|
180 |
+
and (abs(element[0][1] - element[0][0]) > threshold)
|
181 |
+
):
|
182 |
+
new = row.copy()
|
183 |
+
new[idx] = element[1]
|
184 |
+
coor_dict[coor] = new
|
185 |
+
data = [row for row in coor_dict.values()]
|
186 |
+
return data
|
187 |
+
|
188 |
+
|
189 |
+
def valid_row(row):
|
190 |
+
return (
|
191 |
+
(row[0] is not np.nan)
|
192 |
+
or (row[-1] is not np.nan)
|
193 |
+
or (row[-2] is not np.nan)
|
194 |
+
or (row[-3] is not np.nan)
|
195 |
+
)
|
196 |
+
|
197 |
+
|
198 |
+
def finalize_data(data: list, page_enumeration: int):
|
199 |
+
idx = 0
|
200 |
+
while idx <= len(data) - 1:
|
201 |
+
row = data[idx]
|
202 |
+
if valid_row(row) and row[0] is np.nan:
|
203 |
+
# * add the date to the valid row if it's empty
|
204 |
+
try:
|
205 |
+
row[0] = data[idx - 1][0]
|
206 |
+
data[idx] = row
|
207 |
+
except:
|
208 |
+
data.pop(idx)
|
209 |
+
idx = (idx - 1) if idx > 0 else idx
|
210 |
+
continue
|
211 |
+
if not valid_row(row):
|
212 |
+
if idx == 0:
|
213 |
+
data.pop(idx)
|
214 |
+
continue
|
215 |
+
for i, col in enumerate(row):
|
216 |
+
# * merge description to the previous row if the current row is not valid
|
217 |
+
if (col is not None) and (col is not np.nan):
|
218 |
+
data[idx - 1][i] = str(data[idx - 1][i]) + f" {col}"
|
219 |
+
data.pop(idx)
|
220 |
+
idx -= 1
|
221 |
+
continue
|
222 |
+
idx += 1
|
223 |
+
page_idx = ["page"] + [page_enumeration for i in range(len(data) - 1)]
|
224 |
+
data: pd.DataFrame = pd.DataFrame(data)
|
225 |
+
data.insert(0, "page", page_idx)
|
226 |
+
return data
|