mrm8488 commited on
Commit
526964f
1 Parent(s): e216422

Create new file

Browse files
Files changed (1) hide show
  1. preprocess.py +94 -0
preprocess.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ def preprocess(example):
3
+
4
+ def _add_adjusted_col_offsets(table):
5
+ """Add adjusted column offsets to take into account multi-column cells."""
6
+ adjusted_table = []
7
+ for row in table:
8
+ real_col_index = 0
9
+ adjusted_row = []
10
+ for cell in row:
11
+ adjusted_cell = copy.deepcopy(cell)
12
+ adjusted_cell["adjusted_col_start"] = real_col_index
13
+ adjusted_cell["adjusted_col_end"] = (
14
+ adjusted_cell["adjusted_col_start"] + adjusted_cell["column_span"])
15
+ real_col_index += adjusted_cell["column_span"]
16
+ adjusted_row.append(adjusted_cell)
17
+ adjusted_table.append(adjusted_row)
18
+ return adjusted_table
19
+
20
+
21
+ def _get_heuristic_row_headers(adjusted_table, row_index, col_index):
22
+ """Heuristic to find row headers."""
23
+ row_headers = []
24
+ row = adjusted_table[row_index]
25
+ for i in range(0, col_index):
26
+ if row[i]["is_header"]:
27
+ row_headers.append(row[i])
28
+ return row_headers
29
+
30
+
31
+ def _get_heuristic_col_headers(adjusted_table, row_index, col_index):
32
+ """Heuristic to find column headers."""
33
+ adjusted_cell = adjusted_table[row_index][col_index]
34
+ adjusted_col_start = adjusted_cell["adjusted_col_start"]
35
+ adjusted_col_end = adjusted_cell["adjusted_col_end"]
36
+ col_headers = []
37
+ for r in range(0, row_index):
38
+ row = adjusted_table[r]
39
+ for cell in row:
40
+ if (cell["adjusted_col_start"] < adjusted_col_end and
41
+ cell["adjusted_col_end"] > adjusted_col_start):
42
+ if cell["is_header"]:
43
+ col_headers.append(cell)
44
+
45
+ return col_headers
46
+
47
+
48
+
49
+ table = example['table']
50
+ cell_indices = example["highlighted_cells"]
51
+ table_str = ""
52
+ if example['table_page_title']:
53
+ table_str += "<page_title> " + example['table_page_title'] + " </page_title> "
54
+ if example['table_section_title']:
55
+ table_str += "<section_title> " + example['table_section_title'] + " </section_title> "
56
+
57
+ table_str += "<table> "
58
+ adjusted_table = _add_adjusted_col_offsets(table)
59
+ for r_index, row in enumerate(table):
60
+ row_str = "<row> "
61
+ for c_index, col in enumerate(row):
62
+
63
+ row_headers = _get_heuristic_row_headers(adjusted_table, r_index, c_index)
64
+ col_headers = _get_heuristic_col_headers(adjusted_table, r_index, c_index)
65
+
66
+ # Distinguish between highlighted and non-highlighted cells.
67
+ if [r_index, c_index] in cell_indices:
68
+ start_cell_marker = "<highlighted_cell> "
69
+ end_cell_marker = "</highlighted_cell> "
70
+ else:
71
+ start_cell_marker = "<c> "
72
+ end_cell_marker = "</c> "
73
+
74
+ # The value of the cell.
75
+ item_str = start_cell_marker + col["value"] + " "
76
+
77
+ # All the column headers associated with this cell.
78
+ for col_header in col_headers:
79
+ item_str += "<col_header> " + col_header["value"] + " </col_header> "
80
+
81
+ # All the row headers associated with this cell.
82
+ for row_header in row_headers:
83
+ item_str += "<row_header> " + row_header["value"] + " </row_header> "
84
+
85
+ item_str += end_cell_marker
86
+ row_str += item_str
87
+
88
+ row_str += "</row> "
89
+ table_str += row_str
90
+
91
+ table_str += "</table>"
92
+
93
+ return '<s>' + table_str + '\n' + '\n'
94
+