Ashwin V. Mohanan commited on
Commit
9714711
·
1 Parent(s): f3e0487

Complete digitizing

Browse files
Files changed (1) hide show
  1. app/tabs/submit.py +71 -77
app/tabs/submit.py CHANGED
@@ -1,17 +1,22 @@
 
1
  import logging
2
  import os
 
3
  import time
4
 
5
- import numpy as np
6
- import pandas as pd
7
- from pathlib import Path
8
  from dawsonia import io
9
  from dawsonia import digitize
10
  from dawsonia.ml import ml
11
- import pooch
12
  import gradio as gr
13
- import yaml
14
  from gradio_modal import Modal
 
 
 
 
 
 
 
15
 
16
  logger = logging.getLogger(__name__)
17
 
@@ -39,10 +44,11 @@ if os.environ.get("GRADIO_CACHE_DIR", GRADIO_CACHE) != GRADIO_CACHE:
39
 
40
 
41
  def run_dawsonia(
42
- table_fmt_config_override, batch_image_gallery, book, progress=gr.Progress()
43
  ):
44
- if None in (batch_image_gallery, book) or len(batch_image_gallery) == 0:
45
  raise ValueError("You need to select / upload the pages to digitize")
 
46
  progress(0, desc="Dawsonia: starting")
47
 
48
  model_path = Path("data/models/dawsonia/2024-07-02")
@@ -51,7 +57,7 @@ def run_dawsonia(
51
  print("Dawsonia: digitizing", book)
52
  table_fmt = book.table_format
53
 
54
- output_path_book = output_path / book.station_name / book._name
55
  output_path_book.mkdir(exist_ok=True, parents=True)
56
  (output_path_book / "probablities").mkdir(exist_ok=True)
57
 
@@ -63,30 +69,63 @@ def run_dawsonia(
63
  for table_idx in table_fmt.preproc.idx_tables_size_verify
64
  ]
65
 
66
- for page_number in range(len(batch_image_gallery)):
 
 
67
  output_path_page = output_path_book / str(page_number)
68
- results = [
69
- digitize.digitize_page_and_write_output(
70
- book,
71
- init_data,
72
- page_number=page_number + 3,
73
- date_str="2022-02-02",
74
- model_path=model_path,
75
- model_predict=ml.model_predict,
76
- prob_thresh=0.5,
77
- output_path_page=output_path_page,
78
- output_text_fmt=True,
79
- debug=True,
80
- )
81
- ]
 
 
 
 
82
 
83
- collection = []
84
- time.sleep(1)
85
  gr.Info("Pages were succesfully digitized ✨")
86
 
87
  yield collection, gr.skip()
88
 
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  def all_example_images() -> list[str]:
91
  """
92
  Get paths to all example images.
@@ -106,6 +145,9 @@ def get_selected_example_image(
106
  # for name, details in PIPELINES.items():
107
  name, _ext = event.value["image"]["orig_name"].split(".")
108
 
 
 
 
109
  if name in PIPELINES:
110
  book_path = pooch.retrieve(**PIPELINES[name], path=DATA_CACHE)
111
  first, last, book = io.read_book(book_path)
@@ -114,55 +156,6 @@ def get_selected_example_image(
114
  return [book.read_image(pg) for pg in range(first_page, last_page)], book
115
 
116
 
117
- table_fmt_config_override_placeholder = (
118
- """\
119
- [default]
120
- version = 0
121
-
122
- # Default values, but wrote explicitly here. See PreprocConfig class
123
- [default.preproc]
124
- table_modif = true
125
- corr_rotate = true
126
- row_idx_unit = "HOURS"
127
- idx_tables_size_verify = [0, 1]
128
-
129
- [version.0]
130
- columns = [
131
- [
132
- "term_på_baro",
133
- "barom",
134
- "torra_term",
135
- "våta_term",
136
- "moln_slag_lägre",
137
- "moln_mängd_lägre",
138
- "moln_slag_medel",
139
- "moln_slag_högre"
140
- ],
141
- [
142
- "moln_het_sol_dimma_nederbörd_total",
143
- "vind_riktning",
144
- "vind_beaufort",
145
- "vind_m_sek",
146
- "sikt",
147
- "sjögang",
148
- "maximi_term",
149
- "minimi_term",
150
- "nederbörd_mängd",
151
- "nederbörd_slag"
152
- ]
153
- ]
154
- name_idx = "tid"
155
- rows = [2, 8, 14, 19, 21]
156
- tables = [
157
- [5, 8],
158
- [5, 10],
159
- [3, 1],
160
- [4, 2],
161
- [4, 5]
162
- ]
163
- """,
164
- )
165
-
166
  with gr.Blocks() as submit:
167
  gr.Markdown("# Upload")
168
  gr.Markdown(
@@ -180,7 +173,7 @@ with gr.Blocks() as submit:
180
  label="Image to digitize",
181
  interactive=True,
182
  object_fit="scale-down",
183
- scale=10,
184
  )
185
 
186
  with gr.Column(scale=2):
@@ -225,6 +218,7 @@ with gr.Blocks() as submit:
225
  get_selected_example_image,
226
  (first_page, last_page),
227
  (batch_image_gallery, batch_book_state),
 
228
  )
229
 
230
  @batch_image_gallery.upload(
@@ -239,7 +233,7 @@ with gr.Blocks() as submit:
239
 
240
  run_button.click(
241
  fn=run_dawsonia,
242
- inputs=[table_fmt_config_override, batch_image_gallery, batch_book_state],
243
- outputs=[collection_submit_state, batch_image_gallery],
244
  )
245
  edit_table_fmt_button.click(lambda: Modal(visible=True), None, edit_table_fmt_modal)
 
1
+ import json
2
  import logging
3
  import os
4
+ from pathlib import Path
5
  import time
6
 
7
+ from PIL import Image
 
 
8
  from dawsonia import io
9
  from dawsonia import digitize
10
  from dawsonia.ml import ml
 
11
  import gradio as gr
 
12
  from gradio_modal import Modal
13
+ import numpy as np
14
+ from numpy.typing import NDArray
15
+ import pandas as pd
16
+ import pooch
17
+ import yaml
18
+
19
+ from .visualizer import Page, TableCell
20
 
21
  logger = logging.getLogger(__name__)
22
 
 
44
 
45
 
46
  def run_dawsonia(
47
+ table_fmt_config_override, first_page, last_page, book, progress=gr.Progress()
48
  ):
49
+ if book is None:
50
  raise ValueError("You need to select / upload the pages to digitize")
51
+
52
  progress(0, desc="Dawsonia: starting")
53
 
54
  model_path = Path("data/models/dawsonia/2024-07-02")
 
57
  print("Dawsonia: digitizing", book)
58
  table_fmt = book.table_format
59
 
60
+ output_path_book = output_path / book.station_name
61
  output_path_book.mkdir(exist_ok=True, parents=True)
62
  (output_path_book / "probablities").mkdir(exist_ok=True)
63
 
 
69
  for table_idx in table_fmt.preproc.idx_tables_size_verify
70
  ]
71
 
72
+ collection = []
73
+
74
+ for page_number in range(first_page, last_page):
75
  output_path_page = output_path_book / str(page_number)
76
+ gr.Info(f"Digitizing {page_number = }")
77
+
78
+ *_, stats = digitize.digitize_page_and_write_output(
79
+ book,
80
+ init_data,
81
+ page_number=page_number,
82
+ date_str=f"0000-page-{page_number}",
83
+ model_path=model_path,
84
+ model_predict=ml.model_predict,
85
+ prob_thresh=0.5,
86
+ output_path_page=output_path_page,
87
+ output_text_fmt=False,
88
+ debug=False,
89
+ )
90
+ progress_value = (page_number - first_page) / max(1, last_page - first_page)
91
+ progress(progress_value, desc=f"Dawsonia: {stats!s:.50}")
92
+
93
+ collection.append(read_page(stats, output_path_book, str(page_number)))
94
 
 
 
95
  gr.Info("Pages were succesfully digitized ✨")
96
 
97
  yield collection, gr.skip()
98
 
99
 
100
+ def read_page(stats: digitize.Statistics, output_path_book: Path, prefix: str):
101
+ if stats.tables_detected > 0:
102
+ values_df = pd.read_parquet((output_path_book / prefix).with_suffix(".parquet"))
103
+ table_meta = json.loads(
104
+ (output_path_book / "table_meta" / prefix).with_suffix(".json").read_text()
105
+ )
106
+ with Image.open(
107
+ (output_path_book / "pages" / prefix).with_suffix(".webp")
108
+ ) as im:
109
+ width = im.width
110
+ height = im.height
111
+
112
+ values_array = values_df.values.flatten()
113
+ bbox_array = np.array(table_meta["table_positions"]).reshape(
114
+ values_array.size, 4
115
+ )
116
+ cells = [
117
+ make_cell(value, bbox) for value, bbox in zip(values_array, bbox_array)
118
+ ]
119
+ return Page(width, height, cells)
120
+
121
+
122
+ def make_cell(value: str, bbox: NDArray[np.int64]):
123
+ x, y, w, h = bbox
124
+ xmax, ymax = x+w, y+h
125
+ polygon = (x,y), (xmax, y), (xmax, ymax), (x, ymax), (x,y)
126
+ return TableCell(polygon, text_x=x, text_y=y, text=value)
127
+
128
+
129
  def all_example_images() -> list[str]:
130
  """
131
  Get paths to all example images.
 
145
  # for name, details in PIPELINES.items():
146
  name, _ext = event.value["image"]["orig_name"].split(".")
147
 
148
+ if (last_page - first_page) > MAX_IMAGES:
149
+ raise ValueError(f"Maximum images you can digitize is set to: {MAX_IMAGES}")
150
+
151
  if name in PIPELINES:
152
  book_path = pooch.retrieve(**PIPELINES[name], path=DATA_CACHE)
153
  first, last, book = io.read_book(book_path)
 
156
  return [book.read_image(pg) for pg in range(first_page, last_page)], book
157
 
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  with gr.Blocks() as submit:
160
  gr.Markdown("# Upload")
161
  gr.Markdown(
 
173
  label="Image to digitize",
174
  interactive=True,
175
  object_fit="scale-down",
176
+ scale=1,
177
  )
178
 
179
  with gr.Column(scale=2):
 
218
  get_selected_example_image,
219
  (first_page, last_page),
220
  (batch_image_gallery, batch_book_state),
221
+ trigger_mode="always_last",
222
  )
223
 
224
  @batch_image_gallery.upload(
 
233
 
234
  run_button.click(
235
  fn=run_dawsonia,
236
+ inputs=(table_fmt_config_override, first_page, last_page, batch_book_state),
237
+ outputs=(collection_submit_state, batch_image_gallery),
238
  )
239
  edit_table_fmt_button.click(lambda: Modal(visible=True), None, edit_table_fmt_modal)