02alexander
commited on
Commit
•
e347567
1
Parent(s):
ad31b8f
fix bug highligthing of enity when clicking on markdown text
Browse files
ocr.py
CHANGED
@@ -119,10 +119,11 @@ Layout Class:
|
|
119 |
|
120 |
|
121 |
class Layout:
|
122 |
-
def __init__(self, show_unknown: bool = False):
|
123 |
self.counts = {layout_type: 0 for layout_type in LayoutType}
|
124 |
self.records: dict[LayoutType, Any] = {layout_type: [] for layout_type in LayoutType}
|
125 |
self.recovery = """"""
|
|
|
126 |
self.show_unknown = show_unknown
|
127 |
|
128 |
def add(
|
@@ -145,7 +146,7 @@ class Layout:
|
|
145 |
"table": table,
|
146 |
})
|
147 |
if layout_type != LayoutType.UNKNOWN or self.show_unknown: # Discards the unknown layout types detections
|
148 |
-
path = f"recording://Image/{layout_type.type.title()}/{name.title()}"
|
149 |
self.recovery += f"\n\n## [{name.title()}]({path})\n\n" # Log Type as Heading
|
150 |
# Enhancement - Logged image for Figure type TODO(#6517)
|
151 |
if layout_type == LayoutType.TABLE:
|
@@ -153,7 +154,7 @@ class Layout:
|
|
153 |
self.recovery += table # Log details (table)
|
154 |
elif detections:
|
155 |
for index, detection in enumerate(detections):
|
156 |
-
path_text = f"recording://Image/{layout_type.type.title()}/{name.title()}/Detections/{index}"
|
157 |
self.recovery += f' [{detection["text"]}]({path_text})' # Log details (text)
|
158 |
else:
|
159 |
logging.warning(f"Invalid layout type detected: {layout_type}")
|
@@ -221,13 +222,14 @@ class Layout:
|
|
221 |
return f"Error processing the table: {str(e)}"
|
222 |
|
223 |
|
224 |
-
def process_layout_records(log_queue: SimpleQueue[Any], layout: Layout
|
225 |
paths, detections_paths = [], []
|
226 |
zoom_paths: list[rrb.Spatial2DView] = []
|
227 |
zoom_paths_figures: list[rrb.Spatial2DView] = []
|
228 |
zoom_paths_tables: list[rrb.Spatial2DView] = []
|
229 |
zoom_paths_texts: list[rrb.Spatial2DView] = []
|
230 |
|
|
|
231 |
for layout_type in LayoutType:
|
232 |
for record in layout.records[layout_type]:
|
233 |
record_name = record["name"].title()
|
@@ -327,11 +329,11 @@ def update_zoom_paths(
|
|
327 |
|
328 |
def generate_blueprint(
|
329 |
layouts: list[Layout],
|
330 |
-
page_paths: list[str],
|
331 |
processed_layouts: list[LayoutStructure],
|
332 |
) -> rrb.Blueprint:
|
333 |
page_tabs = []
|
334 |
-
for layout,
|
|
|
335 |
paths, detections_paths, zoom_paths_figures, zoom_paths_tables, zoom_paths_texts = processed_layout
|
336 |
|
337 |
section_tabs = []
|
@@ -399,28 +401,28 @@ def detect_and_log_layouts(log_queue: SimpleQueue[Any], file_path: str, start_pa
|
|
399 |
|
400 |
# Extracte the layout from each image
|
401 |
layouts: list[Layout] = []
|
402 |
-
|
403 |
processed_layouts: list[LayoutStructure] = []
|
404 |
-
for i, (image,
|
405 |
-
layouts.append(detect_and_log_layout(log_queue, image,
|
406 |
|
407 |
# Generate and send a blueprint based on the detected layouts
|
408 |
processed_layouts.append(
|
409 |
process_layout_records(
|
410 |
log_queue,
|
411 |
layouts[-1],
|
412 |
-
page_path,
|
413 |
)
|
414 |
)
|
415 |
logging.info("Sending blueprint...")
|
416 |
-
blueprint = generate_blueprint(layouts,
|
417 |
log_queue.put(["blueprint", blueprint])
|
418 |
logging.info("Blueprint sent...")
|
419 |
|
420 |
|
421 |
-
def detect_and_log_layout(log_queue: SimpleQueue, coloured_image: npt.NDArray[np.uint8],
|
422 |
# Layout Object - This will contain the detected layouts and their detections
|
423 |
-
layout = Layout()
|
|
|
424 |
|
425 |
# Log Image and add Annotation Context
|
426 |
log_queue.put([
|
|
|
119 |
|
120 |
|
121 |
class Layout:
|
122 |
+
def __init__(self, page_number: int, show_unknown: bool = False):
|
123 |
self.counts = {layout_type: 0 for layout_type in LayoutType}
|
124 |
self.records: dict[LayoutType, Any] = {layout_type: [] for layout_type in LayoutType}
|
125 |
self.recovery = """"""
|
126 |
+
self.page_number = page_number
|
127 |
self.show_unknown = show_unknown
|
128 |
|
129 |
def add(
|
|
|
146 |
"table": table,
|
147 |
})
|
148 |
if layout_type != LayoutType.UNKNOWN or self.show_unknown: # Discards the unknown layout types detections
|
149 |
+
path = f"recording://page_{self.page_number}/Image/{layout_type.type.title()}/{name.title()}"
|
150 |
self.recovery += f"\n\n## [{name.title()}]({path})\n\n" # Log Type as Heading
|
151 |
# Enhancement - Logged image for Figure type TODO(#6517)
|
152 |
if layout_type == LayoutType.TABLE:
|
|
|
154 |
self.recovery += table # Log details (table)
|
155 |
elif detections:
|
156 |
for index, detection in enumerate(detections):
|
157 |
+
path_text = f"recording://page_{self.page_number}/Image/{layout_type.type.title()}/{name.title()}/Detections/{index}"
|
158 |
self.recovery += f' [{detection["text"]}]({path_text})' # Log details (text)
|
159 |
else:
|
160 |
logging.warning(f"Invalid layout type detected: {layout_type}")
|
|
|
222 |
return f"Error processing the table: {str(e)}"
|
223 |
|
224 |
|
225 |
+
def process_layout_records(log_queue: SimpleQueue[Any], layout: Layout) -> LayoutStructure:
|
226 |
paths, detections_paths = [], []
|
227 |
zoom_paths: list[rrb.Spatial2DView] = []
|
228 |
zoom_paths_figures: list[rrb.Spatial2DView] = []
|
229 |
zoom_paths_tables: list[rrb.Spatial2DView] = []
|
230 |
zoom_paths_texts: list[rrb.Spatial2DView] = []
|
231 |
|
232 |
+
page_path = f'page_{layout.page_number}'
|
233 |
for layout_type in LayoutType:
|
234 |
for record in layout.records[layout_type]:
|
235 |
record_name = record["name"].title()
|
|
|
329 |
|
330 |
def generate_blueprint(
|
331 |
layouts: list[Layout],
|
|
|
332 |
processed_layouts: list[LayoutStructure],
|
333 |
) -> rrb.Blueprint:
|
334 |
page_tabs = []
|
335 |
+
for layout, processed_layout in zip(layouts, processed_layouts):
|
336 |
+
page_path = f'page_{layout.page_number}'
|
337 |
paths, detections_paths, zoom_paths_figures, zoom_paths_tables, zoom_paths_texts = processed_layout
|
338 |
|
339 |
section_tabs = []
|
|
|
401 |
|
402 |
# Extracte the layout from each image
|
403 |
layouts: list[Layout] = []
|
404 |
+
page_numbers = [i + start_page for i in range(len(images))]
|
405 |
processed_layouts: list[LayoutStructure] = []
|
406 |
+
for i, (image, page_number) in enumerate(zip(images, page_numbers)):
|
407 |
+
layouts.append(detect_and_log_layout(log_queue, image, page_number))
|
408 |
|
409 |
# Generate and send a blueprint based on the detected layouts
|
410 |
processed_layouts.append(
|
411 |
process_layout_records(
|
412 |
log_queue,
|
413 |
layouts[-1],
|
|
|
414 |
)
|
415 |
)
|
416 |
logging.info("Sending blueprint...")
|
417 |
+
blueprint = generate_blueprint(layouts, processed_layouts)
|
418 |
log_queue.put(["blueprint", blueprint])
|
419 |
logging.info("Blueprint sent...")
|
420 |
|
421 |
|
422 |
+
def detect_and_log_layout(log_queue: SimpleQueue, coloured_image: npt.NDArray[np.uint8], page_number: int) -> Layout:
|
423 |
# Layout Object - This will contain the detected layouts and their detections
|
424 |
+
layout = Layout(page_number)
|
425 |
+
page_path = f'page_{page_number}'
|
426 |
|
427 |
# Log Image and add Annotation Context
|
428 |
log_queue.put([
|