Spaces:

Riksarkivet
/

htr_demo

Running on T4

App Files Files Community

Gabriel commited on Sep 18, 2023

Commit

7263d32

1 Parent(s): d50ab3a

refactored the pipeline

Browse files

Files changed (13) hide show

app.py +4 -4
helper/gradio_config.py +3 -0
requirements.txt +2 -0
src/htr_pipeline/gradio_backend.py +28 -7
src/htr_pipeline/pipeline.py +10 -6
src/htr_pipeline/utils/helper.py +15 -0
src/htr_pipeline/utils/parser_xml.py +0 -60
src/htr_pipeline/utils/pipeline_inferencer.py +107 -0
src/htr_pipeline/utils/process_xml.py +0 -167
src/htr_pipeline/utils/visualize_xml.py +68 -0
src/htr_pipeline/utils/xml_helper.py +55 -0
tabs/htr_tool.py +5 -29
tabs/stepwise_htr_tool.py +44 -58

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
-from helper.gradio_config import css, js, theme
 from helper.text.text_about import TextAbout
 from helper.text.text_app import TextApp
 from helper.text.text_howto import TextHowTo
@@ -21,7 +21,7 @@ with gr.Blocks(title="HTR Riksarkivet", theme=theme, css=css) as demo:
         with gr.Tab("How to use"):
             with gr.Tabs():
                 with gr.Tab("HTR Tool"):
-                    with gr.Row().style(equal_height=False):
                         with gr.Column():
                             gr.Markdown(TextHowTo.htr_tool)
                         with gr.Column():
@@ -33,7 +33,7 @@ with gr.Blocks(title="HTR Riksarkivet", theme=theme, css=css) as demo:
                             gr.Markdown(TextHowTo.reach_out)
                 with gr.Tab("Stepwise HTR Tool"):
-                    with gr.Row().style(equal_height=False):
                         gr.Markdown(TextHowTo.stepwise_htr_tool)
                     with gr.Row():
                         gr.Markdown(TextHowTo.stepwise_htr_tool_tab_intro)
@@ -115,7 +115,7 @@ print(job.result())
                         with gr.Column():
                             gr.Markdown(TextRoadmap.discussion)
-    demo.load(None, None, None, _js=js)
 demo.queue(concurrency_count=1, max_size=1)

 import gradio as gr
+from helper.gradio_config import css, theme
 from helper.text.text_about import TextAbout
 from helper.text.text_app import TextApp
 from helper.text.text_howto import TextHowTo
         with gr.Tab("How to use"):
             with gr.Tabs():
                 with gr.Tab("HTR Tool"):
+                    with gr.Row(equal_height=False):
                         with gr.Column():
                             gr.Markdown(TextHowTo.htr_tool)
                         with gr.Column():
                             gr.Markdown(TextHowTo.reach_out)
                 with gr.Tab("Stepwise HTR Tool"):
+                    with gr.Row(equal_height=False):
                         gr.Markdown(TextHowTo.stepwise_htr_tool)
                     with gr.Row():
                         gr.Markdown(TextHowTo.stepwise_htr_tool_tab_intro)
                         with gr.Column():
                             gr.Markdown(TextRoadmap.discussion)
+    # demo.load(None, None, None, _js=js)
 demo.queue(concurrency_count=1, max_size=1)

helper/gradio_config.py CHANGED Viewed

@@ -21,6 +21,9 @@ class GradioConfig:
         #image_upload [data-testid="image"], #image_upload [data-testid="image"] > div{min-height: 450px}
         #gallery {height: 400px}
         .fixed-height.svelte-g4rw9.svelte-g4rw9 {min-height: 400px;}
                 """
     def generate_tooltip_css(self):

         #image_upload [data-testid="image"], #image_upload [data-testid="image"] > div{min-height: 450px}
         #gallery {height: 400px}
         .fixed-height.svelte-g4rw9.svelte-g4rw9 {min-height: 400px;}
+        #gallery_lines > div.preview.svelte-1b19cri > div.thumbnails.scroll-hide.svelte-1b19cri {display: none;}
                 """
     def generate_tooltip_css(self):

requirements.txt CHANGED Viewed

@@ -14,6 +14,8 @@ pillow==9.5.0
 # make install_openmmlab (they are excuted in dockerfile)
 # !pip install -U openmim
 # !mim install mmengine

 # make install_openmmlab (they are excuted in dockerfile)
 # !pip install -U openmim
 # !mim install mmengine

src/htr_pipeline/gradio_backend.py CHANGED Viewed

@@ -5,6 +5,7 @@ import pandas as pd
 from src.htr_pipeline.inferencer import Inferencer, InferencerInterface
 from src.htr_pipeline.pipeline import Pipeline, PipelineInterface
 class SingletonModelLoader:
@@ -28,6 +29,7 @@ class FastTrack:
         self.pipeline: PipelineInterface = model_loader.pipeline
     def segment_to_xml(self, image, radio_button_choices):
         xml_xml = "page_xml.xml"
         xml_txt = "page_txt.txt"
@@ -40,6 +42,11 @@ class FastTrack:
             f.write(rendered_xml)
         xml_img = self.visualize_xml_and_return_txt(image, xml_txt)
         if len(radio_button_choices) < 2:
             if radio_button_choices[0] == "Txt":
                 returned_file_extension = xml_txt
@@ -47,8 +54,7 @@ class FastTrack:
                 returned_file_extension = xml_xml
         else:
             returned_file_extension = [xml_txt, xml_xml]
-        return xml_img, returned_file_extension, gr.update(visible=True)
     def segment_to_xml_api(self, image):
         rendered_xml = self.pipeline.running_htr_pipeline(image)
@@ -70,12 +76,14 @@ class CustomTrack:
     def __init__(self, model_loader):
         self.inferencer: InferencerInterface = model_loader.inferencer
     def region_segment(self, image, pred_score_threshold, containments_treshold):
         predicted_regions, regions_cropped_ordered, _, _ = self.inferencer.predict_regions(
             image, pred_score_threshold, containments_treshold
         )
         return predicted_regions, regions_cropped_ordered, gr.update(visible=False), gr.update(visible=True)
     def line_segment(self, image, pred_score_threshold, containments_threshold):
         predicted_lines, lines_cropped_ordered, _ = self.inferencer.predict_lines(
             image, pred_score_threshold, containments_threshold
@@ -93,22 +101,35 @@ class CustomTrack:
         )
     def transcribe_text(self, df, images):
         transcription_temp_list_with_score = []
         mapping_dict = {}
         for image in images:
             transcribed_text, prediction_score_from_htr = self.inferencer.transcribe(image)
             transcription_temp_list_with_score.append((transcribed_text, prediction_score_from_htr))
             df_trans_explore = pd.DataFrame(
-                transcription_temp_list_with_score, columns=["Transcribed text", "HTR prediction score"]
             )
             mapping_dict[transcribed_text] = image
-            yield df_trans_explore[["Transcribed text"]], df_trans_explore, mapping_dict, gr.update(
-                visible=False
-            ), gr.update(visible=True), gr.update(visible=False)
     def get_select_index_image(self, images_from_gallery, evt: gr.SelectData):
         return images_from_gallery[evt.index]["name"]
@@ -120,7 +141,7 @@ class CustomTrack:
         new_first = [sorted_image]
         new_list = [img for txt, img in mapping_dict.items() if txt != key_text]
         new_first.extend(new_list)
-        return new_first
     def download_df_to_txt(self, transcribed_df):
         text_in_list = transcribed_df["Transcribed text"].tolist()

 from src.htr_pipeline.inferencer import Inferencer, InferencerInterface
 from src.htr_pipeline.pipeline import Pipeline, PipelineInterface
+from src.htr_pipeline.utils.helper import gradio_info
 class SingletonModelLoader:
         self.pipeline: PipelineInterface = model_loader.pipeline
     def segment_to_xml(self, image, radio_button_choices):
+        gr.Info("Running HTR-pipeline")
         xml_xml = "page_xml.xml"
         xml_txt = "page_txt.txt"
             f.write(rendered_xml)
         xml_img = self.visualize_xml_and_return_txt(image, xml_txt)
+        returned_file_extension = self.file_extenstion_to_return(radio_button_choices, xml_xml, xml_txt)
+        return xml_img, returned_file_extension, gr.update(visible=True)
+    def file_extenstion_to_return(self, radio_button_choices, xml_xml, xml_txt):
         if len(radio_button_choices) < 2:
             if radio_button_choices[0] == "Txt":
                 returned_file_extension = xml_txt
                 returned_file_extension = xml_xml
         else:
             returned_file_extension = [xml_txt, xml_xml]
+        return returned_file_extension
     def segment_to_xml_api(self, image):
         rendered_xml = self.pipeline.running_htr_pipeline(image)
     def __init__(self, model_loader):
         self.inferencer: InferencerInterface = model_loader.inferencer
+    @gradio_info("Running Segment Region")
     def region_segment(self, image, pred_score_threshold, containments_treshold):
         predicted_regions, regions_cropped_ordered, _, _ = self.inferencer.predict_regions(
             image, pred_score_threshold, containments_treshold
         )
         return predicted_regions, regions_cropped_ordered, gr.update(visible=False), gr.update(visible=True)
+    @gradio_info("Running Segment Line")
     def line_segment(self, image, pred_score_threshold, containments_threshold):
         predicted_lines, lines_cropped_ordered, _ = self.inferencer.predict_lines(
             image, pred_score_threshold, containments_threshold
         )
     def transcribe_text(self, df, images):
+        gr.Info("Running Transcribe Lines")
         transcription_temp_list_with_score = []
         mapping_dict = {}
+        total_images = len(images)
+        current_index = 0
+        bool_to_show_placeholder = gr.update(visible=True)
+        bool_to_show_control_results_transcribe = gr.update(visible=False)
         for image in images:
+            current_index += 1
+            if current_index == total_images:
+                bool_to_show_control_results_transcribe = gr.update(visible=True)
+                bool_to_show_placeholder = gr.update(visible=False)
             transcribed_text, prediction_score_from_htr = self.inferencer.transcribe(image)
             transcription_temp_list_with_score.append((transcribed_text, prediction_score_from_htr))
             df_trans_explore = pd.DataFrame(
+                transcription_temp_list_with_score, columns=["Transcribed text", "Pred score"]
             )
             mapping_dict[transcribed_text] = image
+            yield df_trans_explore[
+                ["Transcribed text"]
+            ], df_trans_explore, mapping_dict, bool_to_show_control_results_transcribe, bool_to_show_placeholder
     def get_select_index_image(self, images_from_gallery, evt: gr.SelectData):
         return images_from_gallery[evt.index]["name"]
         new_first = [sorted_image]
         new_list = [img for txt, img in mapping_dict.items() if txt != key_text]
         new_first.extend(new_list)
+        return new_first, key_text
     def download_df_to_txt(self, transcribed_df):
         text_in_list = transcribed_df["Transcribed text"].tolist()

src/htr_pipeline/pipeline.py CHANGED Viewed

@@ -6,15 +6,18 @@ import numpy as np
 from src.htr_pipeline.inferencer import Inferencer
 from src.htr_pipeline.utils.helper import timer_func
 from src.htr_pipeline.utils.parser_xml import XmlParser
 from src.htr_pipeline.utils.preprocess_img import Preprocess
-from src.htr_pipeline.utils.process_xml import XMLHelper
 class Pipeline:
     def __init__(self, inferencer: Inferencer) -> None:
         self.inferencer = inferencer
-        self.xml = XMLHelper()
         self.preprocess_img = Preprocess()
     @timer_func
     def running_htr_pipeline(
@@ -27,7 +30,7 @@ class Pipeline:
         input_image = self.preprocess_img.binarize_img(input_image)
         image = mmcv.imread(input_image)
-        rendered_xml = self.xml.image_to_page_xml(
             image, pred_score_threshold_regions, pred_score_threshold_lines, containments_threshold, self.inferencer
         )
@@ -35,14 +38,15 @@ class Pipeline:
     @timer_func
     def visualize_xml(self, input_image: np.ndarray) -> np.ndarray:
-        self.xml_visualizer_and_parser = XmlParser()
         bin_input_image = self.preprocess_img.binarize_img(input_image)
-        xml_image = self.xml_visualizer_and_parser.visualize_xml(bin_input_image)
         return xml_image
     @timer_func
     def parse_xml_to_txt(self) -> None:
-        self.xml_visualizer_and_parser.xml_to_txt()
 class PipelineInterface(Protocol):

 from src.htr_pipeline.inferencer import Inferencer
 from src.htr_pipeline.utils.helper import timer_func
 from src.htr_pipeline.utils.parser_xml import XmlParser
+from src.htr_pipeline.utils.pipeline_inferencer import PipelineInferencer, XMLHelper
 from src.htr_pipeline.utils.preprocess_img import Preprocess
+from src.htr_pipeline.utils.process_segmask import SegMaskHelper
+from src.htr_pipeline.utils.visualize_xml import XmlViz
+from src.htr_pipeline.utils.xml_helper import XMLHelper
 class Pipeline:
     def __init__(self, inferencer: Inferencer) -> None:
         self.inferencer = inferencer
         self.preprocess_img = Preprocess()
+        self.pipeline_inferencer = PipelineInferencer(SegMaskHelper(), XMLHelper())
     @timer_func
     def running_htr_pipeline(
         input_image = self.preprocess_img.binarize_img(input_image)
         image = mmcv.imread(input_image)
+        rendered_xml = self.pipeline_inferencer.image_to_page_xml(
             image, pred_score_threshold_regions, pred_score_threshold_lines, containments_threshold, self.inferencer
         )
     @timer_func
     def visualize_xml(self, input_image: np.ndarray) -> np.ndarray:
+        xml_viz = XmlViz()
         bin_input_image = self.preprocess_img.binarize_img(input_image)
+        xml_image = xml_viz.visualize_xml(bin_input_image)
         return xml_image
     @timer_func
     def parse_xml_to_txt(self) -> None:
+        xml_visualizer_and_parser = XmlParser()
+        xml_visualizer_and_parser.xml_to_txt()
 class PipelineInterface(Protocol):

src/htr_pipeline/utils/helper.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import functools
 import threading
 import time
 import tqdm
@@ -75,6 +77,19 @@ def another_long_running_function(*args, **kwargs):
     return "success"
 if __name__ == "__main__":
     # Basic example
     retval = provide_progress_bar(long_running_function, estimated_time=5)

 import functools
 import threading
 import time
+from functools import wraps
+import gradio as gr
 import tqdm
     return "success"
+# Decorator for logging
+def gradio_info(message):
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            gr.Info(message)
+            return func(*args, **kwargs)
+        return wrapper
+    return decorator
 if __name__ == "__main__":
     # Basic example
     retval = provide_progress_bar(long_running_function, estimated_time=5)

src/htr_pipeline/utils/parser_xml.py CHANGED Viewed

@@ -1,10 +1,5 @@
-import math
-import os
-import random
 import xml.etree.ElementTree as ET
-from PIL import Image, ImageDraw, ImageFont
 class XmlParser:
     def __init__(self, page_xml="./page_xml.xml"):
@@ -12,61 +7,6 @@ class XmlParser:
         self.root = self.tree.getroot()
         self.namespace = "{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}"
-    def visualize_xml(
-        self,
-        background_image,
-        font_size=9,
-        text_offset=10,
-        font_path_tff="./src/htr_pipeline/utils/templates/arial.ttf",
-    ):
-        image = Image.fromarray(background_image).convert("RGBA")
-        image_width = int(self.root.find(f"{self.namespace}Page").attrib["imageWidth"])
-        image_height = int(self.root.find(f"{self.namespace}Page").attrib["imageHeight"])
-        text_offset = -text_offset
-        base_font_size = font_size
-        font_path = font_path_tff
-        max_bbox_width = 0  # Initialize maximum bounding box width
-        for textregion in self.root.findall(f".//{self.namespace}TextRegion"):
-            coords = textregion.find(f"{self.namespace}Coords").attrib["points"].split()
-            points = [tuple(map(int, point.split(","))) for point in coords]
-            x_coords, y_coords = zip(*points)
-            min_x, max_x = min(x_coords), max(x_coords)
-            bbox_width = max_x - min_x  # Width of the current bounding box
-            max_bbox_width = max(max_bbox_width, bbox_width)  # Update maximum bounding box width
-        scaling_factor = max_bbox_width / 400.0  # Use maximum bounding box width for scaling
-        font_size_scaled = int(base_font_size * scaling_factor)
-        font = ImageFont.truetype(font_path, font_size_scaled)
-        for textregion in self.root.findall(f".//{self.namespace}TextRegion"):
-            fill_color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), 100)
-            for textline in textregion.findall(f".//{self.namespace}TextLine"):
-                coords = textline.find(f"{self.namespace}Coords").attrib["points"].split()
-                points = [tuple(map(int, point.split(","))) for point in coords]
-                poly_image = Image.new("RGBA", image.size)
-                poly_draw = ImageDraw.Draw(poly_image)
-                poly_draw.polygon(points, fill=fill_color)
-                text = textline.find(f"{self.namespace}TextEquiv").find(f"{self.namespace}Unicode").text
-                x_coords, y_coords = zip(*points)
-                min_x, max_x = min(x_coords), max(x_coords)
-                min_y = min(y_coords)
-                text_width, text_height = poly_draw.textsize(text, font=font)  # Get text size
-                text_position = (
-                    (min_x + max_x) // 2 - text_width // 2,
-                    min_y + text_offset,
-                )  # Center text horizontally
-                poly_draw.text(text_position, text, fill=(0, 0, 0), font=font)
-                image = Image.alpha_composite(image, poly_image)
-        return image
     def xml_to_txt(self, output_file="page_txt.txt"):
         with open(output_file, "w", encoding="utf-8") as f:
             for textregion in self.root.findall(f".//{self.namespace}TextRegion"):

 import xml.etree.ElementTree as ET
 class XmlParser:
     def __init__(self, page_xml="./page_xml.xml"):
         self.root = self.tree.getroot()
         self.namespace = "{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}"
     def xml_to_txt(self, output_file="page_txt.txt"):
         with open(output_file, "w", encoding="utf-8") as f:
             for textregion in self.root.findall(f".//{self.namespace}TextRegion"):

src/htr_pipeline/utils/pipeline_inferencer.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from tqdm import tqdm
+from src.htr_pipeline.utils.process_segmask import SegMaskHelper
+from src.htr_pipeline.utils.xml_helper import XMLHelper
+class PipelineInferencer:
+    def __init__(self, process_seg_mask: SegMaskHelper, xml_helper: XMLHelper):
+        self.process_seg_mask = process_seg_mask
+        self.xml_helper = xml_helper
+    def image_to_page_xml(
+        self, image, pred_score_threshold_regions, pred_score_threshold_lines, containments_threshold, inferencer
+    ):
+        template_data = self.xml_helper.prepare_template_data(self.xml_helper.xml_file_name, image)
+        template_data["textRegions"] = self._process_regions(
+            image, inferencer, pred_score_threshold_regions, pred_score_threshold_lines, containments_threshold
+        )
+        print(template_data)
+        return self.xml_helper.render(template_data)
+    def _process_regions(
+        self,
+        image,
+        inferencer,
+        pred_score_threshold_regions,
+        pred_score_threshold_lines,
+        containments_threshold,
+        htr_threshold=0.7,
+    ):
+        _, regions_cropped_ordered, reg_polygons_ordered, reg_masks_ordered = inferencer.predict_regions(
+            image,
+            pred_score_threshold=pred_score_threshold_regions,
+            containments_threshold=containments_threshold,
+            visualize=False,
+        )
+        region_data_list = []
+        for i, data in tqdm(enumerate(zip(regions_cropped_ordered, reg_polygons_ordered, reg_masks_ordered))):
+            region_data = self._create_region_data(
+                data, i, inferencer, pred_score_threshold_lines, containments_threshold, htr_threshold
+            )
+            if region_data:
+                region_data_list.append(region_data)
+        return region_data_list
+    def _create_region_data(
+        self, data, index, inferencer, pred_score_threshold_lines, containments_threshold, htr_threshold
+    ):
+        text_region, reg_pol, mask = data
+        region_data = {"id": f"region_{index}", "boundary": reg_pol}
+        text_lines, htr_scores = self._process_lines(
+            text_region,
+            inferencer,
+            pred_score_threshold_lines,
+            containments_threshold,
+            mask,
+            region_data["id"],
+            htr_threshold,
+        )
+        if not text_lines:
+            return None
+        region_data["textLines"] = text_lines
+        mean_htr_score = sum(htr_scores) / len(htr_scores) if htr_scores else 0
+        return region_data if mean_htr_score > htr_threshold else None
+    def _process_lines(
+        self, text_region, inferencer, pred_score_threshold, containments_threshold, mask, region_id, htr_threshold=0.7
+    ):
+        _, lines_cropped_ordered, line_polygons_ordered = inferencer.predict_lines(
+            text_region, pred_score_threshold, containments_threshold, visualize=False, custom_track=False
+        )
+        if not lines_cropped_ordered:
+            return None, []
+        line_polygons_ordered_trans = self.process_seg_mask._translate_line_coords(mask, line_polygons_ordered)
+        text_lines = []
+        htr_scores = []
+        for index, (line, line_pol) in enumerate(zip(lines_cropped_ordered, line_polygons_ordered_trans)):
+            line_data, htr_score = self._create_line_data(line, line_pol, index, region_id, inferencer, htr_threshold)
+            if line_data:
+                text_lines.append(line_data)
+            htr_scores.append(htr_score)
+        return text_lines, htr_scores
+    def _create_line_data(self, line, line_pol, index, region_id, inferencer, htr_threshold):
+        line_data = {"id": f"line_{region_id}_{index}", "boundary": line_pol}
+        transcribed_text, htr_score = inferencer.transcribe(line)
+        line_data["unicode"] = self.xml_helper.escape_xml_chars(transcribed_text)
+        line_data["pred_score"] = round(htr_score, 4)
+        return line_data if htr_score > htr_threshold else None, htr_score
+if __name__ == "__main__":
+    pass

src/htr_pipeline/utils/process_xml.py DELETED Viewed

@@ -1,167 +0,0 @@
-import os
-import re
-from datetime import datetime
-import jinja2
-from tqdm import tqdm
-from src.htr_pipeline.inferencer import InferencerInterface
-from src.htr_pipeline.utils.process_segmask import SegMaskHelper
-class XMLHelper:
-    def __init__(self):
-        self.process_seg_mask = SegMaskHelper()
-    def image_to_page_xml(
-        self,
-        image,
-        pred_score_threshold_regions,
-        pred_score_threshold_lines,
-        containments_threshold,
-        inferencer: InferencerInterface,
-        xml_file_name="page_xml.xml",
-    ):
-        img_height = image.shape[0]
-        img_width = image.shape[1]
-        img_file_name = xml_file_name
-        template_data = self.prepare_template_data(img_file_name, img_width, img_height)
-        template_data["textRegions"] = self._process_regions(
-            image,
-            inferencer,
-            pred_score_threshold_regions,
-            pred_score_threshold_lines,
-            containments_threshold,
-        )
-        rendered_xml = self._render_xml(template_data)
-        return rendered_xml
-    def _transform_coords(self, input_string):
-        pattern = r"\[\s*([^\s,]+)\s*,\s*([^\s\]]+)\s*\]"
-        replacement = r"\1,\2"
-        return re.sub(pattern, replacement, input_string)
-    def _render_xml(self, template_data):
-        template_loader = jinja2.FileSystemLoader(searchpath="./src/htr_pipeline/utils/templates")
-        template_env = jinja2.Environment(loader=template_loader, trim_blocks=True)
-        template = template_env.get_template("page_xml_2013.xml")
-        rendered_xml = template.render(template_data)
-        rendered_xml = self._transform_coords(rendered_xml)
-        return rendered_xml
-    def prepare_template_data(self, img_file_name, img_width, img_height):
-        now = datetime.now()
-        date_time = now.strftime("%Y-%m-%d, %H:%M:%S")
-        return {
-            "created": date_time,
-            "imageFilename": img_file_name,
-            "imageWidth": img_width,
-            "imageHeight": img_height,
-            "textRegions": list(),
-        }
-    def _process_regions(
-        self,
-        image,
-        inferencer: InferencerInterface,
-        pred_score_threshold_regions,
-        pred_score_threshold_lines,
-        containments_threshold,
-        htr_threshold=0.7,
-    ):
-        _, regions_cropped_ordered, reg_polygons_ordered, reg_masks_ordered = inferencer.predict_regions(
-            image,
-            pred_score_threshold=pred_score_threshold_regions,
-            containments_threshold=containments_threshold,
-            visualize=False,
-        )
-        region_data_list = []
-        for i, (text_region, reg_pol, mask) in tqdm(
-            enumerate(zip(regions_cropped_ordered, reg_polygons_ordered, reg_masks_ordered))
-        ):
-            region_id = "region_" + str(i)
-            region_data = dict()
-            region_data["id"] = region_id
-            region_data["boundary"] = reg_pol
-            text_lines, htr_scores = self._process_lines(
-                text_region,
-                inferencer,
-                pred_score_threshold_lines,
-                containments_threshold,
-                mask,
-                region_id,
-            )
-            if text_lines is None:
-                continue
-            region_data["textLines"] = text_lines
-            mean_htr_score = sum(htr_scores) / len(htr_scores)
-            if mean_htr_score > htr_threshold:
-                region_data_list.append(region_data)
-        return region_data_list
-    def _process_lines(
-        self,
-        text_region,
-        inferencer: InferencerInterface,
-        pred_score_threshold_lines,
-        containments_threshold,
-        mask,
-        region_id,
-        htr_threshold=0.7,
-    ):
-        _, lines_cropped_ordered, line_polygons_ordered = inferencer.predict_lines(
-            text_region,
-            pred_score_threshold=pred_score_threshold_lines,
-            containments_threshold=containments_threshold,
-            visualize=False,
-            custom_track=False,
-        )
-        if lines_cropped_ordered is None:
-            return None, None
-        line_polygons_ordered_trans = self.process_seg_mask._translate_line_coords(mask, line_polygons_ordered)
-        htr_scores = list()
-        text_lines = list()
-        for j, (line, line_pol) in enumerate(zip(lines_cropped_ordered, line_polygons_ordered_trans)):
-            line_id = "line_" + region_id + "_" + str(j)
-            line_data = dict()
-            line_data["id"] = line_id
-            line_data["boundary"] = line_pol
-            transcribed_text, htr_score = inferencer.transcribe(line)
-            escaped_text = self._escape_xml_chars(transcribed_text)
-            line_data["unicode"] = escaped_text
-            line_data["pred_score"] = round(htr_score, 4)
-            htr_scores.append(htr_score)
-            if htr_score > htr_threshold:
-                text_lines.append(line_data)
-        return text_lines, htr_scores
-    def _escape_xml_chars(self, textline):
-        return (
-            textline.replace("&", "&amp;")
-            .replace("<", "&lt;")
-            .replace(">", "&gt;")
-            .replace("'", "&apos;")
-            .replace('"', "&quot;")
-        )
-if __name__ == "__main__":
-    pass

src/htr_pipeline/utils/visualize_xml.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import random
+import xml.etree.ElementTree as ET
+from PIL import Image, ImageDraw, ImageFont
+class XmlViz:
+    def __init__(self, page_xml="./page_xml.xml"):
+        self.tree = ET.parse(page_xml, parser=ET.XMLParser(encoding="utf-8"))
+        self.root = self.tree.getroot()
+        self.namespace = "{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}"
+    def visualize_xml(
+        self,
+        background_image,
+        font_size=9,
+        text_offset=10,
+        font_path_tff="./src/htr_pipeline/utils/templates/arial.ttf",
+    ):
+        image = Image.fromarray(background_image).convert("RGBA")
+        text_offset = -text_offset
+        base_font_size = font_size
+        font_path = font_path_tff
+        max_bbox_width = 0  # Initialize maximum bounding box width
+        for textregion in self.root.findall(f".//{self.namespace}TextRegion"):
+            coords = textregion.find(f"{self.namespace}Coords").attrib["points"].split()
+            points = [tuple(map(int, point.split(","))) for point in coords]
+            x_coords, y_coords = zip(*points)
+            min_x, max_x = min(x_coords), max(x_coords)
+            bbox_width = max_x - min_x  # Width of the current bounding box
+            max_bbox_width = max(max_bbox_width, bbox_width)  # Update maximum bounding box width
+        scaling_factor = max_bbox_width / 400.0  # Use maximum bounding box width for scaling
+        font_size_scaled = int(base_font_size * scaling_factor)
+        font = ImageFont.truetype(font_path, font_size_scaled)
+        for textregion in self.root.findall(f".//{self.namespace}TextRegion"):
+            fill_color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), 100)
+            for textline in textregion.findall(f".//{self.namespace}TextLine"):
+                coords = textline.find(f"{self.namespace}Coords").attrib["points"].split()
+                points = [tuple(map(int, point.split(","))) for point in coords]
+                poly_image = Image.new("RGBA", image.size)
+                poly_draw = ImageDraw.Draw(poly_image)
+                poly_draw.polygon(points, fill=fill_color)
+                text = textline.find(f"{self.namespace}TextEquiv").find(f"{self.namespace}Unicode").text
+                x_coords, y_coords = zip(*points)
+                min_x, max_x = min(x_coords), max(x_coords)
+                min_y = min(y_coords)
+                text_width, text_height = poly_draw.textsize(text, font=font)  # Get text size
+                text_position = (
+                    (min_x + max_x) // 2 - text_width // 2,
+                    min_y + text_offset,
+                )  # Center text horizontally
+                poly_draw.text(text_position, text, fill=(0, 0, 0), font=font)
+                image = Image.alpha_composite(image, poly_image)
+        return image
+if __name__ == "__main__":
+    pass

src/htr_pipeline/utils/xml_helper.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import re
+from datetime import datetime
+import jinja2
+class XMLHelper:
+    def __init__(self, xml_file_name="page_xml.xml"):
+        self.xml_file_name = xml_file_name
+        self.searchpath = "./src/htr_pipeline/utils/templates"
+        self.template = "page_xml_2013.xml"
+    def render(self, template_data):
+        rendered_xml = self._render_xml(template_data)
+        return rendered_xml
+    def _transform_coords(self, input_string):
+        pattern = r"\[\s*([^\s,]+)\s*,\s*([^\s\]]+)\s*\]"
+        replacement = r"\1,\2"
+        return re.sub(pattern, replacement, input_string)
+    def _render_xml(self, template_data):
+        template_loader = jinja2.FileSystemLoader(searchpath=self.searchpath)
+        template_env = jinja2.Environment(loader=template_loader, trim_blocks=True)
+        template = template_env.get_template(self.template)
+        rendered_xml = template.render(template_data)
+        rendered_xml = self._transform_coords(rendered_xml)
+        return rendered_xml
+    def prepare_template_data(self, img_file_name, image):
+        img_height = image.shape[0]
+        img_width = image.shape[1]
+        now = datetime.now()
+        date_time = now.strftime("%Y-%m-%d, %H:%M:%S")
+        return {
+            "created": date_time,
+            "imageFilename": img_file_name,
+            "imageWidth": img_width,
+            "imageHeight": img_height,
+            "textRegions": list(),
+        }
+    def escape_xml_chars(self, textline):
+        return (
+            textline.replace("&", "&amp;")
+            .replace("<", "&lt;")
+            .replace(">", "&gt;")
+            .replace("'", "&apos;")
+            .replace('"', "&quot;")
+        )
+if __name__ == "__main__":
+    pass

tabs/htr_tool.py CHANGED Viewed

@@ -19,32 +19,17 @@ with gr.Blocks() as htr_tool_tab:
                 )
             with gr.Row():
-                # with gr.Group():
-                # callback = gr.CSVLogger()
-                # # hf_writer = gr.HuggingFaceDatasetSaver(HF_API_TOKEN, "htr_pipelin_flags")
-                # flagging_button = gr.Button(
-                #     "Flag",
-                #     variant="secondary",
-                #     visible=True,
-                # ).style(full_width=True)
-                # radio_file_input = gr.Radio(
-                #     value="Text file", choices=["Text file ", "Page XML file "], label="What kind file output?"
-                # )
                 radio_file_input = gr.CheckboxGroup(
                     choices=["Txt", "XML"],
-                    value=["Txt"],
                     label="Output file extension",
                     # info="Only txt and page xml is supported for now!",
                 )
                 htr_pipeline_button = gr.Button(
-                    "Run HTR",
-                    variant="primary",
-                    visible=True,
-                    elem_id="run_pipeline_button",
-                ).style(full_width=False)
             with gr.Group():
                 with gr.Row():
                     fast_file_downlod = gr.File(label="Download output file", visible=False)
@@ -75,11 +60,7 @@ with gr.Blocks() as htr_tool_tab:
             fast_track_output_image = gr.Image(label="HTR results visualizer", type="numpy", tool="editor", height=650)
         with gr.Row(visible=False) as api_placeholder:
-            htr_pipeline_button_api = gr.Button(
-                "Run pipeline",
-                variant="primary",
-                visible=False,
-            ).style(full_width=False)
     xml_rendered_placeholder_for_api = gr.Textbox(visible=False)
     htr_pipeline_button.click(
@@ -94,8 +75,3 @@ with gr.Blocks() as htr_tool_tab:
         outputs=[xml_rendered_placeholder_for_api],
         api_name="predict",
     )
-    # callback.setup([fast_track_input_region_image], "flagged_data_points")
-    # flagging_button.click(lambda *args: callback.flag(args), [fast_track_input_region_image], None, preprocess=False)
-    # flagging_button.click(lambda: (gr.update(value="Flagged")), outputs=flagging_button)
-    # fast_track_input_region_image.change(lambda: (gr.update(value="Flag")), outputs=flagging_button)

                 )
             with gr.Row():
                 radio_file_input = gr.CheckboxGroup(
                     choices=["Txt", "XML"],
+                    value=["XML"],
                     label="Output file extension",
                     # info="Only txt and page xml is supported for now!",
+                    scale=1,
                 )
                 htr_pipeline_button = gr.Button(
+                    "Run HTR", variant="primary", visible=True, elem_id="run_pipeline_button", scale=1
+                )
             with gr.Group():
                 with gr.Row():
                     fast_file_downlod = gr.File(label="Download output file", visible=False)
             fast_track_output_image = gr.Image(label="HTR results visualizer", type="numpy", tool="editor", height=650)
         with gr.Row(visible=False) as api_placeholder:
+            htr_pipeline_button_api = gr.Button("Run pipeline", variant="primary", visible=False, scale=1)
     xml_rendered_placeholder_for_api = gr.Textbox(visible=False)
     htr_pipeline_button.click(
         outputs=[xml_rendered_placeholder_for_api],
         api_name="predict",
     )

tabs/stepwise_htr_tool.py CHANGED Viewed

@@ -25,7 +25,8 @@ with gr.Blocks() as stepwise_htr_tool_tab:
                             label="Image to Region segment",
                             # type="numpy",
                             tool="editor",
-                        ).style(height=350)
                     with gr.Accordion("Region segment settings:", open=False):
                         with gr.Row():
@@ -63,7 +64,7 @@ with gr.Blocks() as stepwise_htr_tool_tab:
                             "Segment Region",
                             variant="primary",
                             elem_id="region_segment_button",
-                        )  # .style(full_width=False)
                     with gr.Row():
                         with gr.Accordion("Example images to use:", open=False) as example_accord:
@@ -75,7 +76,7 @@ with gr.Blocks() as stepwise_htr_tool_tab:
                             )
                 with gr.Column(scale=3):
-                    output_region_image = gr.Image(label="Segmented regions", type="numpy").style(height=600)
         ##############################################
         with gr.Tab("2. Line Segmentation"):
@@ -84,27 +85,27 @@ with gr.Blocks() as stepwise_htr_tool_tab:
                 # type="numpy",
                 interactive="False",
                 visible=True,
-            ).style(height=600)
             with gr.Row(visible=False) as control_line_segment:
                 with gr.Column(scale=2):
                     with gr.Box():
                         regions_cropped_gallery = gr.Gallery(
                             label="Segmented regions",
-                            show_label=False,
                             elem_id="gallery",
-                        ).style(
                             columns=[2],
                             rows=[2],
                             # object_fit="contain",
-                            height=400,
                             preview=True,
                             container=False,
                         )
                     input_region_from_gallery = gr.Image(
-                        label="Region segmentation to line segment", interactive="False", visible=False
-                    ).style(height=400)
                     with gr.Row():
                         with gr.Accordion("Line segment settings:", open=False):
                             with gr.Row():
@@ -126,7 +127,7 @@ with gr.Blocks() as stepwise_htr_tool_tab:
                                     info="""The minimum required overlap or similarity
                                                     for a detected region or object to be considered valid""",
                                 )
-                            with gr.Row().style(equal_height=False):
                                 line_segment_model_dropdown = gr.Dropdown(
                                     choices=["Riksarkivet/RmtDet_lines"],
                                     value="Riksarkivet/RmtDet_lines",
@@ -138,22 +139,22 @@ with gr.Blocks() as stepwise_htr_tool_tab:
                             " ",
                             variant="Secondary",
                             # elem_id="center_button",
-                        ).style(full_width=True)
                         line_segment_button = gr.Button(
                             "Segment Lines",
                             variant="primary",
                             # elem_id="center_button",
-                        ).style(full_width=True)
                 with gr.Column(scale=3):
                     # gr.Markdown("""lorem ipsum""")
                     output_line_from_region = gr.Image(
-                        label="Segmented lines",
-                        type="numpy",
-                        interactive="False",
-                    ).style(height=600)
         ###############################################
         with gr.Tab("3. Transcribe Text"):
@@ -162,19 +163,16 @@ with gr.Blocks() as stepwise_htr_tool_tab:
                 # type="numpy",
                 interactive="False",
                 visible=True,
-            ).style(height=600)
             with gr.Row(visible=False) as control_htr:
                 inputs_lines_to_transcribe = gr.Variable()
                 with gr.Column(scale=2):
                     image_inputs_lines_to_transcribe = gr.Image(
-                        label="Transcribed lines",
-                        type="numpy",
-                        interactive="False",
-                        visible=False,
-                    ).style(height=470)
                     with gr.Row():
                         with gr.Accordion("Transcribe settings:", open=False):
                             transcriber_model = gr.Dropdown(
@@ -184,30 +182,21 @@ with gr.Blocks() as stepwise_htr_tool_tab:
                                 info="Will add more models later!",
                             )
                     with gr.Row():
-                        clear_transcribe_button = gr.Button(" ", variant="Secondary", visible=True).style(
-                            full_width=True
-                        )
-                        transcribe_button = gr.Button("Transcribe lines", variant="primary", visible=True).style(
-                            full_width=True
-                        )
-                        donwload_txt_button = gr.Button("Download text", variant="secondary", visible=False).style(
-                            full_width=True
-                        )
-                    with gr.Row():
-                        txt_file_downlod = gr.File(label="Download text", visible=False)
                 with gr.Column(scale=3):
                     with gr.Row():
                         transcribed_text_df = gr.Dataframe(
                             headers=["Transcribed text"],
-                            max_rows=15,
                             col_count=(1, "fixed"),
                             wrap=True,
                             interactive=False,
                             overflow_row_behaviour="paginate",
-                        ).style(height=600)
         #####################################
         with gr.Tab("4. Explore Results"):
@@ -216,35 +205,43 @@ with gr.Blocks() as stepwise_htr_tool_tab:
                 # type="numpy",
                 interactive="False",
                 visible=True,
-            ).style(height=600)
-            with gr.Row(visible=False) as control_results_transcribe:
                 with gr.Column(scale=1, visible=True):
                     with gr.Box():
                         temp_gallery_input = gr.Variable()
                         gallery_inputs_lines_to_transcribe = gr.Gallery(
                             label="Cropped transcribed lines",
-                            show_label=True,
                             elem_id="gallery_lines",
-                        ).style(
                             columns=[3],
                             rows=[3],
                             # object_fit="contain",
-                            # height="600",
                             preview=True,
                             container=False,
                         )
                 with gr.Column(scale=1, visible=True):
                     mapping_dict = gr.Variable()
                     transcribed_text_df_finish = gr.Dataframe(
-                        headers=["Transcribed text", "Pred score"],
-                        max_rows=15,
                         col_count=(2, "fixed"),
                         wrap=True,
                         interactive=False,
                         overflow_row_behaviour="paginate",
-                    ).style(height=600)
     # custom track
     region_segment_button.click(
@@ -260,7 +257,7 @@ with gr.Blocks() as stepwise_htr_tool_tab:
     transcribed_text_df_finish.select(
         fn=custom_track.get_select_index_df,
         inputs=[transcribed_text_df_finish, mapping_dict],
-        outputs=gallery_inputs_lines_to_transcribe,
     )
     line_segment_button.click(
@@ -287,23 +284,12 @@ with gr.Blocks() as stepwise_htr_tool_tab:
             transcribed_text_df,
             transcribed_text_df_finish,
             mapping_dict,
-            txt_file_downlod,
             control_results_transcribe,
             image_placeholder_explore_results,
         ],
     )
-    donwload_txt_button.click(
-        custom_track.download_df_to_txt,
-        inputs=transcribed_text_df,
-        outputs=[txt_file_downlod, txt_file_downlod],
-    )
-    # def remove_temp_vis():
-    #     if os.path.exists("./vis_data"):
-    #         os.remove("././vis_data")
-    #     return None
     clear_button.click(
         lambda: (
             (shutil.rmtree("./vis_data") if os.path.exists("./vis_data") else None, None)[1],

                             label="Image to Region segment",
                             # type="numpy",
                             tool="editor",
+                            height=350,
+                        )
                     with gr.Accordion("Region segment settings:", open=False):
                         with gr.Row():
                             "Segment Region",
                             variant="primary",
                             elem_id="region_segment_button",
+                        )
                     with gr.Row():
                         with gr.Accordion("Example images to use:", open=False) as example_accord:
                             )
                 with gr.Column(scale=3):
+                    output_region_image = gr.Image(label="Segmented regions", type="numpy", height=600)
         ##############################################
         with gr.Tab("2. Line Segmentation"):
                 # type="numpy",
                 interactive="False",
                 visible=True,
+                height=600,
+            )
             with gr.Row(visible=False) as control_line_segment:
                 with gr.Column(scale=2):
                     with gr.Box():
                         regions_cropped_gallery = gr.Gallery(
                             label="Segmented regions",
                             elem_id="gallery",
                             columns=[2],
                             rows=[2],
                             # object_fit="contain",
+                            height=450,
                             preview=True,
                             container=False,
                         )
                     input_region_from_gallery = gr.Image(
+                        label="Region segmentation to line segment", interactive="False", visible=False, height=400
+                    )
                     with gr.Row():
                         with gr.Accordion("Line segment settings:", open=False):
                             with gr.Row():
                                     info="""The minimum required overlap or similarity
                                                     for a detected region or object to be considered valid""",
                                 )
+                            with gr.Row(equal_height=False):
                                 line_segment_model_dropdown = gr.Dropdown(
                                     choices=["Riksarkivet/RmtDet_lines"],
                                     value="Riksarkivet/RmtDet_lines",
                             " ",
                             variant="Secondary",
                             # elem_id="center_button",
+                            scale=1,
+                        )
                         line_segment_button = gr.Button(
                             "Segment Lines",
                             variant="primary",
                             # elem_id="center_button",
+                            scale=1,
+                        )
                 with gr.Column(scale=3):
                     # gr.Markdown("""lorem ipsum""")
                     output_line_from_region = gr.Image(
+                        label="Segmented lines", type="numpy", interactive="False", height=600
+                    )
         ###############################################
         with gr.Tab("3. Transcribe Text"):
                 # type="numpy",
                 interactive="False",
                 visible=True,
+                height=600,
+            )
             with gr.Row(visible=False) as control_htr:
                 inputs_lines_to_transcribe = gr.Variable()
                 with gr.Column(scale=2):
                     image_inputs_lines_to_transcribe = gr.Image(
+                        label="Transcribed lines", type="numpy", interactive="False", visible=False, height=470
+                    )
                     with gr.Row():
                         with gr.Accordion("Transcribe settings:", open=False):
                             transcriber_model = gr.Dropdown(
                                 info="Will add more models later!",
                             )
                     with gr.Row():
+                        clear_transcribe_button = gr.Button(" ", variant="Secondary", visible=True, scale=1)
+                        transcribe_button = gr.Button("Transcribe Lines", variant="primary", visible=True, scale=1)
                 with gr.Column(scale=3):
                     with gr.Row():
                         transcribed_text_df = gr.Dataframe(
                             headers=["Transcribed text"],
+                            max_rows=14,
                             col_count=(1, "fixed"),
                             wrap=True,
                             interactive=False,
                             overflow_row_behaviour="paginate",
+                            height=600,
+                        )
         #####################################
         with gr.Tab("4. Explore Results"):
                 # type="numpy",
                 interactive="False",
                 visible=True,
+                height=600,
+            )
+            with gr.Row(visible=False, equal_height=False) as control_results_transcribe:
                 with gr.Column(scale=1, visible=True):
                     with gr.Box():
                         temp_gallery_input = gr.Variable()
                         gallery_inputs_lines_to_transcribe = gr.Gallery(
                             label="Cropped transcribed lines",
                             elem_id="gallery_lines",
                             columns=[3],
                             rows=[3],
                             # object_fit="contain",
+                            height=300,
                             preview=True,
                             container=False,
                         )
+                    dataframe_text_index = gr.Textbox(
+                        label="Text from DataFrame selection",
+                        info="Click on a dataframe cell to view the corresponding transcribed text line crop. You can also sort the dataframe to easily locate specific entries.",
+                        lines=2,
+                        interactive=False,
+                    )
                 with gr.Column(scale=1, visible=True):
                     mapping_dict = gr.Variable()
                     transcribed_text_df_finish = gr.Dataframe(
+                        headers=["Transcribed text", "pred score"],
+                        max_rows=14,
                         col_count=(2, "fixed"),
                         wrap=True,
                         interactive=False,
                         overflow_row_behaviour="paginate",
+                        height=600,
+                    )
     # custom track
     region_segment_button.click(
     transcribed_text_df_finish.select(
         fn=custom_track.get_select_index_df,
         inputs=[transcribed_text_df_finish, mapping_dict],
+        outputs=[gallery_inputs_lines_to_transcribe, dataframe_text_index],
     )
     line_segment_button.click(
             transcribed_text_df,
             transcribed_text_df_finish,
             mapping_dict,
+            # Hide
             control_results_transcribe,
             image_placeholder_explore_results,
         ],
     )
     clear_button.click(
         lambda: (
             (shutil.rmtree("./vis_data") if os.path.exists("./vis_data") else None, None)[1],