Spaces:

pdltiet
/

OCR_demo

Running

App Files Files Community

vteam27 commited on Jan 10, 2024

Commit

b8b3256

1 Parent(s): 41be449

base doctr

Browse files

Files changed (9) hide show

.gitattributes +2 -0
Examples/Book.png +3 -0
Examples/Files.jpg +3 -0
Examples/Manuscript.jpg +3 -0
Examples/News.png +3 -0
app.py +58 -0
packages.txt +3 -0
requirements.txt +5 -0
utils.py +163 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

Examples/Book.png ADDED Viewed

Git LFS Details

SHA256: 45bf8d8c824d48de2013e572bffcedadcbdc84cda21fb73f5f83ecb809aec803
Pointer size: 133 Bytes
Size of remote file: 16 MB

Examples/Files.jpg ADDED Viewed

Git LFS Details

SHA256: bc1979e548161bb556a037594b3945749419b2367f93acac00e53c6d621ee009
Pointer size: 132 Bytes
Size of remote file: 4.37 MB

Examples/Manuscript.jpg ADDED Viewed

Git LFS Details

SHA256: 4a717cd9c625b7b59ebb80b52b0b3fba47c69e61f881ecd4e4f8ea1bb8883ddf
Pointer size: 132 Bytes
Size of remote file: 4.54 MB

Examples/News.png ADDED Viewed

Git LFS Details

SHA256: 5384175e709017ad917f56ff758bce9164444992be3bcad8fe52f7f83343744d
Pointer size: 131 Bytes
Size of remote file: 388 kB

app.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import os
+os.environ['USE_TORCH'] = '1'
+from doctr.io import DocumentFile
+from doctr.models import ocr_predictor
+import gradio as gr
+from PIL import Image
+import base64
+from utils import HocrParser
+predictor = ocr_predictor(det_arch='db_mobilenet_v3_large', reco_arch='crnn_vgg16_bn',pretrained=True)
+title="DocTR OCR (PDL Demo)"
+description="Upload an image to get the OCR results !"
+def greet(img):
+    img.save("out.jpg")
+    doc = DocumentFile.from_images("out.jpg")
+    output=predictor(doc)
+    xml_outputs = output.export_as_xml()
+    parser = HocrParser()
+    res=""
+    for obj in output.pages:
+      for obj1 in obj.blocks:
+        for obj2 in obj1.lines:
+          for obj3 in obj2.words:
+            res=res + " " + obj3.value
+        res=res + "\n"
+      res=res + "\n"
+    _output_name = "RESULT_OCR.txt"
+    _output_name_pdf="RESULT_OCR.pdf"
+    open(_output_name, 'w').close() # clear file
+    with open(_output_name, "w", encoding="utf-8", errors="ignore") as f:
+        f.write(res)
+        print("Writing into file")
+    base64_encoded_pdfs = list()
+    for i, (xml, img) in enumerate(zip(xml_outputs, doc)):
+      xml_element_tree = xml[1]
+      parser.export_pdfa(_output_name_pdf,
+            hocr=xml_element_tree, image=img)
+      with open(_output_name_pdf, 'rb') as f:
+            base64_encoded_pdfs.append(base64.b64encode(f.read()))
+    return res, _output_name, _output_name_pdf
+demo = gr.Interface(fn=greet,
+                    inputs=gr.Image(type="pil"),
+                    outputs=["text", "file","file"],
+                    title=title,
+                    description=description,
+                    examples=[["Examples/Book.png"],["Examples/News.png"],["Examples/Manuscript.jpg"],["Examples/Files.jpg"]]
+                    )
+demo.launch(debug=True)

packages.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+libcairo2-dev
+pkg-config
+fonts-freefont-ttf -y

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+pycairo
+python-doctr[torch]@git+https://github.com/mindee/doctr.git
+gradio
+reportlab>=3.6.2
+PyPDF2==1.26.0

utils.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import base64
+import re
+from tempfile import TemporaryDirectory
+from math import atan, cos, sin
+from typing import Dict, Optional, Tuple
+from xml.etree import ElementTree as ET
+from xml.etree.ElementTree import Element
+import numpy as np
+import PyPDF2
+from PyPDF2 import PdfFileMerger
+from doctr.io import DocumentFile
+from doctr.models import ocr_predictor
+from PIL import Image
+from reportlab.lib.colors import black
+from reportlab.lib.units import inch
+from reportlab.lib.utils import ImageReader
+from reportlab.pdfgen.canvas import Canvas
+class HocrParser():
+    def __init__(self):
+        self.box_pattern = re.compile(r'bbox((\s+\d+){4})')
+        self.baseline_pattern = re.compile(r'baseline((\s+[\d\.\-]+){2})')
+    def _element_coordinates(self, element: Element) -> Dict:
+        """
+        Returns a tuple containing the coordinates of the bounding box around
+        an element
+        """
+        out = out = {'x1': 0, 'y1': 0, 'x2': 0, 'y2': 0}
+        if 'title' in element.attrib:
+            matches = self.box_pattern.search(element.attrib['title'])
+            if matches:
+                coords = matches.group(1).split()
+                out = {'x1': int(coords[0]), 'y1': int(
+                    coords[1]), 'x2': int(coords[2]), 'y2': int(coords[3])}
+        return out
+    def _get_baseline(self, element: Element) -> Tuple[float, float]:
+        """
+        Returns a tuple containing the baseline slope and intercept.
+        """
+        if 'title' in element.attrib:
+            matches = self.baseline_pattern.search(
+                element.attrib['title']).group(1).split()
+            if matches:
+                return float(matches[0]), float(matches[1])
+        return (0.0, 0.0)
+    def _pt_from_pixel(self, pxl: Dict, dpi: int) -> Dict:
+        """
+        Returns the quantity in PDF units (pt) given quantity in pixels
+        """
+        pt = [(c / dpi * inch) for c in pxl.values()]
+        return {'x1': pt[0], 'y1': pt[1], 'x2': pt[2], 'y2': pt[3]}
+    def _get_element_text(self, element: Element) -> str:
+        """
+        Return the textual content of the element and its children
+        """
+        text = ''
+        if element.text is not None:
+            text += element.text
+        for child in element:
+            text += self._get_element_text(child)
+        if element.tail is not None:
+            text += element.tail
+        return text
+    def export_pdfa(self,
+                    out_filename: str,
+                    hocr: ET.ElementTree,
+                    image: Optional[np.ndarray] = None,
+                    fontname: str = "Times-Roman",
+                    fontsize: int = 12,
+                    invisible_text: bool = True,
+                    add_spaces: bool = True,
+                    dpi: int = 300):
+        """
+        Generates a PDF/A document from a hOCR document.
+        """
+        width, height = None, None
+        # Get the image dimensions
+        for div in hocr.findall(".//div[@class='ocr_page']"):
+            coords = self._element_coordinates(div)
+            pt_coords = self._pt_from_pixel(coords, dpi)
+            width, height = pt_coords['x2'] - \
+                pt_coords['x1'], pt_coords['y2'] - pt_coords['y1']
+            # after catch break loop
+            break
+        if width is None or height is None:
+            raise ValueError("Could not determine page size")
+        pdf = Canvas(out_filename, pagesize=(width, height), pageCompression=1)
+        span_elements = [element for element in hocr.iterfind(".//span")]
+        for line in span_elements:
+            if 'class' in line.attrib and line.attrib['class'] == 'ocr_line' and line is not None:
+                # get information from xml
+                pxl_line_coords = self._element_coordinates(line)
+                line_box = self._pt_from_pixel(pxl_line_coords, dpi)
+                # compute baseline
+                slope, pxl_intercept = self._get_baseline(line)
+                if abs(slope) < 0.005:
+                    slope = 0.0
+                angle = atan(slope)
+                cos_a, sin_a = cos(angle), sin(angle)
+                intercept = pxl_intercept / dpi * inch
+                baseline_y2 = height - (line_box['y2'] + intercept)
+                # configure options
+                text = pdf.beginText()
+                text.setFont(fontname, fontsize)
+                pdf.setFillColor(black)
+                if invisible_text:
+                    text.setTextRenderMode(3)  # invisible text
+                # transform overlayed text
+                text.setTextTransform(
+                    cos_a, -sin_a, sin_a, cos_a, line_box['x1'], baseline_y2)
+                elements = line.findall(".//span[@class='ocrx_word']")
+                for elem in elements:
+                    elemtxt = self._get_element_text(elem).strip()
+                    # replace unsupported characters
+                    elemtxt = elemtxt.translate(str.maketrans(
+                        {'ﬀ': 'ff', 'ﬃ': 'f‌f‌i', 'ﬄ': 'f‌f‌l', 'ﬁ': 'fi', 'ﬂ': 'fl'}))
+                    if not elemtxt:
+                        continue
+                    # compute string width
+                    pxl_coords = self._element_coordinates(elem)
+                    box = self._pt_from_pixel(pxl_coords, dpi)
+                    if add_spaces:
+                        elemtxt += ' '
+                        box_width = box['x2'] + pdf.stringWidth(elemtxt, fontname, fontsize) - box['x1']
+                    else:
+                        box_width = box['x2'] - box['x1']
+                    font_width = pdf.stringWidth(elemtxt, fontname, fontsize)
+                    # Adjust relative position of cursor
+                    cursor = text.getStartOfLine()
+                    dx = box['x1'] - cursor[0]
+                    dy = baseline_y2 - cursor[1]
+                    text.moveCursor(dx, dy)
+                    # suppress text if it is 0 units wide
+                    if font_width > 0:
+                        text.setHorizScale(100 * box_width / font_width)
+                        text.textOut(elemtxt)
+                pdf.drawText(text)
+        # overlay image if provided
+        if image is not None:
+            pdf.drawImage(ImageReader(Image.fromarray(image)),
+                          0, 0, width=width, height=height)
+        pdf.save()