Spaces:
Running
Running
xiaoyao9184
commited on
Synced repo using 'sync_with_huggingface' Github Action
Browse files- gradio_app.py +48 -1
- requirements.txt +1 -1
gradio_app.py
CHANGED
@@ -22,6 +22,7 @@ from surya.model.recognition.model import load_model as load_rec_model
|
|
22 |
from surya.model.recognition.processor import load_processor as load_rec_processor
|
23 |
from surya.model.table_rec.model import load_model as load_table_model
|
24 |
from surya.model.table_rec.processor import load_processor as load_table_processor
|
|
|
25 |
from surya.postprocessing.heatmap import draw_polys_on_image, draw_bboxes_on_image
|
26 |
from surya.ocr import run_ocr
|
27 |
from surya.postprocessing.text import draw_text_on_image
|
@@ -31,7 +32,9 @@ from surya.input.langs import replace_lang_with_code
|
|
31 |
from surya.schema import OCRResult, TextDetectionResult, LayoutResult, TableResult
|
32 |
from surya.settings import settings
|
33 |
from surya.tables import batch_table_recognition
|
34 |
-
from surya.postprocessing.util import
|
|
|
|
|
35 |
|
36 |
|
37 |
def load_det_cached():
|
@@ -46,6 +49,34 @@ def load_layout_cached():
|
|
46 |
def load_table_cached():
|
47 |
return load_table_model(), load_table_processor()
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
def text_detection(img) -> (Image.Image, TextDetectionResult):
|
51 |
pred = batch_text_detection([img], det_model, det_processor)[0]
|
@@ -148,6 +179,7 @@ det_model, det_processor = load_det_cached()
|
|
148 |
rec_model, rec_processor = load_rec_cached()
|
149 |
layout_model, layout_processor = load_layout_cached()
|
150 |
table_model, table_processor = load_table_cached()
|
|
|
151 |
|
152 |
with gr.Blocks(title="Surya") as demo:
|
153 |
gr.Markdown("""
|
@@ -179,6 +211,8 @@ with gr.Blocks(title="Surya") as demo:
|
|
179 |
use_pdf_boxes_ckb = gr.Checkbox(label="Use PDF table boxes", value=True, info="Table recognition only: Use the bounding boxes from the PDF file vs text detection model.")
|
180 |
skip_table_detection_ckb = gr.Checkbox(label="Skip table detection", value=False, info="Table recognition only: Skip table detection and treat the whole image/page as a table.")
|
181 |
table_rec_btn = gr.Button("Run Table Rec")
|
|
|
|
|
182 |
with gr.Column():
|
183 |
result_img = gr.Image(label="Result image")
|
184 |
result_json = gr.JSON(label="Result json")
|
@@ -250,5 +284,18 @@ with gr.Blocks(title="Surya") as demo:
|
|
250 |
inputs=[in_img, in_file, in_num, use_pdf_boxes_ckb, skip_table_detection_ckb],
|
251 |
outputs=[result_img, result_json]
|
252 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
253 |
|
254 |
demo.launch()
|
|
|
22 |
from surya.model.recognition.processor import load_processor as load_rec_processor
|
23 |
from surya.model.table_rec.model import load_model as load_table_model
|
24 |
from surya.model.table_rec.processor import load_processor as load_table_processor
|
25 |
+
from surya.model.ocr_error.model import load_model as load_ocr_error_model, load_tokenizer as load_ocr_error_processor
|
26 |
from surya.postprocessing.heatmap import draw_polys_on_image, draw_bboxes_on_image
|
27 |
from surya.ocr import run_ocr
|
28 |
from surya.postprocessing.text import draw_text_on_image
|
|
|
32 |
from surya.schema import OCRResult, TextDetectionResult, LayoutResult, TableResult
|
33 |
from surya.settings import settings
|
34 |
from surya.tables import batch_table_recognition
|
35 |
+
from surya.postprocessing.util import rescale_bbox
|
36 |
+
from pdftext.extraction import plain_text_output
|
37 |
+
from surya.ocr_error import batch_ocr_error_detection
|
38 |
|
39 |
|
40 |
def load_det_cached():
|
|
|
49 |
def load_table_cached():
|
50 |
return load_table_model(), load_table_processor()
|
51 |
|
52 |
+
def load_ocr_error_cached():
|
53 |
+
return load_ocr_error_model(), load_ocr_error_processor()
|
54 |
+
|
55 |
+
|
56 |
+
def run_ocr_errors(pdf_file, page_count, sample_len=512, max_samples=10, max_pages=15):
|
57 |
+
# Sample the text from the middle of the PDF
|
58 |
+
page_middle = page_count // 2
|
59 |
+
page_range = range(max(page_middle - max_pages, 0), min(page_middle + max_pages, page_count))
|
60 |
+
text = plain_text_output(pdf_file, page_range=page_range)
|
61 |
+
|
62 |
+
sample_gap = len(text) // max_samples
|
63 |
+
if len(text) == 0 or sample_gap == 0:
|
64 |
+
return "This PDF has no text or very little text", ["no text"]
|
65 |
+
|
66 |
+
if sample_gap < sample_len:
|
67 |
+
sample_gap = sample_len
|
68 |
+
|
69 |
+
# Split the text into samples for the model
|
70 |
+
samples = []
|
71 |
+
for i in range(0, len(text), sample_gap):
|
72 |
+
samples.append(text[i:i + sample_len])
|
73 |
+
|
74 |
+
results = batch_ocr_error_detection(samples, ocr_error_model, ocr_error_processor)
|
75 |
+
label = "This PDF has good text."
|
76 |
+
if results.labels.count("bad") / len(results.labels) > .2:
|
77 |
+
label = "This PDF may have garbled or bad OCR text."
|
78 |
+
return label, results.labels
|
79 |
+
|
80 |
|
81 |
def text_detection(img) -> (Image.Image, TextDetectionResult):
|
82 |
pred = batch_text_detection([img], det_model, det_processor)[0]
|
|
|
179 |
rec_model, rec_processor = load_rec_cached()
|
180 |
layout_model, layout_processor = load_layout_cached()
|
181 |
table_model, table_processor = load_table_cached()
|
182 |
+
ocr_error_model, ocr_error_processor = load_ocr_error_cached()
|
183 |
|
184 |
with gr.Blocks(title="Surya") as demo:
|
185 |
gr.Markdown("""
|
|
|
211 |
use_pdf_boxes_ckb = gr.Checkbox(label="Use PDF table boxes", value=True, info="Table recognition only: Use the bounding boxes from the PDF file vs text detection model.")
|
212 |
skip_table_detection_ckb = gr.Checkbox(label="Skip table detection", value=False, info="Table recognition only: Skip table detection and treat the whole image/page as a table.")
|
213 |
table_rec_btn = gr.Button("Run Table Rec")
|
214 |
+
|
215 |
+
ocr_errors_btn = gr.Button("Run bad PDF text detection")
|
216 |
with gr.Column():
|
217 |
result_img = gr.Image(label="Result image")
|
218 |
result_json = gr.JSON(label="Result json")
|
|
|
284 |
inputs=[in_img, in_file, in_num, use_pdf_boxes_ckb, skip_table_detection_ckb],
|
285 |
outputs=[result_img, result_json]
|
286 |
)
|
287 |
+
# Run bad PDF text detection
|
288 |
+
def ocr_errors_pdf(file, page_count, sample_len=512, max_samples=10, max_pages=15):
|
289 |
+
if file.endswith('.pdf'):
|
290 |
+
count = count_pdf(file)
|
291 |
+
else:
|
292 |
+
raise gr.Error("This feature only works with PDFs.", duration=5)
|
293 |
+
label, results = run_ocr_errors(file, count)
|
294 |
+
return gr.update(label="Result json:" + label, value=results)
|
295 |
+
ocr_errors_btn.click(
|
296 |
+
fn=ocr_errors_pdf,
|
297 |
+
inputs=[in_file, in_num, use_pdf_boxes_ckb, skip_table_detection_ckb],
|
298 |
+
outputs=[result_json]
|
299 |
+
)
|
300 |
|
301 |
demo.launch()
|
requirements.txt
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
torch==2.5.1
|
2 |
-
surya-ocr==0.8.
|
3 |
gradio==5.8.0
|
4 |
huggingface-hub==0.26.3
|
|
|
1 |
torch==2.5.1
|
2 |
+
surya-ocr==0.8.1
|
3 |
gradio==5.8.0
|
4 |
huggingface-hub==0.26.3
|