Spaces:

digitiamosrl
/

document_info_extractor

Build error

MrFeelgoood commited on May 18, 2023

Commit

5568e6f

1 Parent(s): a30c743

Fixed bug in ocr function

Fixed bugs on ocr function

Files changed (1) hide show

app.py CHANGED Viewed

@@ -231,14 +231,14 @@ def pdf_ocr(file, model_t, question):
             # Perform OCR on the PDF if the extracted text is empty
             if not text:
                 # Convert PDF pages to images
-                images = convert_from_path(content)
                 for i, img in enumerate(images):
                     text += pytesseract.image_to_string(img, lang='ita')
                 # Clear the image list to free up memory
                 del images
-    ks = ('mq', 'metri quadri', 'm2')
     quest = "Quanti metri quadri misura la superficie?"
     totalK = ['totale', 'complessivo', 'complessiva']
@@ -296,6 +296,6 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     extract_button.click(fn = ocr_interface,
                          inputs=[pdf_input, model_input, question_input], outputs=[values_output, total_output, text_output])
-    gr.Examples(['Example1.pdf', 'Example2.pdf'], inputs = pdf_input)
 demo.launch()

             # Perform OCR on the PDF if the extracted text is empty
             if not text:
                 # Convert PDF pages to images
+                images = convert_from_bytes(content)
                 for i, img in enumerate(images):
                     text += pytesseract.image_to_string(img, lang='ita')
                 # Clear the image list to free up memory
                 del images
+    ks = ('mq', 'MQ', 'Mq' 'metri quadri', 'm2')
     quest = "Quanti metri quadri misura la superficie?"
     totalK = ['totale', 'complessivo', 'complessiva']
     extract_button.click(fn = ocr_interface,
                          inputs=[pdf_input, model_input, question_input], outputs=[values_output, total_output, text_output])
+    gr.Examples(['Example1(scanned).pdf', 'Example2.pdf', 'Example3Large.pdf'], inputs = pdf_input)
 demo.launch()