Spaces:

jordyvl
/

viz_bdpc

Runtime error

App Files Files Community

jordyvl commited on Oct 30, 2023

Commit

7568689

1 Parent(s): 96f0e2c

Local test functional

Browse files

Files changed (2) hide show

Arial.ttf +0 -0
app.py +54 -49

Arial.ttf ADDED Viewed

Binary file (276 kB). View file

app.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import os
-from pathlib import Path
-import pandas as pd
 import gradio as gr
 from collections import OrderedDict
 from PIL import Image, ImageDraw, ImageFont
 from io import BytesIO
 import PyPDF2
 import pdf2image
 MAX_PAGES = 50
 MAX_PDF_SIZE = 100000000  # almost 100MB
@@ -81,7 +82,7 @@ def pdf_to_grid(pdf_path):
                 images.append(im)
     except Exception as e:
         print(f"{pdf_path} PyPDF get_images {e}")
-        images = pdf2image.convert_from_path(pdf_path)
     # simpler but slower
     # images = pdf2image.convert_from_path(pdf_path)
@@ -92,37 +93,27 @@ def pdf_to_grid(pdf_path):
     return equal_image_grid(images)
-def main(complexity, evidence, form, operation, type):
-    # need to write a query on diagnostic test and sample from it based on slider values
-    # then return the sample
-    query = " and ".join(
-        [
-            f"{cat}_{val} == {True}"
-            for cat, val in zip(meta_cats.keys(), [complexity, evidence, form, operation, type])
-            if val
-        ]
-    )
-    results = DIAGNOSTIC_TEST.query(query)
-    if len(results) == 0:
-        return f"No results found for query {query}", "", "", "", ""
-    for i, sample in results.sample(frac=1).iterrows():
-        if not sample['nhash']:
-            continue
-        print("Sampled: ", sample["nhash"])
-        # first get PDF file
-        PDF, grid = None, None
-        pdf_path = PDF_PATH / "test" / (sample["nhash"] + ".pdf")
-        if not os.path.exists(pdf_path):
             continue
-        PDF = pdf_path
-        grid = pdf_to_grid(pdf_path)
-        if not grid:
             continue
-        question, answer = sample["label"] #might need to translate
-        return label, grid, PDF
 _CLASSES = [
     "letter",
@@ -141,25 +132,23 @@ _CLASSES = [
     "questionnaire",
     "resume",
     "memo",
 ]
-# test
-# l, im, f = main(*slider_defaults)
-#load both datasets in memory? --> easier retrieval afterwards with seed index based on pressing button
-DATASETS = {
-    'rvl_cdip': load_dataset(
-        "bdpc/rvl_cdip_mp",
-        split="test"),
-    'rvl_cdip_N': load_dataset(
-        "bdpc/rvl_cdip_mp",
-        split="test")
-}
-meta_cats = {'dataset': ['rvl_cdip', 'rvl_cdip_N'],
-             'label': _CLASSES
-            }
 sliders = [gr.Dropdown(choices=choices, value=choices[-1], label=label) for label, choices in meta_cats.items()]
-slider_defaults = [slider.value for slider in sliders]
 outputs = [
     gr.Textbox(label="label"),
@@ -167,5 +156,21 @@ outputs = [
     gr.File(label="PDF"),
 ]
-iface = gr.Interface(fn=main, inputs=sliders, outputs=outputs, description="Visualize PDF samples from multi-page (PDF) document classification datasets", title='Beyond Document Page Classification: Examples')
-iface.launch(share=True)

 import os
 import gradio as gr
 from collections import OrderedDict
 from PIL import Image, ImageDraw, ImageFont
 from io import BytesIO
+import time
+import tempfile
 import PyPDF2
 import pdf2image
+from datasets import load_dataset
 MAX_PAGES = 50
 MAX_PDF_SIZE = 100000000  # almost 100MB
                 images.append(im)
     except Exception as e:
         print(f"{pdf_path} PyPDF get_images {e}")
+        images = pdf2image.convert_from_bytes(pdf_path)
     # simpler but slower
     # images = pdf2image.convert_from_path(pdf_path)
     return equal_image_grid(images)
+def main(dataset, label):
+    # to get different samples, use timestamp as seed
+    timestamp = time.time()
+    seed = int(timestamp * 1000) % 1000000
+    shuffled_dataset = DATASETS[dataset].shuffle(buffer_size=10, seed=seed)
+    # first get PDF file
+    for sample in shuffled_dataset:
+        label_column = "label" if "label" in sample else "labels"
+        filelabel = _CLASSES[sample[label_column]]
+        if label and filelabel != label:
             continue
+        pdf_path = sample["file"]
+        grid = pdf_to_grid(BytesIO(pdf_path))
+        if grid is None:
             continue
+        PDF = tempfile.NamedTemporaryFile(suffix=".pdf")
+        PDF.write(pdf_path)
+        return filelabel, grid, pdf_path
 _CLASSES = [
     "letter",
     "questionnaire",
     "resume",
     "memo",
+    ''
 ]
+# load both datasets in memory? --> easier retrieval afterwards with seed index based on pressing button
+DATASETS = OrderedDict(
+    {
+        "rvl_cdip": load_dataset("bdpc/rvl_cdip_mp", split="test", streaming=True),
+        "rvl_cdip_N": load_dataset("bdpc/rvl_cdip_n_mp", split="test", streaming=True),
+    }
+)
+meta_cats = {"dataset": ["rvl_cdip", "rvl_cdip_N"], "label": _CLASSES}
 sliders = [gr.Dropdown(choices=choices, value=choices[-1], label=label) for label, choices in meta_cats.items()]
+slider_defaults = [sliders[0].value, None]
+# test
+# l, im, f = main(*slider_defaults)
 outputs = [
     gr.Textbox(label="label"),
     gr.File(label="PDF"),
 ]
+DESCRIPTION = """
+Visualize PDF samples from multi-page (PDF) document classification datasets @ https://huggingface.co/datasets/bdpc
+- **dataset**: dataset name
+- **label**: label name
+The first time that the app is launched, it will download the datasets, which can take a few minutes.
+For fastest response, choose the rvl_cdip_N dataset, which is considerably smaller to iterate over.
+"""
+iface = gr.Interface(
+    fn=main,
+    inputs=sliders,
+    outputs=outputs,
+    description=DESCRIPTION,
+    title="Beyond Document Page Classification: Examples",
+)
+iface.launch(share=True)