Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -35,6 +35,114 @@ from magic_pdf.data.data_reader_writer import FileBasedDataReader
|
|
35 |
from magic_pdf.libs.hash_utils import compute_sha256
|
36 |
from magic_pdf.tools.common import do_parse, prepare_env
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
def read_fn(path):
|
39 |
disk_rw = FileBasedDataReader(os.path.dirname(path))
|
40 |
return disk_rw.read(os.path.basename(path))
|
@@ -113,6 +221,18 @@ def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table
|
|
113 |
new_pdf_path = os.path.join(local_md_dir, file_name + "_layout.pdf")
|
114 |
return md_content, txt_content, archive_zip_path, new_pdf_path
|
115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
latex_delimiters = [{"left": "$$", "right": "$$", "display": True},
|
117 |
{"left": '$', "right": '$', "display": False}]
|
118 |
|
@@ -152,36 +272,81 @@ other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
|
|
152 |
all_lang = ['', 'auto']
|
153 |
all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
|
154 |
|
155 |
-
def to_pdf(file_path):
|
156 |
-
with pymupdf.open(file_path) as f:
|
157 |
-
if f.is_pdf:
|
158 |
-
return file_path
|
159 |
-
else:
|
160 |
-
pdf_bytes = f.convert_to_pdf()
|
161 |
-
unique_filename = f"{uuid.uuid4()}.pdf"
|
162 |
-
tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
|
163 |
-
with open(tmp_file_path, 'wb') as tmp_pdf_file:
|
164 |
-
tmp_pdf_file.write(pdf_bytes)
|
165 |
-
return tmp_file_path
|
166 |
-
|
167 |
if __name__ == "__main__":
|
168 |
-
with gr.Blocks(title="OCR FLEX") as demo:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
with gr.Row():
|
|
|
170 |
with gr.Column(variant='panel', scale=5):
|
171 |
-
file = gr.File(
|
172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
with gr.Row():
|
174 |
-
layout_mode = gr.Dropdown(
|
175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
with gr.Row():
|
177 |
-
formula_enable = gr.Checkbox(
|
178 |
-
|
179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
with gr.Row():
|
181 |
-
change_bu = gr.Button(
|
182 |
-
|
183 |
-
|
184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
example_root = os.path.join(os.path.dirname(__file__), "examples")
|
186 |
gr.Examples(
|
187 |
examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
|
@@ -189,18 +354,60 @@ if __name__ == "__main__":
|
|
189 |
inputs=file
|
190 |
)
|
191 |
|
|
|
192 |
with gr.Column(variant='panel', scale=5):
|
193 |
-
output_file = gr.File(
|
194 |
-
|
|
|
|
|
|
|
|
|
|
|
195 |
with gr.Tab("๋งํฌ๋ค์ด ๋ ๋๋ง"):
|
196 |
-
md = gr.Markdown(
|
197 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
with gr.Tab("๋งํฌ๋ค์ด ํ
์คํธ"):
|
199 |
-
md_text = gr.TextArea(
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr])
|
205 |
|
206 |
-
|
|
|
|
35 |
from magic_pdf.libs.hash_utils import compute_sha256
|
36 |
from magic_pdf.tools.common import do_parse, prepare_env
|
37 |
|
38 |
+
def create_css():
|
39 |
+
return """
|
40 |
+
/* ์ ์ฒด ์คํ์ผ */
|
41 |
+
.gradio-container {
|
42 |
+
background: linear-gradient(135deg, #EFF6FF 0%, #F5F3FF 100%);
|
43 |
+
max-width: 1200px !important;
|
44 |
+
margin: 0 auto !important;
|
45 |
+
padding: 2rem !important;
|
46 |
+
}
|
47 |
+
/* ์ ๋ชฉ ์คํ์ผ */
|
48 |
+
.title-area {
|
49 |
+
text-align: center;
|
50 |
+
margin-bottom: 2rem;
|
51 |
+
padding: 1rem;
|
52 |
+
background: white;
|
53 |
+
border-radius: 1rem;
|
54 |
+
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
|
55 |
+
}
|
56 |
+
.title-area h1 {
|
57 |
+
background: linear-gradient(90deg, #2563EB 0%, #7C3AED 100%);
|
58 |
+
-webkit-background-clip: text;
|
59 |
+
-webkit-text-fill-color: transparent;
|
60 |
+
font-size: 2.5rem;
|
61 |
+
font-weight: bold;
|
62 |
+
margin-bottom: 0.5rem;
|
63 |
+
}
|
64 |
+
.title-area p {
|
65 |
+
color: #6B7280;
|
66 |
+
font-size: 1.1rem;
|
67 |
+
}
|
68 |
+
/* ์ปดํฌ๋ํธ ์คํ์ผ๋ง */
|
69 |
+
.gr-box, .gr-panel {
|
70 |
+
border: 2px solid #E0E7FF !important;
|
71 |
+
border-radius: 12px !important;
|
72 |
+
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1) !important;
|
73 |
+
background: white !important;
|
74 |
+
}
|
75 |
+
/* ํ์ผ ์
๋ก๋ ์์ญ */
|
76 |
+
.file-upload {
|
77 |
+
border: 2px dashed #93C5FD !important;
|
78 |
+
border-radius: 8px !important;
|
79 |
+
padding: 2rem !important;
|
80 |
+
background: #F0F9FF !important;
|
81 |
+
transition: all 0.3s ease;
|
82 |
+
}
|
83 |
+
.file-upload:hover {
|
84 |
+
background: #E0F2FE !important;
|
85 |
+
border-color: #60A5FA !important;
|
86 |
+
}
|
87 |
+
/* ๋ฒํผ ์คํ์ผ๋ง */
|
88 |
+
.gr-button.primary-button {
|
89 |
+
background: linear-gradient(90deg, #2563EB 0%, #7C3AED 100%) !important;
|
90 |
+
color: white !important;
|
91 |
+
border: none !important;
|
92 |
+
border-radius: 8px !important;
|
93 |
+
padding: 0.75rem 1.5rem !important;
|
94 |
+
font-weight: bold !important;
|
95 |
+
transition: opacity 0.2s !important;
|
96 |
+
}
|
97 |
+
.gr-button.primary-button:hover {
|
98 |
+
opacity: 0.9 !important;
|
99 |
+
}
|
100 |
+
.gr-button.secondary-button {
|
101 |
+
background: white !important;
|
102 |
+
color: #4B5563 !important;
|
103 |
+
border: 1px solid #D1D5DB !important;
|
104 |
+
border-radius: 8px !important;
|
105 |
+
padding: 0.75rem 1.5rem !important;
|
106 |
+
}
|
107 |
+
.gr-button.secondary-button:hover {
|
108 |
+
background: #F9FAFB !important;
|
109 |
+
}
|
110 |
+
/* ์ฌ๋ผ์ด๋ ์คํ์ผ๋ง */
|
111 |
+
.gr-slider {
|
112 |
+
background: #E0E7FF !important;
|
113 |
+
}
|
114 |
+
.gr-slider .gr-slider-handle {
|
115 |
+
background: #4F46E5 !important;
|
116 |
+
}
|
117 |
+
/* ์ฒดํฌ๋ฐ์ค ์คํ์ผ๋ง */
|
118 |
+
.gr-checkbox {
|
119 |
+
border-color: #6366F1 !important;
|
120 |
+
}
|
121 |
+
.gr-checkbox:checked {
|
122 |
+
background-color: #4F46E5 !important;
|
123 |
+
}
|
124 |
+
/* ํญ ์คํ์ผ๋ง */
|
125 |
+
.gr-tabs {
|
126 |
+
border-bottom: 2px solid #E0E7FF !important;
|
127 |
+
}
|
128 |
+
.gr-tab-button {
|
129 |
+
color: #6B7280 !important;
|
130 |
+
padding: 0.75rem 1rem !important;
|
131 |
+
font-weight: 500 !important;
|
132 |
+
}
|
133 |
+
.gr-tab-button.selected {
|
134 |
+
color: #4F46E5 !important;
|
135 |
+
border-bottom: 2px solid #4F46E5 !important;
|
136 |
+
}
|
137 |
+
/* ๋งํฌ๋ค์ด ์ถ๋ ฅ ์์ญ */
|
138 |
+
.markdown-output {
|
139 |
+
background: white !important;
|
140 |
+
border-radius: 8px !important;
|
141 |
+
padding: 1rem !important;
|
142 |
+
box-shadow: inset 0 2px 4px rgba(0, 0, 0, 0.05) !important;
|
143 |
+
}
|
144 |
+
"""
|
145 |
+
|
146 |
def read_fn(path):
|
147 |
disk_rw = FileBasedDataReader(os.path.dirname(path))
|
148 |
return disk_rw.read(os.path.basename(path))
|
|
|
221 |
new_pdf_path = os.path.join(local_md_dir, file_name + "_layout.pdf")
|
222 |
return md_content, txt_content, archive_zip_path, new_pdf_path
|
223 |
|
224 |
+
def to_pdf(file_path):
|
225 |
+
with pymupdf.open(file_path) as f:
|
226 |
+
if f.is_pdf:
|
227 |
+
return file_path
|
228 |
+
else:
|
229 |
+
pdf_bytes = f.convert_to_pdf()
|
230 |
+
unique_filename = f"{uuid.uuid4()}.pdf"
|
231 |
+
tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
|
232 |
+
with open(tmp_file_path, 'wb') as tmp_pdf_file:
|
233 |
+
tmp_pdf_file.write(pdf_bytes)
|
234 |
+
return tmp_file_path
|
235 |
+
|
236 |
latex_delimiters = [{"left": "$$", "right": "$$", "display": True},
|
237 |
{"left": '$', "right": '$', "display": False}]
|
238 |
|
|
|
272 |
all_lang = ['', 'auto']
|
273 |
all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
|
274 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
if __name__ == "__main__":
|
276 |
+
with gr.Blocks(title="OCR FLEX", css=create_css()) as demo:
|
277 |
+
# ํ์ดํ ์์ญ
|
278 |
+
with gr.Row(elem_classes="title-area"):
|
279 |
+
gr.HTML("""
|
280 |
+
<h1>OCR FLEX</h1>
|
281 |
+
<p>PDF์ ์ด๋ฏธ์ง์์ ํ
์คํธ๋ฅผ ๋น ๋ฅด๊ณ ์ ํํ๊ฒ ์ถ์ถํ์ธ์</p>
|
282 |
+
""")
|
283 |
+
|
284 |
with gr.Row():
|
285 |
+
# ์ผ์ชฝ ํจ๋
|
286 |
with gr.Column(variant='panel', scale=5):
|
287 |
+
file = gr.File(
|
288 |
+
label="PDF ๋๋ ์ด๋ฏธ์ง ํ์ผ์ ์
๋ก๋ํ์ธ์",
|
289 |
+
file_types=[".pdf", ".png", ".jpeg", ".jpg"],
|
290 |
+
elem_classes="file-upload"
|
291 |
+
)
|
292 |
+
|
293 |
+
max_pages = gr.Slider(
|
294 |
+
1, 20, 10,
|
295 |
+
step=1,
|
296 |
+
label='์ต๋ ๋ณํ ํ์ด์ง ์',
|
297 |
+
elem_classes="custom-slider"
|
298 |
+
)
|
299 |
+
|
300 |
with gr.Row():
|
301 |
+
layout_mode = gr.Dropdown(
|
302 |
+
["layoutlmv3", "doclayout_yolo"],
|
303 |
+
label="๋ ์ด์์ ๋ชจ๋ธ",
|
304 |
+
value="doclayout_yolo",
|
305 |
+
elem_classes="custom-dropdown"
|
306 |
+
)
|
307 |
+
language = gr.Dropdown(
|
308 |
+
all_lang,
|
309 |
+
label="์ธ์ด",
|
310 |
+
value='auto',
|
311 |
+
elem_classes="custom-dropdown"
|
312 |
+
)
|
313 |
+
|
314 |
with gr.Row():
|
315 |
+
formula_enable = gr.Checkbox(
|
316 |
+
label="์์ ์ธ์ ํ์ฑํ",
|
317 |
+
value=True,
|
318 |
+
elem_classes="custom-checkbox"
|
319 |
+
)
|
320 |
+
is_ocr = gr.Checkbox(
|
321 |
+
label="OCR ๊ฐ์ ํ์ฑํ",
|
322 |
+
value=False,
|
323 |
+
elem_classes="custom-checkbox"
|
324 |
+
)
|
325 |
+
table_enable = gr.Checkbox(
|
326 |
+
label="ํ ์ธ์ ํ์ฑํ(ํ
์คํธ)",
|
327 |
+
value=True,
|
328 |
+
elem_classes="custom-checkbox"
|
329 |
+
)
|
330 |
+
|
331 |
with gr.Row():
|
332 |
+
change_bu = gr.Button(
|
333 |
+
"๋ณํ",
|
334 |
+
elem_classes="primary-button"
|
335 |
+
)
|
336 |
+
clear_bu = gr.ClearButton(
|
337 |
+
value="์ด๊ธฐํ",
|
338 |
+
elem_classes="secondary-button"
|
339 |
+
)
|
340 |
+
|
341 |
+
pdf_show = PDF(
|
342 |
+
label='PDF ๋ฏธ๋ฆฌ๋ณด๊ธฐ',
|
343 |
+
interactive=False,
|
344 |
+
visible=True,
|
345 |
+
height=800,
|
346 |
+
elem_classes="pdf-preview"
|
347 |
+
)
|
348 |
+
|
349 |
+
with gr.Accordion("์์ :", open=False):
|
350 |
example_root = os.path.join(os.path.dirname(__file__), "examples")
|
351 |
gr.Examples(
|
352 |
examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
|
|
|
354 |
inputs=file
|
355 |
)
|
356 |
|
357 |
+
# ์ค๋ฅธ์ชฝ ํจ๋
|
358 |
with gr.Column(variant='panel', scale=5):
|
359 |
+
output_file = gr.File(
|
360 |
+
label="๋ณํ ๊ฒฐ๊ณผ",
|
361 |
+
interactive=False,
|
362 |
+
elem_classes="output-file"
|
363 |
+
)
|
364 |
+
|
365 |
+
with gr.Tabs() as tabs:
|
366 |
with gr.Tab("๋งํฌ๋ค์ด ๋ ๋๋ง"):
|
367 |
+
md = gr.Markdown(
|
368 |
+
label="๋งํฌ๋ค์ด ๋ ๋๋ง",
|
369 |
+
height=1100,
|
370 |
+
show_copy_button=True,
|
371 |
+
latex_delimiters=latex_delimiters,
|
372 |
+
line_breaks=True,
|
373 |
+
elem_classes="markdown-output"
|
374 |
+
)
|
375 |
+
|
376 |
with gr.Tab("๋งํฌ๋ค์ด ํ
์คํธ"):
|
377 |
+
md_text = gr.TextArea(
|
378 |
+
lines=45,
|
379 |
+
show_copy_button=True,
|
380 |
+
elem_classes="markdown-text"
|
381 |
+
)
|
382 |
+
|
383 |
+
# ์ด๋ฒคํธ ํธ๋ค๋ฌ
|
384 |
+
file.change(
|
385 |
+
fn=to_pdf,
|
386 |
+
inputs=file,
|
387 |
+
outputs=pdf_show
|
388 |
+
)
|
389 |
+
|
390 |
+
change_bu.click(
|
391 |
+
fn=to_markdown,
|
392 |
+
inputs=[
|
393 |
+
file,
|
394 |
+
max_pages,
|
395 |
+
is_ocr,
|
396 |
+
layout_mode,
|
397 |
+
formula_enable,
|
398 |
+
table_enable,
|
399 |
+
language
|
400 |
+
],
|
401 |
+
outputs=[
|
402 |
+
md,
|
403 |
+
md_text,
|
404 |
+
output_file,
|
405 |
+
pdf_show
|
406 |
+
],
|
407 |
+
api_name=False
|
408 |
+
)
|
409 |
+
|
410 |
clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr])
|
411 |
|
412 |
+
# ์ฑ ์คํ
|
413 |
+
demo.launch(ssr_mode=True)
|