openfree commited on
Commit
b0061a0
ยท
verified ยท
1 Parent(s): 99d5f89

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +241 -34
app.py CHANGED
@@ -35,6 +35,114 @@ from magic_pdf.data.data_reader_writer import FileBasedDataReader
35
  from magic_pdf.libs.hash_utils import compute_sha256
36
  from magic_pdf.tools.common import do_parse, prepare_env
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  def read_fn(path):
39
  disk_rw = FileBasedDataReader(os.path.dirname(path))
40
  return disk_rw.read(os.path.basename(path))
@@ -113,6 +221,18 @@ def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table
113
  new_pdf_path = os.path.join(local_md_dir, file_name + "_layout.pdf")
114
  return md_content, txt_content, archive_zip_path, new_pdf_path
115
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  latex_delimiters = [{"left": "$$", "right": "$$", "display": True},
117
  {"left": '$', "right": '$', "display": False}]
118
 
@@ -152,36 +272,81 @@ other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
152
  all_lang = ['', 'auto']
153
  all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
154
 
155
- def to_pdf(file_path):
156
- with pymupdf.open(file_path) as f:
157
- if f.is_pdf:
158
- return file_path
159
- else:
160
- pdf_bytes = f.convert_to_pdf()
161
- unique_filename = f"{uuid.uuid4()}.pdf"
162
- tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
163
- with open(tmp_file_path, 'wb') as tmp_pdf_file:
164
- tmp_pdf_file.write(pdf_bytes)
165
- return tmp_file_path
166
-
167
  if __name__ == "__main__":
168
- with gr.Blocks(title="OCR FLEX") as demo:
 
 
 
 
 
 
 
169
  with gr.Row():
 
170
  with gr.Column(variant='panel', scale=5):
171
- file = gr.File(label="PDF ๋˜๋Š” ์ด๋ฏธ์ง€ ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•˜์„ธ์š”", file_types=[".pdf", ".png", ".jpeg", ".jpg"])
172
- max_pages = gr.Slider(1, 20, 10, step=1, label='์ตœ๋Œ€ ๋ณ€ํ™˜ ํŽ˜์ด์ง€ ์ˆ˜')
 
 
 
 
 
 
 
 
 
 
 
173
  with gr.Row():
174
- layout_mode = gr.Dropdown(["layoutlmv3", "doclayout_yolo"], label="๋ ˆ์ด์•„์›ƒ ๋ชจ๋ธ", value="doclayout_yolo")
175
- language = gr.Dropdown(all_lang, label="์–ธ์–ด", value='auto')
 
 
 
 
 
 
 
 
 
 
 
176
  with gr.Row():
177
- formula_enable = gr.Checkbox(label="์ˆ˜์‹ ์ธ์‹ ํ™œ์„ฑํ™”", value=True)
178
- is_ocr = gr.Checkbox(label="OCR ๊ฐ•์ œ ํ™œ์„ฑํ™”", value=False)
179
- table_enable = gr.Checkbox(label="ํ‘œ ์ธ์‹ ํ™œ์„ฑํ™”(ํ…Œ์ŠคํŠธ)", value=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  with gr.Row():
181
- change_bu = gr.Button("๋ณ€ํ™˜")
182
- clear_bu = gr.ClearButton(value="์ดˆ๊ธฐํ™”")
183
- pdf_show = PDF(label='PDF ๋ฏธ๋ฆฌ๋ณด๊ธฐ', interactive=False, visible=True, height=800)
184
- with gr.Accordion("์˜ˆ์ œ:"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  example_root = os.path.join(os.path.dirname(__file__), "examples")
186
  gr.Examples(
187
  examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
@@ -189,18 +354,60 @@ if __name__ == "__main__":
189
  inputs=file
190
  )
191
 
 
192
  with gr.Column(variant='panel', scale=5):
193
- output_file = gr.File(label="๋ณ€ํ™˜ ๊ฒฐ๊ณผ", interactive=False)
194
- with gr.Tabs():
 
 
 
 
 
195
  with gr.Tab("๋งˆํฌ๋‹ค์šด ๋ Œ๋”๋ง"):
196
- md = gr.Markdown(label="๋งˆํฌ๋‹ค์šด ๋ Œ๋”๋ง", height=1100, show_copy_button=True,
197
- latex_delimiters=latex_delimiters, line_breaks=True)
 
 
 
 
 
 
 
198
  with gr.Tab("๋งˆํฌ๋‹ค์šด ํ…์ŠคํŠธ"):
199
- md_text = gr.TextArea(lines=45, show_copy_button=True)
200
-
201
- file.change(fn=to_pdf, inputs=file, outputs=pdf_show)
202
- change_bu.click(fn=to_markdown, inputs=[file, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
203
- outputs=[md, md_text, output_file, pdf_show], api_name=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr])
205
 
206
- demo.launch(ssr_mode=True)
 
 
35
  from magic_pdf.libs.hash_utils import compute_sha256
36
  from magic_pdf.tools.common import do_parse, prepare_env
37
 
38
+ def create_css():
39
+ return """
40
+ /* ์ „์ฒด ์Šคํƒ€์ผ */
41
+ .gradio-container {
42
+ background: linear-gradient(135deg, #EFF6FF 0%, #F5F3FF 100%);
43
+ max-width: 1200px !important;
44
+ margin: 0 auto !important;
45
+ padding: 2rem !important;
46
+ }
47
+ /* ์ œ๋ชฉ ์Šคํƒ€์ผ */
48
+ .title-area {
49
+ text-align: center;
50
+ margin-bottom: 2rem;
51
+ padding: 1rem;
52
+ background: white;
53
+ border-radius: 1rem;
54
+ box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
55
+ }
56
+ .title-area h1 {
57
+ background: linear-gradient(90deg, #2563EB 0%, #7C3AED 100%);
58
+ -webkit-background-clip: text;
59
+ -webkit-text-fill-color: transparent;
60
+ font-size: 2.5rem;
61
+ font-weight: bold;
62
+ margin-bottom: 0.5rem;
63
+ }
64
+ .title-area p {
65
+ color: #6B7280;
66
+ font-size: 1.1rem;
67
+ }
68
+ /* ์ปดํฌ๋„ŒํŠธ ์Šคํƒ€์ผ๋ง */
69
+ .gr-box, .gr-panel {
70
+ border: 2px solid #E0E7FF !important;
71
+ border-radius: 12px !important;
72
+ box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1) !important;
73
+ background: white !important;
74
+ }
75
+ /* ํŒŒ์ผ ์—…๋กœ๋“œ ์˜์—ญ */
76
+ .file-upload {
77
+ border: 2px dashed #93C5FD !important;
78
+ border-radius: 8px !important;
79
+ padding: 2rem !important;
80
+ background: #F0F9FF !important;
81
+ transition: all 0.3s ease;
82
+ }
83
+ .file-upload:hover {
84
+ background: #E0F2FE !important;
85
+ border-color: #60A5FA !important;
86
+ }
87
+ /* ๋ฒ„ํŠผ ์Šคํƒ€์ผ๋ง */
88
+ .gr-button.primary-button {
89
+ background: linear-gradient(90deg, #2563EB 0%, #7C3AED 100%) !important;
90
+ color: white !important;
91
+ border: none !important;
92
+ border-radius: 8px !important;
93
+ padding: 0.75rem 1.5rem !important;
94
+ font-weight: bold !important;
95
+ transition: opacity 0.2s !important;
96
+ }
97
+ .gr-button.primary-button:hover {
98
+ opacity: 0.9 !important;
99
+ }
100
+ .gr-button.secondary-button {
101
+ background: white !important;
102
+ color: #4B5563 !important;
103
+ border: 1px solid #D1D5DB !important;
104
+ border-radius: 8px !important;
105
+ padding: 0.75rem 1.5rem !important;
106
+ }
107
+ .gr-button.secondary-button:hover {
108
+ background: #F9FAFB !important;
109
+ }
110
+ /* ์Šฌ๋ผ์ด๋” ์Šคํƒ€์ผ๋ง */
111
+ .gr-slider {
112
+ background: #E0E7FF !important;
113
+ }
114
+ .gr-slider .gr-slider-handle {
115
+ background: #4F46E5 !important;
116
+ }
117
+ /* ์ฒดํฌ๋ฐ•์Šค ์Šคํƒ€์ผ๋ง */
118
+ .gr-checkbox {
119
+ border-color: #6366F1 !important;
120
+ }
121
+ .gr-checkbox:checked {
122
+ background-color: #4F46E5 !important;
123
+ }
124
+ /* ํƒญ ์Šคํƒ€์ผ๋ง */
125
+ .gr-tabs {
126
+ border-bottom: 2px solid #E0E7FF !important;
127
+ }
128
+ .gr-tab-button {
129
+ color: #6B7280 !important;
130
+ padding: 0.75rem 1rem !important;
131
+ font-weight: 500 !important;
132
+ }
133
+ .gr-tab-button.selected {
134
+ color: #4F46E5 !important;
135
+ border-bottom: 2px solid #4F46E5 !important;
136
+ }
137
+ /* ๋งˆํฌ๋‹ค์šด ์ถœ๋ ฅ ์˜์—ญ */
138
+ .markdown-output {
139
+ background: white !important;
140
+ border-radius: 8px !important;
141
+ padding: 1rem !important;
142
+ box-shadow: inset 0 2px 4px rgba(0, 0, 0, 0.05) !important;
143
+ }
144
+ """
145
+
146
  def read_fn(path):
147
  disk_rw = FileBasedDataReader(os.path.dirname(path))
148
  return disk_rw.read(os.path.basename(path))
 
221
  new_pdf_path = os.path.join(local_md_dir, file_name + "_layout.pdf")
222
  return md_content, txt_content, archive_zip_path, new_pdf_path
223
 
224
+ def to_pdf(file_path):
225
+ with pymupdf.open(file_path) as f:
226
+ if f.is_pdf:
227
+ return file_path
228
+ else:
229
+ pdf_bytes = f.convert_to_pdf()
230
+ unique_filename = f"{uuid.uuid4()}.pdf"
231
+ tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
232
+ with open(tmp_file_path, 'wb') as tmp_pdf_file:
233
+ tmp_pdf_file.write(pdf_bytes)
234
+ return tmp_file_path
235
+
236
  latex_delimiters = [{"left": "$$", "right": "$$", "display": True},
237
  {"left": '$', "right": '$', "display": False}]
238
 
 
272
  all_lang = ['', 'auto']
273
  all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
274
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  if __name__ == "__main__":
276
+ with gr.Blocks(title="OCR FLEX", css=create_css()) as demo:
277
+ # ํƒ€์ดํ‹€ ์˜์—ญ
278
+ with gr.Row(elem_classes="title-area"):
279
+ gr.HTML("""
280
+ <h1>OCR FLEX</h1>
281
+ <p>PDF์™€ ์ด๋ฏธ์ง€์—์„œ ํ…์ŠคํŠธ๋ฅผ ๋น ๋ฅด๊ณ  ์ •ํ™•ํ•˜๊ฒŒ ์ถ”์ถœํ•˜์„ธ์š”</p>
282
+ """)
283
+
284
  with gr.Row():
285
+ # ์™ผ์ชฝ ํŒจ๋„
286
  with gr.Column(variant='panel', scale=5):
287
+ file = gr.File(
288
+ label="PDF ๋˜๋Š” ์ด๋ฏธ์ง€ ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•˜์„ธ์š”",
289
+ file_types=[".pdf", ".png", ".jpeg", ".jpg"],
290
+ elem_classes="file-upload"
291
+ )
292
+
293
+ max_pages = gr.Slider(
294
+ 1, 20, 10,
295
+ step=1,
296
+ label='์ตœ๋Œ€ ๋ณ€ํ™˜ ํŽ˜์ด์ง€ ์ˆ˜',
297
+ elem_classes="custom-slider"
298
+ )
299
+
300
  with gr.Row():
301
+ layout_mode = gr.Dropdown(
302
+ ["layoutlmv3", "doclayout_yolo"],
303
+ label="๋ ˆ์ด์•„์›ƒ ๋ชจ๋ธ",
304
+ value="doclayout_yolo",
305
+ elem_classes="custom-dropdown"
306
+ )
307
+ language = gr.Dropdown(
308
+ all_lang,
309
+ label="์–ธ์–ด",
310
+ value='auto',
311
+ elem_classes="custom-dropdown"
312
+ )
313
+
314
  with gr.Row():
315
+ formula_enable = gr.Checkbox(
316
+ label="์ˆ˜์‹ ์ธ์‹ ํ™œ์„ฑํ™”",
317
+ value=True,
318
+ elem_classes="custom-checkbox"
319
+ )
320
+ is_ocr = gr.Checkbox(
321
+ label="OCR ๊ฐ•์ œ ํ™œ์„ฑํ™”",
322
+ value=False,
323
+ elem_classes="custom-checkbox"
324
+ )
325
+ table_enable = gr.Checkbox(
326
+ label="ํ‘œ ์ธ์‹ ํ™œ์„ฑํ™”(ํ…Œ์ŠคํŠธ)",
327
+ value=True,
328
+ elem_classes="custom-checkbox"
329
+ )
330
+
331
  with gr.Row():
332
+ change_bu = gr.Button(
333
+ "๋ณ€ํ™˜",
334
+ elem_classes="primary-button"
335
+ )
336
+ clear_bu = gr.ClearButton(
337
+ value="์ดˆ๊ธฐํ™”",
338
+ elem_classes="secondary-button"
339
+ )
340
+
341
+ pdf_show = PDF(
342
+ label='PDF ๋ฏธ๋ฆฌ๋ณด๊ธฐ',
343
+ interactive=False,
344
+ visible=True,
345
+ height=800,
346
+ elem_classes="pdf-preview"
347
+ )
348
+
349
+ with gr.Accordion("์˜ˆ์ œ:", open=False):
350
  example_root = os.path.join(os.path.dirname(__file__), "examples")
351
  gr.Examples(
352
  examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
 
354
  inputs=file
355
  )
356
 
357
+ # ์˜ค๋ฅธ์ชฝ ํŒจ๋„
358
  with gr.Column(variant='panel', scale=5):
359
+ output_file = gr.File(
360
+ label="๋ณ€ํ™˜ ๊ฒฐ๊ณผ",
361
+ interactive=False,
362
+ elem_classes="output-file"
363
+ )
364
+
365
+ with gr.Tabs() as tabs:
366
  with gr.Tab("๋งˆํฌ๋‹ค์šด ๋ Œ๋”๋ง"):
367
+ md = gr.Markdown(
368
+ label="๋งˆํฌ๋‹ค์šด ๋ Œ๋”๋ง",
369
+ height=1100,
370
+ show_copy_button=True,
371
+ latex_delimiters=latex_delimiters,
372
+ line_breaks=True,
373
+ elem_classes="markdown-output"
374
+ )
375
+
376
  with gr.Tab("๋งˆํฌ๋‹ค์šด ํ…์ŠคํŠธ"):
377
+ md_text = gr.TextArea(
378
+ lines=45,
379
+ show_copy_button=True,
380
+ elem_classes="markdown-text"
381
+ )
382
+
383
+ # ์ด๋ฒคํŠธ ํ•ธ๋“ค๋Ÿฌ
384
+ file.change(
385
+ fn=to_pdf,
386
+ inputs=file,
387
+ outputs=pdf_show
388
+ )
389
+
390
+ change_bu.click(
391
+ fn=to_markdown,
392
+ inputs=[
393
+ file,
394
+ max_pages,
395
+ is_ocr,
396
+ layout_mode,
397
+ formula_enable,
398
+ table_enable,
399
+ language
400
+ ],
401
+ outputs=[
402
+ md,
403
+ md_text,
404
+ output_file,
405
+ pdf_show
406
+ ],
407
+ api_name=False
408
+ )
409
+
410
  clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr])
411
 
412
+ # ์•ฑ ์‹คํ–‰
413
+ demo.launch(ssr_mode=True)