cstr commited on
Commit
fd11110
·
verified ·
1 Parent(s): 65caed7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -36
app.py CHANGED
@@ -6,6 +6,9 @@ import gradio as gr
6
  print(f"Gradio version: {gr.__version__}")
7
 
8
  from PyPDF2 import PdfReader
 
 
 
9
  import logging
10
  import webbrowser
11
  from huggingface_hub import InferenceClient
@@ -179,26 +182,114 @@ class ModelRegistry:
179
  self.groq_models = self._fetch_groq_models()
180
  return self.groq_models
181
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  # Initialize model registry
183
  model_registry = ModelRegistry()
184
 
185
- def extract_text_from_pdf(pdf_path: str) -> str:
186
- """Extract text content from PDF file."""
 
 
 
 
 
 
 
 
 
 
 
 
187
  try:
188
- reader = PdfReader(pdf_path)
189
- text = ""
190
- for page_num, page in enumerate(reader.pages, start=1):
191
- page_text = page.extract_text()
192
- if page_text:
193
- text += page_text + "\n"
 
194
  else:
195
- logging.warning(f"No text found on page {page_num}.")
196
- if not text.strip():
197
- return "Error: No extractable text found in the PDF."
198
- return text
199
  except Exception as e:
200
- logging.error(f"Error reading PDF file: {e}")
201
- return f"Error reading PDF file: {e}"
202
 
203
  def format_content(text: str, format_type: str) -> str:
204
  """Format extracted text according to specified format."""
@@ -538,7 +629,7 @@ with gr.Blocks(css="""
538
  )
539
 
540
  format_type = gr.Radio(
541
- choices=["txt", "md", "html"],
542
  value="txt",
543
  label="📝 Output Format"
544
  )
@@ -780,46 +871,34 @@ with gr.Blocks(css="""
780
  ]
781
 
782
  # PDF Processing Handlers
783
- def handle_pdf_process(pdf, fmt, ctx_size):
784
- """Process PDF, format text, and return formatted text and snippets."""
785
  if not pdf:
786
  return "Please upload a PDF file.", "", "", [], gr.update(choices=[], value=None), None
787
 
788
  try:
789
- text = extract_text_from_pdf(pdf.name)
790
  if text.startswith("Error"):
791
  return text, "", "", [], gr.update(choices=[], value=None), None
792
 
793
- # Format the text *before* splitting into snippets:
794
- formatted_text = format_content(text, fmt) # Call format_content here!
795
- snippets_list = split_into_snippets(formatted_text, ctx_size)
796
 
797
- with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix=f'.{fmt}') as f: # Correct suffix
798
- f.write(formatted_text) # Write the *formatted* text
799
  download_file = f.name
800
 
801
- snippet_choices = update_snippet_choices(snippets_list) # Pre-calculate choices
802
-
803
  return (
804
  f"PDF processed successfully! Generated {len(snippets_list)} snippets.",
805
- formatted_text, # Return the *formatted* text
806
- formatted_text, # Update the state with formatted text
807
  snippets_list,
808
  gr.update(choices=snippet_choices, value=snippet_choices[0] if snippet_choices else None),
809
  download_file
810
  )
811
-
812
  except Exception as e:
813
  error_msg = f"Error processing PDF: {str(e)}"
814
  logging.error(error_msg)
815
- return (
816
- error_msg,
817
- "",
818
- "",
819
- [],
820
- gr.update(choices=[], value=None),
821
- None
822
- )
823
 
824
  def handle_snippet_selection(choice, snippets_list): # Add download_snippet output
825
  """Handle snippet selection, update prompt, and provide snippet download."""
 
6
  print(f"Gradio version: {gr.__version__}")
7
 
8
  from PyPDF2 import PdfReader
9
+ import fitz # pymupdf
10
+ from pdf2md.converter import PDF2Markdown
11
+
12
  import logging
13
  import webbrowser
14
  from huggingface_hub import InferenceClient
 
182
  self.groq_models = self._fetch_groq_models()
183
  return self.groq_models
184
 
185
+ class PDFProcessor:
186
+ """Handles PDF conversion to text and markdown using different methods"""
187
+
188
+ @staticmethod
189
+ def txt_convert(pdf_path: str) -> str:
190
+ """Basic text extraction using PyPDF2"""
191
+ try:
192
+ reader = PdfReader(pdf_path)
193
+ text = ""
194
+ for page_num, page in enumerate(reader.pages, start=1):
195
+ page_text = page.extract_text()
196
+ if page_text:
197
+ text += page_text + "\n"
198
+ else:
199
+ logging.warning(f"No text found on page {page_num}.")
200
+ return text
201
+ except Exception as e:
202
+ logging.error(f"Error in txt conversion: {e}")
203
+ return f"Error: {str(e)}"
204
+
205
+ @staticmethod
206
+ def md_convert_with_pdf2md(pdf_path: str) -> str:
207
+ """Convert PDF to Markdown using pdf2md"""
208
+ try:
209
+ converter = PDF2Markdown()
210
+ markdown_text = converter.convert(pdf_path)
211
+ return markdown_text
212
+ except Exception as e:
213
+ logging.error(f"Error in pdf2md conversion: {e}")
214
+ return f"Error: {str(e)}"
215
+
216
+ @staticmethod
217
+ def md_convert_with_pymupdf(pdf_path: str) -> str:
218
+ """Convert PDF to Markdown using pymupdf"""
219
+ try:
220
+ doc = fitz.open(pdf_path)
221
+ markdown_text = []
222
+
223
+ for page in doc:
224
+ blocks = page.get_text("dict")["blocks"]
225
+
226
+ for block in blocks:
227
+ if "lines" in block:
228
+ for line in block["lines"]:
229
+ for span in line["spans"]:
230
+ font_size = span["size"]
231
+ content = span["text"]
232
+ font_flags = span["flags"] # Contains bold, italic info
233
+
234
+ # Handle headers based on font size
235
+ if font_size > 20:
236
+ markdown_text.append(f"# {content}\n")
237
+ elif font_size > 16:
238
+ markdown_text.append(f"## {content}\n")
239
+ elif font_size > 14:
240
+ markdown_text.append(f"### {content}\n")
241
+ else:
242
+ # Handle bold and italic
243
+ if font_flags & 2**4: # Bold
244
+ content = f"**{content}**"
245
+ if font_flags & 2**1: # Italic
246
+ content = f"*{content}*"
247
+ markdown_text.append(content)
248
+
249
+ markdown_text.append(" ") # Space between spans
250
+ markdown_text.append("\n") # Newline between lines
251
+
252
+ # Add extra newline between blocks for paragraphs
253
+ markdown_text.append("\n")
254
+
255
+ doc.close()
256
+ return "".join(markdown_text)
257
+ except Exception as e:
258
+ logging.error(f"Error in pymupdf conversion: {e}")
259
+ return f"Error: {str(e)}"
260
+
261
  # Initialize model registry
262
  model_registry = ModelRegistry()
263
 
264
+ def extract_text_from_pdf(pdf_path: str, format_type: str = "txt", md_engine: str = "pdf2md") -> str:
265
+ """
266
+ Extract and format text from PDF using different processors based on format.
267
+
268
+ Args:
269
+ pdf_path: Path to PDF file
270
+ format_type: Either 'txt' or 'md'
271
+ md_engine: When format_type is 'md', either 'pdf2md' or 'pymupdf'
272
+
273
+ Returns:
274
+ Formatted text content
275
+ """
276
+ processor = PDFProcessor()
277
+
278
  try:
279
+ if format_type == "txt":
280
+ return processor.txt_convert(pdf_path)
281
+ elif format_type == "md":
282
+ if md_engine == "pdf2md":
283
+ return processor.md_convert_with_pdf2md(pdf_path)
284
+ elif md_engine == "pymupdf":
285
+ return processor.md_convert_with_pymupdf(pdf_path)
286
  else:
287
+ return f"Error: Unsupported markdown engine: {md_engine}"
288
+ else:
289
+ return f"Error: Unsupported format type: {format_type}"
 
290
  except Exception as e:
291
+ logging.error(f"Error in PDF conversion: {e}")
292
+ return f"Error: {str(e)}"
293
 
294
  def format_content(text: str, format_type: str) -> str:
295
  """Format extracted text according to specified format."""
 
629
  )
630
 
631
  format_type = gr.Radio(
632
+ choices=["txt", "md (pdf2md)", "md (pymupdf)"],
633
  value="txt",
634
  label="📝 Output Format"
635
  )
 
871
  ]
872
 
873
  # PDF Processing Handlers
874
+ def handle_pdf_process(pdf, fmt, md_eng, ctx_size):
 
875
  if not pdf:
876
  return "Please upload a PDF file.", "", "", [], gr.update(choices=[], value=None), None
877
 
878
  try:
879
+ text = extract_text_from_pdf(pdf.name, format_type=fmt, md_engine=md_eng)
880
  if text.startswith("Error"):
881
  return text, "", "", [], gr.update(choices=[], value=None), None
882
 
883
+ snippets_list = split_into_snippets(text, ctx_size)
884
+ snippet_choices = update_snippet_choices(snippets_list)
 
885
 
886
+ with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix=f'.{fmt}') as f:
887
+ f.write(text)
888
  download_file = f.name
889
 
 
 
890
  return (
891
  f"PDF processed successfully! Generated {len(snippets_list)} snippets.",
892
+ text,
893
+ text,
894
  snippets_list,
895
  gr.update(choices=snippet_choices, value=snippet_choices[0] if snippet_choices else None),
896
  download_file
897
  )
 
898
  except Exception as e:
899
  error_msg = f"Error processing PDF: {str(e)}"
900
  logging.error(error_msg)
901
+ return error_msg, "", "", [], gr.update(choices=[], value=None), None
 
 
 
 
 
 
 
902
 
903
  def handle_snippet_selection(choice, snippets_list): # Add download_snippet output
904
  """Handle snippet selection, update prompt, and provide snippet download."""