Spaces:

cstr
/

PDF-Summarizer

Running

App Files Files Community

cstr commited on Dec 7, 2024

Commit

fd11110

verified ·

1 Parent(s): 65caed7

Update app.py

Browse files

Files changed (1) hide show

app.py +115 -36

app.py CHANGED Viewed

@@ -6,6 +6,9 @@ import gradio as gr
 print(f"Gradio version: {gr.__version__}")
 from PyPDF2 import PdfReader
 import logging
 import webbrowser
 from huggingface_hub import InferenceClient
@@ -179,26 +182,114 @@ class ModelRegistry:
        self.groq_models = self._fetch_groq_models()
        return self.groq_models
 # Initialize model registry
 model_registry = ModelRegistry()
-def extract_text_from_pdf(pdf_path: str) -> str:
-    """Extract text content from PDF file."""
     try:
-        reader = PdfReader(pdf_path)
-        text = ""
-        for page_num, page in enumerate(reader.pages, start=1):
-            page_text = page.extract_text()
-            if page_text:
-                text += page_text + "\n"
             else:
-                logging.warning(f"No text found on page {page_num}.")
-        if not text.strip():
-            return "Error: No extractable text found in the PDF."
-        return text
     except Exception as e:
-        logging.error(f"Error reading PDF file: {e}")
-        return f"Error reading PDF file: {e}"
 def format_content(text: str, format_type: str) -> str:
     """Format extracted text according to specified format."""
@@ -538,7 +629,7 @@ with gr.Blocks(css="""
                     )
                     format_type = gr.Radio(
-                        choices=["txt", "md", "html"],
                         value="txt",
                         label="📝 Output Format"
                     )
@@ -780,46 +871,34 @@ with gr.Blocks(css="""
         ]
     # PDF Processing Handlers
-    def handle_pdf_process(pdf, fmt, ctx_size):
-        """Process PDF, format text, and return formatted text and snippets."""
         if not pdf:
             return "Please upload a PDF file.", "", "", [], gr.update(choices=[], value=None), None
         try:
-            text = extract_text_from_pdf(pdf.name)
             if text.startswith("Error"):
                 return text, "", "", [], gr.update(choices=[], value=None), None
-            # Format the text *before* splitting into snippets:
-            formatted_text = format_content(text, fmt) # Call format_content here!
-            snippets_list = split_into_snippets(formatted_text, ctx_size)
-            with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix=f'.{fmt}') as f: # Correct suffix
-                f.write(formatted_text)  # Write the *formatted* text
                 download_file = f.name
-            snippet_choices = update_snippet_choices(snippets_list) # Pre-calculate choices
             return (
                 f"PDF processed successfully! Generated {len(snippets_list)} snippets.",
-                formatted_text,  # Return the *formatted* text
-                formatted_text, # Update the state with formatted text
                 snippets_list,
                 gr.update(choices=snippet_choices, value=snippet_choices[0] if snippet_choices else None),
                 download_file
             )
         except Exception as e:
             error_msg = f"Error processing PDF: {str(e)}"
             logging.error(error_msg)
-            return (
-                error_msg,
-                "",
-                "",
-                [],
-                gr.update(choices=[], value=None),
-                None
-            )
     def handle_snippet_selection(choice, snippets_list): # Add download_snippet output
         """Handle snippet selection, update prompt, and provide snippet download."""

 print(f"Gradio version: {gr.__version__}")
 from PyPDF2 import PdfReader
+import fitz  # pymupdf
+from pdf2md.converter import PDF2Markdown
 import logging
 import webbrowser
 from huggingface_hub import InferenceClient
        self.groq_models = self._fetch_groq_models()
        return self.groq_models
+class PDFProcessor:
+    """Handles PDF conversion to text and markdown using different methods"""
+    @staticmethod
+    def txt_convert(pdf_path: str) -> str:
+        """Basic text extraction using PyPDF2"""
+        try:
+            reader = PdfReader(pdf_path)
+            text = ""
+            for page_num, page in enumerate(reader.pages, start=1):
+                page_text = page.extract_text()
+                if page_text:
+                    text += page_text + "\n"
+                else:
+                    logging.warning(f"No text found on page {page_num}.")
+            return text
+        except Exception as e:
+            logging.error(f"Error in txt conversion: {e}")
+            return f"Error: {str(e)}"
+    @staticmethod
+    def md_convert_with_pdf2md(pdf_path: str) -> str:
+        """Convert PDF to Markdown using pdf2md"""
+        try:
+            converter = PDF2Markdown()
+            markdown_text = converter.convert(pdf_path)
+            return markdown_text
+        except Exception as e:
+            logging.error(f"Error in pdf2md conversion: {e}")
+            return f"Error: {str(e)}"
+    @staticmethod
+    def md_convert_with_pymupdf(pdf_path: str) -> str:
+        """Convert PDF to Markdown using pymupdf"""
+        try:
+            doc = fitz.open(pdf_path)
+            markdown_text = []
+            for page in doc:
+                blocks = page.get_text("dict")["blocks"]
+                for block in blocks:
+                    if "lines" in block:
+                        for line in block["lines"]:
+                            for span in line["spans"]:
+                                font_size = span["size"]
+                                content = span["text"]
+                                font_flags = span["flags"]  # Contains bold, italic info
+                                # Handle headers based on font size
+                                if font_size > 20:
+                                    markdown_text.append(f"# {content}\n")
+                                elif font_size > 16:
+                                    markdown_text.append(f"## {content}\n")
+                                elif font_size > 14:
+                                    markdown_text.append(f"### {content}\n")
+                                else:
+                                    # Handle bold and italic
+                                    if font_flags & 2**4:  # Bold
+                                        content = f"**{content}**"
+                                    if font_flags & 2**1:  # Italic
+                                        content = f"*{content}*"
+                                    markdown_text.append(content)
+                            markdown_text.append(" ")  # Space between spans
+                        markdown_text.append("\n")  # Newline between lines
+                    # Add extra newline between blocks for paragraphs
+                    markdown_text.append("\n")
+            doc.close()
+            return "".join(markdown_text)
+        except Exception as e:
+            logging.error(f"Error in pymupdf conversion: {e}")
+            return f"Error: {str(e)}"
 # Initialize model registry
 model_registry = ModelRegistry()
+def extract_text_from_pdf(pdf_path: str, format_type: str = "txt", md_engine: str = "pdf2md") -> str:
+    """
+    Extract and format text from PDF using different processors based on format.
+    Args:
+        pdf_path: Path to PDF file
+        format_type: Either 'txt' or 'md'
+        md_engine: When format_type is 'md', either 'pdf2md' or 'pymupdf'
+    Returns:
+        Formatted text content
+    """
+    processor = PDFProcessor()
     try:
+        if format_type == "txt":
+            return processor.txt_convert(pdf_path)
+        elif format_type == "md":
+            if md_engine == "pdf2md":
+                return processor.md_convert_with_pdf2md(pdf_path)
+            elif md_engine == "pymupdf":
+                return processor.md_convert_with_pymupdf(pdf_path)
             else:
+                return f"Error: Unsupported markdown engine: {md_engine}"
+        else:
+            return f"Error: Unsupported format type: {format_type}"
     except Exception as e:
+        logging.error(f"Error in PDF conversion: {e}")
+        return f"Error: {str(e)}"
 def format_content(text: str, format_type: str) -> str:
     """Format extracted text according to specified format."""
                     )
                     format_type = gr.Radio(
+                        choices=["txt", "md (pdf2md)", "md (pymupdf)"],
                         value="txt",
                         label="📝 Output Format"
                     )
         ]
     # PDF Processing Handlers
+    def handle_pdf_process(pdf, fmt, md_eng, ctx_size):
         if not pdf:
             return "Please upload a PDF file.", "", "", [], gr.update(choices=[], value=None), None
         try:
+            text = extract_text_from_pdf(pdf.name, format_type=fmt, md_engine=md_eng)
             if text.startswith("Error"):
                 return text, "", "", [], gr.update(choices=[], value=None), None
+            snippets_list = split_into_snippets(text, ctx_size)
+            snippet_choices = update_snippet_choices(snippets_list)
+            with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix=f'.{fmt}') as f:
+                f.write(text)
                 download_file = f.name
             return (
                 f"PDF processed successfully! Generated {len(snippets_list)} snippets.",
+                text,
+                text,
                 snippets_list,
                 gr.update(choices=snippet_choices, value=snippet_choices[0] if snippet_choices else None),
                 download_file
             )
         except Exception as e:
             error_msg = f"Error processing PDF: {str(e)}"
             logging.error(error_msg)
+            return error_msg, "", "", [], gr.update(choices=[], value=None), None
     def handle_snippet_selection(choice, snippets_list): # Add download_snippet output
         """Handle snippet selection, update prompt, and provide snippet download."""