Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -6,6 +6,9 @@ import gradio as gr
|
|
6 |
print(f"Gradio version: {gr.__version__}")
|
7 |
|
8 |
from PyPDF2 import PdfReader
|
|
|
|
|
|
|
9 |
import logging
|
10 |
import webbrowser
|
11 |
from huggingface_hub import InferenceClient
|
@@ -179,26 +182,114 @@ class ModelRegistry:
|
|
179 |
self.groq_models = self._fetch_groq_models()
|
180 |
return self.groq_models
|
181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
# Initialize model registry
|
183 |
model_registry = ModelRegistry()
|
184 |
|
185 |
-
def extract_text_from_pdf(pdf_path: str) -> str:
|
186 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
try:
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
|
|
194 |
else:
|
195 |
-
|
196 |
-
|
197 |
-
return "Error:
|
198 |
-
return text
|
199 |
except Exception as e:
|
200 |
-
logging.error(f"Error
|
201 |
-
return f"Error
|
202 |
|
203 |
def format_content(text: str, format_type: str) -> str:
|
204 |
"""Format extracted text according to specified format."""
|
@@ -538,7 +629,7 @@ with gr.Blocks(css="""
|
|
538 |
)
|
539 |
|
540 |
format_type = gr.Radio(
|
541 |
-
choices=["txt", "md", "
|
542 |
value="txt",
|
543 |
label="📝 Output Format"
|
544 |
)
|
@@ -780,46 +871,34 @@ with gr.Blocks(css="""
|
|
780 |
]
|
781 |
|
782 |
# PDF Processing Handlers
|
783 |
-
def handle_pdf_process(pdf, fmt, ctx_size):
|
784 |
-
"""Process PDF, format text, and return formatted text and snippets."""
|
785 |
if not pdf:
|
786 |
return "Please upload a PDF file.", "", "", [], gr.update(choices=[], value=None), None
|
787 |
|
788 |
try:
|
789 |
-
text = extract_text_from_pdf(pdf.name)
|
790 |
if text.startswith("Error"):
|
791 |
return text, "", "", [], gr.update(choices=[], value=None), None
|
792 |
|
793 |
-
|
794 |
-
|
795 |
-
snippets_list = split_into_snippets(formatted_text, ctx_size)
|
796 |
|
797 |
-
with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix=f'.{fmt}') as f:
|
798 |
-
f.write(
|
799 |
download_file = f.name
|
800 |
|
801 |
-
snippet_choices = update_snippet_choices(snippets_list) # Pre-calculate choices
|
802 |
-
|
803 |
return (
|
804 |
f"PDF processed successfully! Generated {len(snippets_list)} snippets.",
|
805 |
-
|
806 |
-
|
807 |
snippets_list,
|
808 |
gr.update(choices=snippet_choices, value=snippet_choices[0] if snippet_choices else None),
|
809 |
download_file
|
810 |
)
|
811 |
-
|
812 |
except Exception as e:
|
813 |
error_msg = f"Error processing PDF: {str(e)}"
|
814 |
logging.error(error_msg)
|
815 |
-
return (
|
816 |
-
error_msg,
|
817 |
-
"",
|
818 |
-
"",
|
819 |
-
[],
|
820 |
-
gr.update(choices=[], value=None),
|
821 |
-
None
|
822 |
-
)
|
823 |
|
824 |
def handle_snippet_selection(choice, snippets_list): # Add download_snippet output
|
825 |
"""Handle snippet selection, update prompt, and provide snippet download."""
|
|
|
6 |
print(f"Gradio version: {gr.__version__}")
|
7 |
|
8 |
from PyPDF2 import PdfReader
|
9 |
+
import fitz # pymupdf
|
10 |
+
from pdf2md.converter import PDF2Markdown
|
11 |
+
|
12 |
import logging
|
13 |
import webbrowser
|
14 |
from huggingface_hub import InferenceClient
|
|
|
182 |
self.groq_models = self._fetch_groq_models()
|
183 |
return self.groq_models
|
184 |
|
185 |
+
class PDFProcessor:
|
186 |
+
"""Handles PDF conversion to text and markdown using different methods"""
|
187 |
+
|
188 |
+
@staticmethod
|
189 |
+
def txt_convert(pdf_path: str) -> str:
|
190 |
+
"""Basic text extraction using PyPDF2"""
|
191 |
+
try:
|
192 |
+
reader = PdfReader(pdf_path)
|
193 |
+
text = ""
|
194 |
+
for page_num, page in enumerate(reader.pages, start=1):
|
195 |
+
page_text = page.extract_text()
|
196 |
+
if page_text:
|
197 |
+
text += page_text + "\n"
|
198 |
+
else:
|
199 |
+
logging.warning(f"No text found on page {page_num}.")
|
200 |
+
return text
|
201 |
+
except Exception as e:
|
202 |
+
logging.error(f"Error in txt conversion: {e}")
|
203 |
+
return f"Error: {str(e)}"
|
204 |
+
|
205 |
+
@staticmethod
|
206 |
+
def md_convert_with_pdf2md(pdf_path: str) -> str:
|
207 |
+
"""Convert PDF to Markdown using pdf2md"""
|
208 |
+
try:
|
209 |
+
converter = PDF2Markdown()
|
210 |
+
markdown_text = converter.convert(pdf_path)
|
211 |
+
return markdown_text
|
212 |
+
except Exception as e:
|
213 |
+
logging.error(f"Error in pdf2md conversion: {e}")
|
214 |
+
return f"Error: {str(e)}"
|
215 |
+
|
216 |
+
@staticmethod
|
217 |
+
def md_convert_with_pymupdf(pdf_path: str) -> str:
|
218 |
+
"""Convert PDF to Markdown using pymupdf"""
|
219 |
+
try:
|
220 |
+
doc = fitz.open(pdf_path)
|
221 |
+
markdown_text = []
|
222 |
+
|
223 |
+
for page in doc:
|
224 |
+
blocks = page.get_text("dict")["blocks"]
|
225 |
+
|
226 |
+
for block in blocks:
|
227 |
+
if "lines" in block:
|
228 |
+
for line in block["lines"]:
|
229 |
+
for span in line["spans"]:
|
230 |
+
font_size = span["size"]
|
231 |
+
content = span["text"]
|
232 |
+
font_flags = span["flags"] # Contains bold, italic info
|
233 |
+
|
234 |
+
# Handle headers based on font size
|
235 |
+
if font_size > 20:
|
236 |
+
markdown_text.append(f"# {content}\n")
|
237 |
+
elif font_size > 16:
|
238 |
+
markdown_text.append(f"## {content}\n")
|
239 |
+
elif font_size > 14:
|
240 |
+
markdown_text.append(f"### {content}\n")
|
241 |
+
else:
|
242 |
+
# Handle bold and italic
|
243 |
+
if font_flags & 2**4: # Bold
|
244 |
+
content = f"**{content}**"
|
245 |
+
if font_flags & 2**1: # Italic
|
246 |
+
content = f"*{content}*"
|
247 |
+
markdown_text.append(content)
|
248 |
+
|
249 |
+
markdown_text.append(" ") # Space between spans
|
250 |
+
markdown_text.append("\n") # Newline between lines
|
251 |
+
|
252 |
+
# Add extra newline between blocks for paragraphs
|
253 |
+
markdown_text.append("\n")
|
254 |
+
|
255 |
+
doc.close()
|
256 |
+
return "".join(markdown_text)
|
257 |
+
except Exception as e:
|
258 |
+
logging.error(f"Error in pymupdf conversion: {e}")
|
259 |
+
return f"Error: {str(e)}"
|
260 |
+
|
261 |
# Initialize model registry
|
262 |
model_registry = ModelRegistry()
|
263 |
|
264 |
+
def extract_text_from_pdf(pdf_path: str, format_type: str = "txt", md_engine: str = "pdf2md") -> str:
|
265 |
+
"""
|
266 |
+
Extract and format text from PDF using different processors based on format.
|
267 |
+
|
268 |
+
Args:
|
269 |
+
pdf_path: Path to PDF file
|
270 |
+
format_type: Either 'txt' or 'md'
|
271 |
+
md_engine: When format_type is 'md', either 'pdf2md' or 'pymupdf'
|
272 |
+
|
273 |
+
Returns:
|
274 |
+
Formatted text content
|
275 |
+
"""
|
276 |
+
processor = PDFProcessor()
|
277 |
+
|
278 |
try:
|
279 |
+
if format_type == "txt":
|
280 |
+
return processor.txt_convert(pdf_path)
|
281 |
+
elif format_type == "md":
|
282 |
+
if md_engine == "pdf2md":
|
283 |
+
return processor.md_convert_with_pdf2md(pdf_path)
|
284 |
+
elif md_engine == "pymupdf":
|
285 |
+
return processor.md_convert_with_pymupdf(pdf_path)
|
286 |
else:
|
287 |
+
return f"Error: Unsupported markdown engine: {md_engine}"
|
288 |
+
else:
|
289 |
+
return f"Error: Unsupported format type: {format_type}"
|
|
|
290 |
except Exception as e:
|
291 |
+
logging.error(f"Error in PDF conversion: {e}")
|
292 |
+
return f"Error: {str(e)}"
|
293 |
|
294 |
def format_content(text: str, format_type: str) -> str:
|
295 |
"""Format extracted text according to specified format."""
|
|
|
629 |
)
|
630 |
|
631 |
format_type = gr.Radio(
|
632 |
+
choices=["txt", "md (pdf2md)", "md (pymupdf)"],
|
633 |
value="txt",
|
634 |
label="📝 Output Format"
|
635 |
)
|
|
|
871 |
]
|
872 |
|
873 |
# PDF Processing Handlers
|
874 |
+
def handle_pdf_process(pdf, fmt, md_eng, ctx_size):
|
|
|
875 |
if not pdf:
|
876 |
return "Please upload a PDF file.", "", "", [], gr.update(choices=[], value=None), None
|
877 |
|
878 |
try:
|
879 |
+
text = extract_text_from_pdf(pdf.name, format_type=fmt, md_engine=md_eng)
|
880 |
if text.startswith("Error"):
|
881 |
return text, "", "", [], gr.update(choices=[], value=None), None
|
882 |
|
883 |
+
snippets_list = split_into_snippets(text, ctx_size)
|
884 |
+
snippet_choices = update_snippet_choices(snippets_list)
|
|
|
885 |
|
886 |
+
with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix=f'.{fmt}') as f:
|
887 |
+
f.write(text)
|
888 |
download_file = f.name
|
889 |
|
|
|
|
|
890 |
return (
|
891 |
f"PDF processed successfully! Generated {len(snippets_list)} snippets.",
|
892 |
+
text,
|
893 |
+
text,
|
894 |
snippets_list,
|
895 |
gr.update(choices=snippet_choices, value=snippet_choices[0] if snippet_choices else None),
|
896 |
download_file
|
897 |
)
|
|
|
898 |
except Exception as e:
|
899 |
error_msg = f"Error processing PDF: {str(e)}"
|
900 |
logging.error(error_msg)
|
901 |
+
return error_msg, "", "", [], gr.update(choices=[], value=None), None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
902 |
|
903 |
def handle_snippet_selection(choice, snippets_list): # Add download_snippet output
|
904 |
"""Handle snippet selection, update prompt, and provide snippet download."""
|