Spaces:

AndresIgnacio
/

LiciCariolaSpace

Runtime error

App Files Files Community

AndresIgnacio commited on 17 days ago

Commit

a358cff

verified ·

1 Parent(s): 4bcea11

Update app.py

Browse files

Files changed (1) hide show

app.py +125 -129

app.py CHANGED Viewed

@@ -1,129 +1,125 @@
-import os
-import gradio as gr
-import pdfplumber
-from PIL import Image, ImageEnhance
-import pytesseract
-from docx import Document
-from docx.shared import Pt, RGBColor
-from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
-import pandas as pd
-# Silenciar advertencias de TensorFlow
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
-# Configurar Tesseract (para Visual Studio: ajustar según tu ruta local)
-if os.getenv("PYTHON_ENV") == "local":
-    pytesseract.pytesseract.tesseract_cmd = r'C:\Users\andre\Documents\Aplicaciones y Softwear\Tesseeract\tesseract.exe'
-def apply_text_formatting(run, font_name=None, font_size=None, is_bold=None, is_italic=None, alignment=None):
-    """Aplica formato al texto en el documento Word."""
-    if font_name:
-        run.font.name = font_name
-    if font_size:
-        run.font.size = Pt(font_size)
-    if is_bold is not None:
-        run.bold = is_bold
-    if is_italic is not None:
-        run.italic = is_italic
-def process_file(file):
-    try:
-        print(f"Procesando archivo: {file.name}")
-        extracted_text = ""
-        tables = []
-        # Crear documento Word sin formato predefinido
-        doc = Document()
-        # Verifica el tipo de archivo
-        if file.name.endswith(".pdf"):
-            print("Archivo identificado como PDF.")
-            try:
-                with pdfplumber.open(file.name) as pdf:
-                    for page_num, page in enumerate(pdf.pages):
-                        print(f"Procesando página {page_num + 1}")
-                        # Extraer texto y sus características
-                        for char in page.chars:  # Extrae caracteres y sus estilos
-                            text = char.get("text", "")
-                            font_name = char.get("fontname", None)
-                            font_size = char.get("size", None)
-                            is_bold = "Bold" in font_name if font_name else False
-                            alignment = page.bbox  # Aquí podrías refinar el análisis si es necesario
-                            if text.strip():
-                                para = doc.add_paragraph()
-                                run = para.add_run(text)
-                                apply_text_formatting(run, font_name=font_name, font_size=font_size, is_bold=is_bold)
-                        # Extraer tablas
-                        tables_on_page = page.extract_tables()
-                        for table in tables_on_page:
-                            if table:
-                                df = pd.DataFrame(table)
-                                table_in_doc = doc.add_table(rows=0, cols=len(df.columns))
-                                for i, row in df.iterrows():
-                                    cells = table_in_doc.add_row().cells
-                                    for j, cell in enumerate(row):
-                                        cells[j].text = str(cell)
-                                tables.append(df)
-            except Exception as pdf_error:
-                return f"Error procesando el PDF: {str(pdf_error)}"
-        elif file.name.endswith((".png", ".jpg", ".jpeg")):
-            print("Archivo identificado como imagen.")
-            try:
-                # Preprocesar imagen para mejorar OCR
-                img = Image.open(file)
-                img = img.convert("L")  # Convertir a escala de grises
-                img = ImageEnhance.Contrast(img).enhance(2)  # Aumentar contraste
-                img = img.resize((img.width * 2, img.height * 2))  # Aumentar tamaño
-                # OCR en español
-                extracted_text = pytesseract.image_to_string(img, lang="spa")
-                doc.add_paragraph(extracted_text)
-            except Exception as img_error:
-                return f"Error procesando la imagen: {str(img_error)}"
-        else:
-            print("Archivo no soportado.")
-            return "Tipo de archivo no soportado. Sube un PDF o una imagen."
-        # Validar texto extraído
-        if len(extracted_text.strip()) == 0:
-            return "No se encontró texto en el archivo proporcionado."
-        # Guardar archivo Word
-        output_word_path = "output_document.docx"
-        doc.save(output_word_path)
-        # Crear salida con tablas
-        output = f"Texto extraído del PDF:\n\n{extracted_text}\n\nArchivo Word generado: {output_word_path}\n"
-        if tables:
-            output += "Tablas extraídas:\n\n"
-            for idx, table in enumerate(tables):
-                output += f"Tabla {idx + 1}:\n{table.to_string(index=False)}\n\n"
-        return output, output_word_path
-    except Exception as e:
-        print(f"Error general: {str(e)}")
-        return f"Error general procesando el archivo: {str(e)}", None
-# Interfaz Gradio
-def interface_fn(file):
-    output, word_path = process_file(file)
-    if word_path:
-        return output, word_path
-    return output
-demo = gr.Interface(
-    fn=interface_fn,
-    inputs=gr.File(label="Sube tu archivo (PDF o imagen)"),
-    outputs=["text", gr.File(label="Descargar Word generado")],
-    title="Análisis Avanzado de Licitaciones",
-    description="Sube un archivo PDF o una imagen para extraer texto y generar un documento Word con formato original.",
-)
-# Ejecuta la aplicación
-demo.launch(share=True)

+import os
+from pathlib import Path
+from typing import Tuple, Union
+import logging
+from transformers import DonutProcessor, VisionEncoderDecoderModel
+from PIL import Image
+from pdf2image import convert_from_path
+from docx import Document
+from docx.shared import Pt
+from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
+import gradio as gr
+# Configuración avanzada de logging
+logging.basicConfig(
+    level=logging.DEBUG,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    handlers=[
+        logging.FileHandler("app.log", mode="a", encoding="utf-8"),
+        logging.StreamHandler()
+    ]
+)
+class HuggingFaceProcessor:
+    """Clase para manejar modelos avanzados de Hugging Face para procesamiento de documentos."""
+    def __init__(self, model_name: str = "naver-clova-ix/donut-base-finetuned-docvqa"):
+        self.logger = logging.getLogger("HuggingFaceProcessor")
+        self.logger.info("Cargando modelo de Hugging Face...")
+        try:
+            self.processor = DonutProcessor.from_pretrained(model_name)
+            self.model = VisionEncoderDecoderModel.from_pretrained(model_name)
+        except Exception as e:
+            self.logger.error(f"Error cargando el modelo: {e}")
+            raise
+    def process_image(self, image: Image.Image) -> str:
+        """Procesa una imagen y extrae texto usando el modelo Donut."""
+        try:
+            pixel_values = self.processor(image, return_tensors="pt").pixel_values
+            outputs = self.model.generate(pixel_values, max_length=512)
+            result = self.processor.batch_decode(outputs, skip_special_tokens=True)[0]
+            return result.strip()
+        except Exception as e:
+            self.logger.error(f"Error procesando la imagen con Donut: {e}")
+            return ""
+class PDFToWordProcessor:
+    """Procesa un PDF escaneado y genera un documento Word."""
+    def __init__(self):
+        self.logger = logging.getLogger("PDFToWordProcessor")
+        self.hf_processor = HuggingFaceProcessor()
+    def process_pdf(self, file_path: Path) -> Document:
+        """Convierte un PDF a un documento Word."""
+        self.logger.info(f"Procesando PDF: {file_path}")
+        doc = Document()
+        try:
+            # Convertir cada página del PDF a imagen
+            images = convert_from_path(file_path)
+            for page_num, image in enumerate(images, start=1):
+                self.logger.debug(f"Procesando página {page_num}")
+                # Extraer texto usando el modelo Donut
+                page_text = self.hf_processor.process_image(image)
+                # Agregar encabezado para cada página
+                doc.add_heading(f"Página {page_num}", level=2)
+                # Agregar texto extraído al documento Word
+                self._add_text_to_doc(doc, page_text)
+        except Exception as e:
+            self.logger.error(f"Error procesando PDF: {e}")
+            raise
+        return doc
+    def _add_text_to_doc(self, doc: Document, text: str):
+        """Agrega texto extraído al documento Word."""
+        for line in text.split('\n'):
+            if line.strip():  # Evitar líneas vacías
+                paragraph = doc.add_paragraph(line.strip(), style="Normal")
+                paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
+    def process_file(self, file_path: Union[str, Path]) -> Tuple[str, str]:
+        """Procesa un archivo PDF y guarda el documento Word generado."""
+        file_path = Path(file_path)
+        output_path = file_path.with_suffix(".docx")
+        try:
+            if file_path.suffix.lower() != ".pdf":
+                raise ValueError(f"Formato no soportado: {file_path.suffix}")
+            doc = self.process_pdf(file_path)
+            doc.save(output_path)
+            return "Documento procesado exitosamente", str(output_path)
+        except Exception as e:
+            return f"Error: {e}", ""
+def create_interface():
+    """Crea la interfaz de usuario con Gradio."""
+    processor = PDFToWordProcessor()
+    def process_file(file):
+        if not file:
+            return "Por favor, seleccione un archivo", None
+        return processor.process_file(file.name)
+    with gr.Blocks(title="Procesador de PDF a Word") as demo:
+        gr.Markdown("# Procesador PDF a Word con Hugging Face")
+        gr.Markdown("Convierte documentos PDF escaneados a Word utilizando modelos avanzados de Hugging Face.")
+        file_input = gr.File(label="Seleccionar PDF", file_types=[".pdf"], type="filepath")
+        process_button = gr.Button("Procesar", variant="primary")
+        output_text = gr.Textbox(label="Estado del Proceso")
+        output_file = gr.File(label="Documento Procesado")
+        process_button.click(process_file, inputs=[file_input], outputs=[output_text, output_file])
+    return demo
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(share=True)