Spaces:

tony-42069
/

cre-chatbot-rag

Sleeping

App Files Files Community

tony-42069 commited on Nov 27, 2024

Commit

afb405a

1 Parent(s): 0881f45

Add detailed debug logging for PDF processing

Browse files

Files changed (2) hide show

app.py +10 -0
pdf_processor.py +17 -1

app.py CHANGED Viewed

@@ -64,6 +64,14 @@ if not st.session_state.processed_file:
         try:
             # Process the pre-loaded PDF
             pdf_path = os.path.join("Dataset", "Commercial Lending 101.pdf")
             processor = PDFProcessor()
             chunks = processor.process_pdf(pdf_path)
@@ -72,6 +80,8 @@ if not st.session_state.processed_file:
             st.session_state.processed_file = True
         except Exception as e:
             st.error(f"Error initializing knowledge base: {str(e)}")
             st.stop()
 # Sidebar with information

         try:
             # Process the pre-loaded PDF
             pdf_path = os.path.join("Dataset", "Commercial Lending 101.pdf")
+            st.write(f"Looking for PDF at: {os.path.abspath(pdf_path)}")
+            if not os.path.exists(pdf_path):
+                st.error(f"PDF file not found at {pdf_path}")
+                st.stop()
+            st.write(f"PDF file found, size: {os.path.getsize(pdf_path)} bytes")
             processor = PDFProcessor()
             chunks = processor.process_pdf(pdf_path)
             st.session_state.processed_file = True
         except Exception as e:
             st.error(f"Error initializing knowledge base: {str(e)}")
+            st.write("Current working directory:", os.getcwd())
+            st.write("Directory contents:", os.listdir())
             st.stop()
 # Sidebar with information

pdf_processor.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from typing import List, Dict
 import pypdf
 from langchain.document_loaders import PyPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -22,10 +23,19 @@ class PDFProcessor:
         Returns:
             List[Dict]: List of text chunks with metadata
         """
         try:
             # Try using PyPDFLoader from langchain
             loader = PyPDFLoader(pdf_path)
             pages = loader.load()
             # Split the text into chunks
             chunks = []
@@ -36,6 +46,7 @@ class PDFProcessor:
                         'text': chunk,
                         'metadata': {'page': page.metadata['page']}
                     })
             return chunks
         except Exception as e:
@@ -44,8 +55,10 @@ class PDFProcessor:
             # Fallback to direct pypdf usage
             try:
                 with open(pdf_path, 'rb') as file:
                     pdf = pypdf.PdfReader(file)
                     chunks = []
                     for page_num in range(len(pdf.pages)):
@@ -57,7 +70,10 @@ class PDFProcessor:
                                 'text': chunk,
                                 'metadata': {'page': page_num + 1}
                             })
                     return chunks
             except Exception as e2:
-                raise Exception(f"Failed to process PDF with both methods. Error: {str(e2)}")

 from typing import List, Dict
+import os
 import pypdf
 from langchain.document_loaders import PyPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
         Returns:
             List[Dict]: List of text chunks with metadata
         """
+        print(f"Processing PDF at: {os.path.abspath(pdf_path)}")
+        if not os.path.exists(pdf_path):
+            raise FileNotFoundError(f"PDF file not found at {pdf_path}")
+        print(f"PDF file exists, size: {os.path.getsize(pdf_path)} bytes")
         try:
+            print("Attempting to use PyPDFLoader...")
             # Try using PyPDFLoader from langchain
             loader = PyPDFLoader(pdf_path)
             pages = loader.load()
+            print(f"Successfully loaded {len(pages)} pages with PyPDFLoader")
             # Split the text into chunks
             chunks = []
                         'text': chunk,
                         'metadata': {'page': page.metadata['page']}
                     })
+            print(f"Created {len(chunks)} chunks from PyPDFLoader method")
             return chunks
         except Exception as e:
             # Fallback to direct pypdf usage
             try:
+                print("Attempting to use pypdf directly...")
                 with open(pdf_path, 'rb') as file:
                     pdf = pypdf.PdfReader(file)
+                    print(f"Successfully opened PDF with {len(pdf.pages)} pages")
                     chunks = []
                     for page_num in range(len(pdf.pages)):
                                 'text': chunk,
                                 'metadata': {'page': page_num + 1}
                             })
+                    print(f"Created {len(chunks)} chunks from direct pypdf method")
                     return chunks
             except Exception as e2:
+                error_msg = f"Failed to process PDF with both methods.\nPyPDFLoader error: {str(e)}\npypdf error: {str(e2)}"
+                print(error_msg)
+                raise Exception(error_msg)