tony-42069 commited on
Commit
afb405a
·
1 Parent(s): 0881f45

Add detailed debug logging for PDF processing

Browse files
Files changed (2) hide show
  1. app.py +10 -0
  2. pdf_processor.py +17 -1
app.py CHANGED
@@ -64,6 +64,14 @@ if not st.session_state.processed_file:
64
  try:
65
  # Process the pre-loaded PDF
66
  pdf_path = os.path.join("Dataset", "Commercial Lending 101.pdf")
 
 
 
 
 
 
 
 
67
  processor = PDFProcessor()
68
  chunks = processor.process_pdf(pdf_path)
69
 
@@ -72,6 +80,8 @@ if not st.session_state.processed_file:
72
  st.session_state.processed_file = True
73
  except Exception as e:
74
  st.error(f"Error initializing knowledge base: {str(e)}")
 
 
75
  st.stop()
76
 
77
  # Sidebar with information
 
64
  try:
65
  # Process the pre-loaded PDF
66
  pdf_path = os.path.join("Dataset", "Commercial Lending 101.pdf")
67
+ st.write(f"Looking for PDF at: {os.path.abspath(pdf_path)}")
68
+
69
+ if not os.path.exists(pdf_path):
70
+ st.error(f"PDF file not found at {pdf_path}")
71
+ st.stop()
72
+
73
+ st.write(f"PDF file found, size: {os.path.getsize(pdf_path)} bytes")
74
+
75
  processor = PDFProcessor()
76
  chunks = processor.process_pdf(pdf_path)
77
 
 
80
  st.session_state.processed_file = True
81
  except Exception as e:
82
  st.error(f"Error initializing knowledge base: {str(e)}")
83
+ st.write("Current working directory:", os.getcwd())
84
+ st.write("Directory contents:", os.listdir())
85
  st.stop()
86
 
87
  # Sidebar with information
pdf_processor.py CHANGED
@@ -1,4 +1,5 @@
1
  from typing import List, Dict
 
2
  import pypdf
3
  from langchain.document_loaders import PyPDFLoader
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -22,10 +23,19 @@ class PDFProcessor:
22
  Returns:
23
  List[Dict]: List of text chunks with metadata
24
  """
 
 
 
 
 
 
 
25
  try:
 
26
  # Try using PyPDFLoader from langchain
27
  loader = PyPDFLoader(pdf_path)
28
  pages = loader.load()
 
29
 
30
  # Split the text into chunks
31
  chunks = []
@@ -36,6 +46,7 @@ class PDFProcessor:
36
  'text': chunk,
37
  'metadata': {'page': page.metadata['page']}
38
  })
 
39
  return chunks
40
 
41
  except Exception as e:
@@ -44,8 +55,10 @@ class PDFProcessor:
44
 
45
  # Fallback to direct pypdf usage
46
  try:
 
47
  with open(pdf_path, 'rb') as file:
48
  pdf = pypdf.PdfReader(file)
 
49
  chunks = []
50
 
51
  for page_num in range(len(pdf.pages)):
@@ -57,7 +70,10 @@ class PDFProcessor:
57
  'text': chunk,
58
  'metadata': {'page': page_num + 1}
59
  })
 
60
  return chunks
61
 
62
  except Exception as e2:
63
- raise Exception(f"Failed to process PDF with both methods. Error: {str(e2)}")
 
 
 
1
  from typing import List, Dict
2
+ import os
3
  import pypdf
4
  from langchain.document_loaders import PyPDFLoader
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
23
  Returns:
24
  List[Dict]: List of text chunks with metadata
25
  """
26
+ print(f"Processing PDF at: {os.path.abspath(pdf_path)}")
27
+
28
+ if not os.path.exists(pdf_path):
29
+ raise FileNotFoundError(f"PDF file not found at {pdf_path}")
30
+
31
+ print(f"PDF file exists, size: {os.path.getsize(pdf_path)} bytes")
32
+
33
  try:
34
+ print("Attempting to use PyPDFLoader...")
35
  # Try using PyPDFLoader from langchain
36
  loader = PyPDFLoader(pdf_path)
37
  pages = loader.load()
38
+ print(f"Successfully loaded {len(pages)} pages with PyPDFLoader")
39
 
40
  # Split the text into chunks
41
  chunks = []
 
46
  'text': chunk,
47
  'metadata': {'page': page.metadata['page']}
48
  })
49
+ print(f"Created {len(chunks)} chunks from PyPDFLoader method")
50
  return chunks
51
 
52
  except Exception as e:
 
55
 
56
  # Fallback to direct pypdf usage
57
  try:
58
+ print("Attempting to use pypdf directly...")
59
  with open(pdf_path, 'rb') as file:
60
  pdf = pypdf.PdfReader(file)
61
+ print(f"Successfully opened PDF with {len(pdf.pages)} pages")
62
  chunks = []
63
 
64
  for page_num in range(len(pdf.pages)):
 
70
  'text': chunk,
71
  'metadata': {'page': page_num + 1}
72
  })
73
+ print(f"Created {len(chunks)} chunks from direct pypdf method")
74
  return chunks
75
 
76
  except Exception as e2:
77
+ error_msg = f"Failed to process PDF with both methods.\nPyPDFLoader error: {str(e)}\npypdf error: {str(e2)}"
78
+ print(error_msg)
79
+ raise Exception(error_msg)