Spaces:
Sleeping
Sleeping
Commit
·
afb405a
1
Parent(s):
0881f45
Add detailed debug logging for PDF processing
Browse files- app.py +10 -0
- pdf_processor.py +17 -1
app.py
CHANGED
@@ -64,6 +64,14 @@ if not st.session_state.processed_file:
|
|
64 |
try:
|
65 |
# Process the pre-loaded PDF
|
66 |
pdf_path = os.path.join("Dataset", "Commercial Lending 101.pdf")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
processor = PDFProcessor()
|
68 |
chunks = processor.process_pdf(pdf_path)
|
69 |
|
@@ -72,6 +80,8 @@ if not st.session_state.processed_file:
|
|
72 |
st.session_state.processed_file = True
|
73 |
except Exception as e:
|
74 |
st.error(f"Error initializing knowledge base: {str(e)}")
|
|
|
|
|
75 |
st.stop()
|
76 |
|
77 |
# Sidebar with information
|
|
|
64 |
try:
|
65 |
# Process the pre-loaded PDF
|
66 |
pdf_path = os.path.join("Dataset", "Commercial Lending 101.pdf")
|
67 |
+
st.write(f"Looking for PDF at: {os.path.abspath(pdf_path)}")
|
68 |
+
|
69 |
+
if not os.path.exists(pdf_path):
|
70 |
+
st.error(f"PDF file not found at {pdf_path}")
|
71 |
+
st.stop()
|
72 |
+
|
73 |
+
st.write(f"PDF file found, size: {os.path.getsize(pdf_path)} bytes")
|
74 |
+
|
75 |
processor = PDFProcessor()
|
76 |
chunks = processor.process_pdf(pdf_path)
|
77 |
|
|
|
80 |
st.session_state.processed_file = True
|
81 |
except Exception as e:
|
82 |
st.error(f"Error initializing knowledge base: {str(e)}")
|
83 |
+
st.write("Current working directory:", os.getcwd())
|
84 |
+
st.write("Directory contents:", os.listdir())
|
85 |
st.stop()
|
86 |
|
87 |
# Sidebar with information
|
pdf_processor.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
from typing import List, Dict
|
|
|
2 |
import pypdf
|
3 |
from langchain.document_loaders import PyPDFLoader
|
4 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
@@ -22,10 +23,19 @@ class PDFProcessor:
|
|
22 |
Returns:
|
23 |
List[Dict]: List of text chunks with metadata
|
24 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
try:
|
|
|
26 |
# Try using PyPDFLoader from langchain
|
27 |
loader = PyPDFLoader(pdf_path)
|
28 |
pages = loader.load()
|
|
|
29 |
|
30 |
# Split the text into chunks
|
31 |
chunks = []
|
@@ -36,6 +46,7 @@ class PDFProcessor:
|
|
36 |
'text': chunk,
|
37 |
'metadata': {'page': page.metadata['page']}
|
38 |
})
|
|
|
39 |
return chunks
|
40 |
|
41 |
except Exception as e:
|
@@ -44,8 +55,10 @@ class PDFProcessor:
|
|
44 |
|
45 |
# Fallback to direct pypdf usage
|
46 |
try:
|
|
|
47 |
with open(pdf_path, 'rb') as file:
|
48 |
pdf = pypdf.PdfReader(file)
|
|
|
49 |
chunks = []
|
50 |
|
51 |
for page_num in range(len(pdf.pages)):
|
@@ -57,7 +70,10 @@ class PDFProcessor:
|
|
57 |
'text': chunk,
|
58 |
'metadata': {'page': page_num + 1}
|
59 |
})
|
|
|
60 |
return chunks
|
61 |
|
62 |
except Exception as e2:
|
63 |
-
|
|
|
|
|
|
1 |
from typing import List, Dict
|
2 |
+
import os
|
3 |
import pypdf
|
4 |
from langchain.document_loaders import PyPDFLoader
|
5 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
23 |
Returns:
|
24 |
List[Dict]: List of text chunks with metadata
|
25 |
"""
|
26 |
+
print(f"Processing PDF at: {os.path.abspath(pdf_path)}")
|
27 |
+
|
28 |
+
if not os.path.exists(pdf_path):
|
29 |
+
raise FileNotFoundError(f"PDF file not found at {pdf_path}")
|
30 |
+
|
31 |
+
print(f"PDF file exists, size: {os.path.getsize(pdf_path)} bytes")
|
32 |
+
|
33 |
try:
|
34 |
+
print("Attempting to use PyPDFLoader...")
|
35 |
# Try using PyPDFLoader from langchain
|
36 |
loader = PyPDFLoader(pdf_path)
|
37 |
pages = loader.load()
|
38 |
+
print(f"Successfully loaded {len(pages)} pages with PyPDFLoader")
|
39 |
|
40 |
# Split the text into chunks
|
41 |
chunks = []
|
|
|
46 |
'text': chunk,
|
47 |
'metadata': {'page': page.metadata['page']}
|
48 |
})
|
49 |
+
print(f"Created {len(chunks)} chunks from PyPDFLoader method")
|
50 |
return chunks
|
51 |
|
52 |
except Exception as e:
|
|
|
55 |
|
56 |
# Fallback to direct pypdf usage
|
57 |
try:
|
58 |
+
print("Attempting to use pypdf directly...")
|
59 |
with open(pdf_path, 'rb') as file:
|
60 |
pdf = pypdf.PdfReader(file)
|
61 |
+
print(f"Successfully opened PDF with {len(pdf.pages)} pages")
|
62 |
chunks = []
|
63 |
|
64 |
for page_num in range(len(pdf.pages)):
|
|
|
70 |
'text': chunk,
|
71 |
'metadata': {'page': page_num + 1}
|
72 |
})
|
73 |
+
print(f"Created {len(chunks)} chunks from direct pypdf method")
|
74 |
return chunks
|
75 |
|
76 |
except Exception as e2:
|
77 |
+
error_msg = f"Failed to process PDF with both methods.\nPyPDFLoader error: {str(e)}\npypdf error: {str(e2)}"
|
78 |
+
print(error_msg)
|
79 |
+
raise Exception(error_msg)
|