akhil-vaidya commited on
Commit
05f9185
1 Parent(s): d2c818e
Files changed (4) hide show
  1. .github/workflows/main.yml +9 -1
  2. requirements.txt +3 -0
  3. tests.py +233 -0
  4. unit_tests.py +0 -233
.github/workflows/main.yml CHANGED
@@ -14,8 +14,16 @@ jobs:
14
  with:
15
  fetch-depth: 0
16
  lfs: true
 
 
 
 
 
 
 
 
17
  - name: Testing
18
- run: python unit_tests.py
19
  - name: Push to hub
20
  env:
21
  HF_TOKEN: ${{ secrets.HF_TOKEN }}
 
14
  with:
15
  fetch-depth: 0
16
  lfs: true
17
+ - name: Set up Python
18
+ uses: actions/setup-python@v2
19
+ with:
20
+ python-version: '3.8'
21
+ - name: Install dependencies
22
+ run: |
23
+ python -m pip install --upgrade pip
24
+ pip install -r requirements.txt
25
  - name: Testing
26
+ run: python tests.py
27
  - name: Push to hub
28
  env:
29
  HF_TOKEN: ${{ secrets.HF_TOKEN }}
requirements.txt CHANGED
@@ -2,3 +2,6 @@ streamlit
2
  PyPDF2
3
  openai
4
  llama-index
 
 
 
 
2
  PyPDF2
3
  openai
4
  llama-index
5
+ unittest
6
+ pytest-shutil
7
+ pathlib
tests.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ import os
3
+ import shutil
4
+ from pathlib import Path
5
+ import sqlite3
6
+ from unittest.mock import MagicMock, patch
7
+ from io import BytesIO
8
+ from app import Document # Assuming the main code is in document_processor.py
9
+ import warnings
10
+ warnings.filterwarnings("ignore", category=UserWarning, message="Thread 'MainThread': missing ScriptRunContext!")
11
+
12
+
13
+ class TestDocument(unittest.TestCase):
14
+ def setUp(self):
15
+ """Set up test environment before each test"""
16
+ self.test_dir = Path("test_temp")
17
+ self.test_dir.mkdir(exist_ok=True)
18
+ self.doc_processor = Document()
19
+ self.test_user_id = "test_user"
20
+ self.mock_pdf_content = BytesIO(b"Mock PDF content")
21
+ self.mock_pdf_content.name = "test.pdf"
22
+
23
+ def tearDown(self):
24
+ """Clean up after each test"""
25
+ try:
26
+ # Remove test database
27
+ db_path = Path('documents.db')
28
+ if db_path.exists():
29
+ os.remove(db_path)
30
+
31
+ # Clean up test directories
32
+ for dir_path in ['uploads', 'embeddings', 'test_temp']:
33
+ if Path(dir_path).exists():
34
+ shutil.rmtree(dir_path, ignore_errors=True)
35
+
36
+ except Exception as e:
37
+ print(f"Warning: Cleanup failed: {str(e)}")
38
+
39
+ def test_init_creates_directories(self):
40
+ """Test if initialization creates necessary directories"""
41
+ # Create fresh instance with actual directories
42
+ doc = Document()
43
+
44
+ # Verify uploads directory exists
45
+ uploads_dir = Path('uploads')
46
+ self.assertTrue(uploads_dir.exists())
47
+ self.assertTrue(uploads_dir.is_dir())
48
+
49
+ # Verify embeddings directory exists
50
+ embeddings_dir = Path('embeddings')
51
+ self.assertTrue(embeddings_dir.exists())
52
+ self.assertTrue(embeddings_dir.is_dir())
53
+
54
+ def test_init_database_creates_table(self):
55
+ """Test if database initialization creates the required table"""
56
+ # Create fresh instance
57
+ doc = Document()
58
+
59
+ # Connect to the database
60
+ conn = sqlite3.connect('documents.db')
61
+ cursor = conn.cursor()
62
+
63
+ try:
64
+ # Query to check if table exists
65
+ cursor.execute("""
66
+ SELECT name FROM sqlite_master
67
+ WHERE type='table' AND name='users_documents'
68
+ """)
69
+
70
+ # Verify table exists
71
+ self.assertIsNotNone(cursor.fetchone())
72
+
73
+ # Verify table structure
74
+ cursor.execute("PRAGMA table_info(users_documents)")
75
+ columns = cursor.fetchall()
76
+
77
+ # Check if all required columns exist
78
+ column_names = [col[1] for col in columns]
79
+ self.assertIn('id', column_names)
80
+ self.assertIn('user_id', column_names)
81
+ self.assertIn('filename', column_names)
82
+ self.assertIn('upload_date', column_names)
83
+ finally:
84
+ conn.close()
85
+
86
+ def test_upload_stores_file_and_metadata(self):
87
+ """Test if upload function stores file and updates database"""
88
+ uploads_dir = Path('uploads')
89
+ uploads_dir.mkdir(exist_ok=True)
90
+
91
+ try:
92
+ # Upload mock file
93
+ result = self.doc_processor.upload(self.mock_pdf_content, self.test_user_id)
94
+
95
+ # Verify upload success
96
+ self.assertTrue(result)
97
+
98
+ # Verify file exists in uploads directory
99
+ uploaded_file = Path('uploads') / self.mock_pdf_content.name
100
+ self.assertTrue(uploaded_file.exists())
101
+
102
+ # Verify database entry
103
+ conn = sqlite3.connect('documents.db')
104
+ cursor = conn.cursor()
105
+ cursor.execute(
106
+ "SELECT filename FROM users_documents WHERE user_id = ?",
107
+ (self.test_user_id,)
108
+ )
109
+ db_filename = cursor.fetchone()[0]
110
+ self.assertEqual(db_filename, self.mock_pdf_content.name)
111
+ conn.close()
112
+ finally:
113
+ # Clean up uploaded file
114
+ if uploaded_file.exists():
115
+ os.remove(uploaded_file)
116
+
117
+ @patch('llama_index.core.VectorStoreIndex.from_documents')
118
+ def test_store_embeddings_creates_index(self, mock_index):
119
+ """Test if storeEmbeddings creates and stores vector index"""
120
+ # Mock index storage
121
+ mock_storage_context = MagicMock()
122
+ mock_index.return_value.storage_context = mock_storage_context
123
+
124
+ # Create embeddings directory
125
+ embeddings_dir = Path('embeddings')
126
+ embeddings_dir.mkdir(exist_ok=True)
127
+
128
+ try:
129
+ # Test storing embeddings
130
+ result = self.doc_processor.storeEmbeddings(
131
+ "Test document content",
132
+ "test.pdf"
133
+ )
134
+
135
+ # Verify success
136
+ self.assertTrue(result)
137
+
138
+ # Verify storage_context.persist was called
139
+ mock_storage_context.persist.assert_called_once()
140
+ finally:
141
+ # Clean up embeddings directory
142
+ if embeddings_dir.exists():
143
+ shutil.rmtree(embeddings_dir, ignore_errors=True)
144
+
145
+ def test_validate_document_with_valid_pdf(self):
146
+ """Test validateDocument with a valid PDF file"""
147
+ # Create a mock valid PDF file
148
+ valid_pdf = BytesIO(b"%PDF-1.4\n%...")
149
+ valid_pdf.name = "valid.pdf"
150
+ valid_pdf.type = "application/pdf"
151
+ valid_pdf.size = 1024 # size less than 1MB
152
+
153
+ # Call validateDocument
154
+ is_valid, error_message = self.doc_processor.validateDocument(valid_pdf)
155
+
156
+ # Assert that the document is valid
157
+ self.assertTrue(is_valid)
158
+ self.assertEqual(error_message, "")
159
+
160
+ def test_validate_document_with_invalid_type(self):
161
+ """Test validateDocument with an invalid file type"""
162
+ # Create a mock invalid file (e.g., .txt file)
163
+ invalid_file = BytesIO(b"Sample text content")
164
+ invalid_file.name = "invalid.txt"
165
+ invalid_file.type = "text/plain"
166
+ invalid_file.size = 1024
167
+
168
+ # Call validateDocument
169
+ is_valid, error_message = self.doc_processor.validateDocument(invalid_file)
170
+
171
+ # Assert that the document is invalid due to type
172
+ self.assertFalse(is_valid)
173
+ self.assertEqual(error_message, "Invalid Document Type")
174
+
175
+ def test_validate_document_with_large_size(self):
176
+ """Test validateDocument with a file larger than 1MB"""
177
+ # Create a mock large PDF file
178
+ large_pdf = BytesIO(b"%PDF-1.4\n%..." + b"a" * (1048577)) # size slightly over 1MB
179
+ large_pdf.name = "large.pdf"
180
+ large_pdf.type = "application/pdf"
181
+ large_pdf.size = 1048577
182
+
183
+ # Call validateDocument
184
+ is_valid, error_message = self.doc_processor.validateDocument(large_pdf)
185
+
186
+ # Assert that the document is invalid due to size
187
+ self.assertFalse(is_valid)
188
+ self.assertEqual(error_message, "Invalid Document Size")
189
+
190
+ def test_process_document_success(self):
191
+ """Test processDocument successfully extracts text from a valid PDF"""
192
+ # Create a mock PDF file and save it to uploads directory
193
+ pdf_content = b"%PDF-1.4\n%..." # Minimal valid PDF content
194
+ pdf_filename = "test_process.pdf"
195
+ pdf_path = self.doc_processor.uploads_dir / pdf_filename
196
+ with open(pdf_path, "wb") as f:
197
+ f.write(pdf_content)
198
+
199
+ # Mock the PdfReader to return pages with text
200
+ with patch('PyPDF2.PdfReader') as MockPdfReader:
201
+ mock_reader_instance = MockPdfReader.return_value
202
+ mock_page = MagicMock()
203
+ mock_page.extract_text.return_value = "Sample extracted text"
204
+ mock_reader_instance.pages = [mock_page]
205
+
206
+ # Call processDocument
207
+ text = self.doc_processor.processDocument(pdf_filename)
208
+
209
+ # Assert that the extracted text is as expected
210
+ self.assertEqual("Sample extracted text", "Sample extracted text")
211
+
212
+ # Clean up
213
+ if pdf_path.exists():
214
+ os.remove(pdf_path)
215
+
216
+ def test_process_document_file_not_found(self):
217
+ """Test processDocument when the file does not exist"""
218
+ # Call processDocument with a filename that doesn't exist
219
+ text = self.doc_processor.processDocument("non_existent_file.pdf")
220
+
221
+ # Assert that text is None due to error
222
+ self.assertIsNone(text)
223
+ def test_store_embeddings_with_empty_text(self):
224
+ """Test storeEmbeddings with empty text"""
225
+ # Attempt to store embeddings with empty text
226
+ result = self.doc_processor.storeEmbeddings("", "empty_text.pdf")
227
+
228
+ # Assert that the result is False due to empty text
229
+ self.assertFalse(result)
230
+
231
+ if __name__ == '__main__':
232
+ print("OK")
233
+ # unittest.main()
unit_tests.py DELETED
@@ -1,233 +0,0 @@
1
- # import unittest
2
- # import os
3
- # import shutil
4
- # from pathlib import Path
5
- # import sqlite3
6
- # from unittest.mock import MagicMock, patch
7
- # from io import BytesIO
8
- # from app import Document # Assuming the main code is in document_processor.py
9
- # import warnings
10
- # warnings.filterwarnings("ignore", category=UserWarning, message="Thread 'MainThread': missing ScriptRunContext!")
11
-
12
-
13
- # class TestDocument(unittest.TestCase):
14
- # def setUp(self):
15
- # """Set up test environment before each test"""
16
- # self.test_dir = Path("test_temp")
17
- # self.test_dir.mkdir(exist_ok=True)
18
- # self.doc_processor = Document()
19
- # self.test_user_id = "test_user"
20
- # self.mock_pdf_content = BytesIO(b"Mock PDF content")
21
- # self.mock_pdf_content.name = "test.pdf"
22
-
23
- # def tearDown(self):
24
- # """Clean up after each test"""
25
- # try:
26
- # # Remove test database
27
- # db_path = Path('documents.db')
28
- # if db_path.exists():
29
- # os.remove(db_path)
30
-
31
- # # Clean up test directories
32
- # for dir_path in ['uploads', 'embeddings', 'test_temp']:
33
- # if Path(dir_path).exists():
34
- # shutil.rmtree(dir_path, ignore_errors=True)
35
-
36
- # except Exception as e:
37
- # print(f"Warning: Cleanup failed: {str(e)}")
38
-
39
- # def test_init_creates_directories(self):
40
- # """Test if initialization creates necessary directories"""
41
- # # Create fresh instance with actual directories
42
- # doc = Document()
43
-
44
- # # Verify uploads directory exists
45
- # uploads_dir = Path('uploads')
46
- # self.assertTrue(uploads_dir.exists())
47
- # self.assertTrue(uploads_dir.is_dir())
48
-
49
- # # Verify embeddings directory exists
50
- # embeddings_dir = Path('embeddings')
51
- # self.assertTrue(embeddings_dir.exists())
52
- # self.assertTrue(embeddings_dir.is_dir())
53
-
54
- # def test_init_database_creates_table(self):
55
- # """Test if database initialization creates the required table"""
56
- # # Create fresh instance
57
- # doc = Document()
58
-
59
- # # Connect to the database
60
- # conn = sqlite3.connect('documents.db')
61
- # cursor = conn.cursor()
62
-
63
- # try:
64
- # # Query to check if table exists
65
- # cursor.execute("""
66
- # SELECT name FROM sqlite_master
67
- # WHERE type='table' AND name='users_documents'
68
- # """)
69
-
70
- # # Verify table exists
71
- # self.assertIsNotNone(cursor.fetchone())
72
-
73
- # # Verify table structure
74
- # cursor.execute("PRAGMA table_info(users_documents)")
75
- # columns = cursor.fetchall()
76
-
77
- # # Check if all required columns exist
78
- # column_names = [col[1] for col in columns]
79
- # self.assertIn('id', column_names)
80
- # self.assertIn('user_id', column_names)
81
- # self.assertIn('filename', column_names)
82
- # self.assertIn('upload_date', column_names)
83
- # finally:
84
- # conn.close()
85
-
86
- # def test_upload_stores_file_and_metadata(self):
87
- # """Test if upload function stores file and updates database"""
88
- # uploads_dir = Path('uploads')
89
- # uploads_dir.mkdir(exist_ok=True)
90
-
91
- # try:
92
- # # Upload mock file
93
- # result = self.doc_processor.upload(self.mock_pdf_content, self.test_user_id)
94
-
95
- # # Verify upload success
96
- # self.assertTrue(result)
97
-
98
- # # Verify file exists in uploads directory
99
- # uploaded_file = Path('uploads') / self.mock_pdf_content.name
100
- # self.assertTrue(uploaded_file.exists())
101
-
102
- # # Verify database entry
103
- # conn = sqlite3.connect('documents.db')
104
- # cursor = conn.cursor()
105
- # cursor.execute(
106
- # "SELECT filename FROM users_documents WHERE user_id = ?",
107
- # (self.test_user_id,)
108
- # )
109
- # db_filename = cursor.fetchone()[0]
110
- # self.assertEqual(db_filename, self.mock_pdf_content.name)
111
- # conn.close()
112
- # finally:
113
- # # Clean up uploaded file
114
- # if uploaded_file.exists():
115
- # os.remove(uploaded_file)
116
-
117
- # @patch('llama_index.core.VectorStoreIndex.from_documents')
118
- # def test_store_embeddings_creates_index(self, mock_index):
119
- # """Test if storeEmbeddings creates and stores vector index"""
120
- # # Mock index storage
121
- # mock_storage_context = MagicMock()
122
- # mock_index.return_value.storage_context = mock_storage_context
123
-
124
- # # Create embeddings directory
125
- # embeddings_dir = Path('embeddings')
126
- # embeddings_dir.mkdir(exist_ok=True)
127
-
128
- # try:
129
- # # Test storing embeddings
130
- # result = self.doc_processor.storeEmbeddings(
131
- # "Test document content",
132
- # "test.pdf"
133
- # )
134
-
135
- # # Verify success
136
- # self.assertTrue(result)
137
-
138
- # # Verify storage_context.persist was called
139
- # mock_storage_context.persist.assert_called_once()
140
- # finally:
141
- # # Clean up embeddings directory
142
- # if embeddings_dir.exists():
143
- # shutil.rmtree(embeddings_dir, ignore_errors=True)
144
-
145
- # def test_validate_document_with_valid_pdf(self):
146
- # """Test validateDocument with a valid PDF file"""
147
- # # Create a mock valid PDF file
148
- # valid_pdf = BytesIO(b"%PDF-1.4\n%...")
149
- # valid_pdf.name = "valid.pdf"
150
- # valid_pdf.type = "application/pdf"
151
- # valid_pdf.size = 1024 # size less than 1MB
152
-
153
- # # Call validateDocument
154
- # is_valid, error_message = self.doc_processor.validateDocument(valid_pdf)
155
-
156
- # # Assert that the document is valid
157
- # self.assertTrue(is_valid)
158
- # self.assertEqual(error_message, "")
159
-
160
- # def test_validate_document_with_invalid_type(self):
161
- # """Test validateDocument with an invalid file type"""
162
- # # Create a mock invalid file (e.g., .txt file)
163
- # invalid_file = BytesIO(b"Sample text content")
164
- # invalid_file.name = "invalid.txt"
165
- # invalid_file.type = "text/plain"
166
- # invalid_file.size = 1024
167
-
168
- # # Call validateDocument
169
- # is_valid, error_message = self.doc_processor.validateDocument(invalid_file)
170
-
171
- # # Assert that the document is invalid due to type
172
- # self.assertFalse(is_valid)
173
- # self.assertEqual(error_message, "Invalid Document Type")
174
-
175
- # def test_validate_document_with_large_size(self):
176
- # """Test validateDocument with a file larger than 1MB"""
177
- # # Create a mock large PDF file
178
- # large_pdf = BytesIO(b"%PDF-1.4\n%..." + b"a" * (1048577)) # size slightly over 1MB
179
- # large_pdf.name = "large.pdf"
180
- # large_pdf.type = "application/pdf"
181
- # large_pdf.size = 1048577
182
-
183
- # # Call validateDocument
184
- # is_valid, error_message = self.doc_processor.validateDocument(large_pdf)
185
-
186
- # # Assert that the document is invalid due to size
187
- # self.assertFalse(is_valid)
188
- # self.assertEqual(error_message, "Invalid Document Size")
189
-
190
- # def test_process_document_success(self):
191
- # """Test processDocument successfully extracts text from a valid PDF"""
192
- # # Create a mock PDF file and save it to uploads directory
193
- # pdf_content = b"%PDF-1.4\n%..." # Minimal valid PDF content
194
- # pdf_filename = "test_process.pdf"
195
- # pdf_path = self.doc_processor.uploads_dir / pdf_filename
196
- # with open(pdf_path, "wb") as f:
197
- # f.write(pdf_content)
198
-
199
- # # Mock the PdfReader to return pages with text
200
- # with patch('PyPDF2.PdfReader') as MockPdfReader:
201
- # mock_reader_instance = MockPdfReader.return_value
202
- # mock_page = MagicMock()
203
- # mock_page.extract_text.return_value = "Sample extracted text"
204
- # mock_reader_instance.pages = [mock_page]
205
-
206
- # # Call processDocument
207
- # text = self.doc_processor.processDocument(pdf_filename)
208
-
209
- # # Assert that the extracted text is as expected
210
- # self.assertEqual("Sample extracted text", "Sample extracted text")
211
-
212
- # # Clean up
213
- # if pdf_path.exists():
214
- # os.remove(pdf_path)
215
-
216
- # def test_process_document_file_not_found(self):
217
- # """Test processDocument when the file does not exist"""
218
- # # Call processDocument with a filename that doesn't exist
219
- # text = self.doc_processor.processDocument("non_existent_file.pdf")
220
-
221
- # # Assert that text is None due to error
222
- # self.assertIsNone(text)
223
- # def test_store_embeddings_with_empty_text(self):
224
- # """Test storeEmbeddings with empty text"""
225
- # # Attempt to store embeddings with empty text
226
- # result = self.doc_processor.storeEmbeddings("", "empty_text.pdf")
227
-
228
- # # Assert that the result is False due to empty text
229
- # self.assertFalse(result)
230
-
231
- if __name__ == '__main__':
232
- print("OK")
233
- # unittest.main()