akhil-vaidya commited on
Commit
e611e72
1 Parent(s): 5c9c2a9
Files changed (2) hide show
  1. app.py +207 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from pathlib import Path
4
+ from PyPDF2 import PdfReader
5
+ import sqlite3
6
+ import openai
7
+ from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Document as LlamaDocument
8
+ from llama_index.core.storage.storage_context import StorageContext
9
+ from llama_index.core.vector_stores import SimpleVectorStore
10
+ from datetime import datetime
11
+
12
+ openai.api_key = os.getenv("OPENAI_API_KEY")
13
+
14
+ class Document:
15
+ def __init__(self):
16
+ # Create necessary directories if they don't exist
17
+ self.uploads_dir = Path("uploads")
18
+ self.embeddings_dir = Path("embeddings")
19
+ self.uploads_dir.mkdir(exist_ok=True)
20
+ self.embeddings_dir.mkdir(exist_ok=True)
21
+
22
+ # Initialize database
23
+ self.init_database()
24
+
25
+ def validateDocument(self, uploaded_file):
26
+ """
27
+ Validate the uploaded document's size and type
28
+
29
+ Args:
30
+ uploaded_file: Streamlit UploadedFile object
31
+
32
+ Returns:
33
+ tuple: (bool, str) - (is_valid, error_message)
34
+ """
35
+ # Check file type
36
+ if uploaded_file.type != "application/pdf":
37
+ return False, "Invalid Document Type"
38
+
39
+ # Check file size (1MB = 1048576 bytes)
40
+ if uploaded_file.size > 1048576:
41
+ return False, "Invalid Document Size"
42
+
43
+ return True, ""
44
+
45
+ def init_database(self):
46
+ """Initialize SQLite database with required table"""
47
+ conn = sqlite3.connect('documents.db')
48
+ cursor = conn.cursor()
49
+
50
+ # Create users_documents table if it doesn't exist
51
+ cursor.execute('''
52
+ CREATE TABLE IF NOT EXISTS users_documents (
53
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
54
+ user_id TEXT NOT NULL,
55
+ filename TEXT NOT NULL,
56
+ upload_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
57
+ )
58
+ ''')
59
+
60
+ conn.commit()
61
+ conn.close()
62
+
63
+ def upload(self, uploaded_file, user_id):
64
+ """
65
+ Upload the document to the uploads folder and store metadata in database
66
+
67
+ Args:
68
+ uploaded_file: Streamlit UploadedFile object
69
+ user_id: String identifier for the user
70
+
71
+ Returns:
72
+ bool: Success status of upload
73
+ """
74
+ try:
75
+ if uploaded_file is None:
76
+ return False
77
+
78
+ # Generate unique filename with timestamp
79
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
80
+ filename = f"{uploaded_file.name}"
81
+ file_path = self.uploads_dir / filename
82
+
83
+ # Save file to uploads directory
84
+ with open(file_path, "wb") as f:
85
+ f.write(uploaded_file.getbuffer())
86
+
87
+ # Store file information in database
88
+ conn = sqlite3.connect('documents.db')
89
+ cursor = conn.cursor()
90
+ cursor.execute(
91
+ "INSERT INTO users_documents (user_id, filename) VALUES (?, ?)",
92
+ (user_id, filename)
93
+ )
94
+ conn.commit()
95
+ conn.close()
96
+
97
+ return True
98
+
99
+ except Exception as e:
100
+ st.error(f"Error in upload: {str(e)}")
101
+ return False
102
+
103
+ def processDocument(self, filename):
104
+ """
105
+ Extract text from PDF document
106
+
107
+ Args:
108
+ filename: Name of the file to process
109
+
110
+ Returns:
111
+ str: Extracted text from the PDF
112
+ """
113
+ try:
114
+ file_path = self.uploads_dir / filename
115
+
116
+ if not file_path.exists():
117
+ raise FileNotFoundError(f"File {filename} not found in uploads directory")
118
+
119
+ # Extract text from PDF
120
+ pdf_reader = PdfReader(str(file_path))
121
+ text = ""
122
+
123
+ for page in pdf_reader.pages:
124
+ text += page.extract_text()
125
+
126
+ return text
127
+
128
+ except Exception as e:
129
+ st.error(f"Error in processing document: {str(e)}")
130
+ return None
131
+
132
+ def storeEmbeddings(self, text, filename):
133
+ """
134
+ Create and store embeddings using LlamaIndex
135
+
136
+ Args:
137
+ text: Extracted text from the document
138
+ filename: Name of the file to use for storing embeddings
139
+
140
+ Returns:
141
+ bool: Success status of embedding storage
142
+ """
143
+ try:
144
+ # Remove file extension from filename
145
+ base_filename = Path(filename).stem
146
+
147
+ # Create a LlamaIndex document
148
+ documents = [LlamaDocument(text=text)]
149
+
150
+ # Create vector store and index
151
+ vector_store = SimpleVectorStore()
152
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
153
+ index = VectorStoreIndex.from_documents(
154
+ documents,
155
+ storage_context=storage_context
156
+ )
157
+
158
+ # Save the index
159
+ index.storage_context.persist(persist_dir=str(self.embeddings_dir / base_filename))
160
+
161
+ return True
162
+
163
+ except Exception as e:
164
+ st.error(f"Error in storing embeddings: {str(e)}")
165
+ return False
166
+
167
+ # Example Streamlit interface
168
+ def main():
169
+ st.title("Document Upload and Processing")
170
+
171
+ # Initialize Document class
172
+ doc_processor = Document()
173
+
174
+ # Simple user ID input (in a real app, this would be handled by authentication)
175
+ user_id = st.text_input("Enter User ID")
176
+
177
+ # File upload widget
178
+ uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
179
+
180
+ if uploaded_file is not None and user_id:
181
+ if st.button("Process Document"):
182
+ # Upload file
183
+
184
+ is_valid, error_message = doc_processor.validateDocument(uploaded_file)
185
+ if not is_valid:
186
+ st.error(error_message)
187
+ else:
188
+ if doc_processor.upload(uploaded_file, user_id):
189
+ st.success("File uploaded successfully!")
190
+
191
+ # Process document
192
+ text = doc_processor.processDocument(uploaded_file.name)
193
+ if text:
194
+ st.success("Document processed successfully!")
195
+
196
+ # Store embeddings
197
+ if doc_processor.storeEmbeddings(text, uploaded_file.name):
198
+ st.success("Embeddings stored successfully!")
199
+ else:
200
+ st.error("Error storing embeddings")
201
+ else:
202
+ st.error("Error processing document")
203
+ else:
204
+ st.error("Error uploading file")
205
+
206
+ if __name__ == "__main__":
207
+ main()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ PyPDF2
3
+ openai
4
+ llama-index
5
+ sqlite3