Spaces:
Sleeping
Sleeping
File size: 6,944 Bytes
e611e72 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 |
import os
import streamlit as st
from pathlib import Path
from PyPDF2 import PdfReader
import sqlite3
import openai
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Document as LlamaDocument
from llama_index.core.storage.storage_context import StorageContext
from llama_index.core.vector_stores import SimpleVectorStore
from datetime import datetime
openai.api_key = os.getenv("OPENAI_API_KEY")
class Document:
def __init__(self):
# Create necessary directories if they don't exist
self.uploads_dir = Path("uploads")
self.embeddings_dir = Path("embeddings")
self.uploads_dir.mkdir(exist_ok=True)
self.embeddings_dir.mkdir(exist_ok=True)
# Initialize database
self.init_database()
def validateDocument(self, uploaded_file):
"""
Validate the uploaded document's size and type
Args:
uploaded_file: Streamlit UploadedFile object
Returns:
tuple: (bool, str) - (is_valid, error_message)
"""
# Check file type
if uploaded_file.type != "application/pdf":
return False, "Invalid Document Type"
# Check file size (1MB = 1048576 bytes)
if uploaded_file.size > 1048576:
return False, "Invalid Document Size"
return True, ""
def init_database(self):
"""Initialize SQLite database with required table"""
conn = sqlite3.connect('documents.db')
cursor = conn.cursor()
# Create users_documents table if it doesn't exist
cursor.execute('''
CREATE TABLE IF NOT EXISTS users_documents (
id INTEGER PRIMARY KEY AUTOINCREMENT,
user_id TEXT NOT NULL,
filename TEXT NOT NULL,
upload_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
conn.commit()
conn.close()
def upload(self, uploaded_file, user_id):
"""
Upload the document to the uploads folder and store metadata in database
Args:
uploaded_file: Streamlit UploadedFile object
user_id: String identifier for the user
Returns:
bool: Success status of upload
"""
try:
if uploaded_file is None:
return False
# Generate unique filename with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{uploaded_file.name}"
file_path = self.uploads_dir / filename
# Save file to uploads directory
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
# Store file information in database
conn = sqlite3.connect('documents.db')
cursor = conn.cursor()
cursor.execute(
"INSERT INTO users_documents (user_id, filename) VALUES (?, ?)",
(user_id, filename)
)
conn.commit()
conn.close()
return True
except Exception as e:
st.error(f"Error in upload: {str(e)}")
return False
def processDocument(self, filename):
"""
Extract text from PDF document
Args:
filename: Name of the file to process
Returns:
str: Extracted text from the PDF
"""
try:
file_path = self.uploads_dir / filename
if not file_path.exists():
raise FileNotFoundError(f"File {filename} not found in uploads directory")
# Extract text from PDF
pdf_reader = PdfReader(str(file_path))
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
return text
except Exception as e:
st.error(f"Error in processing document: {str(e)}")
return None
def storeEmbeddings(self, text, filename):
"""
Create and store embeddings using LlamaIndex
Args:
text: Extracted text from the document
filename: Name of the file to use for storing embeddings
Returns:
bool: Success status of embedding storage
"""
try:
# Remove file extension from filename
base_filename = Path(filename).stem
# Create a LlamaIndex document
documents = [LlamaDocument(text=text)]
# Create vector store and index
vector_store = SimpleVectorStore()
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
documents,
storage_context=storage_context
)
# Save the index
index.storage_context.persist(persist_dir=str(self.embeddings_dir / base_filename))
return True
except Exception as e:
st.error(f"Error in storing embeddings: {str(e)}")
return False
# Example Streamlit interface
def main():
st.title("Document Upload and Processing")
# Initialize Document class
doc_processor = Document()
# Simple user ID input (in a real app, this would be handled by authentication)
user_id = st.text_input("Enter User ID")
# File upload widget
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
if uploaded_file is not None and user_id:
if st.button("Process Document"):
# Upload file
is_valid, error_message = doc_processor.validateDocument(uploaded_file)
if not is_valid:
st.error(error_message)
else:
if doc_processor.upload(uploaded_file, user_id):
st.success("File uploaded successfully!")
# Process document
text = doc_processor.processDocument(uploaded_file.name)
if text:
st.success("Document processed successfully!")
# Store embeddings
if doc_processor.storeEmbeddings(text, uploaded_file.name):
st.success("Embeddings stored successfully!")
else:
st.error("Error storing embeddings")
else:
st.error("Error processing document")
else:
st.error("Error uploading file")
if __name__ == "__main__":
main() |