Spaces:

akhil-vaidya
/

DocProcess

Sleeping

File size: 6,944 Bytes

e611e72

import os
import streamlit as st
from pathlib import Path
from PyPDF2 import PdfReader
import sqlite3
import openai
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Document as LlamaDocument
from llama_index.core.storage.storage_context import StorageContext
from llama_index.core.vector_stores import SimpleVectorStore
from datetime import datetime

openai.api_key = os.getenv("OPENAI_API_KEY")

class Document:
    def __init__(self):
        # Create necessary directories if they don't exist
        self.uploads_dir = Path("uploads")
        self.embeddings_dir = Path("embeddings")
        self.uploads_dir.mkdir(exist_ok=True)
        self.embeddings_dir.mkdir(exist_ok=True)
        
        # Initialize database
        self.init_database()

    def validateDocument(self, uploaded_file):
        """
        Validate the uploaded document's size and type
        
        Args:
            uploaded_file: Streamlit UploadedFile object
            
        Returns:
            tuple: (bool, str) - (is_valid, error_message)
        """
        # Check file type
        if uploaded_file.type != "application/pdf":
            return False, "Invalid Document Type"
        
        # Check file size (1MB = 1048576 bytes)
        if uploaded_file.size > 1048576:
            return False, "Invalid Document Size"
        
        return True, ""
    
    def init_database(self):
        """Initialize SQLite database with required table"""
        conn = sqlite3.connect('documents.db')
        cursor = conn.cursor()
        
        # Create users_documents table if it doesn't exist
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS users_documents (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                user_id TEXT NOT NULL,
                filename TEXT NOT NULL,
                upload_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )
        ''')
        
        conn.commit()
        conn.close()

    def upload(self, uploaded_file, user_id):
        """
        Upload the document to the uploads folder and store metadata in database
        
        Args:
            uploaded_file: Streamlit UploadedFile object
            user_id: String identifier for the user
            
        Returns:
            bool: Success status of upload
        """
        try:
            if uploaded_file is None:
                return False
            
            # Generate unique filename with timestamp
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"{uploaded_file.name}"
            file_path = self.uploads_dir / filename
            
            # Save file to uploads directory
            with open(file_path, "wb") as f:
                f.write(uploaded_file.getbuffer())
            
            # Store file information in database
            conn = sqlite3.connect('documents.db')
            cursor = conn.cursor()
            cursor.execute(
                "INSERT INTO users_documents (user_id, filename) VALUES (?, ?)",
                (user_id, filename)
            )
            conn.commit()
            conn.close()
            
            return True
            
        except Exception as e:
            st.error(f"Error in upload: {str(e)}")
            return False

    def processDocument(self, filename):
        """
        Extract text from PDF document
        
        Args:
            filename: Name of the file to process
            
        Returns:
            str: Extracted text from the PDF
        """
        try:
            file_path = self.uploads_dir / filename
            
            if not file_path.exists():
                raise FileNotFoundError(f"File {filename} not found in uploads directory")
            
            # Extract text from PDF
            pdf_reader = PdfReader(str(file_path))
            text = ""
            
            for page in pdf_reader.pages:
                text += page.extract_text()
            
            return text
            
        except Exception as e:
            st.error(f"Error in processing document: {str(e)}")
            return None

    def storeEmbeddings(self, text, filename):
        """
        Create and store embeddings using LlamaIndex
        
        Args:
            text: Extracted text from the document
            filename: Name of the file to use for storing embeddings
            
        Returns:
            bool: Success status of embedding storage
        """
        try:
            # Remove file extension from filename
            base_filename = Path(filename).stem
            
            # Create a LlamaIndex document
            documents = [LlamaDocument(text=text)]
            
            # Create vector store and index
            vector_store = SimpleVectorStore()
            storage_context = StorageContext.from_defaults(vector_store=vector_store)
            index = VectorStoreIndex.from_documents(
                documents,
                storage_context=storage_context
            )
            
            # Save the index
            index.storage_context.persist(persist_dir=str(self.embeddings_dir / base_filename))
            
            return True
            
        except Exception as e:
            st.error(f"Error in storing embeddings: {str(e)}")
            return False

# Example Streamlit interface
def main():
    st.title("Document Upload and Processing")
    
    # Initialize Document class
    doc_processor = Document()
    
    # Simple user ID input (in a real app, this would be handled by authentication)
    user_id = st.text_input("Enter User ID")
    
    # File upload widget
    uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
    
    if uploaded_file is not None and user_id:
        if st.button("Process Document"):
            # Upload file

            is_valid, error_message = doc_processor.validateDocument(uploaded_file)
            if not is_valid:
                st.error(error_message)
            else:
                if doc_processor.upload(uploaded_file, user_id):
                    st.success("File uploaded successfully!")
                    
                    # Process document
                    text = doc_processor.processDocument(uploaded_file.name)
                    if text:
                        st.success("Document processed successfully!")
                        
                        # Store embeddings
                        if doc_processor.storeEmbeddings(text, uploaded_file.name):
                            st.success("Embeddings stored successfully!")
                        else:
                            st.error("Error storing embeddings")
                    else:
                        st.error("Error processing document")
                else:
                    st.error("Error uploading file")

if __name__ == "__main__":
    main()