Spaces:

tony-42069
/

cre-chatbot-rag

Sleeping

App Files Files Community

tony-42069 commited on Nov 27, 2024

Commit

d16e9aa

1 Parent(s): 836ede6

Add source code and test files

Browse files

Files changed (9) hide show

api/__init__.py +8 -0
api/function_app.py +71 -0
api/requirements.txt +7 -0
src/__init__.py +1 -0
src/pdf_processor.py +112 -0
src/rag_engine.py +131 -0
tests/__init__.py +1 -0
tests/test_pdf_processor.py +73 -0
tests/test_rag_engine.py +112 -0

api/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import azure.functions as func
+import streamlit as st
+def main(req: func.HttpRequest) -> func.HttpResponse:
+    return func.HttpResponse(
+        "This is the API endpoint for the CRE Knowledge Assistant",
+        status_code=200
+    )

api/function_app.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import azure.functions as func
+import logging
+import json
+from io import BytesIO
+# Add the project root to Python path
+import sys
+import os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from app.config import validate_config
+from app.logging import setup_logging
+from src.pdf_processor import PDFProcessor
+from src.rag_engine import RAGEngine
+# Initialize components
+setup_logging()
+logger = logging.getLogger('app')
+pdf_processor = PDFProcessor()
+rag_engine = RAGEngine()
+def process_pdf(req: func.HttpRequest) -> func.HttpResponse:
+    try:
+        # Get the PDF file from the request
+        pdf_file = req.files['file']
+        pdf_bytes = pdf_file.read()
+        # Process the PDF
+        pdf_processor.process(BytesIO(pdf_bytes))
+        return func.HttpResponse(
+            json.dumps({"message": "PDF processed successfully"}),
+            mimetype="application/json",
+            status_code=200
+        )
+    except Exception as e:
+        logger.error(f"Error processing PDF: {str(e)}")
+        return func.HttpResponse(
+            json.dumps({"error": str(e)}),
+            mimetype="application/json",
+            status_code=500
+        )
+def query(req: func.HttpRequest) -> func.HttpResponse:
+    try:
+        # Get the query from request body
+        req_body = req.get_json()
+        user_query = req_body.get('query')
+        if not user_query:
+            return func.HttpResponse(
+                json.dumps({"error": "No query provided"}),
+                mimetype="application/json",
+                status_code=400
+            )
+        # Process query through RAG engine
+        answer = rag_engine.process_query(user_query)
+        return func.HttpResponse(
+            json.dumps({"answer": answer}),
+            mimetype="application/json",
+            status_code=200
+        )
+    except Exception as e:
+        logger.error(f"Error processing query: {str(e)}")
+        return func.HttpResponse(
+            json.dumps({"error": str(e)}),
+            mimetype="application/json",
+            status_code=500
+        )

api/requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+azure-functions==1.15.0
+openai==1.6.1
+python-dotenv==1.0.0
+azure-cognitiveservices-language-textanalytics==0.2.0
+PyPDF2==3.0.1
+langchain==0.0.352
+azure-storage-blob==12.19.0

src/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

src/pdf_processor.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""
+PDF processing module for extracting and chunking text from PDF documents.
+"""
+import logging
+from typing import List, Tuple
+import PyPDF2
+from io import BytesIO
+from app.config import MAX_CHUNK_SIZE, OVERLAP_SIZE
+logger = logging.getLogger('pdf')
+class PDFProcessor:
+    """Handles PDF document processing and text chunking."""
+    @staticmethod
+    def extract_text(pdf_file: BytesIO) -> str:
+        """Extract text content from a PDF file."""
+        try:
+            pdf_reader = PyPDF2.PdfReader(pdf_file)
+            text = ""
+            for page in pdf_reader.pages:
+                text += page.extract_text() + "\n"
+            logger.info(f"Successfully extracted text from PDF ({len(text)} characters)")
+            return text
+        except Exception as e:
+            logger.error(f"Error extracting text from PDF: {str(e)}")
+            raise
+    @staticmethod
+    def create_chunks(text: str, chunk_size: int = MAX_CHUNK_SIZE,
+                     overlap: int = OVERLAP_SIZE) -> List[Tuple[str, dict]]:
+        """Split text into overlapping chunks with metadata."""
+        try:
+            chunks = []
+            start = 0
+            while start < len(text):
+                # Find the end of the chunk
+                end = start + chunk_size
+                # If we're not at the end of the text, try to find a good break point
+                if end < len(text):
+                    # Try to find the last period or newline in the chunk
+                    last_period = text.rfind('.', start, end)
+                    last_newline = text.rfind('\n', start, end)
+                    break_point = max(last_period, last_newline)
+                    if break_point > start:
+                        end = break_point + 1
+                # Create chunk with metadata
+                chunk_text = text[start:end].strip()
+                if chunk_text:  # Only add non-empty chunks
+                    metadata = {
+                        "start_char": start,
+                        "end_char": end,
+                        "chunk_size": len(chunk_text)
+                    }
+                    chunks.append((chunk_text, metadata))
+                # Move the start position, accounting for overlap
+                start = end - overlap if end < len(text) else len(text)
+            logger.info(f"Created {len(chunks)} chunks from text")
+            return chunks
+        except Exception as e:
+            logger.error(f"Error creating chunks: {str(e)}")
+            raise
+    @staticmethod
+    def clean_text(text: str) -> str:
+        """Clean and normalize extracted text."""
+        try:
+            # Remove extra whitespace
+            text = ' '.join(text.split())
+            # Remove special characters that might cause issues
+            text = text.replace('\x00', '')
+            # Normalize newlines
+            text = text.replace('\r\n', '\n')
+            logger.info("Text cleaned successfully")
+            return text
+        except Exception as e:
+            logger.error(f"Error cleaning text: {str(e)}")
+            raise
+    def process_pdf(self, pdf_file: BytesIO) -> List[Tuple[str, dict]]:
+        """Process PDF file and return chunks with metadata."""
+        try:
+            # Extract text from PDF
+            raw_text = self.extract_text(pdf_file)
+            # Clean the extracted text
+            cleaned_text = self.clean_text(raw_text)
+            # Create chunks
+            chunks = self.create_chunks(cleaned_text)
+            logger.info(f"PDF processed successfully: {len(chunks)} chunks created")
+            return chunks
+        except Exception as e:
+            logger.error(f"Error processing PDF: {str(e)}")
+            raise

src/rag_engine.py ADDED Viewed

	@@ -0,0 +1,131 @@

+"""
+RAG (Retrieval Augmented Generation) engine for the CRE Chatbot.
+"""
+import logging
+import os
+from typing import List, Dict, Any, Optional
+import chromadb
+from chromadb.config import Settings
+from openai import AzureOpenAI
+from app.config import (
+    AZURE_OPENAI_ENDPOINT,
+    AZURE_OPENAI_API_KEY,  # Added this line
+    TEMPERATURE,
+    MAX_TOKENS,
+    AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME
+)
+logger = logging.getLogger('rag')
+class RAGEngine:
+    """Handles document retrieval and question answering using Azure OpenAI."""
+    def __init__(self, deployment_name: str):
+        """Initialize the RAG engine with Azure OpenAI client."""
+        self.client = AzureOpenAI(
+            api_key=AZURE_OPENAI_API_KEY,
+            api_version="2023-12-01-preview",
+            azure_endpoint=AZURE_OPENAI_ENDPOINT
+        )
+        self.deployment_name = deployment_name
+        self.embedding_deployment_name = AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME
+        # Initialize ChromaDB with simple in-memory settings
+        self.chroma_client = chromadb.Client(Settings(anonymized_telemetry=False))
+        self.collection = None
+        self.initialize_vector_store("cre_docs")
+        logger.info("RAG Engine initialized with Azure OpenAI")
+    def create_embeddings(self, texts: List[str]) -> List[List[float]]:
+        """Create embeddings for the given texts using Azure OpenAI."""
+        try:
+            response = self.client.embeddings.create(
+                input=texts,
+                model=self.embedding_deployment_name
+            )
+            return [item.embedding for item in response.data]
+        except Exception as e:
+            logger.error(f"Error creating embeddings: {str(e)}")
+            raise
+    def initialize_vector_store(self, collection_name: str):
+        """Initialize or get the vector store collection."""
+        try:
+            self.collection = self.chroma_client.get_or_create_collection(
+                name=collection_name,
+                metadata={"hnsw:space": "cosine"}
+            )
+            logger.info(f"Vector store initialized with collection: {collection_name}")
+        except Exception as e:
+            logger.error(f"Error initializing vector store: {str(e)}")
+            raise
+    def add_documents(self, texts: List[str], metadata: Optional[List[Dict[str, Any]]] = None):
+        """Add documents to the vector store."""
+        try:
+            if not self.collection:
+                raise ValueError("Vector store collection not initialized")
+            embeddings = self.create_embeddings(texts)
+            # Use timestamp + index as ID to ensure uniqueness
+            import time
+            timestamp = int(time.time())
+            ids = [f"{timestamp}_{i}" for i in range(len(texts))]
+            self.collection.add(
+                embeddings=embeddings,
+                documents=texts,
+                ids=ids,
+                metadatas=metadata if metadata else [{}] * len(texts)
+            )
+            logger.info(f"Added {len(texts)} documents to vector store")
+        except Exception as e:
+            logger.error(f"Error adding documents: {str(e)}")
+            raise
+    def query(self, question: str, k: int = 3) -> Dict[str, Any]:
+        """Query the vector store and generate an answer."""
+        try:
+            # Create embedding for the question
+            question_embedding = self.create_embeddings([question])[0]
+            # Query vector store
+            results = self.collection.query(
+                query_embeddings=[question_embedding],
+                n_results=k
+            )
+            # Prepare context from retrieved documents
+            context = "\n".join(results['documents'][0])
+            # Generate answer using Azure OpenAI
+            messages = [
+                {"role": "system", "content": "You are a helpful assistant that answers questions about commercial real estate concepts. Use the provided context to answer questions accurately and concisely."},
+                {"role": "user", "content": f"Context: {context}\n\nQuestion: {question}"}
+            ]
+            response = self.client.chat.completions.create(
+                model=self.deployment_name,
+                messages=messages,
+                temperature=TEMPERATURE,
+                max_tokens=MAX_TOKENS
+            )
+            answer = response.choices[0].message.content
+            return {
+                "answer": answer,
+                "context": context,
+                "source_documents": results['documents'][0]
+            }
+        except Exception as e:
+            logger.error(f"Error querying RAG engine: {str(e)}")
+            raise
+    def clear(self):
+        """Clear the vector store collection."""
+        if self.collection:
+            self.collection.delete()
+            logger.info("Vector store collection cleared")

tests/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

tests/test_pdf_processor.py ADDED Viewed

	@@ -0,0 +1,73 @@

+"""
+Tests for the PDF processor module.
+"""
+import pytest
+from io import BytesIO
+from src.pdf_processor import PDFProcessor
+def test_clean_text():
+    """Test text cleaning functionality."""
+    processor = PDFProcessor()
+    # Test removing extra whitespace
+    text = "This   has    extra   spaces"
+    assert processor.clean_text(text) == "This has extra spaces"
+    # Test normalizing newlines
+    text = "Line1\r\nLine2\r\nLine3"
+    assert processor.clean_text(text) == "Line1 Line2 Line3"
+    # Test removing null characters
+    text = "Text with\x00null\x00chars"
+    assert processor.clean_text(text) == "Text with null chars"
+def test_create_chunks():
+    """Test text chunking functionality."""
+    processor = PDFProcessor()
+    # Test basic chunking
+    text = "This is a test. This is another test. And a final test."
+    chunks = processor.create_chunks(text, chunk_size=20, overlap=5)
+    assert len(chunks) > 0
+    assert all(isinstance(chunk, tuple) for chunk in chunks)
+    assert all(len(chunk) == 2 for chunk in chunks)  # (text, metadata)
+    assert all(isinstance(chunk[1], dict) for chunk in chunks)  # metadata is dict
+def test_chunk_metadata():
+    """Test chunk metadata creation."""
+    processor = PDFProcessor()
+    text = "Short test text."
+    chunks = processor.create_chunks(text, chunk_size=20, overlap=5)
+    assert len(chunks) == 1
+    chunk_text, metadata = chunks[0]
+    assert "start_char" in metadata
+    assert "end_char" in metadata
+    assert "chunk_size" in metadata
+    assert metadata["chunk_size"] == len(chunk_text)
+def test_empty_text():
+    """Test handling of empty text."""
+    processor = PDFProcessor()
+    chunks = processor.create_chunks("")
+    assert len(chunks) == 0
+def test_chunk_overlap():
+    """Test chunk overlap functionality."""
+    processor = PDFProcessor()
+    text = "This is a long text that should be split into multiple chunks with overlap."
+    chunks = processor.create_chunks(text, chunk_size=20, overlap=5)
+    # Check that chunks overlap
+    if len(chunks) > 1:
+        for i in range(len(chunks) - 1):
+            current_chunk = chunks[i][0]
+            next_chunk = chunks[i + 1][0]
+            # There should be some overlap between consecutive chunks
+            assert any(word in next_chunk for word in current_chunk.split()[-3:])

tests/test_rag_engine.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""
+Tests for the RAG engine module.
+"""
+import pytest
+from unittest.mock import Mock, patch
+from src.rag_engine import RAGEngine
+@pytest.fixture
+def mock_azure_client():
+    """Create a mock Azure OpenAI client."""
+    with patch('openai.AzureOpenAI') as mock_client:
+        yield mock_client
+@pytest.fixture
+def mock_chroma_client():
+    """Create a mock Chroma client."""
+    with patch('chromadb.Client') as mock_client:
+        yield mock_client
+@pytest.fixture
+def rag_engine(mock_azure_client, mock_chroma_client):
+    """Create a RAG engine instance with mocked dependencies."""
+    return RAGEngine("test-deployment")
+def test_create_embeddings(rag_engine, mock_azure_client):
+    """Test embedding creation."""
+    # Setup mock response
+    mock_response = Mock()
+    mock_response.data = [
+        Mock(embedding=[0.1, 0.2, 0.3]),
+        Mock(embedding=[0.4, 0.5, 0.6])
+    ]
+    rag_engine.client.embeddings.create.return_value = mock_response
+    # Test
+    texts = ["Text 1", "Text 2"]
+    embeddings = rag_engine.create_embeddings(texts)
+    # Verify
+    assert len(embeddings) == 2
+    assert all(isinstance(emb, list) for emb in embeddings)
+    assert len(embeddings[0]) == 3  # Embedding dimension
+def test_initialize_vector_store(rag_engine):
+    """Test vector store initialization."""
+    rag_engine.initialize_vector_store("test_collection")
+    # Verify the collection was created
+    assert rag_engine.collection is not None
+def test_add_documents(rag_engine):
+    """Test adding documents to vector store."""
+    # Setup
+    rag_engine.initialize_vector_store("test_collection")
+    texts = ["Document 1", "Document 2"]
+    metadata = [{"source": "test1"}, {"source": "test2"}]
+    # Create mock embeddings
+    with patch.object(rag_engine, 'create_embeddings') as mock_create_embeddings:
+        mock_create_embeddings.return_value = [[0.1, 0.2], [0.3, 0.4]]
+        # Test
+        rag_engine.add_documents(texts, metadata)
+        # Verify
+        mock_create_embeddings.assert_called_once_with(texts)
+        assert rag_engine.collection.add.called
+def test_query(rag_engine):
+    """Test querying the RAG engine."""
+    # Setup
+    rag_engine.initialize_vector_store("test_collection")
+    # Mock embeddings creation
+    with patch.object(rag_engine, 'create_embeddings') as mock_create_embeddings:
+        mock_create_embeddings.return_value = [[0.1, 0.2]]
+        # Mock vector store query
+        mock_results = {
+            'documents': [["Relevant document 1", "Relevant document 2"]],
+            'distances': [[0.1, 0.2]]
+        }
+        rag_engine.collection.query.return_value = mock_results
+        # Mock chat completion
+        mock_response = Mock()
+        mock_response.choices = [Mock(message=Mock(content="Test answer"))]
+        rag_engine.client.chat.completions.create.return_value = mock_response
+        # Test
+        result = rag_engine.query("Test question")
+        # Verify
+        assert isinstance(result, dict)
+        assert "answer" in result
+        assert "context" in result
+        assert "source_documents" in result
+        assert result["answer"] == "Test answer"
+def test_error_handling(rag_engine):
+    """Test error handling in RAG engine."""
+    # Test error in embeddings creation
+    rag_engine.client.embeddings.create.side_effect = Exception("API Error")
+    with pytest.raises(Exception):
+        rag_engine.create_embeddings(["Test"])
+    # Test error in vector store initialization
+    rag_engine.chroma_client.get_or_create_collection.side_effect = Exception("DB Error")
+    with pytest.raises(Exception):
+        rag_engine.initialize_vector_store("test")