Spaces:

DvorakInnovationAI
/

Document-Interaction-NDIS

Sleeping

File size: 3,008 Bytes

import streamlit as st
import fitz  # PyMuPDF
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np
from phi.agent import Agent
from phi.model.groq import Groq

# Load embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def agent_response(question, retrieved_text):
    agent = Agent(
        model=Groq(id="llama-3.3-70b-versatile"),
        markdown=True,
        description="You are an AI assistant that provides the answer based on the provided document.",
        instructions=[
            f"First read the question carefully. The question is: **{question}**",
            f"Then read the document provided to you as a text. The document is: \n**{retrieved_text}**\n",
            "Finally answer the question based on the provided document only. Don't try to give random responses."
        ]
    )
    response = agent.run(question + '\n' + retrieved_text).content
    return response

class PDFChatbot:
    def __init__(self):
        self.text_chunks = []
        self.index = None
    
    def process_pdf(self, pdf_file):
        """Extract text from PDF and create FAISS index."""
        self.text_chunks = []
        
        # Read the uploaded file into memory
        with fitz.open("pdf", pdf_file.read()) as doc:  # Use "pdf" as format
            for page in doc:
                self.text_chunks.append(page.get_text("text"))
        
        # Embed the chunks
        embeddings = embedding_model.encode(self.text_chunks, convert_to_numpy=True)
        
        # Create FAISS index
        self.index = faiss.IndexFlatL2(embeddings.shape[1])
        self.index.add(embeddings)
        return "PDF processed successfully!"
    
    def chat(self, query):
        """Retrieve the most relevant chunk for a query."""
        if self.index is None:
            return "Please upload a PDF first."
        
        # query_embedding = embedding_model.encode([query], convert_to_numpy=True)
        # _, indices = self.index.search(query_embedding, 1)  # Get top match
        # retrieved_text = self.text_chunks[indices[0][0]]
        # response = agent_response(query, retrieved_text)

        query_embedding = embedding_model.encode([query], convert_to_numpy=True)
        _, indices = self.index.search(query_embedding, 5)  # Get top 5 matches
        retrieved_texts = [self.text_chunks[idx] for idx in indices[0]]
        retrieved_text_combined = "\n\n".join(retrieved_texts)
        print('Retrieved_texts:',retrieved_text_combined)
        response = agent_response(query, retrieved_text_combined)
        return response

# Instantiate chatbot
chatbot = PDFChatbot()

st.title("Chat with your PDF")

uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
if uploaded_file:
    message = chatbot.process_pdf(uploaded_file)
    st.success(message)

query = st.text_input("Ask a question")
if st.button("Ask"):
    if query:
        response = chatbot.chat(query)
        st.markdown(f"**Answer:**\n\n{response}")