import os
from PyPDF2 import PdfReader
import streamlit as st
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.faiss import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
import google.generativeai as genai
from dotenv import load_dotenv
load_dotenv()
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
# Function to extract text from PDFs
def extract_pdf_text(pdfs):
all_text = ""
for pdf in pdfs:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
all_text += page.extract_text()
return all_text
# Function to split text into chunks
def split_text_into_chunks(text):
splitter = RecursiveCharacterTextSplitter(chunk_size=12000, chunk_overlap=1200)
text_chunks = splitter.split_text(text)
return text_chunks
# Function to create vector store
def create_vector_store(chunks):
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vector_store = FAISS.from_texts(chunks, embedding=embeddings)
vector_store.save_local("faiss_index")
# Function to setup conversation chain for QA
def setup_conversation_chain(template):
model = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3)
prompt = PromptTemplate(template=template, input_variables=["context", "question"])
chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
return chain
# Function to handle user input based on selected mode
def handle_user_input(mode, user_question=None):
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
indexed_data = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
docs = indexed_data.similarity_search(user_question)
chain = setup_conversation_chain(prompt_template[mode])
response = chain({"input_documents": docs, "question": user_question}, return_only_outputs=True)
return response["output_text"]
# Prompt templates for each mode
prompt_template = {
"chat":"""
Your alias is Neural-PDF. Your task is to provide a thorough response based on the given context, ensuring all relevant details are included.
If the requested information isn't available, simply state, "answer not available in context," then answer based on your understanding, connecting with the context.
Don't provide incorrect information.\n\n
Context: \n {context}?\n
Question: \n {question}\n
Answer:
""",
"quiz":"""
Your alias is Neural-PDF. Your task is to generate multiple choice questions for quiz based on the given context and requested number of questions, ensuring all relevant details are included.
If the requested information isn't available, simply state, "answer not available in context," then answer based on your understanding, connecting with the context.
Don't provide incorrect information.\n\n
Context: \n {context}?\n
Question: \n {question}\n
Answer:
""",
"long":"""
Your alias is Neural-PDF. Your task is to generate long answer-type questions based on the given context and requested number of questions, ensuring all relevant details are included.
If the requested information isn't available, simply state, "answer not available in context," then answer based on your understanding, connecting with the context.
Don't provide incorrect information.\n\n
Context: \n {context}?\n
Question: \n {question}\n
Answer:
""",
}
# Streamlit app
def main():
if "conversation" not in st.session_state:
st.session_state.conversation = []
if "mode" not in st.session_state:
st.session_state.mode=""
if "file_upload" not in st.session_state:
st.session_state.file_upload=False
st.set_page_config(page_title="NeuralPDF", page_icon=":page_with_curl:", initial_sidebar_state="expanded", layout="wide")
st.title("NeuralPDF: Interactive PDF Chat using AI 🤖")
# sidebar
files = st.sidebar.file_uploader("Upload one or more PDF files", type="pdf", accept_multiple_files=True)
if st.sidebar.button("Submit"):
if files:
with st.spinner("Processing..."):
raw_text = extract_pdf_text(files)
text_chunks = split_text_into_chunks(raw_text)
create_vector_store(text_chunks)
st.sidebar.success("Processing done!")
st.session_state.file_upload=True
# mode of chat
with st.sidebar:
if st.session_state.file_upload:
# st.write('', unsafe_allow_html=True)
# st.write('', unsafe_allow_html=True)
modes={"Chat Conversation":"chat", "Quiz & MCQs":"quiz", "Long-Answer Questions":"long"}
choose_mode = st.radio("", list(modes.keys()), index=0)
st.session_state.mode=modes[choose_mode]
if st.session_state.file_upload:
# keep history of chat
for dialogue in st.session_state.conversation:
with st.chat_message(dialogue["role"]):
if st.session_state.mode != "chat" and dialogue["role"] == "assistant":
st.markdown(dialogue["content"])
with st.expander("Answer"):
st.markdown(dialogue["answer"])
else: st.markdown(dialogue["content"])
# handle conversation
if prompt := st.chat_input("Type your question here"):
# handle user side
with st.chat_message("user"): st.markdown(prompt)
st.session_state.conversation.append({"role":"user", "content":prompt, "answer":""})
# handle assistant side
with st.chat_message("assistant"):
response=handle_user_input(st.session_state.mode, prompt)
answer=""
if st.session_state.mode != "chat":
answer = handle_user_input("chat", response)
st.markdown(response)
with st.expander("Answer"):
st.markdown(answer)
else: st.markdown(response)
st.session_state.conversation.append({"role":"assistant", "content":response, "answer":answer})
# Launch the app
if __name__ == "__main__":
main()