Spaces:

techasad
/

document_chatbot

Sleeping

App Files Files Community

techasad commited on Feb 5, 2024

Commit

cae3b1e

verified ·

1 Parent(s): 7a6e3fa

Upload 3 files

Browse files

Files changed (3) hide show

Readme.md +3 -0
pdfassistant.py +348 -0
requirements.txt +99 -0

Readme.md ADDED Viewed

	@@ -0,0 +1,3 @@

+install all the dependencies with 'pip install -r requirements.txt'
+add sectrets.toml file in .streamlit folder with your api key
+run the app with streamlit run pdfassistant.py

pdfassistant.py ADDED Viewed

	@@ -0,0 +1,348 @@

+import streamlit as st
+from PyPDF2 import PdfReader
+import langchain
+from textwrap import dedent
+import pandas as pd
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_community.callbacks import StreamlitCallbackHandler
+from langchain_openai import ChatOpenAI
+from langchain_community.chat_models import ChatGooglePalm
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores.faiss import FAISS
+from langchain.prompts import PromptTemplate
+from langchain.memory import ConversationBufferMemory
+import tempfile
+from langchain.document_loaders.csv_loader import CSVLoader
+from langchain.document_loaders.pdf import PyPDFLoader
+from langchain.document_loaders.word_document import UnstructuredWordDocumentLoader
+from langchain.chains.question_answering import load_qa_chain
+from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+from langchain.agents import load_tools
+import os
+from io import BytesIO
+from langdetect import detect
+from gtts import gTTS
+from langchain.prompts import (
+    ChatPromptTemplate
+)
+google_api_key = st.secrets["GOOGLE_API_KEY"]
+#api_key2 = st.secrets["OPENAI_API_KEY"]
+os.environ["GOOGLE_API_KEY"] = google_api_key
+st.set_page_config(page_title='Personal Chatbot', page_icon='books')
+st.header('Knowledge Query Assistant')
+st.write("Upload your file to begin a chat, or ask any general questions you have")
+st.sidebar.title('Options')
+st.sidebar.subheader("Please Choose the AI Engine")
+use_google = st.sidebar.checkbox("Use Free AI", value =True)
+use_openai = st.sidebar.checkbox("Use OpenAI with your API Key")
+openai_api_key = st.sidebar.text_input("Enter your OpenAI API Key:", type="password")
+def choose_llm():
+    try:
+        if use_google and use_openai:
+            st.sidebar.warning("Please choose only one AI engine.")
+            st.warning("Please choose only one AI engine.")
+        elif use_google:
+            llm = ChatGooglePalm(temperature=0.1)
+        elif use_openai:
+            if not openai_api_key:
+                st.sidebar.warning("Please provide your OpenAI API Key.")
+                st.warning("Please provide your OpenAI API Key.")
+            llm = ChatOpenAI(api_key=openai_api_key, temperature=0.1)
+        return llm
+    except Exception as e:
+        " "
+llm = choose_llm()
+if llm:
+    st.sidebar.success("AI Engine selected")
+else:
+    st.sidebar.warning("Please choose an AI engine.")
+@st.cache_resource(show_spinner=False)
+def processing_csv_pdf_docx(uploaded_file):
+    with st.spinner(text="Embedding Your Files"):
+        # Read text from the uploaded PDF file
+        data = []
+        for file in uploaded_file:
+            split_tup = os.path.splitext(file.name)
+            file_extension = split_tup[1]
+            if file_extension == ".pdf":
+                with tempfile.NamedTemporaryFile(delete=False) as tmp_file1:
+                    tmp_file1.write(file.getvalue())
+                    tmp_file_path1 = tmp_file1.name
+                    loader = PyPDFLoader(file_path=tmp_file_path1)
+                    documents = loader.load()
+                    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=50)
+                    data += text_splitter.split_documents(documents)
+            if file_extension == ".csv":
+                with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
+                    tmp_file.write(file.getvalue())
+                    tmp_file_path = tmp_file.name
+                    loader = CSVLoader(file_path=tmp_file_path, encoding="utf-8", csv_args={
+                                'delimiter': ','})
+                    documents = loader.load()
+                    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=50)
+                    data += text_splitter.split_documents(documents)
+                    st.sidebar.header(f"Data-{file.name}")
+                    data1 = pd.read_csv(tmp_file_path)
+                    st.sidebar.dataframe(data1)
+            if file_extension == ".docx":
+                with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
+                    tmp_file.write(file.getvalue())
+                    tmp_file_path = tmp_file.name
+                    loader = UnstructuredWordDocumentLoader(file_path=tmp_file_path)
+                    documents = loader.load()
+                    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=50)
+                    data += text_splitter.split_documents(documents)
+        # Download embeddings from GooglePalm
+        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+        #embeddings = GooglePalmEmbeddings()
+        #embeddings = OpenAIEmbeddings()
+        # Create a FAISS index from texts and embeddings
+        vectorstore = FAISS.from_documents(data, embeddings)
+        #vectorstore.save_local("./faiss")
+        return vectorstore
+with st.sidebar:
+    uploaded_file =  st.file_uploader("Upload your files",
+    help="Multiple Files are Supported",
+    type=['pdf', 'docx', 'csv'], accept_multiple_files= True)
+if not uploaded_file:
+    st.warning("Upload your file(s) to start chatting!")
+if 'history' not in st.session_state:
+        st.session_state['history'] = []
+if "messages" not in st.session_state or st.sidebar.button("Clear conversation history"):
+    st.session_state["messages"]= []
+st.sidebar.subheader('Created by Engr. Muhammad Asadullah')
+# Adding links to social accounts
+st.sidebar.markdown("[LinkedIn](https://www.linkedin.com/in/asad18/)")
+st.sidebar.markdown("[GitHub](https://github.com/TechAsad)")
+st.sidebar.markdown("[Fiverr](https://www.fiverr.com/promptengr?source=gig_page&gigs=slug%3Acreate-streamlit-and-gradio-web-apps-for-ai-and-data-analysis%2Cpckg_id%3A1&is_choice=true)")
+st.sidebar.markdown("[Website](https://tenlancer.com/)")
+########--Save PDF--########
+def main():
+    try:
+        if (use_openai and openai_api_key) or use_google:
+            if uploaded_file:
+                db = processing_csv_pdf_docx(uploaded_file)
+                for file in uploaded_file:
+                    st.success(f'Your File: {file.name} is Embedded', icon="✅")
+            for msg in st.session_state.messages:
+                st.chat_message(msg["role"]).write(msg["content"])
+                if msg["role"] == "Assistant":
+                    st.chat_message(msg["role"]).audio(msg["audio_content"], format='audio/wav')
+                    #st.audio(audio_msg, format='audio/mp3').audio(audio_msg)
+            if prompt := st.chat_input(placeholder="Type your question!"):
+                st.session_state.messages.append({"role": "user", "content": prompt})
+                st.chat_message("user").write(prompt)
+                memory = ConversationBufferMemory(memory_key="chat_history", input_key="question", human_prefix= "", ai_prefix= "")
+                user_message = {"role": "user", "content": prompt}
+                for i in range(0, len(st.session_state.messages), 2):
+                    if i + 1 < len(st.session_state.messages):
+                        user_prompt = st.session_state.messages[i]
+                        ai_res = st.session_state.messages[i + 1]
+                        current_role = user_prompt["role"]
+                        current_content = user_prompt["content"]
+                        next_role = ai_res["role"]
+                        next_content = ai_res["content"]
+                        # Concatenate role and content for context and output
+                        user = f"{current_role}: {current_content}"
+                        ai = f"{next_role}: {next_content}"
+                        memory.save_context({"question": user}, {"output": ai})
+                # Get user input -> Generate the answer
+                greetings = ['Hey', 'Hello', 'hi', 'hello', 'hey', 'helloo', 'hellooo', 'g morning', 'gmorning', 'good morning', 'morning',
+                            'good day', 'good afternoon', 'good evening', 'greetings', 'greeting', 'good to see you',
+                            'its good seeing you', 'how are you', "how're you", 'how are you doing', "how ya doin'", 'how ya doin',
+                            'how is everything', 'how is everything going', "how's everything going", 'how is you', "how's you",
+                            'how are things', "how're things", 'how is it going', "how's it going", "how's it goin'", "how's it goin",
+                            'how is life been treating you', "how's life been treating you", 'how have you been', "how've you been",
+                            'what is up', "what's up", 'what is cracking', "what's cracking", 'what is good', "what's good",
+                            'what is happening', "what's happening", 'what is new', "what's new", 'what is neww', "g’day", 'howdy']
+                compliment = ['thank you', 'thanks', 'thanks a lot', 'thanks a bunch', 'great', 'ok', 'ok thanks', 'okay', 'great', 'awesome', 'nice']
+                prompt_template =dedent(r"""
+                You are a helpful assistant to help user find information from his documents.
+                talk humbly. Answer the question from the provided context. Do not answer from your own training data.
+                Use the following pieces of context to answer the question at the end.
+                If you don't know the answer, just say that you don't know. Do not makeup any answer.
+                Do not answer hypothetically. Do not answer in more than 100 words.
+                Please Do Not say: "Based on the provided context"
+                Always use the context to find the answer.
+                this is the context from study material:
+                ---------
+                {context}
+                ---------
+                Current Conversation:
+                ---------
+                {chat_history}
+                ---------
+                Question:
+                {question}
+                Helpful Answer:
+                """)
+                PROMPT = PromptTemplate(
+                    template=prompt_template, input_variables=["context", "question", "chat_history"]
+                )
+                # Run the question-answering chain
+                    # Load question-answering chain
+                chain = load_qa_chain(llm=llm, verbose= True, prompt = PROMPT,memory=memory, chain_type="stuff")
+                #chain = load_qa_chain(ChatOpenAI(temperature=0.9, model="gpt-3.5-turbo-0613", streaming=True) , verbose= True, prompt = PROMPT, memory=memory,chain_type="stuff")
+                with st.chat_message("Assistant"):
+                    st_cb = StreamlitCallbackHandler(st.container())
+                    if prompt.lower() in greetings:
+                        response = 'Hi, how are you? I am here to help you get information from your file. How can I assist you?'
+                        audio_buffer = BytesIO()
+                        audio_file = gTTS(text=response, lang='en', slow=False)
+                        audio_file.write_to_fp(audio_buffer)
+                        audio_buffer.seek(0)
+                        #st.audio(audio_buffer, format='audio/mp3')
+                        st.session_state.messages.append({"role": "Assistant", "content": response, "audio_content": audio_buffer})
+                    elif prompt.lower() in compliment:
+                        response = 'My pleasure! If you have any more questions, feel free to ask.'
+                        audio_buffer = BytesIO()
+                        audio_file = gTTS(text=response, lang='en', slow=False)
+                        audio_file.write_to_fp(audio_buffer)
+                        audio_buffer.seek(0)
+                        #st.audio(audio_buffer, format='audio/mp3')
+                        st.session_state.messages.append({"role": "Assistant", "content": response, "audio_content": audio_buffer})
+                    elif uploaded_file:
+                        with st.spinner('Bot is typing ...'):
+                            docs = db.similarity_search(prompt, k=5, fetch_k=10)
+                            response = chain.run(input_documents=docs, question=prompt)
+                            lang = detect(response)
+                            audio_buffer = BytesIO()
+                            audio_file = gTTS(text=response, lang=lang, slow=False)
+                            audio_file.write_to_fp(audio_buffer)
+                            audio_buffer.seek(0)
+                           # st.audio(audio_buffer, format='audio/mp3')
+                            #st.session_state.audio.append({"role": "Assistant", "audio": audio_buffer})
+                            st.session_state.messages.append({"role": "Assistant", "content": response, "audio_content": audio_buffer})
+                            assistant_message = {"role": "assistant", "content": response}
+                    else:
+                        with st.spinner('Bot is typing ...'):
+                            prompt_chat = ChatPromptTemplate.from_template("you are a helpful assistant, Answer the question with your knowledge.\n\n current conversation: {chat_history} \n\n Question: {question} \n\n Answer:")
+                            chain = prompt_chat | llm
+                            response = chain.invoke({"chat_history": memory, "question": prompt}).content
+                            lang = detect(response)
+                            audio_buffer = BytesIO()
+                            audio_file = gTTS(text=response, lang=lang, slow=False)
+                            audio_file.write_to_fp(audio_buffer)
+                            audio_buffer.seek(0)
+                            #st.audio(audio_buffer, format='audio/mp3')
+                            #st.session_state.audio.append({"role": "Assistant", "audio": audio_buffer})
+                            st.session_state.messages.append({"role": "Assistant", "content": response, "audio_content": audio_buffer})
+                            assistant_message = {"role": "assistant", "content": response}
+                    st.write(response)
+                    st.audio(audio_buffer, format='audio/wav')
+    except Exception as e:
+        "Sorry, there was a problem. A corrupted file or;"
+        if use_google:
+            "Google PaLM AI only take English Data and Questions. Or the AI could not find the answer in your provided document."
+        elif use_openai:
+            "Please check your OpenAI API key"
+hide_streamlit_style = """
+            <style>
+            #MainMenu {visibility: hidden;}
+            footer {visibility: hidden;}
+            </style>
+            """
+st.markdown(hide_streamlit_style, unsafe_allow_html=True)
+if __name__ == '__main__':
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,99 @@

+aiohttp==3.9.1
+aiosignal==1.3.1
+altair==5.2.0
+annotated-types==0.6.0
+anyio==4.2.0
+attrs==23.2.0
+blinker==1.7.0
+cachetools==5.3.2
+certifi==2023.11.17
+charset-normalizer==3.3.2
+click==8.1.7
+dataclasses-json==0.6.3
+distro==1.9.0
+frozenlist==1.4.1
+gitdb==4.0.11
+GitPython==3.1.41
+google-ai-generativelanguage==0.4.0
+google-api-core==2.15.0
+google-auth==2.27.0
+google-generativeai==0.3.2
+googleapis-common-protos==1.62.0
+greenlet==3.0.3
+grpcio==1.60.0
+grpcio-status==1.60.0
+gtts
+h11==0.14.0
+httpcore==1.0.2
+httpx==0.26.0
+idna==3.6
+importlib-metadata==7.0.1
+Jinja2==3.1.3
+jsonpatch==1.33
+jsonpointer==2.4
+jsonschema==4.21.1
+jsonschema-specifications==2023.12.1
+langchain==0.1.4
+langchain-community==0.0.16
+langchain-core==0.1.16
+langchain-openai==0.0.5
+langdetect
+langsmith==0.0.83
+lxml==5.1.0
+markdown-it-py==3.0.0
+MarkupSafe==2.1.4
+marshmallow==3.20.2
+mdurl==0.1.2
+multidict==6.0.4
+mypy-extensions==1.0.0
+numpy==1.26.3
+openai==1.10.0
+packaging==23.2
+pandas==2.2.0
+pillow==10.2.0
+proto-plus==1.23.0
+protobuf==4.25.2
+pyarrow==15.0.0
+pyasn1==0.5.1
+pyasn1-modules==0.3.0
+pydantic==2.5.3
+pydantic_core==2.14.6
+pydeck==0.8.1b0
+Pygments==2.17.2
+pypdf==4.0.0
+PyPDF2==3.0.1
+python-dateutil==2.8.2
+python-docx==1.1.0
+pytz==2023.3.post1
+PyYAML==6.0.1
+referencing==0.32.1
+regex==2023.12.25
+requests==2.31.0
+rich==13.7.0
+rpds-py==0.17.1
+rsa==4.9
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.0
+SQLAlchemy==2.0.25
+streamlit==1.30.0
+tenacity==8.2.3
+tiktoken==0.5.2
+toml==0.10.2
+toolz==0.12.1
+tornado==6.4
+tqdm==4.66.1
+typing-inspect==0.9.0
+typing_extensions==4.9.0
+tzdata==2023.4
+tzlocal==5.2
+urllib3==2.1.0
+validators==0.22.0
+yarl==1.9.4
+zipp==3.17.0
+sentence-transformers
+unstructured
+faiss-cpu
+pycryptodome==3.15.0
+unstructured[pdf]
+cryptography>=3.1