Spaces:

hubsnippetai
/

chatdocs

Sleeping

App Files Files Community

hubsnippetai commited on Jul 8, 2024

Commit

425fac0

verified ·

1 Parent(s): b6612d4

app files for chatdocs app

Browse files

Files changed (2) hide show

app.py +141 -0
appui.py +52 -0

app.py ADDED Viewed

	@@ -0,0 +1,141 @@

+from langchain_community.document_loaders import UnstructuredPDFLoader, TextLoader # type: ignore
+from langchain_community.vectorstores import FAISS
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_cohere import ChatCohere
+from langchain_core.messages import HumanMessage
+import dotenv
+from langchain_core.output_parsers import StrOutputParser
+# from langchain_community.vectorstores import Chroma
+from langchain.schema.runnable import RunnablePassthrough
+from langchain_cohere import CohereEmbeddings
+from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
+from langchain.memory.summary_buffer import ConversationSummaryBufferMemory
+from langchain.chains import ConversationChain
+from langchain_core.prompts.chat import MessagesPlaceholder
+from langchain.agents import AgentExecutor, create_tool_calling_agent
+import os
+#from langchain_community.utilities import GoogleSearchAPIWrapper
+from langchain_core.tools import Tool
+from langchain_google_community import GoogleSearchAPIWrapper
+dotenv.load_dotenv()
+#file_path = (    "/home/hubsnippet/Downloads/papers/2205.11916v4.pdf")
+#load the file to memory
+#loader = PyPDFLoader(file_path)
+#load the file content to data variable
+#data = loader.load_and_split()
+# embed the file data in a vector store
+#print(data[0])
+def parse_document(docs : str, question : str):
+    # initialise an embedding for the vector store
+    embeddings = CohereEmbeddings(model="embed-english-light-v3.0")
+    # initialise the llm
+    llm = ChatCohere(model='command-r-plus')
+    # split the file into chunks
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size = 1000,
+        chunk_overlap = 100
+    )
+    docs = text_splitter.split_text(docs)
+    # initialize vectorstore
+    faiss_vs = FAISS.from_texts(docs, embeddings)
+    # res = faiss_vs.similarity_search(input, k=2)
+    llm_retriever = faiss_vs.as_retriever(llm = llm, search_kwargs={'k':1})
+    res = llm_retriever.invoke(question)[0].page_content
+    return res
+#os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")
+#os.environ["GOOGLE_CSE_ID"] = os.getenv("GOOGLE_CSE_ID")
+#COHERE_API_KEY = os.getenv("COHERE_API_KEY")
+GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
+GOOGLE_CSE_ID = os.getenv("GOOGLE_CSE_ID")
+COHERE_API_KEY = os.getenv("COHERE_API_KEY")
+# integrating an agent to perform the search with the URL
+llm = ChatCohere(model="command-r-plus", cohere_api_key=COHERE_API_KEY)
+    # history = MessagesPlaceholder(variable_name="history")
+#question = ""
+#url = ""
+prompt_template = [
+    ("system",  "You are a searh engine for a corpse of documentation. you will be provided with a url {url} \
+    the url is your only source of information, so you should search the url pages by key words \
+    you should only ground your responses with the url. \
+    If {question} has no related content from the url, simple response 'no related content to your question'"),
+    ("human", "{question}"),
+    ("placeholder", "{agent_scratchpad}"),
+    ("ai", "")]
+# prompt_template = prompt_template.format(url=url, question=question)
+prompt = ChatPromptTemplate.from_messages([
+    #SystemMessage(content="You are a helpful assistant. You should use the google_search_name agent tool for information."),
+    #HumanMessage(content="{input}"),
+    #AIMessage(content="{output}"),
+    ("system","You are a helpful virtual assistant." \
+    "You should only use the google_search_name agent tool to search for information when necessary."),
+    #MessagesPlaceholder(variable_name="history"),
+    ("human","{question}"),
+    ("placeholder", "{agent_scratchpad}")
+])
+# prompt template
+prompt_text = ChatPromptTemplate.from_messages(prompt_template)
+# print(prompt_text)
+# prompt template input variables
+# prompt_text.input_variables = ["question", "url"], input_variables = ["question", "url"]
+search = GoogleSearchAPIWrapper(google_api_key = GOOGLE_API_KEY, google_cse_id = GOOGLE_CSE_ID)
+tool = Tool(
+    name="google_search_name",
+    description="The model should use this tool when it needs more information from the internet.",
+    func=search.run,
+)
+agent = create_tool_calling_agent(
+    tools=[tool],
+    llm=llm,
+    #prompt = prompt_text
+    prompt = prompt
+)
+agent_executor = AgentExecutor(
+    agent=agent,
+    tools=[tool],
+    verbose=False
+)
+def parse_url(question : str) -> str:
+    # initialise the llm
+    response = agent_executor.invoke(input = {"question": question})
+    # add memmory to your conversation
+    # chain your llm to prompt
+    # chain = prompt_text | llm | StrOutputParser()
+    # chain = conversation_llm | StrOutputParser()
+    #response = chain.invoke(input = {"question" : question, "url":url})
+    return response
+# message = HumanMessage(content="inurl: https://learn.microsoft.com 'what are cloud security best practices'")
+# print(parse_url(message))

appui.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import streamlit as st
+from io import StringIO
+from pypdf import PdfReader
+#from PyPDF2 import PdfReader
+from app import parse_document, parse_url
+files= st.file_uploader(label="upload a file", accept_multiple_files=True, type="pdf")
+st.write("if you prefer to interact with a particular URL such as a docs e.g https://docs.python.org,\n")
+url = st.text_input("provide the URL in the field below, then press the enter key")
+st.write(url)
+# st.write(file_upload[0]._file_urls.upload_url)
+all_docs = []
+docs = ""
+input_text = st.text_input(label="Ask your documents/url a question:")
+# st.write(input_text)
+pressed = st.button(label="Get Response", type="primary")
+user_query = "inurl: " + url + " " + input_text
+if len(url) > 0 and pressed is True:
+    #st.write(url)
+    #input_text
+    response = parse_url(user_query)
+    if 'response' not in st.session_state:
+        st.session_state['response'] = response
+    st.write(st.session_state.response)
+else:
+    try:
+        for file in files:
+            if file is not None:
+                file_data = PdfReader(file)
+                # extract text from the pdf file
+                for page in file_data.pages:
+                    docs += page.extract_text()
+            #all_docs.append(docs)
+            if len(input_text) > 0:
+                response = parse_document(docs=docs, question= input_text)
+                if 'file_response' not in st.session_state:
+                    st.session_state['file_response'] = response
+                    st.write(st.session_state.file_response)
+            else:
+                st.write("Ask a question")
+    except:
+        st.write("No answer")