hubsnippetai commited on
Commit
425fac0
·
verified ·
1 Parent(s): b6612d4

app files for chatdocs app

Browse files
Files changed (2) hide show
  1. app.py +141 -0
  2. appui.py +52 -0
app.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import UnstructuredPDFLoader, TextLoader # type: ignore
2
+ from langchain_community.vectorstores import FAISS
3
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
4
+ from langchain_community.document_loaders import PyPDFLoader
5
+ from langchain_cohere import ChatCohere
6
+ from langchain_core.messages import HumanMessage
7
+ import dotenv
8
+ from langchain_core.output_parsers import StrOutputParser
9
+ # from langchain_community.vectorstores import Chroma
10
+ from langchain.schema.runnable import RunnablePassthrough
11
+ from langchain_cohere import CohereEmbeddings
12
+ from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
13
+
14
+ from langchain.memory.summary_buffer import ConversationSummaryBufferMemory
15
+ from langchain.chains import ConversationChain
16
+ from langchain_core.prompts.chat import MessagesPlaceholder
17
+
18
+ from langchain.agents import AgentExecutor, create_tool_calling_agent
19
+
20
+ import os
21
+
22
+ #from langchain_community.utilities import GoogleSearchAPIWrapper
23
+ from langchain_core.tools import Tool
24
+ from langchain_google_community import GoogleSearchAPIWrapper
25
+
26
+ dotenv.load_dotenv()
27
+ #file_path = ( "/home/hubsnippet/Downloads/papers/2205.11916v4.pdf")
28
+
29
+ #load the file to memory
30
+ #loader = PyPDFLoader(file_path)
31
+
32
+ #load the file content to data variable
33
+ #data = loader.load_and_split()
34
+
35
+ # embed the file data in a vector store
36
+
37
+ #print(data[0])
38
+
39
+ def parse_document(docs : str, question : str):
40
+ # initialise an embedding for the vector store
41
+ embeddings = CohereEmbeddings(model="embed-english-light-v3.0")
42
+
43
+ # initialise the llm
44
+ llm = ChatCohere(model='command-r-plus')
45
+
46
+ # split the file into chunks
47
+ text_splitter = RecursiveCharacterTextSplitter(
48
+ chunk_size = 1000,
49
+ chunk_overlap = 100
50
+ )
51
+ docs = text_splitter.split_text(docs)
52
+
53
+ # initialize vectorstore
54
+ faiss_vs = FAISS.from_texts(docs, embeddings)
55
+ # res = faiss_vs.similarity_search(input, k=2)
56
+ llm_retriever = faiss_vs.as_retriever(llm = llm, search_kwargs={'k':1})
57
+ res = llm_retriever.invoke(question)[0].page_content
58
+
59
+ return res
60
+
61
+ #os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")
62
+ #os.environ["GOOGLE_CSE_ID"] = os.getenv("GOOGLE_CSE_ID")
63
+ #COHERE_API_KEY = os.getenv("COHERE_API_KEY")
64
+
65
+ GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
66
+ GOOGLE_CSE_ID = os.getenv("GOOGLE_CSE_ID")
67
+ COHERE_API_KEY = os.getenv("COHERE_API_KEY")
68
+
69
+
70
+ # integrating an agent to perform the search with the URL
71
+ llm = ChatCohere(model="command-r-plus", cohere_api_key=COHERE_API_KEY)
72
+ # history = MessagesPlaceholder(variable_name="history")
73
+
74
+ #question = ""
75
+ #url = ""
76
+
77
+ prompt_template = [
78
+ ("system", "You are a searh engine for a corpse of documentation. you will be provided with a url {url} \
79
+ the url is your only source of information, so you should search the url pages by key words \
80
+ you should only ground your responses with the url. \
81
+ If {question} has no related content from the url, simple response 'no related content to your question'"),
82
+ ("human", "{question}"),
83
+ ("placeholder", "{agent_scratchpad}"),
84
+ ("ai", "")]
85
+ # prompt_template = prompt_template.format(url=url, question=question)
86
+
87
+ prompt = ChatPromptTemplate.from_messages([
88
+ #SystemMessage(content="You are a helpful assistant. You should use the google_search_name agent tool for information."),
89
+ #HumanMessage(content="{input}"),
90
+ #AIMessage(content="{output}"),
91
+ ("system","You are a helpful virtual assistant." \
92
+ "You should only use the google_search_name agent tool to search for information when necessary."),
93
+ #MessagesPlaceholder(variable_name="history"),
94
+ ("human","{question}"),
95
+ ("placeholder", "{agent_scratchpad}")
96
+ ])
97
+
98
+ # prompt template
99
+ prompt_text = ChatPromptTemplate.from_messages(prompt_template)
100
+
101
+ # print(prompt_text)
102
+ # prompt template input variables
103
+ # prompt_text.input_variables = ["question", "url"], input_variables = ["question", "url"]
104
+ search = GoogleSearchAPIWrapper(google_api_key = GOOGLE_API_KEY, google_cse_id = GOOGLE_CSE_ID)
105
+
106
+ tool = Tool(
107
+ name="google_search_name",
108
+ description="The model should use this tool when it needs more information from the internet.",
109
+ func=search.run,
110
+ )
111
+
112
+ agent = create_tool_calling_agent(
113
+ tools=[tool],
114
+ llm=llm,
115
+ #prompt = prompt_text
116
+ prompt = prompt
117
+ )
118
+
119
+ agent_executor = AgentExecutor(
120
+ agent=agent,
121
+ tools=[tool],
122
+ verbose=False
123
+ )
124
+
125
+ def parse_url(question : str) -> str:
126
+ # initialise the llm
127
+
128
+ response = agent_executor.invoke(input = {"question": question})
129
+ # add memmory to your conversation
130
+
131
+
132
+ # chain your llm to prompt
133
+ # chain = prompt_text | llm | StrOutputParser()
134
+ # chain = conversation_llm | StrOutputParser()
135
+
136
+ #response = chain.invoke(input = {"question" : question, "url":url})
137
+ return response
138
+
139
+ # message = HumanMessage(content="inurl: https://learn.microsoft.com 'what are cloud security best practices'")
140
+
141
+ # print(parse_url(message))
appui.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from io import StringIO
3
+ from pypdf import PdfReader
4
+ #from PyPDF2 import PdfReader
5
+ from app import parse_document, parse_url
6
+
7
+ files= st.file_uploader(label="upload a file", accept_multiple_files=True, type="pdf")
8
+ st.write("if you prefer to interact with a particular URL such as a docs e.g https://docs.python.org,\n")
9
+ url = st.text_input("provide the URL in the field below, then press the enter key")
10
+ st.write(url)
11
+
12
+ # st.write(file_upload[0]._file_urls.upload_url)
13
+
14
+ all_docs = []
15
+ docs = ""
16
+ input_text = st.text_input(label="Ask your documents/url a question:")
17
+ # st.write(input_text)
18
+
19
+ pressed = st.button(label="Get Response", type="primary")
20
+
21
+ user_query = "inurl: " + url + " " + input_text
22
+
23
+ if len(url) > 0 and pressed is True:
24
+ #st.write(url)
25
+ #input_text
26
+
27
+ response = parse_url(user_query)
28
+
29
+ if 'response' not in st.session_state:
30
+ st.session_state['response'] = response
31
+
32
+ st.write(st.session_state.response)
33
+
34
+ else:
35
+ try:
36
+ for file in files:
37
+ if file is not None:
38
+ file_data = PdfReader(file)
39
+ # extract text from the pdf file
40
+ for page in file_data.pages:
41
+ docs += page.extract_text()
42
+ #all_docs.append(docs)
43
+ if len(input_text) > 0:
44
+ response = parse_document(docs=docs, question= input_text)
45
+ if 'file_response' not in st.session_state:
46
+ st.session_state['file_response'] = response
47
+ st.write(st.session_state.file_response)
48
+ else:
49
+ st.write("Ask a question")
50
+ except:
51
+ st.write("No answer")
52
+