Spaces:
Sleeping
Sleeping
File size: 4,956 Bytes
425fac0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
from langchain_community.document_loaders import UnstructuredPDFLoader, TextLoader # type: ignore
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_cohere import ChatCohere
from langchain_core.messages import HumanMessage
import dotenv
from langchain_core.output_parsers import StrOutputParser
# from langchain_community.vectorstores import Chroma
from langchain.schema.runnable import RunnablePassthrough
from langchain_cohere import CohereEmbeddings
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain.memory.summary_buffer import ConversationSummaryBufferMemory
from langchain.chains import ConversationChain
from langchain_core.prompts.chat import MessagesPlaceholder
from langchain.agents import AgentExecutor, create_tool_calling_agent
import os
#from langchain_community.utilities import GoogleSearchAPIWrapper
from langchain_core.tools import Tool
from langchain_google_community import GoogleSearchAPIWrapper
dotenv.load_dotenv()
#file_path = ( "/home/hubsnippet/Downloads/papers/2205.11916v4.pdf")
#load the file to memory
#loader = PyPDFLoader(file_path)
#load the file content to data variable
#data = loader.load_and_split()
# embed the file data in a vector store
#print(data[0])
def parse_document(docs : str, question : str):
# initialise an embedding for the vector store
embeddings = CohereEmbeddings(model="embed-english-light-v3.0")
# initialise the llm
llm = ChatCohere(model='command-r-plus')
# split the file into chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size = 1000,
chunk_overlap = 100
)
docs = text_splitter.split_text(docs)
# initialize vectorstore
faiss_vs = FAISS.from_texts(docs, embeddings)
# res = faiss_vs.similarity_search(input, k=2)
llm_retriever = faiss_vs.as_retriever(llm = llm, search_kwargs={'k':1})
res = llm_retriever.invoke(question)[0].page_content
return res
#os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")
#os.environ["GOOGLE_CSE_ID"] = os.getenv("GOOGLE_CSE_ID")
#COHERE_API_KEY = os.getenv("COHERE_API_KEY")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GOOGLE_CSE_ID = os.getenv("GOOGLE_CSE_ID")
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
# integrating an agent to perform the search with the URL
llm = ChatCohere(model="command-r-plus", cohere_api_key=COHERE_API_KEY)
# history = MessagesPlaceholder(variable_name="history")
#question = ""
#url = ""
prompt_template = [
("system", "You are a searh engine for a corpse of documentation. you will be provided with a url {url} \
the url is your only source of information, so you should search the url pages by key words \
you should only ground your responses with the url. \
If {question} has no related content from the url, simple response 'no related content to your question'"),
("human", "{question}"),
("placeholder", "{agent_scratchpad}"),
("ai", "")]
# prompt_template = prompt_template.format(url=url, question=question)
prompt = ChatPromptTemplate.from_messages([
#SystemMessage(content="You are a helpful assistant. You should use the google_search_name agent tool for information."),
#HumanMessage(content="{input}"),
#AIMessage(content="{output}"),
("system","You are a helpful virtual assistant." \
"You should only use the google_search_name agent tool to search for information when necessary."),
#MessagesPlaceholder(variable_name="history"),
("human","{question}"),
("placeholder", "{agent_scratchpad}")
])
# prompt template
prompt_text = ChatPromptTemplate.from_messages(prompt_template)
# print(prompt_text)
# prompt template input variables
# prompt_text.input_variables = ["question", "url"], input_variables = ["question", "url"]
search = GoogleSearchAPIWrapper(google_api_key = GOOGLE_API_KEY, google_cse_id = GOOGLE_CSE_ID)
tool = Tool(
name="google_search_name",
description="The model should use this tool when it needs more information from the internet.",
func=search.run,
)
agent = create_tool_calling_agent(
tools=[tool],
llm=llm,
#prompt = prompt_text
prompt = prompt
)
agent_executor = AgentExecutor(
agent=agent,
tools=[tool],
verbose=False
)
def parse_url(question : str) -> str:
# initialise the llm
response = agent_executor.invoke(input = {"question": question})
# add memmory to your conversation
# chain your llm to prompt
# chain = prompt_text | llm | StrOutputParser()
# chain = conversation_llm | StrOutputParser()
#response = chain.invoke(input = {"question" : question, "url":url})
return response
# message = HumanMessage(content="inurl: https://learn.microsoft.com 'what are cloud security best practices'")
# print(parse_url(message))
|