Spaces:

hubsnippetai
/

chatdocs

Sleeping

File size: 4,956 Bytes

425fac0

from langchain_community.document_loaders import UnstructuredPDFLoader, TextLoader # type: ignore
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_cohere import ChatCohere
from langchain_core.messages import HumanMessage
import dotenv
from langchain_core.output_parsers import StrOutputParser
# from langchain_community.vectorstores import Chroma
from langchain.schema.runnable import RunnablePassthrough
from langchain_cohere import CohereEmbeddings
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate

from langchain.memory.summary_buffer import ConversationSummaryBufferMemory
from langchain.chains import ConversationChain
from langchain_core.prompts.chat import MessagesPlaceholder

from langchain.agents import AgentExecutor, create_tool_calling_agent

import os

#from langchain_community.utilities import GoogleSearchAPIWrapper
from langchain_core.tools import Tool
from langchain_google_community import GoogleSearchAPIWrapper

dotenv.load_dotenv()
#file_path = (    "/home/hubsnippet/Downloads/papers/2205.11916v4.pdf")

#load the file to memory
#loader = PyPDFLoader(file_path)

#load the file content to data variable
#data = loader.load_and_split()

# embed the file data in a vector store

#print(data[0])

def parse_document(docs : str, question : str):
    # initialise an embedding for the vector store
    embeddings = CohereEmbeddings(model="embed-english-light-v3.0")
    
    # initialise the llm
    llm = ChatCohere(model='command-r-plus')

    # split the file into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 100
    )    
    docs = text_splitter.split_text(docs)

    # initialize vectorstore
    faiss_vs = FAISS.from_texts(docs, embeddings)
    # res = faiss_vs.similarity_search(input, k=2)
    llm_retriever = faiss_vs.as_retriever(llm = llm, search_kwargs={'k':1})
    res = llm_retriever.invoke(question)[0].page_content
    
    return res

#os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")
#os.environ["GOOGLE_CSE_ID"] = os.getenv("GOOGLE_CSE_ID")
#COHERE_API_KEY = os.getenv("COHERE_API_KEY")

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GOOGLE_CSE_ID = os.getenv("GOOGLE_CSE_ID")
COHERE_API_KEY = os.getenv("COHERE_API_KEY")


# integrating an agent to perform the search with the URL
llm = ChatCohere(model="command-r-plus", cohere_api_key=COHERE_API_KEY)
    # history = MessagesPlaceholder(variable_name="history")

#question = ""
#url = ""

prompt_template = [
    ("system",  "You are a searh engine for a corpse of documentation. you will be provided with a url {url} \
    the url is your only source of information, so you should search the url pages by key words \
    you should only ground your responses with the url. \
    If {question} has no related content from the url, simple response 'no related content to your question'"),
    ("human", "{question}"),
    ("placeholder", "{agent_scratchpad}"),
    ("ai", "")]
# prompt_template = prompt_template.format(url=url, question=question)

prompt = ChatPromptTemplate.from_messages([
    #SystemMessage(content="You are a helpful assistant. You should use the google_search_name agent tool for information."),
    #HumanMessage(content="{input}"),
    #AIMessage(content="{output}"),
    ("system","You are a helpful virtual assistant." \
    "You should only use the google_search_name agent tool to search for information when necessary."),
    #MessagesPlaceholder(variable_name="history"),
    ("human","{question}"),
    ("placeholder", "{agent_scratchpad}")
])

# prompt template    
prompt_text = ChatPromptTemplate.from_messages(prompt_template)

# print(prompt_text)
# prompt template input variables
# prompt_text.input_variables = ["question", "url"], input_variables = ["question", "url"]
search = GoogleSearchAPIWrapper(google_api_key = GOOGLE_API_KEY, google_cse_id = GOOGLE_CSE_ID)

tool = Tool(
    name="google_search_name",
    description="The model should use this tool when it needs more information from the internet.",
    func=search.run,
)

agent = create_tool_calling_agent(
    tools=[tool],
    llm=llm,
    #prompt = prompt_text
    prompt = prompt
)

agent_executor = AgentExecutor(
    agent=agent,
    tools=[tool],
    verbose=False
)

def parse_url(question : str) -> str:
    # initialise the llm 

    response = agent_executor.invoke(input = {"question": question})
    # add memmory to your conversation


    # chain your llm to prompt
    # chain = prompt_text | llm | StrOutputParser()
    # chain = conversation_llm | StrOutputParser()

    #response = chain.invoke(input = {"question" : question, "url":url})
    return response

# message = HumanMessage(content="inurl: https://learn.microsoft.com 'what are cloud security best practices'")

# print(parse_url(message))