Spaces:

satvikjain
/

RAG_ChatBot_NoQs

Sleeping

App Files Files Community

satvikjain commited on Jul 2, 2024

Commit

618bce1

1 Parent(s): cbe00ab

Optimized pinecone indexing and add OOP

Browse files

Files changed (3) hide show

app.py +2 -118
chatbot.py +112 -0
dependencies.py +13 -0

app.py CHANGED Viewed

@@ -1,124 +1,8 @@
-from langchain.text_splitter import CharacterTextSplitter
-from langchain_community.document_loaders import TextLoader
-from langchain_community.document_loaders import UnstructuredURLLoader
-from langchain_huggingface import HuggingFaceEmbeddings
-from langchain_groq import ChatGroq
-import langchain_community.vectorstores
-from pinecone import Pinecone, ServerlessSpec
-from dotenv import load_dotenv
-import os
-from langchain_core.prompts import PromptTemplate
-from langchain.schema.runnable import RunnablePassthrough
-from langchain.schema.output_parser import StrOutputParser
-import gradio as gr
-class ChatBot():
-    load_dotenv()
-    # loader = DirectoryLoader('data', glob="*.md")
-    urls = [
-        'https://noqs.in/faqs/',
-        'https://noqs.in/',
-        'https://noqs.in/internships/'
-    ]
-    url_loader = UnstructuredURLLoader(urls=urls)
-    url_data = url_loader.load()
-    text_loader = TextLoader('data.txt', encoding = 'UTF-8')
-    text_data = text_loader.load()
-    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=600)
-    url_docs = text_splitter.split_documents(url_data)
-    text_docs = text_splitter.split_documents(text_data)
-    docs = url_docs + text_docs
-    embeddings = HuggingFaceEmbeddings()
-    load_dotenv()
-    # Initialize Pinecone client
-    pc = Pinecone(
-        api_key=os.environ.get("PINECONE_API_KEY")
-    )
-    # Define Index Name
-    index_name = "noqs-chatbot-with-web-content-dynamic"
-    # Checking Index
-    if index_name not in pc.list_indexes().names():
-        # Create new Index
-        pc.create_index(name=index_name,
-                         metric="cosine",
-                           dimension=768,
-                            spec=ServerlessSpec(
-                                cloud="aws",
-                                region="us-east-1"
-                        ))
-        docsearch = langchain_community.vectorstores.Pinecone.from_documents(docs, embeddings, index_name=index_name)
-    else:
-        # Embed the documents
-        pc.delete_index(index_name)
-        pc.create_index(name=index_name,
-                         metric="cosine",
-                           dimension=768,
-                            spec=ServerlessSpec(
-                                cloud="aws",
-                                region="us-east-1"
-                        ))
-        docsearch = langchain_community.vectorstores.Pinecone.from_documents(docs, embeddings, index_name=index_name)
-    # Define the repo ID and connect to a model on Groq API
-    model_id = "llama3-8b-8192"
-    llm = ChatGroq(
-        model=model_id,
-        temperature=0.5,
-        max_tokens=1200
-    )
-    # Creating prompt response template
-    template = """
-        You are a knowledgeable assistant for NoQs Digital.
-        The Manager of our company is Mr. Adit Agarwal.
-        Users will ask you questions about our company, and you must use the given context to answer their questions accurately.
-        Follow these guidelines:
-        Always base your answers on the provided context. Do not make up information.
-        If the context does not contain the answer, simply say, "I don't know based on the provided information."
-        Offer detailed and thorough responses, but stay relevant to the user's question.
-        Maintain a professional tone in your responses.
-        You have to answer like a Customer Care Specialist.
-        If someone answer about program A just answer politelly that due to high demand we are currently not offering program A.
-        Don't tell user that you are fetching the information from the context
-        Context: {context}
-        User Question: {question}
-        Answer:
-    """
-    prompt = PromptTemplate(
-        template=template,
-        input_variables=["context", "question"]
-    )
-    # Chaining llm and prompt
-    rag_chain = (
-        {"context": docsearch.as_retriever(),  "question": RunnablePassthrough()}
-        | prompt
-        | llm
-        | StrOutputParser()
-    )
 bot = ChatBot()
-# def chat_function(prompt):
-#     user_input = prompt
-#     result = bot.rag_chain.invoke(user_input)
-#     return result
 def chat_function(prompts,history):
     user_input = prompts
     result = bot.rag_chain.invoke(user_input)

+from dependencies import *
+from chatbot import ChatBot
 bot = ChatBot()
 def chat_function(prompts,history):
     user_input = prompts
     result = bot.rag_chain.invoke(user_input)

chatbot.py ADDED Viewed

	@@ -0,0 +1,112 @@

+from dependencies import *
+class ChatBot():
+    def __init__(self, data_change = False):
+        self.execute = data_change
+        self.start_loader()
+        self.start_embeddings()
+        self.init_model()
+    def start_loader(self):
+        load_dotenv()
+        # loader = DirectoryLoader('data', glob="*.md")
+        urls = [
+            'https://noqs.in/faqs/',
+            'https://noqs.in/',
+            'https://noqs.in/internships/'
+        ]
+        url_loader = UnstructuredURLLoader(urls=urls)
+        url_data = url_loader.load()
+        text_loader = TextLoader('data.txt', encoding = 'UTF-8')
+        text_data = text_loader.load()
+        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=600)
+        url_docs = text_splitter.split_documents(url_data)
+        text_docs = text_splitter.split_documents(text_data)
+        self.docs = url_docs + text_docs
+    def start_embeddings(self):
+        embeddings = HuggingFaceEmbeddings()
+        load_dotenv()
+        # Initialize Pinecone client
+        pc = Pinecone(
+            api_key=os.environ.get("PINECONE_API_KEY")
+        )
+        # Define Index Name
+        index_name = "noqs-chatbot-with-web-content-dynamic"
+        # Checking Index
+        if index_name not in pc.list_indexes().names():
+            # Create new Index
+            pc.create_index(name=index_name,
+                            metric="cosine",
+                            dimension=768,
+                                spec=ServerlessSpec(
+                                    cloud="aws",
+                                    region="us-east-1"
+                            ))
+            docsearch = langchain_community.vectorstores.Pinecone.from_documents(self.docs, embeddings, index_name=index_name)
+        else:
+            # Embed the documents
+            if self.execute:
+                pc.delete_index(index_name)
+                pc.create_index(name=index_name,
+                                metric="cosine",
+                                dimension=768,
+                                    spec=ServerlessSpec(
+                                        cloud="aws",
+                                        region="us-east-1"
+                                ))
+                self.docsearch = langchain_community.vectorstores.Pinecone.from_documents(self.docs, embeddings, index_name=index_name)
+            else:
+                self.docsearch = langchain_community.vectorstores.Pinecone.from_existing_index(embedding=embeddings, index_name=index_name)
+    def init_model(self):
+        # Define the repo ID and connect to a model on Groq API
+        model_id = "llama3-8b-8192"
+        llm = ChatGroq(
+            model=model_id,
+            temperature=0.5,
+            max_tokens=1200
+        )
+        # Creating prompt response template
+        template = """
+            You are a knowledgeable assistant for NoQs Digital.
+            The Manager of our company is Mr. Adit Agarwal.
+            Users will ask you questions about our company, and you must use the given context to answer their questions accurately.
+            Follow these guidelines:
+            Always base your answers on the provided context. Do not make up information.
+            If the context does not contain the answer, simply say, "I don't know based on the provided information."
+            Offer detailed and thorough responses, but stay relevant to the user's question.
+            Maintain a professional tone in your responses.
+            You have to answer like a Customer Care Specialist.
+            If someone answer about program A just answer politelly that due to high demand we are currently not offering program A.
+            Don't tell user that you are fetching the information from the context
+            Context: {context}
+            User Question: {question}
+            Answer:
+        """
+        prompt = PromptTemplate(
+            template=template,
+            input_variables=["context", "question"]
+        )
+        # Chaining llm and prompt
+        self.rag_chain = (
+            {"context": self.docsearch.as_retriever(),  "question": RunnablePassthrough()}
+            | prompt
+            | llm
+            | StrOutputParser()
+        )

dependencies.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from langchain.text_splitter import CharacterTextSplitter
+from langchain_community.document_loaders import TextLoader
+from langchain_community.document_loaders import UnstructuredURLLoader
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_groq import ChatGroq
+import langchain_community.vectorstores
+from pinecone import Pinecone, ServerlessSpec
+from dotenv import load_dotenv
+import os
+from langchain_core.prompts import PromptTemplate
+from langchain.schema.runnable import RunnablePassthrough
+from langchain.schema.output_parser import StrOutputParser
+import gradio as gr