Spaces:
Sleeping
Sleeping
Commit
·
618bce1
1
Parent(s):
cbe00ab
Optimized pinecone indexing and add OOP
Browse files- app.py +2 -118
- chatbot.py +112 -0
- dependencies.py +13 -0
app.py
CHANGED
@@ -1,124 +1,8 @@
|
|
1 |
-
from
|
2 |
-
from
|
3 |
-
from langchain_community.document_loaders import UnstructuredURLLoader
|
4 |
-
from langchain_huggingface import HuggingFaceEmbeddings
|
5 |
-
from langchain_groq import ChatGroq
|
6 |
-
import langchain_community.vectorstores
|
7 |
-
from pinecone import Pinecone, ServerlessSpec
|
8 |
-
from dotenv import load_dotenv
|
9 |
-
import os
|
10 |
-
from langchain_core.prompts import PromptTemplate
|
11 |
-
from langchain.schema.runnable import RunnablePassthrough
|
12 |
-
from langchain.schema.output_parser import StrOutputParser
|
13 |
-
import gradio as gr
|
14 |
-
|
15 |
-
class ChatBot():
|
16 |
-
load_dotenv()
|
17 |
-
# loader = DirectoryLoader('data', glob="*.md")
|
18 |
-
urls = [
|
19 |
-
'https://noqs.in/faqs/',
|
20 |
-
'https://noqs.in/',
|
21 |
-
'https://noqs.in/internships/'
|
22 |
-
]
|
23 |
-
|
24 |
-
url_loader = UnstructuredURLLoader(urls=urls)
|
25 |
-
url_data = url_loader.load()
|
26 |
-
|
27 |
-
text_loader = TextLoader('data.txt', encoding = 'UTF-8')
|
28 |
-
text_data = text_loader.load()
|
29 |
-
|
30 |
-
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=600)
|
31 |
-
|
32 |
-
url_docs = text_splitter.split_documents(url_data)
|
33 |
-
text_docs = text_splitter.split_documents(text_data)
|
34 |
-
docs = url_docs + text_docs
|
35 |
-
|
36 |
-
embeddings = HuggingFaceEmbeddings()
|
37 |
-
|
38 |
-
load_dotenv()
|
39 |
-
# Initialize Pinecone client
|
40 |
-
pc = Pinecone(
|
41 |
-
api_key=os.environ.get("PINECONE_API_KEY")
|
42 |
-
)
|
43 |
-
|
44 |
-
# Define Index Name
|
45 |
-
index_name = "noqs-chatbot-with-web-content-dynamic"
|
46 |
-
|
47 |
-
# Checking Index
|
48 |
-
if index_name not in pc.list_indexes().names():
|
49 |
-
# Create new Index
|
50 |
-
pc.create_index(name=index_name,
|
51 |
-
metric="cosine",
|
52 |
-
dimension=768,
|
53 |
-
spec=ServerlessSpec(
|
54 |
-
cloud="aws",
|
55 |
-
region="us-east-1"
|
56 |
-
))
|
57 |
-
|
58 |
-
docsearch = langchain_community.vectorstores.Pinecone.from_documents(docs, embeddings, index_name=index_name)
|
59 |
-
else:
|
60 |
-
# Embed the documents
|
61 |
-
pc.delete_index(index_name)
|
62 |
-
pc.create_index(name=index_name,
|
63 |
-
metric="cosine",
|
64 |
-
dimension=768,
|
65 |
-
spec=ServerlessSpec(
|
66 |
-
cloud="aws",
|
67 |
-
region="us-east-1"
|
68 |
-
))
|
69 |
-
|
70 |
-
docsearch = langchain_community.vectorstores.Pinecone.from_documents(docs, embeddings, index_name=index_name)
|
71 |
-
|
72 |
-
# Define the repo ID and connect to a model on Groq API
|
73 |
-
model_id = "llama3-8b-8192"
|
74 |
-
llm = ChatGroq(
|
75 |
-
model=model_id,
|
76 |
-
temperature=0.5,
|
77 |
-
max_tokens=1200
|
78 |
-
)
|
79 |
-
|
80 |
-
# Creating prompt response template
|
81 |
-
template = """
|
82 |
-
You are a knowledgeable assistant for NoQs Digital.
|
83 |
-
The Manager of our company is Mr. Adit Agarwal.
|
84 |
-
Users will ask you questions about our company, and you must use the given context to answer their questions accurately.
|
85 |
-
Follow these guidelines:
|
86 |
-
Always base your answers on the provided context. Do not make up information.
|
87 |
-
If the context does not contain the answer, simply say, "I don't know based on the provided information."
|
88 |
-
Offer detailed and thorough responses, but stay relevant to the user's question.
|
89 |
-
Maintain a professional tone in your responses.
|
90 |
-
You have to answer like a Customer Care Specialist.
|
91 |
-
If someone answer about program A just answer politelly that due to high demand we are currently not offering program A.
|
92 |
-
Don't tell user that you are fetching the information from the context
|
93 |
-
|
94 |
-
Context: {context}
|
95 |
-
|
96 |
-
User Question: {question}
|
97 |
-
|
98 |
-
Answer:
|
99 |
-
"""
|
100 |
-
|
101 |
-
|
102 |
-
prompt = PromptTemplate(
|
103 |
-
template=template,
|
104 |
-
input_variables=["context", "question"]
|
105 |
-
)
|
106 |
-
|
107 |
-
# Chaining llm and prompt
|
108 |
-
rag_chain = (
|
109 |
-
{"context": docsearch.as_retriever(), "question": RunnablePassthrough()}
|
110 |
-
| prompt
|
111 |
-
| llm
|
112 |
-
| StrOutputParser()
|
113 |
-
)
|
114 |
|
115 |
bot = ChatBot()
|
116 |
|
117 |
-
# def chat_function(prompt):
|
118 |
-
# user_input = prompt
|
119 |
-
# result = bot.rag_chain.invoke(user_input)
|
120 |
-
# return result
|
121 |
-
|
122 |
def chat_function(prompts,history):
|
123 |
user_input = prompts
|
124 |
result = bot.rag_chain.invoke(user_input)
|
|
|
1 |
+
from dependencies import *
|
2 |
+
from chatbot import ChatBot
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
bot = ChatBot()
|
5 |
|
|
|
|
|
|
|
|
|
|
|
6 |
def chat_function(prompts,history):
|
7 |
user_input = prompts
|
8 |
result = bot.rag_chain.invoke(user_input)
|
chatbot.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dependencies import *
|
2 |
+
|
3 |
+
class ChatBot():
|
4 |
+
def __init__(self, data_change = False):
|
5 |
+
self.execute = data_change
|
6 |
+
self.start_loader()
|
7 |
+
self.start_embeddings()
|
8 |
+
self.init_model()
|
9 |
+
|
10 |
+
def start_loader(self):
|
11 |
+
load_dotenv()
|
12 |
+
# loader = DirectoryLoader('data', glob="*.md")
|
13 |
+
urls = [
|
14 |
+
'https://noqs.in/faqs/',
|
15 |
+
'https://noqs.in/',
|
16 |
+
'https://noqs.in/internships/'
|
17 |
+
]
|
18 |
+
|
19 |
+
url_loader = UnstructuredURLLoader(urls=urls)
|
20 |
+
url_data = url_loader.load()
|
21 |
+
|
22 |
+
text_loader = TextLoader('data.txt', encoding = 'UTF-8')
|
23 |
+
text_data = text_loader.load()
|
24 |
+
|
25 |
+
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=600)
|
26 |
+
|
27 |
+
url_docs = text_splitter.split_documents(url_data)
|
28 |
+
text_docs = text_splitter.split_documents(text_data)
|
29 |
+
self.docs = url_docs + text_docs
|
30 |
+
|
31 |
+
def start_embeddings(self):
|
32 |
+
embeddings = HuggingFaceEmbeddings()
|
33 |
+
load_dotenv()
|
34 |
+
# Initialize Pinecone client
|
35 |
+
pc = Pinecone(
|
36 |
+
api_key=os.environ.get("PINECONE_API_KEY")
|
37 |
+
)
|
38 |
+
|
39 |
+
# Define Index Name
|
40 |
+
index_name = "noqs-chatbot-with-web-content-dynamic"
|
41 |
+
|
42 |
+
# Checking Index
|
43 |
+
if index_name not in pc.list_indexes().names():
|
44 |
+
# Create new Index
|
45 |
+
pc.create_index(name=index_name,
|
46 |
+
metric="cosine",
|
47 |
+
dimension=768,
|
48 |
+
spec=ServerlessSpec(
|
49 |
+
cloud="aws",
|
50 |
+
region="us-east-1"
|
51 |
+
))
|
52 |
+
|
53 |
+
docsearch = langchain_community.vectorstores.Pinecone.from_documents(self.docs, embeddings, index_name=index_name)
|
54 |
+
else:
|
55 |
+
# Embed the documents
|
56 |
+
if self.execute:
|
57 |
+
pc.delete_index(index_name)
|
58 |
+
pc.create_index(name=index_name,
|
59 |
+
metric="cosine",
|
60 |
+
dimension=768,
|
61 |
+
spec=ServerlessSpec(
|
62 |
+
cloud="aws",
|
63 |
+
region="us-east-1"
|
64 |
+
))
|
65 |
+
|
66 |
+
self.docsearch = langchain_community.vectorstores.Pinecone.from_documents(self.docs, embeddings, index_name=index_name)
|
67 |
+
else:
|
68 |
+
self.docsearch = langchain_community.vectorstores.Pinecone.from_existing_index(embedding=embeddings, index_name=index_name)
|
69 |
+
|
70 |
+
def init_model(self):
|
71 |
+
# Define the repo ID and connect to a model on Groq API
|
72 |
+
model_id = "llama3-8b-8192"
|
73 |
+
llm = ChatGroq(
|
74 |
+
model=model_id,
|
75 |
+
temperature=0.5,
|
76 |
+
max_tokens=1200
|
77 |
+
)
|
78 |
+
|
79 |
+
# Creating prompt response template
|
80 |
+
template = """
|
81 |
+
You are a knowledgeable assistant for NoQs Digital.
|
82 |
+
The Manager of our company is Mr. Adit Agarwal.
|
83 |
+
Users will ask you questions about our company, and you must use the given context to answer their questions accurately.
|
84 |
+
Follow these guidelines:
|
85 |
+
Always base your answers on the provided context. Do not make up information.
|
86 |
+
If the context does not contain the answer, simply say, "I don't know based on the provided information."
|
87 |
+
Offer detailed and thorough responses, but stay relevant to the user's question.
|
88 |
+
Maintain a professional tone in your responses.
|
89 |
+
You have to answer like a Customer Care Specialist.
|
90 |
+
If someone answer about program A just answer politelly that due to high demand we are currently not offering program A.
|
91 |
+
Don't tell user that you are fetching the information from the context
|
92 |
+
|
93 |
+
Context: {context}
|
94 |
+
|
95 |
+
User Question: {question}
|
96 |
+
|
97 |
+
Answer:
|
98 |
+
"""
|
99 |
+
|
100 |
+
|
101 |
+
prompt = PromptTemplate(
|
102 |
+
template=template,
|
103 |
+
input_variables=["context", "question"]
|
104 |
+
)
|
105 |
+
|
106 |
+
# Chaining llm and prompt
|
107 |
+
self.rag_chain = (
|
108 |
+
{"context": self.docsearch.as_retriever(), "question": RunnablePassthrough()}
|
109 |
+
| prompt
|
110 |
+
| llm
|
111 |
+
| StrOutputParser()
|
112 |
+
)
|
dependencies.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.text_splitter import CharacterTextSplitter
|
2 |
+
from langchain_community.document_loaders import TextLoader
|
3 |
+
from langchain_community.document_loaders import UnstructuredURLLoader
|
4 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
5 |
+
from langchain_groq import ChatGroq
|
6 |
+
import langchain_community.vectorstores
|
7 |
+
from pinecone import Pinecone, ServerlessSpec
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
import os
|
10 |
+
from langchain_core.prompts import PromptTemplate
|
11 |
+
from langchain.schema.runnable import RunnablePassthrough
|
12 |
+
from langchain.schema.output_parser import StrOutputParser
|
13 |
+
import gradio as gr
|