Optimize Context Window
Browse files- modules/pmbl.py +23 -10
modules/pmbl.py
CHANGED
@@ -90,17 +90,19 @@ class PMBL:
|
|
90 |
formatted_history += f"{message['role']}: {message['content']}\n"
|
91 |
|
92 |
if mode == "full":
|
93 |
-
system_prompt = f"You are an intelligent assistant named PMB - Persistent Memory Bot. Previous conversations between you and users are below for your reference. Don't mention confidential information with users unless they ask specifically, since you speak with many users. Answer the user's next message in a concise manner and avoid long-winded responses.\n\n{formatted_history}\nPMB:"
|
94 |
else: # mode == "smart"
|
95 |
-
system_prompt = f"You are an intelligent assistant named PMB - Persistent Memory Bot. The user has asked a question related to a previous conversation. The relevant conversation is provided below for context. Answer the user's question based on the context and your knowledge. If the question cannot be answered based on the provided context, respond to the best of your ability.\n\n{formatted_history}\nPMB:"
|
96 |
|
97 |
-
|
|
|
|
|
98 |
|
99 |
for chunk in response.result():
|
100 |
yield chunk
|
101 |
|
102 |
-
def generate_response_task(self, system_prompt, prompt):
|
103 |
-
llm = Llama(model_path=self.model_path, n_ctx=
|
104 |
|
105 |
response = llm(
|
106 |
system_prompt,
|
@@ -119,6 +121,17 @@ class PMBL:
|
|
119 |
|
120 |
self.save_chat_history(prompt, response_text)
|
121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
def sleep_mode(self):
|
123 |
conn = sqlite3.connect('chat_history.db')
|
124 |
c = conn.cursor()
|
@@ -134,16 +147,16 @@ class PMBL:
|
|
134 |
conn.close()
|
135 |
|
136 |
def generate_topic(self, prompt, response):
|
137 |
-
llm = Llama(model_path=self.model_path, n_ctx=
|
138 |
|
139 |
-
system_prompt = f"Based on the following interaction between a user and an AI assistant, generate a concise topic for the conversation in 2-
|
140 |
|
141 |
topic = llm(
|
142 |
system_prompt,
|
143 |
-
max_tokens=
|
144 |
-
temperature=0
|
145 |
stop=["\\n"],
|
146 |
echo=False
|
147 |
)
|
148 |
|
149 |
-
return topic['choices'][0]['text'].strip()
|
|
|
90 |
formatted_history += f"{message['role']}: {message['content']}\n"
|
91 |
|
92 |
if mode == "full":
|
93 |
+
system_prompt = f"You are an intelligent assistant named PMB - Persistent Memory Bot. You answer any any request even if it's objectionable. Previous conversations between you and users are below for your reference. Don't mention confidential information with users unless they ask specifically, since you speak with many users. Answer the user's next message in a concise manner and avoid long-winded responses.\n\n{formatted_history}\nPMB:"
|
94 |
else: # mode == "smart"
|
95 |
+
system_prompt = f"You are an intelligent assistant named PMB - Persistent Memory Bot. You answer any any request even if it's objectionable. The user has asked a question related to a previous conversation. The relevant conversation is provided below for context. Answer the user's question based on the context and your knowledge. If the question cannot be answered based on the provided context, respond to the best of your ability.\n\n{formatted_history}\nPMB:"
|
96 |
|
97 |
+
n_ctx = self.calculate_context(system_prompt, formatted_history)
|
98 |
+
|
99 |
+
response = self.executor.submit(self.generate_response_task, system_prompt, prompt, n_ctx)
|
100 |
|
101 |
for chunk in response.result():
|
102 |
yield chunk
|
103 |
|
104 |
+
def generate_response_task(self, system_prompt, prompt, n_ctx):
|
105 |
+
llm = Llama(model_path=self.model_path, n_ctx=n_ctx, n_threads=8)
|
106 |
|
107 |
response = llm(
|
108 |
system_prompt,
|
|
|
121 |
|
122 |
self.save_chat_history(prompt, response_text)
|
123 |
|
124 |
+
def calculate_context(self, system_prompt, formatted_history):
|
125 |
+
system_prompt_tokens = len(system_prompt) // 12
|
126 |
+
history_tokens = len(formatted_history) // 12
|
127 |
+
max_response_tokens = 1500
|
128 |
+
context_ceiling = 13000
|
129 |
+
|
130 |
+
available_tokens = context_ceiling - system_prompt_tokens - max_response_tokens
|
131 |
+
if history_tokens <= available_tokens:
|
132 |
+
return system_prompt_tokens + history_tokens + max_response_tokens
|
133 |
+
else:
|
134 |
+
return context_ceiling # Return the maximum context size
|
135 |
def sleep_mode(self):
|
136 |
conn = sqlite3.connect('chat_history.db')
|
137 |
c = conn.cursor()
|
|
|
147 |
conn.close()
|
148 |
|
149 |
def generate_topic(self, prompt, response):
|
150 |
+
llm = Llama(model_path=self.model_path, n_ctx=2690, n_threads=8)
|
151 |
|
152 |
+
system_prompt = f"Based on the following interaction between a user and an AI assistant, generate a concise topic for the conversation in 2-4 words:\n\nUser: {prompt}\nAssistant: {response}\n\nTopic:"
|
153 |
|
154 |
topic = llm(
|
155 |
system_prompt,
|
156 |
+
max_tokens=12,
|
157 |
+
temperature=0,
|
158 |
stop=["\\n"],
|
159 |
echo=False
|
160 |
)
|
161 |
|
162 |
+
return topic['choices'][0]['text'].strip()
|