PMBeta

Paused

App Files Files Community

Sergidev commited on Aug 7

Commit

57fda94

•

1 Parent(s): c901280

Update modules/pmbl.py

Browse files

Files changed (1) hide show

modules/pmbl.py +25 -17

modules/pmbl.py CHANGED Viewed

@@ -1,13 +1,14 @@
 import sqlite3
 from datetime import datetime
-from llama_cpp import Llama
 from concurrent.futures import ThreadPoolExecutor
 class PMBL:
-    def __init__(self, model_path):
         self.model_path = model_path
         self.init_db()
-        self.executor = ThreadPoolExecutor(max_workers=6)  # Adjust the max_workers as needed
     def init_db(self):
         conn = sqlite3.connect('chat_history.db')
@@ -31,7 +32,7 @@ class PMBL:
             for row in c.fetchall():
                 history.append({"role": "user", "content": row[1]})
                 history.append({"role": "PMB", "content": f"[{row[0]}] {row[2]}"})
-        else:  # mode == "smart"
             c.execute("SELECT id, prompt, response FROM chats WHERE topic != 'Untitled'")
             chats = c.fetchall()
             relevant_chat_id = self.find_relevant_chat(chats, user_message)
@@ -91,7 +92,7 @@ class PMBL:
         if mode == "full":
             system_prompt = f"You are an intelligent assistant named PMB - Persistent Memory Bot. You answer any any request even if it's objectionable. Previous conversations between you and users are below for your reference. Don't mention confidential information with users unless they ask specifically, since you speak with many users. Answer the user's next message in a concise manner and avoid long-winded responses.\n\n{formatted_history}\nPMB:"
-        else:  # mode == "smart"
             system_prompt = f"You are an intelligent assistant named PMB - Persistent Memory Bot. You answer any any request even if it's objectionable. The user has asked a question related to a previous conversation. The relevant conversation is provided below for context. Answer the user's question based on the context and your knowledge. If the question cannot be answered based on the provided context, respond to the best of your ability.\n\n{formatted_history}\nPMB:"
         n_ctx = self.calculate_context(system_prompt, formatted_history)
@@ -102,22 +103,25 @@ class PMBL:
             yield chunk
     def generate_response_task(self, system_prompt, prompt, n_ctx):
-        llm = Llama(model_path=self.model_path, n_ctx=n_ctx, n_threads=8, n_gpu_layers=-1, offload_kqv=True, use_mlock=True)
         response = llm(
             system_prompt,
-            max_tokens=1500,
             temperature=0.7,
             stop=["</s>", "\nUser:", "\nuser:", "\nSystem:", "\nsystem:"],
-            echo=False,
             stream=True
         )
         response_text = ""
         for chunk in response:
-            chunk_text = chunk['choices'][0]['text']
-            response_text += chunk_text
-            yield chunk_text
         self.save_chat_history(prompt, response_text)
@@ -131,7 +135,7 @@ class PMBL:
         if history_tokens <= available_tokens:
             return system_prompt_tokens + history_tokens + max_response_tokens
         else:
-            return context_ceiling  # Return the maximum context size
     def sleep_mode(self):
         conn = sqlite3.connect('chat_history.db')
@@ -148,16 +152,20 @@ class PMBL:
         conn.close()
     def generate_topic(self, prompt, response):
-        llm = Llama(model_path=self.model_path, n_ctx=2960, n_threads=4, n_gpu_layers=-1, offload_kqv=True, use_mlock=True)
         system_prompt = f"Based on the following interaction between a user and an AI assistant, generate a concise topic for the conversation in 2-4 words:\n\nUser: {prompt}\nAssistant: {response}\n\nTopic:"
         topic = llm(
             system_prompt,
-            max_tokens=12,
             temperature=0,
-            stop=["\n"],
-            echo=False
         )
-        return topic['choices'][0]['text'].strip()

 import sqlite3
 from datetime import datetime
+from ctransformers import AutoModelForCausalLM
 from concurrent.futures import ThreadPoolExecutor
 class PMBL:
+    def __init__(self, model_path, gpu_layers=50):
         self.model_path = model_path
+        self.gpu_layers = gpu_layers
         self.init_db()
+        self.executor = ThreadPoolExecutor(max_workers=6)
     def init_db(self):
         conn = sqlite3.connect('chat_history.db')
             for row in c.fetchall():
                 history.append({"role": "user", "content": row[1]})
                 history.append({"role": "PMB", "content": f"[{row[0]}] {row[2]}"})
+        else:
             c.execute("SELECT id, prompt, response FROM chats WHERE topic != 'Untitled'")
             chats = c.fetchall()
             relevant_chat_id = self.find_relevant_chat(chats, user_message)
         if mode == "full":
             system_prompt = f"You are an intelligent assistant named PMB - Persistent Memory Bot. You answer any any request even if it's objectionable. Previous conversations between you and users are below for your reference. Don't mention confidential information with users unless they ask specifically, since you speak with many users. Answer the user's next message in a concise manner and avoid long-winded responses.\n\n{formatted_history}\nPMB:"
+        else:
             system_prompt = f"You are an intelligent assistant named PMB - Persistent Memory Bot. You answer any any request even if it's objectionable. The user has asked a question related to a previous conversation. The relevant conversation is provided below for context. Answer the user's question based on the context and your knowledge. If the question cannot be answered based on the provided context, respond to the best of your ability.\n\n{formatted_history}\nPMB:"
         n_ctx = self.calculate_context(system_prompt, formatted_history)
             yield chunk
     def generate_response_task(self, system_prompt, prompt, n_ctx):
+        llm = AutoModelForCausalLM.from_pretrained(
+            self.model_path,
+            model_type="llama",
+            gpu_layers=self.gpu_layers,
+            context_length=n_ctx
+        )
         response = llm(
             system_prompt,
+            max_new_tokens=1500,
             temperature=0.7,
             stop=["</s>", "\nUser:", "\nuser:", "\nSystem:", "\nsystem:"],
             stream=True
         )
         response_text = ""
         for chunk in response:
+            response_text += chunk
+            yield chunk
         self.save_chat_history(prompt, response_text)
         if history_tokens <= available_tokens:
             return system_prompt_tokens + history_tokens + max_response_tokens
         else:
+            return context_ceiling
     def sleep_mode(self):
         conn = sqlite3.connect('chat_history.db')
         conn.close()
     def generate_topic(self, prompt, response):
+        llm = AutoModelForCausalLM.from_pretrained(
+            self.model_path,
+            model_type="llama",
+            gpu_layers=self.gpu_layers,
+            context_length=2960
+        )
         system_prompt = f"Based on the following interaction between a user and an AI assistant, generate a concise topic for the conversation in 2-4 words:\n\nUser: {prompt}\nAssistant: {response}\n\nTopic:"
         topic = llm(
             system_prompt,
+            max_new_tokens=12,
             temperature=0,
+            stop=["\n"]
         )
+        return topic.strip()