405b now support 128k ctx
Browse files
llm.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
import utils; from utils import *
|
3 |
import os, sys, lzma, json, pprint, time, subprocess
|
4 |
|
5 |
-
thinker = os.getenv("thinker", "
|
6 |
TEMPERATURE = float(os.getenv("temperature", 0.1)) # 0.0 conservative (good for coding and correct syntax)
|
7 |
|
8 |
LLM_HOST = "gemini"
|
@@ -78,7 +78,7 @@ elif thinker in "70b|405b":
|
|
78 |
|
79 |
# https://docs.together.ai/docs/chat-models#hosted-models
|
80 |
model = {
|
81 |
-
"405b": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo
|
82 |
"70b": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo 128k 4k 1.2", # $0.88 / 1m tokens(*)
|
83 |
}[thinker]
|
84 |
|
@@ -89,12 +89,12 @@ elif thinker in "70b|405b":
|
|
89 |
TKNZ_RATIO = float(TKNZ_RATIO)
|
90 |
|
91 |
CTXLEN = int(CTXLEN[:-1])
|
92 |
-
if CTXLEN >
|
93 |
CTXLEN = CTXLEN*1024 - MAX_TOKENS
|
94 |
# print(model, CTXLEN, MAX_TOKENS, TKNZ_RATIO); input(); # DEBUG
|
95 |
|
96 |
from together import Together
|
97 |
-
together_client = Together(api_key=
|
98 |
###
|
99 |
stops = ["<|eot_id|>","<|eom_id|>","</answer>","</output>"]
|
100 |
def thinker_chat(prompt, history=[], stream=False, use_cache=True, testing=False):
|
@@ -163,51 +163,6 @@ elif thinker in "70b|405b":
|
|
163 |
return messages
|
164 |
|
165 |
|
166 |
-
elif thinker in "gemma2:27b|commandr:35b|llama3.1:70b":
|
167 |
-
#################
|
168 |
-
## Ollama connect
|
169 |
-
import subprocess, ollama # pip install ollama
|
170 |
-
try: ollama.list()
|
171 |
-
except: subprocess.run('nohup ssh -N -L 11434:localhost:11434 -p 22021 [email protected] &', shell=True)
|
172 |
-
subprocess.run('nohup ssh -N -L 9999:localhost:11434 -p 17340 [email protected] &', shell=True)
|
173 |
-
#################
|
174 |
-
OLLAMA_CLIENT = ollama.Client(host='http://localhost:11434')
|
175 |
-
machine = "RTX-4090-24G"
|
176 |
-
|
177 |
-
## ~30b models
|
178 |
-
if thinker in "gemma2:27b": OLLAMA_MODEL = "gemma2:27b-instruct-q5_K_M" ; CTXLEN = 512*14 # fit 24G
|
179 |
-
elif thinker in "commandr:35b": OLLAMA_MODEL = "command-r:35b-08-2024-q4_K_M" ; CTXLEN = 512*18 # fit 24G
|
180 |
-
else: OLLAMA_MODEL = "not found"
|
181 |
-
|
182 |
-
try: connect_to_4090 = OLLAMA_MODEL in str(ollama.list())
|
183 |
-
except: connect_to_4090 = False
|
184 |
-
|
185 |
-
if not connect_to_4090: # switch to A100
|
186 |
-
OLLAMA_CLIENT = ollama.Client(host='http://localhost:9999')
|
187 |
-
machine = "A100-PCIE-40GB"
|
188 |
-
## ~30b to ~70b models
|
189 |
-
if thinker in "gemma2:27b": OLLAMA_MODEL = "gemma2:27b-instruct-q8_0" ; CTXLEN = 1024*24
|
190 |
-
elif thinker in "commandr:35b": OLLAMA_MODEL = "command-r:35b-08-2024-q8_0" ; CTXLEN = 1024*32
|
191 |
-
elif thinker in "llama3.1:70b": OLLAMA_MODEL = "llama3.1:70b-instruct-q3_K_M" ; CTXLEN = 1024*12#fit 40G
|
192 |
-
LLM_HOST = f"{machine}__{OLLAMA_MODEL}"
|
193 |
-
|
194 |
-
def thinker_chat(prompt, history=[], stream=False, use_cache=False):
|
195 |
-
if stream:
|
196 |
-
with open(llm_log_filename,"at") as f: f.write(f"\n- - - [ {LLM_HOST} ] - - -\n\nPROMPT:\n{prompt}\n")
|
197 |
-
return OLLAMA_CLIENT.chat(model=OLLAMA_MODEL, messages=[{"role": "user", "content": prompt}], \
|
198 |
-
stream=True, options={'num_ctx': CTXLEN, 'temperature': TEMPERATURE})
|
199 |
-
|
200 |
-
messages = history + [{"role": "user", "content": prompt}]
|
201 |
-
with open(llm_log_filename,"at") as f: f.write(f"\n- - - [ {LLM_HOST} ] - - -\n\nPROMPT:\n{prompt}\n")
|
202 |
-
res = OLLAMA_CLIENT.chat(model=OLLAMA_MODEL, messages=messages, options={'temperature': TEMPERATURE})
|
203 |
-
content = res["message"]["content"]
|
204 |
-
with open(llm_log_filename,"at") as f: f.write(f"\nCONTENT:\n{content}\n")
|
205 |
-
messages += [{"role": "assistant", "content": content}]
|
206 |
-
return messages
|
207 |
-
|
208 |
-
## To make it's 100% local llm, normal chat can also use thinker
|
209 |
-
# chat = thinker_chat
|
210 |
-
|
211 |
LLM_HOST += f"__{round(CTXLEN/1024)}k_ctxlen"
|
212 |
who_are_you()
|
213 |
|
|
|
2 |
import utils; from utils import *
|
3 |
import os, sys, lzma, json, pprint, time, subprocess
|
4 |
|
5 |
+
thinker = os.getenv("thinker", "405b")
|
6 |
TEMPERATURE = float(os.getenv("temperature", 0.1)) # 0.0 conservative (good for coding and correct syntax)
|
7 |
|
8 |
LLM_HOST = "gemini"
|
|
|
78 |
|
79 |
# https://docs.together.ai/docs/chat-models#hosted-models
|
80 |
model = {
|
81 |
+
"405b": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo 128k 4k 1.2", # $4.00 / 1m tokens(*)
|
82 |
"70b": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo 128k 4k 1.2", # $0.88 / 1m tokens(*)
|
83 |
}[thinker]
|
84 |
|
|
|
89 |
TKNZ_RATIO = float(TKNZ_RATIO)
|
90 |
|
91 |
CTXLEN = int(CTXLEN[:-1])
|
92 |
+
if CTXLEN > 64: CTXLEN = 64 # max 32k ctxlen
|
93 |
CTXLEN = CTXLEN*1024 - MAX_TOKENS
|
94 |
# print(model, CTXLEN, MAX_TOKENS, TKNZ_RATIO); input(); # DEBUG
|
95 |
|
96 |
from together import Together
|
97 |
+
together_client = Together(api_key='adc0db56b77fe6508bdeadb4d8253771750a50639f8e87313153e49d4599f6ea')
|
98 |
###
|
99 |
stops = ["<|eot_id|>","<|eom_id|>","</answer>","</output>"]
|
100 |
def thinker_chat(prompt, history=[], stream=False, use_cache=True, testing=False):
|
|
|
163 |
return messages
|
164 |
|
165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
LLM_HOST += f"__{round(CTXLEN/1024)}k_ctxlen"
|
167 |
who_are_you()
|
168 |
|