tiendung commited on
Commit
47ab333
·
1 Parent(s): d3255e3

405b now support 128k ctx

Browse files
Files changed (1) hide show
  1. llm.py +4 -49
llm.py CHANGED
@@ -2,7 +2,7 @@
2
  import utils; from utils import *
3
  import os, sys, lzma, json, pprint, time, subprocess
4
 
5
- thinker = os.getenv("thinker", "gemini")
6
  TEMPERATURE = float(os.getenv("temperature", 0.1)) # 0.0 conservative (good for coding and correct syntax)
7
 
8
  LLM_HOST = "gemini"
@@ -78,7 +78,7 @@ elif thinker in "70b|405b":
78
 
79
  # https://docs.together.ai/docs/chat-models#hosted-models
80
  model = {
81
- "405b": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo 8k 3k 1.2", # $5.00 / 1m tokens(*)
82
  "70b": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo 128k 4k 1.2", # $0.88 / 1m tokens(*)
83
  }[thinker]
84
 
@@ -89,12 +89,12 @@ elif thinker in "70b|405b":
89
  TKNZ_RATIO = float(TKNZ_RATIO)
90
 
91
  CTXLEN = int(CTXLEN[:-1])
92
- if CTXLEN > 32: CTXLEN = 32 # max 32k ctxlen
93
  CTXLEN = CTXLEN*1024 - MAX_TOKENS
94
  # print(model, CTXLEN, MAX_TOKENS, TKNZ_RATIO); input(); # DEBUG
95
 
96
  from together import Together
97
- together_client = Together(api_key=os.environ.get('TOGETHER_API_KEY'))
98
  ###
99
  stops = ["<|eot_id|>","<|eom_id|>","</answer>","</output>"]
100
  def thinker_chat(prompt, history=[], stream=False, use_cache=True, testing=False):
@@ -163,51 +163,6 @@ elif thinker in "70b|405b":
163
  return messages
164
 
165
 
166
- elif thinker in "gemma2:27b|commandr:35b|llama3.1:70b":
167
- #################
168
- ## Ollama connect
169
- import subprocess, ollama # pip install ollama
170
- try: ollama.list()
171
- except: subprocess.run('nohup ssh -N -L 11434:localhost:11434 -p 22021 [email protected] &', shell=True)
172
- subprocess.run('nohup ssh -N -L 9999:localhost:11434 -p 17340 [email protected] &', shell=True)
173
- #################
174
- OLLAMA_CLIENT = ollama.Client(host='http://localhost:11434')
175
- machine = "RTX-4090-24G"
176
-
177
- ## ~30b models
178
- if thinker in "gemma2:27b": OLLAMA_MODEL = "gemma2:27b-instruct-q5_K_M" ; CTXLEN = 512*14 # fit 24G
179
- elif thinker in "commandr:35b": OLLAMA_MODEL = "command-r:35b-08-2024-q4_K_M" ; CTXLEN = 512*18 # fit 24G
180
- else: OLLAMA_MODEL = "not found"
181
-
182
- try: connect_to_4090 = OLLAMA_MODEL in str(ollama.list())
183
- except: connect_to_4090 = False
184
-
185
- if not connect_to_4090: # switch to A100
186
- OLLAMA_CLIENT = ollama.Client(host='http://localhost:9999')
187
- machine = "A100-PCIE-40GB"
188
- ## ~30b to ~70b models
189
- if thinker in "gemma2:27b": OLLAMA_MODEL = "gemma2:27b-instruct-q8_0" ; CTXLEN = 1024*24
190
- elif thinker in "commandr:35b": OLLAMA_MODEL = "command-r:35b-08-2024-q8_0" ; CTXLEN = 1024*32
191
- elif thinker in "llama3.1:70b": OLLAMA_MODEL = "llama3.1:70b-instruct-q3_K_M" ; CTXLEN = 1024*12#fit 40G
192
- LLM_HOST = f"{machine}__{OLLAMA_MODEL}"
193
-
194
- def thinker_chat(prompt, history=[], stream=False, use_cache=False):
195
- if stream:
196
- with open(llm_log_filename,"at") as f: f.write(f"\n- - - [ {LLM_HOST} ] - - -\n\nPROMPT:\n{prompt}\n")
197
- return OLLAMA_CLIENT.chat(model=OLLAMA_MODEL, messages=[{"role": "user", "content": prompt}], \
198
- stream=True, options={'num_ctx': CTXLEN, 'temperature': TEMPERATURE})
199
-
200
- messages = history + [{"role": "user", "content": prompt}]
201
- with open(llm_log_filename,"at") as f: f.write(f"\n- - - [ {LLM_HOST} ] - - -\n\nPROMPT:\n{prompt}\n")
202
- res = OLLAMA_CLIENT.chat(model=OLLAMA_MODEL, messages=messages, options={'temperature': TEMPERATURE})
203
- content = res["message"]["content"]
204
- with open(llm_log_filename,"at") as f: f.write(f"\nCONTENT:\n{content}\n")
205
- messages += [{"role": "assistant", "content": content}]
206
- return messages
207
-
208
- ## To make it's 100% local llm, normal chat can also use thinker
209
- # chat = thinker_chat
210
-
211
  LLM_HOST += f"__{round(CTXLEN/1024)}k_ctxlen"
212
  who_are_you()
213
 
 
2
  import utils; from utils import *
3
  import os, sys, lzma, json, pprint, time, subprocess
4
 
5
+ thinker = os.getenv("thinker", "405b")
6
  TEMPERATURE = float(os.getenv("temperature", 0.1)) # 0.0 conservative (good for coding and correct syntax)
7
 
8
  LLM_HOST = "gemini"
 
78
 
79
  # https://docs.together.ai/docs/chat-models#hosted-models
80
  model = {
81
+ "405b": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo 128k 4k 1.2", # $4.00 / 1m tokens(*)
82
  "70b": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo 128k 4k 1.2", # $0.88 / 1m tokens(*)
83
  }[thinker]
84
 
 
89
  TKNZ_RATIO = float(TKNZ_RATIO)
90
 
91
  CTXLEN = int(CTXLEN[:-1])
92
+ if CTXLEN > 64: CTXLEN = 64 # max 32k ctxlen
93
  CTXLEN = CTXLEN*1024 - MAX_TOKENS
94
  # print(model, CTXLEN, MAX_TOKENS, TKNZ_RATIO); input(); # DEBUG
95
 
96
  from together import Together
97
+ together_client = Together(api_key='adc0db56b77fe6508bdeadb4d8253771750a50639f8e87313153e49d4599f6ea')
98
  ###
99
  stops = ["<|eot_id|>","<|eom_id|>","</answer>","</output>"]
100
  def thinker_chat(prompt, history=[], stream=False, use_cache=True, testing=False):
 
163
  return messages
164
 
165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  LLM_HOST += f"__{round(CTXLEN/1024)}k_ctxlen"
167
  who_are_you()
168