Spaces:
Sleeping
Sleeping
lukestanley
commited on
Commit
·
e01e28e
1
Parent(s):
0945e5b
Add env vars to set GPU layer count and context size, make verbose
Browse files
utils.py
CHANGED
@@ -19,9 +19,12 @@ from llama_cpp import Llama, LlamaGrammar, json_schema_to_gbnf
|
|
19 |
URL = "http://localhost:5834/v1/chat/completions"
|
20 |
in_memory_llm = None
|
21 |
|
22 |
-
|
|
|
23 |
LLM_MODEL_PATH = env.get("LLM_MODEL_PATH", None)
|
24 |
USE_HTTP_SERVER = env.get("USE_HTTP_SERVER", "false").lower() == "true"
|
|
|
|
|
25 |
|
26 |
if LLM_MODEL_PATH and len(LLM_MODEL_PATH) > 0:
|
27 |
print(f"Using local model from {LLM_MODEL_PATH}")
|
@@ -35,7 +38,7 @@ else:
|
|
35 |
|
36 |
if in_memory_llm is None and USE_HTTP_SERVER is False:
|
37 |
print("Loading model into memory. If you didn't want this, set the USE_HTTP_SERVER environment variable to 'true'.")
|
38 |
-
in_memory_llm = Llama(model_path=LLM_MODEL_PATH, n_ctx=
|
39 |
|
40 |
def llm_streaming(
|
41 |
prompt: str, pydantic_model_class, return_pydantic_object=False
|
@@ -51,9 +54,9 @@ def llm_streaming(
|
|
51 |
|
52 |
payload = {
|
53 |
"stream": True,
|
54 |
-
"max_tokens":
|
55 |
"grammar": grammar,
|
56 |
-
"temperature":
|
57 |
"messages": [{"role": "user", "content": prompt}],
|
58 |
}
|
59 |
headers = {
|
@@ -117,8 +120,8 @@ def llm_stream_sans_network(
|
|
117 |
|
118 |
stream = in_memory_llm(
|
119 |
prompt,
|
120 |
-
max_tokens=
|
121 |
-
temperature=
|
122 |
grammar=grammar,
|
123 |
stream=True
|
124 |
)
|
|
|
19 |
URL = "http://localhost:5834/v1/chat/completions"
|
20 |
in_memory_llm = None
|
21 |
|
22 |
+
N_GPU_LAYERS = env.get("N_GPU_LAYERS", 10)
|
23 |
+
CONTEXT_SIZE = int(env.get("CONTEXT_SIZE", 4096))
|
24 |
LLM_MODEL_PATH = env.get("LLM_MODEL_PATH", None)
|
25 |
USE_HTTP_SERVER = env.get("USE_HTTP_SERVER", "false").lower() == "true"
|
26 |
+
MAX_TOKENS = int(env.get("MAX_TOKENS", 1000))
|
27 |
+
TEMPERATURE = float(env.get("TEMPERATURE", 0.7))
|
28 |
|
29 |
if LLM_MODEL_PATH and len(LLM_MODEL_PATH) > 0:
|
30 |
print(f"Using local model from {LLM_MODEL_PATH}")
|
|
|
38 |
|
39 |
if in_memory_llm is None and USE_HTTP_SERVER is False:
|
40 |
print("Loading model into memory. If you didn't want this, set the USE_HTTP_SERVER environment variable to 'true'.")
|
41 |
+
in_memory_llm = Llama(model_path=LLM_MODEL_PATH, n_ctx=CONTEXT_SIZE, n_gpu_layers=N_GPU_LAYERS, verbose=True)
|
42 |
|
43 |
def llm_streaming(
|
44 |
prompt: str, pydantic_model_class, return_pydantic_object=False
|
|
|
54 |
|
55 |
payload = {
|
56 |
"stream": True,
|
57 |
+
"max_tokens": MAX_TOKENS,
|
58 |
"grammar": grammar,
|
59 |
+
"temperature": TEMPERATURE,
|
60 |
"messages": [{"role": "user", "content": prompt}],
|
61 |
}
|
62 |
headers = {
|
|
|
120 |
|
121 |
stream = in_memory_llm(
|
122 |
prompt,
|
123 |
+
max_tokens=MAX_TOKENS,
|
124 |
+
temperature=TEMPERATURE,
|
125 |
grammar=grammar,
|
126 |
stream=True
|
127 |
)
|