Spaces:
Sleeping
Sleeping
Luke Stanley
commited on
Commit
·
74d6e52
1
Parent(s):
a0f49a0
Auto-downloads model if env var is not set
Browse files
utils.py
CHANGED
@@ -1,9 +1,16 @@
|
|
1 |
import json
|
|
|
2 |
from typing import Any, Dict, Union
|
3 |
import requests
|
4 |
|
|
|
5 |
from llama_cpp import Llama, LlamaGrammar, json_schema_to_gbnf
|
6 |
|
|
|
|
|
|
|
|
|
|
|
7 |
# The llama_cpp Python HTTP server communicates with the AI model, similar
|
8 |
# to the OpenAI API but adds a unique "grammar" parameter.
|
9 |
# The real OpenAI API has other ways to set the output format.
|
@@ -11,8 +18,24 @@ from llama_cpp import Llama, LlamaGrammar, json_schema_to_gbnf
|
|
11 |
|
12 |
URL = "http://localhost:5834/v1/chat/completions"
|
13 |
in_memory_llm = None
|
14 |
-
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
def llm_streaming(
|
18 |
prompt: str, pydantic_model_class, return_pydantic_object=False
|
@@ -83,9 +106,6 @@ def calculate_overall_score(faithfulness, spiciness):
|
|
83 |
def llm_stream_sans_network(
|
84 |
prompt: str, pydantic_model_class, return_pydantic_object=False
|
85 |
) -> Union[str, Dict[str, Any]]:
|
86 |
-
global in_memory_llm
|
87 |
-
if in_memory_llm is None:
|
88 |
-
in_memory_llm = Llama(model_path=IN_MEMORY_LLM_PATH)
|
89 |
schema = pydantic_model_class.model_json_schema()
|
90 |
|
91 |
# Optional example field from schema, is not needed for the grammar generation
|
@@ -97,6 +117,7 @@ def llm_stream_sans_network(
|
|
97 |
|
98 |
stream = in_memory_llm(
|
99 |
prompt,
|
|
|
100 |
max_tokens=1000,
|
101 |
temperature=0.7,
|
102 |
grammar=grammar,
|
|
|
1 |
import json
|
2 |
+
from os import environ as env
|
3 |
from typing import Any, Dict, Union
|
4 |
import requests
|
5 |
|
6 |
+
from huggingface_hub import hf_hub_download
|
7 |
from llama_cpp import Llama, LlamaGrammar, json_schema_to_gbnf
|
8 |
|
9 |
+
# There are two ways to use the LLM model currently used:
|
10 |
+
# 1. Use the HTTP server (USE_HTTP_SERVER=True), this is good for development
|
11 |
+
# when you want to change the logic of the translator without restarting the server.
|
12 |
+
# 2. Load the model into memory
|
13 |
+
# When using the HTTP server, it must be ran separately. See the README for instructions.
|
14 |
# The llama_cpp Python HTTP server communicates with the AI model, similar
|
15 |
# to the OpenAI API but adds a unique "grammar" parameter.
|
16 |
# The real OpenAI API has other ways to set the output format.
|
|
|
18 |
|
19 |
URL = "http://localhost:5834/v1/chat/completions"
|
20 |
in_memory_llm = None
|
21 |
+
|
22 |
+
|
23 |
+
LLM_MODEL_PATH = env.get("LLM_MODEL_PATH", None)
|
24 |
+
USE_HTTP_SERVER = env.get("USE_HTTP_SERVER", "false").lower() == "true"
|
25 |
+
|
26 |
+
if len(LLM_MODEL_PATH) > 0:
|
27 |
+
print(f"Using local model from {LLM_MODEL_PATH}")
|
28 |
+
else:
|
29 |
+
print("No local LLM_MODEL_PATH environment variable set. We need a model, downloading model from HuggingFace Hub")
|
30 |
+
LLM_MODEL_PATH =hf_hub_download(
|
31 |
+
repo_id=env.get("REPO_ID", "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF"),
|
32 |
+
filename=env.get("MODEL_FILE", "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf"),
|
33 |
+
)
|
34 |
+
print(f"Model downloaded to {LLM_MODEL_PATH}")
|
35 |
+
|
36 |
+
if in_memory_llm is None and USE_HTTP_SERVER is False:
|
37 |
+
print("Loading model into memory. If you didn't want this, set the USE_HTTP_SERVER environment variable to 'true'.")
|
38 |
+
in_memory_llm = Llama(model_path=LLM_MODEL_PATH)
|
39 |
|
40 |
def llm_streaming(
|
41 |
prompt: str, pydantic_model_class, return_pydantic_object=False
|
|
|
106 |
def llm_stream_sans_network(
|
107 |
prompt: str, pydantic_model_class, return_pydantic_object=False
|
108 |
) -> Union[str, Dict[str, Any]]:
|
|
|
|
|
|
|
109 |
schema = pydantic_model_class.model_json_schema()
|
110 |
|
111 |
# Optional example field from schema, is not needed for the grammar generation
|
|
|
117 |
|
118 |
stream = in_memory_llm(
|
119 |
prompt,
|
120 |
+
n_ctx=4096,
|
121 |
max_tokens=1000,
|
122 |
temperature=0.7,
|
123 |
grammar=grammar,
|