Leetmonkey In Action via Inference
Browse files
app.py
CHANGED
@@ -33,10 +33,12 @@ def download_model(model_name):
|
|
33 |
model_path = download_model(MODEL_NAME)
|
34 |
llm = Llama(
|
35 |
model_path=model_path,
|
36 |
-
n_ctx=
|
37 |
n_threads=4,
|
38 |
n_gpu_layers=-1, # Use all available GPU layers
|
39 |
-
verbose=False
|
|
|
|
|
40 |
)
|
41 |
logger.info("8-bit model loaded successfully")
|
42 |
|
@@ -46,12 +48,12 @@ train_dataset = dataset["train"]
|
|
46 |
|
47 |
# Generation parameters
|
48 |
generation_kwargs = {
|
49 |
-
"max_tokens":
|
50 |
"stop": ["```", "### Instruction:", "### Response:"],
|
51 |
"echo": False,
|
52 |
-
"temperature": 0.
|
53 |
-
"top_k":
|
54 |
-
"top_p": 0.
|
55 |
"repeat_penalty": 1.1
|
56 |
}
|
57 |
|
|
|
33 |
model_path = download_model(MODEL_NAME)
|
34 |
llm = Llama(
|
35 |
model_path=model_path,
|
36 |
+
n_ctx=1024,
|
37 |
n_threads=4,
|
38 |
n_gpu_layers=-1, # Use all available GPU layers
|
39 |
+
verbose=False,
|
40 |
+
n_batch=512,
|
41 |
+
mlock=True
|
42 |
)
|
43 |
logger.info("8-bit model loaded successfully")
|
44 |
|
|
|
48 |
|
49 |
# Generation parameters
|
50 |
generation_kwargs = {
|
51 |
+
"max_tokens": 1024,
|
52 |
"stop": ["```", "### Instruction:", "### Response:"],
|
53 |
"echo": False,
|
54 |
+
"temperature": 0.1,
|
55 |
+
"top_k": 20,
|
56 |
+
"top_p": 0.9,
|
57 |
"repeat_penalty": 1.1
|
58 |
}
|
59 |
|