Leetmonkey In Action via Inference
Browse files
app.py
CHANGED
@@ -33,11 +33,11 @@ def download_model(model_name):
|
|
33 |
model_path = download_model(MODEL_NAME)
|
34 |
llm = Llama(
|
35 |
model_path=model_path,
|
36 |
-
n_ctx=
|
37 |
n_threads=8,
|
38 |
-
n_gpu_layers
|
39 |
verbose=False,
|
40 |
-
n_batch=
|
41 |
mlock=True
|
42 |
)
|
43 |
logger.info("8-bit model loaded successfully")
|
@@ -48,11 +48,11 @@ train_dataset = dataset["train"]
|
|
48 |
|
49 |
# Generation parameters
|
50 |
generation_kwargs = {
|
51 |
-
"max_tokens":
|
52 |
"stop": ["```", "### Instruction:", "### Response:"],
|
53 |
"echo": False,
|
54 |
-
"temperature": 0.
|
55 |
-
"top_k":
|
56 |
"top_p": 0.9,
|
57 |
"repeat_penalty": 1.1
|
58 |
}
|
|
|
33 |
model_path = download_model(MODEL_NAME)
|
34 |
llm = Llama(
|
35 |
model_path=model_path,
|
36 |
+
n_ctx=512,
|
37 |
n_threads=8,
|
38 |
+
n_gpu_layers=1, # Use all available GPU layers
|
39 |
verbose=False,
|
40 |
+
n_batch=1024,
|
41 |
mlock=True
|
42 |
)
|
43 |
logger.info("8-bit model loaded successfully")
|
|
|
48 |
|
49 |
# Generation parameters
|
50 |
generation_kwargs = {
|
51 |
+
"max_tokens": 256,
|
52 |
"stop": ["```", "### Instruction:", "### Response:"],
|
53 |
"echo": False,
|
54 |
+
"temperature": 0.01,
|
55 |
+
"top_k": 5,
|
56 |
"top_p": 0.9,
|
57 |
"repeat_penalty": 1.1
|
58 |
}
|