Update README.md
Browse files
README.md
CHANGED
@@ -77,3 +77,16 @@ outputs = model.generate(**inputs.to(model.device), max_new_tokens=1000, cache_i
|
|
77 |
if(backend == 'gemlite'):
|
78 |
gemlite.core.GemLiteLinear.cache_config('/tmp/gemlite_config.json')
|
79 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
if(backend == 'gemlite'):
|
78 |
gemlite.core.GemLiteLinear.cache_config('/tmp/gemlite_config.json')
|
79 |
```
|
80 |
+
|
81 |
+
Use in <a href="https://github.com/vllm-project/vllm/">vllm</a>:
|
82 |
+
```Python
|
83 |
+
from vllm import LLM
|
84 |
+
from vllm.sampling_params import SamplingParams
|
85 |
+
|
86 |
+
model_id = "mobiuslabsgmbh/Qwen2.5-14B-Instruct-1M_4bitgs64_hqq_hf"
|
87 |
+
|
88 |
+
llm = LLM(model=model_id, max_model_len=4096, enable_chunked_prefill=False)
|
89 |
+
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=1024)
|
90 |
+
outputs = llm.generate(["What is the capital of Germany?"], sampling_params)
|
91 |
+
print(outputs[0].outputs[0].text)
|
92 |
+
```
|