nm-testing/Yi-6B-Llama-50-quant

Usage

from deepsparse import TextGeneration

prompt = "How to get in a good university?"
formatted_prompt = f"<s>[INST]{prompt}[/INST]"

model = TextGeneration(model_path="hf:neuralmagic/Yi-6B-Llama-50-quant")
print(model(formatted_prompt, max_new_tokens=200).generations[0].text)

"""
Getting into a good university is a complex process that involves factors such as academic performance, financial aid, and personal qualifications. Here are some steps you can follow to get in a good university:

1. Academic performance:

- Look for a university that has a strong academic program, including a well-rounded curriculum that covers a wide range of subjects.
- Check if the university offers a clear curriculum that includes a clear sequence of courses.
- Check if the university offers a clear pathway to graduation, including clear dates and deadlines.

2. Financial aid:

- Look for a university that offers financial aid, such as scholarships, grants, or loans.
- Check if the university offers financial aid that fits your budget.
- Consider the university's financial aid package, including the cost of tuition, room and board, and other expenses.
"""

One-shot and Export

git clone https://github.com/neuralmagic/sparseml
pip install -e "sparseml[transformers]" "torch<2"
python sparseml/src/sparseml/transformers/sparsification/obcq/obcq.py chargoddard/Yi-6B-Llama open_platypus --recipe sparseml/src/sparseml/transformers/sparsification/obcq/example_llama.yaml --precision float16  --save True --device cuda
cd sparseml 
git checkout update/onnx_export/duplicate
python src/sparseml/transformers/sparsification/obcq/export.py --task text-generation --model_path /root/obcq_deployment
cp deployment/model.onnx deployment/model-orig.onnx
python onnx_kv_inject.py --input-file deployment/model-orig.onnx --output-file deployment/model.onnx

recipe.yaml

test_stage:
  obcq_modifiers:
    SmoothQuantModifier:
      smoothing_strength: 0.5
      mappings: [
        [["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"], "re:.*input_layernorm"],
        [["re:.*gate_proj", "re:.*up_proj"], "re:.*post_attention_layernorm"]
      ]
    QuantizationModifier:
      ignore:
        - LlamaRotaryEmbedding
        - LlamaRMSNorm
        - SiLUActivation
        - model.layers.0.mlp.down_proj
        - model.layers.1.mlp.down_proj
        - model.layers.2.mlp.down_proj
        - model.layers.3.mlp.down_proj
        - model.layers.4.mlp.down_proj
        - model.layers.5.mlp.down_proj
        - model.layers.6.mlp.down_proj
        - model.layers.7.mlp.down_proj
        - model.layers.8.mlp.down_proj
        - model.layers.9.mlp.down_proj
        - model.layers.10.mlp.down_proj
        - model.layers.11.mlp.down_proj
        - model.layers.12.mlp.down_proj
        - model.layers.13.mlp.down_proj
        - model.layers.14.mlp.down_proj
        - model.layers.15.mlp.down_proj
        - model.layers.16.mlp.down_proj
        - model.layers.17.mlp.down_proj
        - model.layers.18.mlp.down_proj
        - model.layers.19.mlp.down_proj
        - model.layers.20.mlp.down_proj
        - model.layers.21.mlp.down_proj
        - model.layers.22.mlp.down_proj
        - model.layers.23.mlp.down_proj
        - model.layers.24.mlp.down_proj
        - model.layers.25.mlp.down_proj
        - model.layers.26.mlp.down_proj
        - model.layers.27.mlp.down_proj
        - model.layers.28.mlp.down_proj
        - model.layers.29.mlp.down_proj
        - model.layers.30.mlp.down_proj
        - model.layers.31.mlp.down_proj
      post_oneshot_calibration: True
      scheme_overrides:
        Embedding:
          input_activations: null
          weights:
            num_bits: 8
            symmetric: False
    SparseGPTModifier:
      sparsity: 0.5
      block_size: 128
      sequential_update: False
      quantize: True
      percdamp: 0.01
      mask_structure: "0:0"
      targets: [
        "model.layers.0",
        "model.layers.1",
        "model.layers.2",
        "model.layers.3",
        "model.layers.4",
        "model.layers.5",
        "model.layers.6",
        "model.layers.7",
        "model.layers.8",
        "model.layers.9",
        "model.layers.10",
        "model.layers.11",
        "model.layers.12",
        "model.layers.13",
        "model.layers.14",
        "model.layers.15",
        "model.layers.16",
        "model.layers.17",
        "model.layers.18",
        "model.layers.19",
        "model.layers.20",
        "model.layers.21",
        "model.layers.22",
        "model.layers.23",
        "model.layers.24",
        "model.layers.25",
        "model.layers.26",
        "model.layers.27",
        "model.layers.28",
        "model.layers.29",
        "model.layers.30",
        "model.layers.31",
      ]