Fix generation with latest transformers

#88

Purpose

  • Fix model generation

Related Issues

Changes

  • The latest transformers release removed support for past_key_values.get_max_length() in favor of max_cache_length = past_key_values.get_max_cache_shape()

Testing

from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig

# Select model and load it.
MODEL_ID = "deepseek-ai/DeepSeek-V3"

config = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True)
del config.quantization_config

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    config=config,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

# # Confirm generations of the quantized model look sane.
print("\n\n")
print("========== SAMPLE GENERATION ==============")
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(output[0]))
print("==========================================\n\n")
kylesayrs changed pull request status to open
Ready to merge
This branch is ready to get merged automatically.

Sign up or log in to comment