File size: 2,239 Bytes
b6cae30
 
 
cb71443
 
 
 
b6cae30
 
 
 
 
 
b98c24f
 
489f743
 
 
 
3c3b33d
489f743
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c3b33d
489f743
 
 
 
 
 
3c3b33d
489f743
 
 
 
b6cae30
 
c36136e
b6cae30
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
---
base_model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
library_name: peft
license: mit
language:
- en
pipeline_tag: text-generation
---

# Model Card for Model ID

<!-- Provide a quick summary of what the model is/does. -->

## How to Use
```python
!pip install bitsandbytes
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# **Model Name on Hugging Face**
MODEL_NAME = "Vijayendra/DeepSeek-Llama3.1-8B-DeepThinker-v1"

# 🛠 **Load Model & Tokenizer from Hugging Face**
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, 
    device_map="auto",  # Automatically assigns model layers to available GPUs/CPUs
    torch_dtype=torch.float16  # Use 16-bit precision for memory efficiency
).to("cuda" if torch.cuda.is_available() else "cpu")  # Send model to GPU if available

# 🛠 **Define Inference Function**
def generate_response(model, tokenizer, prompt, max_new_tokens=2048, temperature=0.7):
    
    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)
    # Ensure attention mask is passed
    attention_mask = inputs.attention_mask
    # Generate response
    with torch.no_grad():
        generated_tokens = model.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,  # Ensure attention mask is passed
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=True,
            top_k=40,
            top_p=0.9,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id
        )

    # Decode response
    return tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

# **Test Questions**
questions = [
    "What happened yesterday?",
    "If an unstoppable force hits an immovable object, what happens?",
    "The sun orbits the Earth once every 365 days. Is this true?"
]

# **Generate and Print Responses**
for i, question in enumerate(questions, 1):
    response = generate_response(model, tokenizer, question)
    print(f"\n🟢 Question {i}: {question}")
    print(f"🔵 Response: {response}")


```
### Framework versions

- PEFT 0.14.0