INFERENCE

import time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

finetuned_model = AutoModelForCausalLM.from_pretrained("Mr-Vicky-01/GPT-QnA")
tokenizer = AutoTokenizer.from_pretrained("Mr-Vicky-01/GPT-QnA")

alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
what is depresssion how to overcome

### Response:
"""

s = time.time()
prompt = alpaca_prompt
encodeds = tokenizer(prompt, return_tensors="pt",truncation=True).input_ids

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
finetuned_model.to(device)
inputs = encodeds.to(device)

# Increase max_new_tokens if needed
generated_ids = finetuned_model.generate(inputs, max_new_tokens=256, temperature=0.1, top_p=0.90, do_sample=False,pad_token_id=50259,eos_token_id=50259,num_return_sequences=1)
print(tokenizer.decode(generated_ids[0]).split('### Response:')[1].split('<eos>')[0].strip())
e = time.time()
print(f'time taken:{e-s}')
Downloads last month
17
Safetensors
Model size
774M params
Tensor type
F32
·
Inference Examples
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.