ModernBertModel works on the CPU but fails on the GPU
#43
by
rudigung
- opened
Hello everyone,
My problem is that ModernBertModel fails to return a valid output when I use the GPU instead of the CPU. The following code returns a valid output:
import torch
from transformers import AutoTokenizer, ModernBertModel
model_id = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = ModernBertModel.from_pretrained(model_id)
device = torch.device("cpu")
model.to(device)
texts = ["The capital of France is Paris.", "The capital of Germany is Berlin."]
inputs = tokenizer(
text=texts,
add_special_tokens=True,
padding='max_length',
truncation=True,
max_length=768,
return_attention_mask=True,
return_tensors='pt'
)
input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
print(outputs.last_hidden_state)
Output:
tensor([[[ 0.2510, -0.8900, -0.7447, ..., -0.6569, 0.2809, -0.5663],
[ 0.1292, 0.0536, 0.2478, ..., 0.1400, -0.1059, 0.0981],
[-0.0945, -1.2089, -0.5087, ..., -0.0810, 1.4614, -0.1214],
...,
[ 1.5802, -0.2266, 0.8008, ..., -0.8563, -0.0378, -0.6842],
[ 1.6365, -0.2077, 0.7667, ..., -0.8660, -0.0537, -0.6460],
[ 1.6404, -0.1780, 0.7846, ..., -0.8497, -0.0268, -0.6155]],
[[ 0.3872, -0.9977, -0.8920, ..., -0.7293, 0.5094, -0.5080],
[-0.1917, -0.8092, -0.3774, ..., -1.0475, -0.4196, 0.1802],
[-0.0937, -1.1293, -0.8068, ..., 0.4551, 1.5275, -0.0922],
...,
[ 1.7813, 0.2581, 0.6624, ..., -1.0199, -0.1711, -1.1627],
[ 1.8317, 0.3041, 0.6434, ..., -1.0328, -0.1824, -1.1392],
[ 1.8517, 0.3272, 0.6883, ..., -0.9966, -0.1606, -1.1124]]],
grad_fn=<NativeLayerNormBackward0>)
But when I switch to the GPU I get a tensor with NaNs:
import torch
from transformers import AutoTokenizer, ModernBertModel
model_id = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = ModernBertModel.from_pretrained(model_id)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
texts = ["The capital of France is Paris.", "The capital of Germany is Berlin."]
inputs = tokenizer(
text=texts,
add_special_tokens=True,
padding='max_length',
truncation=True,
max_length=768,
return_attention_mask=True,
return_tensors='pt'
)
input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
print(outputs.last_hidden_state)
Output:
tensor([[[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
...,
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan]],
[[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
...,
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0',
grad_fn=<NativeLayerNormBackward0>)
Do you have an idea what the problem might be?
Do you have flash-attn installed? The second snipped worked for me after installing it.