def test_eos_pad(): from datasets import load_dataset import torch from transformers import GPT2Tokenizer, GPT2LMHeadModel raw_text_batch = 'a' tokenizer = GPT2Tokenizer.from_pretrained("gpt2") # print(f'{tokenizer.eos_token=}') # print(f'{tokenizer.eos_token_id=}') # print(f'{tokenizer.pad_token=}') # print(f'{tokenizer.pad_token_id=}') # print(f'{raw_text_batch=}') # tokenize_batch = tokenizer(raw_text_batch, padding="max_length", max_length=5, truncation=True, return_tensors="pt") # print(f'{tokenize_batch=}') if tokenizer.pad_token_id is None: tokenizer.pad_token = tokenizer.eos_token probe_network = GPT2LMHeadModel.from_pretrained("gpt2") device = torch.device(f"cuda:{0}" if torch.cuda.is_available() else "cpu") probe_network = probe_network.to(device) print(f'{tokenizer.eos_token=}') print(f'{tokenizer.eos_token_id=}') print(f'{tokenizer.pad_token=}') print(f'{tokenizer.pad_token_id=}') print(f'{raw_text_batch=}') tokenize_batch = tokenizer(raw_text_batch, padding="max_length", max_length=5, truncation=True, return_tensors="pt") print(f'{tokenize_batch=}') print('Done')