import torch from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments from datasets import Dataset # Load and preprocess data def load_and_preprocess_data(file_path): with open(file_path, 'r') as f: lines = f.readlines() data = {'text': [line.strip() for line in lines if line.strip()]} dataset = Dataset.from_dict(data) return dataset # Define preprocessing function for tokenization def tokenize_function(examples, tokenizer): inputs = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=256) targets = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=256) inputs['labels'] = targets['input_ids'] return inputs # Load dataset and tokenizer data_file = 'data.txt' dataset = load_and_preprocess_data(data_file) tokenizer = T5Tokenizer.from_pretrained('google/t5-efficient-tiny') # Tokenize dataset tokenized_datasets = dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True) # Split dataset into training and evaluation datasets split_datasets = tokenized_datasets.train_test_split(test_size=0.1) train_dataset = split_datasets['train'] eval_dataset = split_datasets['test'] # Load model model = T5ForConditionalGeneration.from_pretrained('google/t5-efficient-tiny') # Ensure all tensors in the model are contiguous def make_contiguous(model): for name, param in model.named_parameters(): if not param.is_contiguous(): param.data = param.data.contiguous() # Apply the conversion to contiguous tensors make_contiguous(model) # Define training arguments training_args = TrainingArguments( output_dir='./results', per_device_train_batch_size=2, per_device_eval_batch_size=2, num_train_epochs=6, logging_dir='./logs', logging_steps=10, save_steps=10, evaluation_strategy='steps', save_total_limit=1, learning_rate=8e-5, weight_decay=0.01, report_to='tensorboard', fp16=True, gradient_accumulation_steps=2 ) # Initialize Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer ) # Train and fine-tune the model trainer.train() # Save the fine-tuned model output_dir = './fine-tuned-t5-efficient-tiny' model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) # Also save the model state dict to avoid any issues with saving torch.save(model.state_dict(), f'{output_dir}/pytorch_model.bin')