import json from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling from torch.utils.data import Dataset import os # Step 1: Load and Preprocess Data class SpiderDataset(Dataset): def __init__(self, file_paths, tokenizer, max_length=128): self.data = [] self.tokenizer = tokenizer self.max_length = max_length for file_path in file_paths: with open(file_path, 'r') as f: self.data.extend(json.load(f)) def __len__(self): return len(self.data) def __getitem__(self, idx): item = self.data[idx] question = item['question'] sql_query = item['query'] # Tokenize inputs and labels input_encoding = self.tokenizer( question, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt" ) output_encoding = self.tokenizer( sql_query, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt" ) # Prepare inputs and labels input_ids = input_encoding['input_ids'].squeeze() labels = output_encoding['input_ids'].squeeze() return { "input_ids": input_ids, "labels": labels } # Step 2: Initialize Tokenizer and Model tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2") tokenizer.pad_token = tokenizer.eos_token # Set pad token # Load model with language model head model = GPT2LMHeadModel.from_pretrained("distilgpt2") # Step 3: Load Datasets # Assuming the files are in a directory called `space/dataset` file_paths = [ "space/dataset/train_others.json", "space/dataset/dev.json", "space/dataset/train_spider.json", "space/dataset/test.json" ] train_dataset = SpiderDataset(file_paths, tokenizer) # Step 4: Define Training Arguments training_args = TrainingArguments( output_dir="./distilgpt2-sql-converter", evaluation_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=4, per_device_eval_batch_size=4, num_train_epochs=3, weight_decay=0.01, logging_dir="./logs", save_total_limit=2, ) # Step 5: Initialize Trainer with Data Collator data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, data_collator=data_collator, ) # Step 6: Train the Model trainer.train() # Step 7: Save the Model and Tokenizer model.save_pretrained("./distilgpt2-sql-converter") tokenizer.save_pretrained("./distilgpt2-sql-converter")