File size: 2,770 Bytes
c896cf3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea8f76c
 
 
 
c896cf3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import json
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset
import os

# Step 1: Load and Preprocess Data
class SpiderDataset(Dataset):
    def __init__(self, file_paths, tokenizer, max_length=128):
        self.data = []
        self.tokenizer = tokenizer
        self.max_length = max_length

        for file_path in file_paths:
            with open(file_path, 'r') as f:
                self.data.extend(json.load(f))
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item['question']
        sql_query = item['query']

        # Tokenize inputs and labels
        input_encoding = self.tokenizer(
            question, 
            max_length=self.max_length, 
            padding="max_length", 
            truncation=True, 
            return_tensors="pt"
        )

        output_encoding = self.tokenizer(
            sql_query, 
            max_length=self.max_length, 
            padding="max_length", 
            truncation=True, 
            return_tensors="pt"
        )

        # Prepare inputs and labels
        input_ids = input_encoding['input_ids'].squeeze()
        labels = output_encoding['input_ids'].squeeze()

        return {
            "input_ids": input_ids,
            "labels": labels
        }

# Step 2: Initialize Tokenizer and Model
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set pad token

# Load model with language model head
model = GPT2LMHeadModel.from_pretrained("distilgpt2")

# Step 3: Load Datasets
# Assuming the files are in a directory called `space/dataset`
file_paths = [
    "text2sql_pepe/train_others.json",
    "text2sql_pepe/dev.json",
    "text2sql_pepe/train_spider.json",
    "text2sql_pepe/test.json"
]
train_dataset = SpiderDataset(file_paths, tokenizer)

# Step 4: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./distilgpt2-sql-converter",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=2,
)

# Step 5: Initialize Trainer with Data Collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

# Step 6: Train the Model
trainer.train()

# Step 7: Save the Model and Tokenizer
model.save_pretrained("./distilgpt2-sql-converter")
tokenizer.save_pretrained("./distilgpt2-sql-converter")