nileshhanotia commited on
Commit
c896cf3
1 Parent(s): 245af2f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -0
app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
3
+ from torch.utils.data import Dataset
4
+ import os
5
+
6
+ # Step 1: Load and Preprocess Data
7
+ class SpiderDataset(Dataset):
8
+ def __init__(self, file_paths, tokenizer, max_length=128):
9
+ self.data = []
10
+ self.tokenizer = tokenizer
11
+ self.max_length = max_length
12
+
13
+ for file_path in file_paths:
14
+ with open(file_path, 'r') as f:
15
+ self.data.extend(json.load(f))
16
+
17
+ def __len__(self):
18
+ return len(self.data)
19
+
20
+ def __getitem__(self, idx):
21
+ item = self.data[idx]
22
+ question = item['question']
23
+ sql_query = item['query']
24
+
25
+ # Tokenize inputs and labels
26
+ input_encoding = self.tokenizer(
27
+ question,
28
+ max_length=self.max_length,
29
+ padding="max_length",
30
+ truncation=True,
31
+ return_tensors="pt"
32
+ )
33
+
34
+ output_encoding = self.tokenizer(
35
+ sql_query,
36
+ max_length=self.max_length,
37
+ padding="max_length",
38
+ truncation=True,
39
+ return_tensors="pt"
40
+ )
41
+
42
+ # Prepare inputs and labels
43
+ input_ids = input_encoding['input_ids'].squeeze()
44
+ labels = output_encoding['input_ids'].squeeze()
45
+
46
+ return {
47
+ "input_ids": input_ids,
48
+ "labels": labels
49
+ }
50
+
51
+ # Step 2: Initialize Tokenizer and Model
52
+ tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
53
+ tokenizer.pad_token = tokenizer.eos_token # Set pad token
54
+
55
+ # Load model with language model head
56
+ model = GPT2LMHeadModel.from_pretrained("distilgpt2")
57
+
58
+ # Step 3: Load Datasets
59
+ # Assuming the files are in a directory called `space/dataset`
60
+ file_paths = [
61
+ "space/dataset/train_others.json",
62
+ "space/dataset/dev.json",
63
+ "space/dataset/train_spider.json",
64
+ "space/dataset/test.json"
65
+ ]
66
+ train_dataset = SpiderDataset(file_paths, tokenizer)
67
+
68
+ # Step 4: Define Training Arguments
69
+ training_args = TrainingArguments(
70
+ output_dir="./distilgpt2-sql-converter",
71
+ evaluation_strategy="epoch",
72
+ learning_rate=2e-5,
73
+ per_device_train_batch_size=4,
74
+ per_device_eval_batch_size=4,
75
+ num_train_epochs=3,
76
+ weight_decay=0.01,
77
+ logging_dir="./logs",
78
+ save_total_limit=2,
79
+ )
80
+
81
+ # Step 5: Initialize Trainer with Data Collator
82
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
83
+
84
+ trainer = Trainer(
85
+ model=model,
86
+ args=training_args,
87
+ train_dataset=train_dataset,
88
+ data_collator=data_collator,
89
+ )
90
+
91
+ # Step 6: Train the Model
92
+ trainer.train()
93
+
94
+ # Step 7: Save the Model and Tokenizer
95
+ model.save_pretrained("./distilgpt2-sql-converter")
96
+ tokenizer.save_pretrained("./distilgpt2-sql-converter")