ysn-rfd commited on
Commit
d994c11
1 Parent(s): ecb5917

Upload 3 files

Browse files
tiny-gpt2-fine-tuning/chat.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import GPT2Tokenizer, GPT2LMHeadModel
2
+ import torch
3
+
4
+ # Load the fine-tuned model and tokenizer
5
+ model = GPT2LMHeadModel.from_pretrained("./fine-tuned-gpt2")
6
+ tokenizer = GPT2Tokenizer.from_pretrained("./fine-tuned-gpt2")
7
+
8
+ # Function to generate a response
9
+ def generate_response(prompt, max_length=100, temperature=0.7, top_k=50, top_p=0.95):
10
+ # Encode the input prompt
11
+ input_ids = tokenizer.encode(prompt, return_tensors='pt')
12
+
13
+ # Create an attention mask
14
+ attention_mask = (input_ids != tokenizer.pad_token_id).long()
15
+
16
+ with torch.no_grad():
17
+ output = model.generate(
18
+ input_ids,
19
+ attention_mask=attention_mask, # Add attention mask
20
+ max_length=max_length, # Adjust as needed
21
+ temperature=temperature, # Sampling temperature
22
+ top_k=top_k, # Top-k sampling
23
+ top_p=top_p, # Top-p (nucleus) sampling
24
+ do_sample=True, # Enable sampling
25
+ pad_token_id=tokenizer.eos_token_id # Handle padding token
26
+ )
27
+
28
+ # Decode the output
29
+ response = tokenizer.decode(output[0], skip_special_tokens=True)
30
+ return response
31
+
32
+ # Chat with the model
33
+ if __name__ == "__main__":
34
+ print("Chat with the model (type 'exit' to quit)")
35
+ while True:
36
+ prompt = input("You: ")
37
+ if prompt.lower() in ["exit", "quit", "bye"]:
38
+ print("Goodbye!")
39
+ break
40
+ response = generate_response(prompt)
41
+ print("Bot:", response)
tiny-gpt2-fine-tuning/fine-tuning.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
2
+ from datasets import Dataset
3
+ import torch
4
+
5
+ # Load the pre-trained tokenizer and model
6
+ model_name = "sshleifer/tiny-gpt2"
7
+ tokenizer = GPT2Tokenizer.from_pretrained(model_name)
8
+ model = GPT2LMHeadModel.from_pretrained(model_name)
9
+
10
+ # Set the pad token to be the same as the eos token
11
+ tokenizer.pad_token = tokenizer.eos_token
12
+
13
+ # Function to read the text file and return a list of texts
14
+ def read_text_file(file_path):
15
+ with open(file_path, "r") as file:
16
+ text = file.read()
17
+ return text.splitlines()
18
+
19
+ # Load and process your data
20
+ file_path = "data.txt" # Replace with the path to your data.txt
21
+ texts = read_text_file(file_path)
22
+
23
+ # Create a Dataset object
24
+ dataset = Dataset.from_dict({"text": texts})
25
+
26
+ # Split the dataset into training and evaluation sets
27
+ train_testdatasets = dataset.train_test_split(test_size=0.1)
28
+ train_dataset = train_testdatasets["train"]
29
+ eval_dataset = train_testdatasets["test"]
30
+
31
+ # Tokenize the dataset and prepare labels
32
+ def tokenize_function(examples):
33
+ encoding = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
34
+ encoding["labels"] = encoding["input_ids"].copy() # Labels are the same as input_ids
35
+ return encoding
36
+
37
+ tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
38
+ tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
39
+
40
+ # Set up training arguments
41
+ training_args = TrainingArguments(
42
+ output_dir="./results", # Output directory for model predictions and checkpoints
43
+ overwrite_output_dir=True, # Overwrite the content of the output directory
44
+ num_train_epochs=5, # Number of training epochs
45
+ per_device_train_batch_size=2, # Batch size per GPU/TPU core/CPU for training
46
+ save_steps=10_000, # Save checkpoint every 10,000 steps
47
+ save_total_limit=2, # Limit the total amount of checkpoints, delete the older ones
48
+ logging_dir='./logs', # Directory for storing logs
49
+ logging_steps=500, # Log every 500 steps
50
+ eval_strategy="epoch", # Evaluate at the end of each epoch
51
+ learning_rate = 0.01,
52
+ )
53
+
54
+ # Initialize the Trainer
55
+ trainer = Trainer(
56
+ model=model, # The pre-trained model to be fine-tuned
57
+ args=training_args, # Training arguments
58
+ train_dataset=tokenized_train_dataset, # Tokenized training dataset
59
+ eval_dataset=tokenized_eval_dataset,
60
+ )
61
+
62
+ # Train the model
63
+ trainer.train()
64
+
65
+ # Save the fine-tuned model and tokenizer
66
+ model.save_pretrained("./fine-tuned-gpt2")
67
+ tokenizer.save_pretrained("./fine-tuned-gpt2")
68
+
69
+ # Evaluate the model
70
+ results = trainer.evaluate()
71
+ print(results)
tiny-gpt2-fine-tuning/fine-tuning2.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer, DataCollatorForLanguageModeling
2
+ from datasets import Dataset
3
+ import torch
4
+ import numpy as np
5
+ from sklearn.model_selection import train_test_split
6
+ from transformers import EarlyStoppingCallback
7
+ import os
8
+
9
+ # Load the tokenizer and model dynamically
10
+ model_name = "sshleifer/tiny-gpt2" # Replace with any GPT-2 model (e.g., "gpt2" or "gpt2-medium")
11
+ tokenizer = GPT2Tokenizer.from_pretrained(model_name)
12
+ model = GPT2LMHeadModel.from_pretrained(model_name)
13
+
14
+ # Ensure the tokenizer has a pad token
15
+ if tokenizer.pad_token is None:
16
+ tokenizer.pad_token = tokenizer.eos_token
17
+
18
+ # Function to read and process the text file into a list of texts
19
+ def read_text_file(file_path):
20
+ with open(file_path, "r") as file:
21
+ text = file.read()
22
+ return text.splitlines()
23
+
24
+ # Load and process your data
25
+ file_path = "data.txt" # Replace with the path to your data.txt
26
+ texts = read_text_file(file_path)
27
+
28
+ # Shuffle and split the dataset more robustly
29
+ train_texts, eval_texts = train_test_split(texts, test_size=0.1, random_state=42)
30
+ train_dataset = Dataset.from_dict({"text": train_texts})
31
+ eval_dataset = Dataset.from_dict({"text": eval_texts})
32
+
33
+ # Tokenize the dataset and prepare labels
34
+ def tokenize_function(examples):
35
+ encoding = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=332)
36
+ encoding["labels"] = encoding["input_ids"].copy() # Labels are the same as input_ids
37
+ return encoding
38
+
39
+ # Tokenize datasets
40
+ tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
41
+ tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
42
+
43
+ # Data collator for language modeling to handle dynamic padding
44
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
45
+
46
+ # Set up advanced training arguments
47
+ training_args = TrainingArguments(
48
+ output_dir="./results", # Output directory for model predictions and checkpoints
49
+ overwrite_output_dir=True, # Overwrite the content of the output directory
50
+ num_train_epochs=10, # Number of training epochs
51
+ per_device_train_batch_size=2, # Batch size per GPU/TPU core/CPU for training
52
+ per_device_eval_batch_size=2, # Batch size per GPU/TPU core/CPU for evaluation
53
+ save_steps=2_000, # Save checkpoint every 2,000 steps
54
+ save_total_limit=2, # Limit the total amount of checkpoints, delete the older ones
55
+ logging_dir='./logs', # Directory for storing logs
56
+ logging_steps=10, # Log every 100 steps
57
+ eval_strategy="steps", # Evaluate every X steps (set by `evaluation_strategy` argument) # Perform evaluation every N steps
58
+ learning_rate=0.01, # Lower learning rate for fine-tuning
59
+ weight_decay=0.01, # Weight decay to prevent overfitting
60
+ warmup_steps=64, # Number of steps to perform learning rate warmup
61
+ fp16=True, # Use 16-bit precision if supported by GPU
62
+ load_best_model_at_end=True, # Load the best model when training is finished
63
+ metric_for_best_model="loss", # Metric to use to compare the best model
64
+ greater_is_better=False, # Set to False as we're minimizing loss
65
+ disable_tqdm=False, # Show progress bar
66
+ )
67
+
68
+ # Early stopping to prevent overfitting
69
+ early_stopping = EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01)
70
+
71
+ # Custom evaluation metrics (e.g., perplexity)
72
+ def compute_metrics(eval_pred):
73
+ logits, labels = eval_pred
74
+ # Convert numpy arrays to PyTorch tensors
75
+ logits = torch.tensor(logits)
76
+ labels = torch.tensor(labels)
77
+
78
+ # Shift the logits and labels for loss calculation
79
+ shift_logits = logits[..., :-1, :].contiguous()
80
+ shift_labels = labels[..., 1:].contiguous()
81
+
82
+ # Calculate perplexity
83
+ loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)
84
+ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
85
+ perplexity = torch.exp(loss)
86
+
87
+ return {"perplexity": perplexity.item()}
88
+
89
+ # Initialize the Trainer with advanced settings
90
+ trainer = Trainer(
91
+ model=model, # The pre-trained model to be fine-tuned
92
+ args=training_args, # Training arguments
93
+ train_dataset=tokenized_train_dataset, # Tokenized training dataset
94
+ eval_dataset=tokenized_eval_dataset, # Tokenized evaluation dataset
95
+ data_collator=data_collator, # Data collator for dynamic padding
96
+ compute_metrics=compute_metrics, # Custom evaluation metrics
97
+ callbacks=[early_stopping], # Early stopping callback
98
+ )
99
+
100
+ # Train the model
101
+ trainer.train()
102
+
103
+ # Save the fine-tuned model and tokenizer
104
+ model_output_dir = "./fine-tuned-gpt2"
105
+ os.makedirs(model_output_dir, exist_ok=True)
106
+ model.save_pretrained(model_output_dir)
107
+ tokenizer.save_pretrained(model_output_dir)
108
+
109
+ # Evaluate the model
110
+ results = trainer.evaluate()
111
+ print(f"Evaluation Results: {results}")