Upload 3 files
Browse files
@@ -0,0 +1,41 @@
1 |
from transformers import GPT2Tokenizer, GPT2LMHeadModel
2 |
import torch
3 |
4 |
# Load the fine-tuned model and tokenizer
5 |
model = GPT2LMHeadModel.from_pretrained("./fine-tuned-gpt2")
6 |
tokenizer = GPT2Tokenizer.from_pretrained("./fine-tuned-gpt2")
7 |
8 |
# Function to generate a response
9 |
def generate_response(prompt, max_length=100, temperature=0.7, top_k=50, top_p=0.95):
10 |
# Encode the input prompt
11 |
input_ids = tokenizer.encode(prompt, return_tensors='pt')
12 |
13 |
# Create an attention mask
14 |
attention_mask = (input_ids != tokenizer.pad_token_id).long()
15 |
16 |
with torch.no_grad():
17 |
output = model.generate(
18 |
19 |
attention_mask=attention_mask, # Add attention mask
20 |
max_length=max_length, # Adjust as needed
21 |
temperature=temperature, # Sampling temperature
22 |
top_k=top_k, # Top-k sampling
23 |
top_p=top_p, # Top-p (nucleus) sampling
24 |
do_sample=True, # Enable sampling
25 |
pad_token_id=tokenizer.eos_token_id # Handle padding token
26 |
27 |
28 |
# Decode the output
29 |
response = tokenizer.decode(output[0], skip_special_tokens=True)
30 |
return response
31 |
32 |
# Chat with the model
33 |
if __name__ == "__main__":
34 |
print("Chat with the model (type 'exit' to quit)")
35 |
while True:
36 |
prompt = input("You: ")
37 |
if prompt.lower() in ["exit", "quit", "bye"]:
38 |
39 |
40 |
response = generate_response(prompt)
41 |
print("Bot:", response)
@@ -0,0 +1,71 @@
1 |
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
2 |
from datasets import Dataset
3 |
import torch
4 |
5 |
# Load the pre-trained tokenizer and model
6 |
model_name = "sshleifer/tiny-gpt2"
7 |
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
8 |
model = GPT2LMHeadModel.from_pretrained(model_name)
9 |
10 |
# Set the pad token to be the same as the eos token
11 |
tokenizer.pad_token = tokenizer.eos_token
12 |
13 |
# Function to read the text file and return a list of texts
14 |
def read_text_file(file_path):
15 |
with open(file_path, "r") as file:
16 |
text = file.read()
17 |
return text.splitlines()
18 |
19 |
# Load and process your data
20 |
file_path = "data.txt" # Replace with the path to your data.txt
21 |
texts = read_text_file(file_path)
22 |
23 |
# Create a Dataset object
24 |
dataset = Dataset.from_dict({"text": texts})
25 |
26 |
# Split the dataset into training and evaluation sets
27 |
train_testdatasets = dataset.train_test_split(test_size=0.1)
28 |
train_dataset = train_testdatasets["train"]
29 |
eval_dataset = train_testdatasets["test"]
30 |
31 |
# Tokenize the dataset and prepare labels
32 |
def tokenize_function(examples):
33 |
encoding = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
34 |
encoding["labels"] = encoding["input_ids"].copy() # Labels are the same as input_ids
35 |
return encoding
36 |
37 |
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
38 |
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
39 |
40 |
# Set up training arguments
41 |
training_args = TrainingArguments(
42 |
output_dir="./results", # Output directory for model predictions and checkpoints
43 |
overwrite_output_dir=True, # Overwrite the content of the output directory
44 |
num_train_epochs=5, # Number of training epochs
45 |
per_device_train_batch_size=2, # Batch size per GPU/TPU core/CPU for training
46 |
save_steps=10_000, # Save checkpoint every 10,000 steps
47 |
save_total_limit=2, # Limit the total amount of checkpoints, delete the older ones
48 |
logging_dir='./logs', # Directory for storing logs
49 |
logging_steps=500, # Log every 500 steps
50 |
eval_strategy="epoch", # Evaluate at the end of each epoch
51 |
learning_rate = 0.01,
52 |
53 |
54 |
# Initialize the Trainer
55 |
trainer = Trainer(
56 |
model=model, # The pre-trained model to be fine-tuned
57 |
args=training_args, # Training arguments
58 |
train_dataset=tokenized_train_dataset, # Tokenized training dataset
59 |
60 |
61 |
62 |
# Train the model
63 |
64 |
65 |
# Save the fine-tuned model and tokenizer
66 |
67 |
68 |
69 |
# Evaluate the model
70 |
results = trainer.evaluate()
71 |
@@ -0,0 +1,111 @@
1 |
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer, DataCollatorForLanguageModeling
2 |
from datasets import Dataset
3 |
import torch
4 |
import numpy as np
5 |
from sklearn.model_selection import train_test_split
6 |
from transformers import EarlyStoppingCallback
7 |
import os
8 |
9 |
# Load the tokenizer and model dynamically
10 |
model_name = "sshleifer/tiny-gpt2" # Replace with any GPT-2 model (e.g., "gpt2" or "gpt2-medium")
11 |
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
12 |
model = GPT2LMHeadModel.from_pretrained(model_name)
13 |
14 |
# Ensure the tokenizer has a pad token
15 |
if tokenizer.pad_token is None:
16 |
tokenizer.pad_token = tokenizer.eos_token
17 |
18 |
# Function to read and process the text file into a list of texts
19 |
def read_text_file(file_path):
20 |
with open(file_path, "r") as file:
21 |
text = file.read()
22 |
return text.splitlines()
23 |
24 |
# Load and process your data
25 |
file_path = "data.txt" # Replace with the path to your data.txt
26 |
texts = read_text_file(file_path)
27 |
28 |
# Shuffle and split the dataset more robustly
29 |
train_texts, eval_texts = train_test_split(texts, test_size=0.1, random_state=42)
30 |
train_dataset = Dataset.from_dict({"text": train_texts})
31 |
eval_dataset = Dataset.from_dict({"text": eval_texts})
32 |
33 |
# Tokenize the dataset and prepare labels
34 |
def tokenize_function(examples):
35 |
encoding = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=332)
36 |
encoding["labels"] = encoding["input_ids"].copy() # Labels are the same as input_ids
37 |
return encoding
38 |
39 |
# Tokenize datasets
40 |
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
41 |
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
42 |
43 |
# Data collator for language modeling to handle dynamic padding
44 |
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
45 |
46 |
# Set up advanced training arguments
47 |
training_args = TrainingArguments(
48 |
output_dir="./results", # Output directory for model predictions and checkpoints
49 |
overwrite_output_dir=True, # Overwrite the content of the output directory
50 |
num_train_epochs=10, # Number of training epochs
51 |
per_device_train_batch_size=2, # Batch size per GPU/TPU core/CPU for training
52 |
per_device_eval_batch_size=2, # Batch size per GPU/TPU core/CPU for evaluation
53 |
save_steps=2_000, # Save checkpoint every 2,000 steps
54 |
save_total_limit=2, # Limit the total amount of checkpoints, delete the older ones
55 |
logging_dir='./logs', # Directory for storing logs
56 |
logging_steps=10, # Log every 100 steps
57 |
eval_strategy="steps", # Evaluate every X steps (set by `evaluation_strategy` argument) # Perform evaluation every N steps
58 |
learning_rate=0.01, # Lower learning rate for fine-tuning
59 |
weight_decay=0.01, # Weight decay to prevent overfitting
60 |
warmup_steps=64, # Number of steps to perform learning rate warmup
61 |
fp16=True, # Use 16-bit precision if supported by GPU
62 |
load_best_model_at_end=True, # Load the best model when training is finished
63 |
metric_for_best_model="loss", # Metric to use to compare the best model
64 |
greater_is_better=False, # Set to False as we're minimizing loss
65 |
disable_tqdm=False, # Show progress bar
66 |
67 |
68 |
# Early stopping to prevent overfitting
69 |
early_stopping = EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01)
70 |
71 |
# Custom evaluation metrics (e.g., perplexity)
72 |
def compute_metrics(eval_pred):
73 |
logits, labels = eval_pred
74 |
# Convert numpy arrays to PyTorch tensors
75 |
logits = torch.tensor(logits)
76 |
labels = torch.tensor(labels)
77 |
78 |
# Shift the logits and labels for loss calculation
79 |
shift_logits = logits[..., :-1, :].contiguous()
80 |
shift_labels = labels[..., 1:].contiguous()
81 |
82 |
# Calculate perplexity
83 |
loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)
84 |
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
85 |
perplexity = torch.exp(loss)
86 |
87 |
return {"perplexity": perplexity.item()}
88 |
89 |
# Initialize the Trainer with advanced settings
90 |
trainer = Trainer(
91 |
model=model, # The pre-trained model to be fine-tuned
92 |
args=training_args, # Training arguments
93 |
train_dataset=tokenized_train_dataset, # Tokenized training dataset
94 |
eval_dataset=tokenized_eval_dataset, # Tokenized evaluation dataset
95 |
data_collator=data_collator, # Data collator for dynamic padding
96 |
compute_metrics=compute_metrics, # Custom evaluation metrics
97 |
callbacks=[early_stopping], # Early stopping callback
98 |
99 |
100 |
# Train the model
101 |
102 |
103 |
# Save the fine-tuned model and tokenizer
104 |
model_output_dir = "./fine-tuned-gpt2"
105 |
os.makedirs(model_output_dir, exist_ok=True)
106 |
107 |
108 |
109 |
# Evaluate the model
110 |
results = trainer.evaluate()
111 |
print(f"Evaluation Results: {results}")