from torch_lr_finder import LRFinder
from torch.nn import CrossEntropyLoss
import torch.optim as optim
import torch
from transformer import Config, DecoderOnlyTransformer

class DataLoaderLite:
    def __init__(self, B, T):
        self.B = B
        self.T = T

        # at init load tokens from disk and store them in memory
        with open('input.txt', 'r') as f:
            text = f.read()
        enc = tiktoken.get_encoding('gpt2') 
        tokens = enc.encode(text)
        self.tokens = torch.tensor(tokens)
        print(f'loaded {len(self.tokens)} tokens')
        print(f'1 epoch = {len(self.tokens) // (B * T)} batches')

        # state
        self.current_position = 0
    
    def next_batch(self):
        B, T = self.B, self.T
        buf = self.tokens[self.current_position: self.current_position + B * T + 1]
        x = (buf[:-1]).view(B, T) # inputs
        y = (buf[1:]).view(B, T) # targets
        # advance the position in the tensor
        self.current_position += B*T
        # if loading the next batch would be out of bounds, reset
        if self.current_position + (B * T + 1) > len(self.tokens):
            self.current_position = 0
        return x, y


batches, no_of_tokens = 16, 128
train_loader = DataLoaderLite(B=batches, T=no_of_tokens)
steps_per_epoch = len(train_loader.tokens) // (batches * no_of_tokens)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
# Model configuration
config = Config()

# Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")  # Use GPT-2 tokenizer for compatibility

# Load trained model
model = DecoderOnlyTransformer(config)
model.load_state_dict(torch.load("decoder_only_transformer.pth", map_location=torch.device('cpu')))
model.eval()
model.to(device)

amp_config = {
    'device_type': 'cuda',
    'dtype': torch.float16,
}
criterion = CrossEntropyLoss()
grad_scaler = torch.cuda.amp.GradScaler()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Define a custom batch fetching wrapper
class CustomDataLoader:
    def __init__(self, next_batch_func, num_batches):
        self.next_batch_func = next_batch_func
        self.num_batches = num_batches
        self.current_batch = 0

    def __iter__(self):
        self.current_batch = 0
        return self

    def __next__(self):
        if self.current_batch < self.num_batches:
            self.current_batch += 1
            return self.next_batch_func()
        else:
            raise StopIteration

# Create a custom data loader using next_batch
custom_train_loader = CustomDataLoader(train_loader.next_batch(), num_batches=steps_per_epoch)

# Use the custom data loader with LRFinder
lr_finder = LRFinder(
    model, optimizer, criterion, device='cuda',
    amp_backend='torch', amp_config=amp_config, grad_scaler=grad_scaler
)
lr_finder.range_test(custom_train_loader, end_lr=5, num_iter=1000, step_mode='exp')
lr_finder.plot()
lr_finder.reset()