<a href="https://colab.research.google.com/github/Druvith/NN_from_scratch2023/blob/main/MoE2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

# Directory where the dataset will be stored
dataset_dir = "TinyStories4"

# Create the TinyStories directory if it doesn't exist
os.makedirs(dataset_dir, exist_ok=True)

# URL of the dataset archive
dataset_url = "https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStories_all_data.tar.gz"
# Name of the file to save the downloaded dataset archive
dataset_archive_path = os.path.join(dataset_dir, "TinyStories_all_data.tar.gz")

# Download the archive
if not os.path.exists(dataset_archive_path):
    os.system(f"wget {dataset_url} -O {dataset_archive_path}")

# Extract the archive into the TinyStories directory
os.system(f"tar -xzf {dataset_archive_path} -C {dataset_dir}")


0

In [3]:
import os
import glob
import json

shard_filenames = sorted(glob.glob(os.path.join('TinyStories4', "*.json")))
with open(shard_filenames[0], "r") as f:
        data = json.load(f)

stories = [x['story'] for x in data]
text = "\n".join(stories)

In [4]:
len(text)

77586884

In [5]:
import string

# Define the set of characters to remove
remove_chars = "\$%&*+-/;`|~ éñ–—…"

# Create a translation table that maps the characters to be removed to None
trans_table = str.maketrans(remove_chars, ' ' * len(remove_chars))

# Remove the characters from the string using the translation table
text = text.translate(trans_table)

In [6]:
### vocab size and characters used

chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)

	
 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz 
66


In [7]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1137)

# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 6000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embed = 384
n_head = 6
n_layer = 6
dropout = 0.0
# ------------

In [8]:
### character encoding and decoding function, stoi : string to integer, itos: integer to string

import torch

stoi = { ch:i for i, ch in enumerate(chars) }
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[x] for x in l])

data = torch.tensor(encode(text), dtype = torch.long)

In [9]:
### creating the train and text splits

n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [10]:
# data loading

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

In [11]:
class RMSNorm(nn.Module):    #### x * (sqrt(mean(x**2))
  def __init__(self, n_embed, eps = 1e-6):
    super().__init__()
    self.eps = eps
    self.weight = nn.Parameter(torch.ones(n_embed))

  def _norm(self, x):
    return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

  def forward(self, x):
    output = self._norm(x.float()).type_as(x)
    return output * self.weight

In [12]:
class MoeLayer(nn.Module):
    def __init__(self, experts, gate, k=2):
        super().__init__()
        assert len(experts) > 0
        self.experts = nn.ModuleList(experts)
        self.gate = gate
        self.k = k

    def forward(self, inputs: torch.Tensor):
        inputs_squashed = inputs.view(-1, inputs.shape[-1])
        gate_logits = self.gate(inputs_squashed)
        weights, selected_experts = torch.topk(
            gate_logits, self.k
        )
        weights = nn.functional.softmax(
            weights,
            dim=1,
            dtype=torch.float,
        ).type_as(inputs)
        results = torch.zeros_like(inputs_squashed)
        for i, expert in enumerate(self.experts):
            batch_idx, nth_expert = torch.where(selected_experts == i)
            results[batch_idx] += weights[batch_idx, nth_expert, None] * expert(
                inputs_squashed[batch_idx]
            )
        return results.view_as(inputs)

In [13]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias = False)
        self.query = nn.Linear(n_embed, head_size, bias = False)
        self.value = nn.Linear(n_embed, head_size, bias = False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * C**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        out = wei @ v
        return out

class MulitHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embed, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x =  torch.cat([head(x) for head in self.heads], dim=-1)
        out = self.dropout(self.proj(x))
        return out


class Expert(nn.Module):
    def __init__(self, n_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4* n_embed),
            nn.SiLU(),
            nn.Linear(4 * n_embed, n_embed),
         nn.Dropout(dropout))

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embed, n_head, num_experts=4):
        super().__init__()
        self.sa_head= MulitHeadAttention(n_head, n_embed//n_head)
        self.ffw = MoeLayer(
            experts=[Expert(n_embed) for _ in range(num_experts)],
            gate=nn.Linear(n_embed, num_experts, bias=False),
        )

#         self.ffw=  FeedForward(n_embed)
        self.ln1 = RMSNorm(n_embed)
        self.ln2 = RMSNorm(n_embed)

    def forward(self, x):
        x = x + self.sa_head(self.ln1(x))
        x = x+self.ffw(self.ln2(x))
        return x


class Transformer(nn.Module):
    def __init__(self):
        super().__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, n_embed, device=device)
        self.position_embedding_table = nn.Embedding(block_size, n_embed, device=device)
        self.blocks = nn.Sequential(*[Block(n_embed, n_head=n_head) for _ in range(n_layer)])
        self.lm_head = nn.Linear(n_embed, vocab_size)


    def forward(self, idx, targets=None):
        B, T = idx.shape

        token_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T).to(device))
        x = token_emb + pos_emb
        x = self.blocks(x)
        logits = self.lm_head(x)
        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokes):
        for _ in range(max_new_tokes):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim = -1)
            idx_next = torch.multinomial(probs, num_samples = 1)
            idx = torch.cat((idx, idx_next), dim = 1)
        return idx

In [14]:
model = Transformer()
optimizer = torch.optim.AdamW(model.parameters(),lr=learning_rate)
print(sum(p.numel() for p in model.parameters()), 'total parameters')

32061762 total parameters


In [15]:
model.eval()

Transformer(
  (token_embedding_table): Embedding(66, 384)
  (position_embedding_table): Embedding(256, 384)
  (blocks): Sequential(
    (0): Block(
      (sa_head): MulitHeadAttention(
        (heads): ModuleList(
          (0-5): 6 x Head(
            (key): Linear(in_features=384, out_features=64, bias=False)
            (query): Linear(in_features=384, out_features=64, bias=False)
            (value): Linear(in_features=384, out_features=64, bias=False)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ffw): MoeLayer(
        (experts): ModuleList(
          (0-3): 4 x Expert(
            (net): Sequential(
              (0): Linear(in_features=384, out_features=1536, bias=True)
              (1): SiLU()
              (2): Linear(in_features=1536, out_features=384, bias=True)
              (3): Dropout(p=0.0, inplace=False)
  

In [16]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.16.4-py3-none-any.whl (2.2 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/2.2 MB[0m [31m7.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.42-py3-none-any.whl (195 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.4/195.4 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.43.0-py2.py3-none-any.whl (264 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m264.6/264.6 kB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.w

In [17]:
import wandb
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [18]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            X = X.to(device)
            Y = Y.to(device)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [19]:
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(),lr=1e-4)

wandb.init(
    # set the wandb project where this run will be logged
    project="mixture of experts",

    # track hyperparameters and run metadata
    config={
    'batch_size': batch_size,
    'block_size': block_size,
    'max_iters': max_iters,
    'eval_interval': eval_interval,
    'learning_rate': learning_rate,
    'device': device,
    'eval_iters': eval_iters,
    'n_embed': n_embed,
    'n_head': n_head,
    'n_layer': n_layer,
    'dropout': dropout
    }
)

wandb.watch(model)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % 100 == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        wandb.log({'train_loss': losses['train'], 'val_loss': losses['val']}, step=iter)

    # sample a batch of data
    xb, yb = get_batch('train')
    xb = xb.to(device)
    yb = yb.to(device)

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

model_dir = 'Mixture of experts models'
os.makedirs(model_dir, exist_ok=True)
final_model_path = os.path.join(model_dir, 'moe2_model.pth')
torch.save(model.state_dict(), final_model_path)
print('Final trained model saved!')

wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mdruvithlgowda00[0m. Use [1m`wandb login --relogin`[0m to force relogin


step 0: train loss 4.5065, val loss 4.5070
step 100: train loss 2.2867, val loss 2.2864
step 200: train loss 2.2622, val loss 2.2609
step 300: train loss 2.2438, val loss 2.2433
step 400: train loss 2.2149, val loss 2.2150
step 500: train loss 2.1553, val loss 2.1571
step 600: train loss 1.9616, val loss 1.9618
step 700: train loss 1.7527, val loss 1.7581
step 800: train loss 1.6343, val loss 1.6381
step 900: train loss 1.5419, val loss 1.5456
step 1000: train loss 1.4781, val loss 1.4763
step 1100: train loss 1.4095, val loss 1.4159
step 1200: train loss 1.3634, val loss 1.3671
step 1300: train loss 1.3279, val loss 1.3312
step 1400: train loss 1.2929, val loss 1.2962
step 1500: train loss 1.2640, val loss 1.2644
step 1600: train loss 1.2410, val loss 1.2439
step 1700: train loss 1.2074, val loss 1.2121
step 1800: train loss 1.1888, val loss 1.1913
step 1900: train loss 1.1664, val loss 1.1709
step 2000: train loss 1.1527, val loss 1.1543
step 2100: train loss 1.1329, val loss 1.1347


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_loss,█▄▄▄▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▄▄▄▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
train_loss,0.84435
val_loss,0.84581


In [26]:
# generation
d = 'once'
context = torch.tensor(encode(d), dtype=torch.long, device=device).unsqueeze(0)
print(decode(model.generate(context, max_new_tokes=500)[0].tolist()))


once to go home 
Once upon a time  there was a little girl named Lily  She loved to play with her together in the tracks  One day  her mom made a little here with a big sea  She had answed 
 Let s play   as play white   The tree was crying and continued   but then we down the sea tost inside   the spick replied   That dog   for it is thought about   
Then  it s house  The boy and the man fell down about the pole and excited away  The wind was go away to and Tom again  They excited to put the circle 


In [27]:
#push the weights to hugging_face.hub
!pip install transformers huggingface_hub



In [28]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [32]:
from huggingface_hub import create_repo, Repository

repo_name = "Druvith/Tiny_StoriesMoE"
create_repo(repo_name, private=False, exist_ok=True)  # Set private=True for a private repository


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


RepoUrl('https://huggingface.co/Druvith/Tiny_StoriesMoE', endpoint='https://huggingface.co', repo_type='model', repo_id='Druvith/Tiny_StoriesMoE')

In [34]:
repo = Repository("Tiny_StoriesMoE", clone_from=f"Druvith/Tiny_StoriesMoE")


For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/Druvith/Tiny_StoriesMoE into local empty directory.


In [40]:
import shutil
shutil.move("wandb", "Tiny_StoriesMoE")

Error: Destination path 'Tiny_StoriesMoE/wandb' already exists

In [39]:
repo.push_to_hub(commit_message="wandb logs")

IsADirectoryError: [Errno 21] Is a directory: '/content/Tiny_StoriesMoE/wandb/latest-run'

In [None]:
shutil.move("Mixture of experts models/moe2_model.pth", "Tiny_StoriesMoE")