broskicodes commited on
Commit
d0ff5ee
·
1 Parent(s): 91791ec

add app.py

Browse files
Files changed (1) hide show
  1. app.py +157 -0
app.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch # we use PyTorch: https://pytorch.org
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+
6
+
7
+ # model hyperparameters
8
+ batch_size = 32
9
+ block_size = 128
10
+ max_iters = 5000
11
+ eval_interval = 500
12
+ learning_rate = 3e-4
13
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
14
+ eval_iters = 200
15
+ n_embed = 256
16
+ n_heads = 8
17
+ n_layers = 6
18
+ dropout = 0.2
19
+ # -------------------------------------------------
20
+
21
+ # model architecture
22
+ class AttentionHead(nn.Module):
23
+ """a single head of self attention"""
24
+
25
+ def __init__(self, head_size):
26
+ super().__init__()
27
+ self.key = nn.Linear(n_embed, head_size, bias=False)
28
+ self.query = nn.Linear(n_embed, head_size, bias=False)
29
+ self.value = nn.Linear(n_embed, head_size, bias=False)
30
+ self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
31
+
32
+ self.dropout = nn.Dropout(dropout)
33
+
34
+ def forward(self, x):
35
+ B, T, C = x.shape
36
+ K = self.key(x) # (B, T, C)
37
+ Q = self.query(x) # (B, T, C)
38
+
39
+ wei = Q @ K.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, H, C) -> (B, T, T)
40
+ wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
41
+ wei = F.softmax(wei, dim=-1)
42
+ wei = self.dropout(wei)
43
+
44
+ V = self.value(x) # (B, T, C)
45
+ out = wei @ V # (B, T, T) @ (B, T, C) -> (B, T, C)
46
+ return out
47
+
48
+ class MultiHeadAttention(nn.Module):
49
+ """a multi-head self attention layer"""
50
+
51
+ def __init__(self, n_heads, head_size):
52
+ super().__init__()
53
+ self.heads = nn.ModuleList([AttentionHead(head_size) for _ in range(n_heads)])
54
+ self.fc = nn.Linear(head_size * n_heads, n_embed)
55
+ self.dropout = nn.Dropout(dropout)
56
+
57
+ def forward(self, x):
58
+ out = torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, n_heads*C)
59
+ out = self.fc(out) # (B, T, C)
60
+ out = self.dropout(out)
61
+ return out
62
+
63
+ class FeedForward(nn.Module):
64
+ def __init__(self, n_hidden):
65
+ super().__init__()
66
+ self.net = nn.Sequential(
67
+ nn.Linear(n_embed, n_hidden),
68
+ nn.ReLU(),
69
+ nn.Linear(n_hidden, n_embed),
70
+ nn.Dropout(dropout)
71
+ )
72
+
73
+ def forward(self, x):
74
+ return self.net(x)
75
+
76
+ class Block(nn.Module):
77
+ def __init__(self, n_embed, n_heads):
78
+ super().__init__()
79
+ self.sa_heads = MultiHeadAttention(n_heads, n_embed // n_heads)
80
+ self.ffwd = FeedForward(n_embed*4)
81
+ self.ln1 = nn.LayerNorm(n_embed)
82
+ self.ln2 = nn.LayerNorm(n_embed)
83
+
84
+
85
+ def forward(self, x):
86
+ x = x + self.sa_heads(self.ln1(x)) # [batch_size, block_size, n_embed]
87
+ x = x + self.ffwd(self.ln2(x)) # [batch_size, block_size, n_embed]
88
+ return x
89
+
90
+ class BigramModel(nn.Module):
91
+ def __init__(self):
92
+ super().__init__()
93
+ self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
94
+ self.position_embedding_table = nn.Embedding(block_size, n_embed)
95
+ self.blocks = nn.Sequential(*[Block(n_embed, n_heads) for _ in range(n_layers)])
96
+ self.ln_f = nn.LayerNorm(n_embed)
97
+ self.lm_head = nn.Linear(n_embed, vocab_size)
98
+
99
+ def forward(self, idx, targets=None):
100
+ # idx and target are both [batch_size, block_size]
101
+ B, T = idx.shape
102
+
103
+ tok_emb = self.token_embedding_table(idx) # [batch_size, block_size, n_embed]
104
+ pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # [block_size, n_embed]
105
+ x = tok_emb + pos_emb # [batch_size, block_size, n_embed]
106
+ x = self.blocks(x)
107
+ x = self.ln_f(x)
108
+ logits = self.lm_head(x) # [batch_size, block_size, vocab_size]
109
+
110
+ if targets is None:
111
+ loss = None
112
+ else:
113
+ B, T, C = logits.shape
114
+ logits = logits.view(B*T, C)
115
+ targets = targets.view(B*T)
116
+ loss = F.cross_entropy(logits, targets)
117
+
118
+ return logits, loss
119
+
120
+ def generate(self, idx, max_new_tokens=100):
121
+ # idx is (B, T)
122
+ for _ in range(max_new_tokens):
123
+ # get the last block_size tokens
124
+ idx_cond = idx[:, -block_size:] # (B, T)
125
+ # get the predictions
126
+ logits, _ = self(idx_cond)
127
+ # focus only on the last time step
128
+ logits = logits[:, -1, :] # becomes (B, C)
129
+ # apply softmax to get probabilities
130
+ probs = F.softmax(logits, dim=1) # (B, C)
131
+ # sample from the distribution
132
+ idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
133
+ # append sampled index to the running sequence
134
+ idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
135
+
136
+ return idx
137
+ # ----------------------------------------------------------------
138
+
139
+ # helpers
140
+ chars = list("!$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz")
141
+ stoi = { ch:i for i,ch in enumerate(chars) }
142
+ itos = { i:ch for i,ch in enumerate(chars) }
143
+ encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
144
+ decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
145
+ # ----------------------------------------------------------------
146
+
147
+ # load model
148
+ model = torch.load('model/complete-model.pt')
149
+
150
+ # inference
151
+ slider_value = st.slider('Amount of text to generate', min_value=100, max_value=2000, value=500, step=5)
152
+ if st.button('Generat text')
153
+ context = torch.zeros((1, 1), dtype=torch.long, device='cuda')
154
+ text = model.generate(context, max_new_tokens=slider_value)[0].tolist()
155
+ st.json(decode(text))
156
+ #
157
+