mouadenna commited on
Commit
fff6c5d
·
verified ·
1 Parent(s): 3f566f9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -0
app.py CHANGED
@@ -9,11 +9,13 @@ from sklearn.decomposition import PCA
9
  from sklearn.manifold import TSNE
10
  import plotly.graph_objects as go
11
 
 
12
  st.set_page_config(
13
  page_title="Token & Embedding Visualizer",
14
  layout="wide"
15
  )
16
 
 
17
  COLORS = {
18
  'Special': '#FFB6C1',
19
  'Subword': '#98FB98',
@@ -46,6 +48,7 @@ def load_models_and_tokenizers() -> Tuple[Dict, Dict]:
46
  return tokenizers, models
47
 
48
  def classify_token(token: str) -> str:
 
49
  if token.startswith(('##', '▁', 'Ġ', '_', '.')):
50
  return 'Subword'
51
  elif token in ['[CLS]', '[SEP]', '<s>', '</s>', '<pad>', '[PAD]', '[MASK]', '<mask>']:
@@ -57,6 +60,7 @@ def classify_token(token: str) -> str:
57
 
58
  @torch.no_grad()
59
  def get_embeddings(text: str, model, tokenizer) -> Tuple[torch.Tensor, List[str]]:
 
60
  inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
61
  outputs = model(**inputs)
62
  embeddings = outputs.last_hidden_state[0] # Get first batch
@@ -64,6 +68,7 @@ def get_embeddings(text: str, model, tokenizer) -> Tuple[torch.Tensor, List[str]
64
  return embeddings, tokens
65
 
66
  def visualize_embeddings(embeddings: torch.Tensor, tokens: List[str], method: str = 'PCA') -> go.Figure:
 
67
  embed_array = embeddings.numpy()
68
 
69
  if method == 'PCA':
@@ -117,23 +122,27 @@ def visualize_embeddings(embeddings: torch.Tensor, tokens: List[str], method: st
117
  return fig
118
 
119
  def compute_token_similarities(embeddings: torch.Tensor, tokens: List[str]) -> pd.DataFrame:
 
120
  normalized_embeddings = embeddings / embeddings.norm(dim=1, keepdim=True)
121
  similarities = torch.mm(normalized_embeddings, normalized_embeddings.t())
122
 
123
  sim_df = pd.DataFrame(similarities.numpy(), columns=tokens, index=tokens)
124
  return sim_df
125
 
 
126
  st.title("🔤 Token & Embedding Visualizer")
127
 
128
  # Load models and tokenizers
129
  tokenizers, models = load_models_and_tokenizers()
130
 
 
131
  token_tab, embedding_tab, similarity_tab = st.tabs([
132
  "Token Visualization",
133
  "Embedding Visualization",
134
  "Token Similarities"
135
  ])
136
 
 
137
  default_text = "Hello world! Let's analyze how neural networks process language. The transformer architecture revolutionized NLP."
138
  text_input = st.text_area("Enter text to analyze:", value=default_text, height=100)
139
 
 
9
  from sklearn.manifold import TSNE
10
  import plotly.graph_objects as go
11
 
12
+ # Set Streamlit page configuration
13
  st.set_page_config(
14
  page_title="Token & Embedding Visualizer",
15
  layout="wide"
16
  )
17
 
18
+ # Define colors for different token types
19
  COLORS = {
20
  'Special': '#FFB6C1',
21
  'Subword': '#98FB98',
 
48
  return tokenizers, models
49
 
50
  def classify_token(token: str) -> str:
51
+ """Classify token type based on its characteristics"""
52
  if token.startswith(('##', '▁', 'Ġ', '_', '.')):
53
  return 'Subword'
54
  elif token in ['[CLS]', '[SEP]', '<s>', '</s>', '<pad>', '[PAD]', '[MASK]', '<mask>']:
 
60
 
61
  @torch.no_grad()
62
  def get_embeddings(text: str, model, tokenizer) -> Tuple[torch.Tensor, List[str]]:
63
+ """Get embeddings and tokens from the model and tokenizer"""
64
  inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
65
  outputs = model(**inputs)
66
  embeddings = outputs.last_hidden_state[0] # Get first batch
 
68
  return embeddings, tokens
69
 
70
  def visualize_embeddings(embeddings: torch.Tensor, tokens: List[str], method: str = 'PCA') -> go.Figure:
71
+ """Visualize embeddings using PCA or t-SNE"""
72
  embed_array = embeddings.numpy()
73
 
74
  if method == 'PCA':
 
122
  return fig
123
 
124
  def compute_token_similarities(embeddings: torch.Tensor, tokens: List[str]) -> pd.DataFrame:
125
+ """Compute cosine similarities between token embeddings"""
126
  normalized_embeddings = embeddings / embeddings.norm(dim=1, keepdim=True)
127
  similarities = torch.mm(normalized_embeddings, normalized_embeddings.t())
128
 
129
  sim_df = pd.DataFrame(similarities.numpy(), columns=tokens, index=tokens)
130
  return sim_df
131
 
132
+ # Streamlit app title
133
  st.title("🔤 Token & Embedding Visualizer")
134
 
135
  # Load models and tokenizers
136
  tokenizers, models = load_models_and_tokenizers()
137
 
138
+ # Create tabs for different visualizations
139
  token_tab, embedding_tab, similarity_tab = st.tabs([
140
  "Token Visualization",
141
  "Embedding Visualization",
142
  "Token Similarities"
143
  ])
144
 
145
+ # Default text for analysis
146
  default_text = "Hello world! Let's analyze how neural networks process language. The transformer architecture revolutionized NLP."
147
  text_input = st.text_area("Enter text to analyze:", value=default_text, height=100)
148