Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
from transformers import AutoTokenizer, AutoModel | |
import torch | |
from typing import Dict, List, Tuple | |
import plotly.express as px | |
from sklearn.decomposition import PCA | |
from sklearn.manifold import TSNE | |
import plotly.graph_objects as go | |
# Set Streamlit page configuration | |
st.set_page_config( | |
page_title="Token & Embedding Visualizer", | |
layout="wide" | |
) | |
# Define colors for different token types | |
COLORS = { | |
'Special': '#FFB6C1', | |
'Subword': '#98FB98', | |
'Word': '#87CEFA', | |
'Punctuation': '#DDA0DD' | |
} | |
def load_models_and_tokenizers() -> Tuple[Dict, Dict]: | |
"""Load tokenizers and models with error handling""" | |
model_names = { | |
"BERT": "bert-base-uncased", | |
"RoBERTa": "roberta-base", | |
"DistilBERT": "distilbert-base-uncased", | |
"MPNet": "microsoft/mpnet-base", | |
"DeBERTa": "microsoft/deberta-base", | |
} | |
tokenizers = {} | |
models = {} | |
for name, model_name in model_names.items(): | |
try: | |
tokenizers[name] = AutoTokenizer.from_pretrained(model_name) | |
models[name] = AutoModel.from_pretrained(model_name) | |
st.success(f"β Loaded {name}") | |
except Exception as e: | |
st.warning(f"Γ Failed to load {name}: {str(e)}") | |
return tokenizers, models | |
def classify_token(token: str) -> str: | |
"""Classify token type based on its characteristics""" | |
if token.startswith(('##', 'β', 'Δ ', '_', '.')): | |
return 'Subword' | |
elif token in ['[CLS]', '[SEP]', '<s>', '</s>', '<pad>', '[PAD]', '[MASK]', '<mask>']: | |
return 'Special' | |
elif token in [',', '.', '!', '?', ';', ':', '"', "'", '(', ')', '[', ']', '{', '}']: | |
return 'Punctuation' | |
else: | |
return 'Word' | |
def get_embeddings(text: str, model, tokenizer) -> Tuple[torch.Tensor, List[str]]: | |
"""Get embeddings and tokens from the model and tokenizer""" | |
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) | |
outputs = model(**inputs) | |
embeddings = outputs.last_hidden_state[0] # Get first batch | |
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]) | |
return embeddings, tokens | |
def visualize_embeddings(embeddings: torch.Tensor, tokens: List[str], method: str = 'PCA') -> go.Figure: | |
"""Visualize embeddings using PCA or t-SNE""" | |
embed_array = embeddings.numpy() | |
if method == 'PCA': | |
reducer = PCA(n_components=3) | |
reduced_embeddings = reducer.fit_transform(embed_array) | |
variance_explained = reducer.explained_variance_ratio_ | |
method_info = f"Total variance explained: {sum(variance_explained):.2%}" | |
else: # t-SNE | |
reducer = TSNE(n_components=3, random_state=42, perplexity=min(30, len(tokens)-1)) | |
reduced_embeddings = reducer.fit_transform(embed_array) | |
method_info = "t-SNE embedding (perplexity: {})".format(reducer.perplexity) | |
df = pd.DataFrame({ | |
'x': reduced_embeddings[:, 0], | |
'y': reduced_embeddings[:, 1], | |
'z': reduced_embeddings[:, 2], | |
'token': tokens, | |
'type': [classify_token(t) for t in tokens] | |
}) | |
fig = go.Figure() | |
for token_type in df['type'].unique(): | |
mask = df['type'] == token_type | |
fig.add_trace(go.Scatter3d( | |
x=df[mask]['x'], | |
y=df[mask]['y'], | |
z=df[mask]['z'], | |
mode='markers+text', | |
name=token_type, | |
text=df[mask]['token'], | |
hovertemplate="Token: %{text}<br>Type: " + token_type + "<extra></extra>", | |
marker=dict( | |
size=8, | |
color=COLORS[token_type], | |
opacity=0.8 | |
) | |
)) | |
fig.update_layout( | |
title=f"{method} Visualization of Token Embeddings<br><sup>{method_info}</sup>", | |
scene=dict( | |
xaxis_title=f"{method}_1", | |
yaxis_title=f"{method}_2", | |
zaxis_title=f"{method}_3" | |
), | |
width=800, | |
height=800 | |
) | |
return fig | |
def compute_token_similarities(embeddings: torch.Tensor, tokens: List[str]) -> pd.DataFrame: | |
"""Compute cosine similarities between token embeddings""" | |
normalized_embeddings = embeddings / embeddings.norm(dim=1, keepdim=True) | |
similarities = torch.mm(normalized_embeddings, normalized_embeddings.t()) | |
sim_df = pd.DataFrame(similarities.numpy(), columns=tokens, index=tokens) | |
return sim_df | |
# Streamlit app title | |
st.title("π€ Token & Embedding Visualizer") | |
# Load models and tokenizers | |
tokenizers, models = load_models_and_tokenizers() | |
# Create tabs for different visualizations | |
token_tab, embedding_tab, similarity_tab = st.tabs([ | |
"Token Visualization", | |
"Embedding Visualization", | |
"Token Similarities" | |
]) | |
# Default text for analysis | |
default_text = "Hello world! Let's analyze how neural networks process language. The transformer architecture revolutionized NLP." | |
text_input = st.text_area("Enter text to analyze:", value=default_text, height=100) | |
with token_tab: | |
st.markdown(""" | |
Token colors represent: | |
- π¦ Blue: Complete words | |
- π© Green: Subwords | |
- π¨ Pink: Special tokens | |
- πͺ Purple: Punctuation | |
""") | |
selected_models = st.multiselect( | |
"Select models to compare tokens", | |
options=list(tokenizers.keys()), | |
default=["BERT", "RoBERTa"], | |
max_selections=4 | |
) | |
if text_input and selected_models: | |
cols = st.columns(len(selected_models)) | |
for idx, model_name in enumerate(selected_models): | |
with cols[idx]: | |
st.subheader(model_name) | |
tokenizer = tokenizers[model_name] | |
tokens = tokenizer.tokenize(text_input) | |
token_ids = tokenizer.encode(text_input) | |
if len(tokens) != len(token_ids): | |
tokens = tokenizer.convert_ids_to_tokens(token_ids) | |
st.metric("Tokens", len(tokens)) | |
html_tokens = [] | |
for token in tokens: | |
color = COLORS[classify_token(token)] | |
token_text = token.replace('<', '<').replace('>', '>') | |
html_tokens.append( | |
f'<span style="background-color: {color}; padding: 2px 4px; ' | |
f'margin: 2px; border-radius: 3px; font-family: monospace;">' | |
f'{token_text}</span>' | |
) | |
st.markdown( | |
'<div style="background-color: white; padding: 10px; ' | |
'border-radius: 5px; border: 1px solid #ddd;">' | |
f'{"".join(html_tokens)}</div>', | |
unsafe_allow_html=True | |
) | |
with embedding_tab: | |
st.markdown(""" | |
This tab shows how tokens are embedded in the model's vector space. | |
- Compare different dimensionality reduction techniques | |
- Observe clustering of similar tokens | |
- Explore the relationship between different token types | |
""") | |
col1, col2 = st.columns([2, 1]) | |
with col1: | |
selected_model = st.selectbox( | |
"Select model for embedding visualization", | |
options=list(models.keys()) | |
) | |
with col2: | |
viz_method = st.radio( | |
"Select visualization method", | |
options=['PCA', 't-SNE'], | |
horizontal=True | |
) | |
if text_input and selected_model: | |
with st.spinner(f"Generating embeddings with {selected_model}..."): | |
embeddings, tokens = get_embeddings( | |
text_input, | |
models[selected_model], | |
tokenizers[selected_model] | |
) | |
fig = visualize_embeddings(embeddings, tokens, viz_method) | |
st.plotly_chart(fig, use_container_width=True) | |
with st.expander("Embedding Statistics"): | |
embed_stats = pd.DataFrame({ | |
'Token': tokens, | |
'Type': [classify_token(t) for t in tokens], | |
'Mean': embeddings.mean(dim=1).numpy(), | |
'Std': embeddings.std(dim=1).numpy(), | |
'Norm': torch.norm(embeddings, dim=1).numpy() | |
}) | |
st.dataframe(embed_stats, use_container_width=True) | |
with similarity_tab: | |
st.markdown(""" | |
Explore token similarities based on their embedding representations. | |
- Darker colors indicate higher similarity | |
- Hover over cells to see exact similarity scores | |
""") | |
if text_input and selected_model: | |
with st.spinner("Computing token similarities..."): | |
sim_df = compute_token_similarities(embeddings, tokens) | |
fig = px.imshow( | |
sim_df, | |
labels=dict(color="Cosine Similarity"), | |
color_continuous_scale="RdYlBu", | |
aspect="auto" | |
) | |
fig.update_layout( | |
title="Token Similarity Matrix", | |
width=800, | |
height=800 | |
) | |
st.plotly_chart(fig, use_container_width=True) | |
st.subheader("Most Similar Token Pairs") | |
sim_matrix = sim_df.values | |
np.fill_diagonal(sim_matrix, 0) # Exclude self-similarities | |
top_k = min(10, len(tokens)) | |
pairs = [] | |
for i in range(len(tokens)): | |
for j in range(i+1, len(tokens)): | |
pairs.append((tokens[i], tokens[j], sim_matrix[i, j])) | |
top_pairs = sorted(pairs, key=lambda x: x[2], reverse=True)[:top_k] | |
for token1, token2, sim in top_pairs: | |
st.write(f"'{token1}' β '{token2}': {sim:.3f}") | |
st.markdown("---") | |
st.markdown(""" | |
π‘ **Tips:** | |
- Try comparing how different models tokenize and embed the same text | |
- Use PCA for global structure and t-SNE for local relationships | |
- Check the similarity matrix for interesting token relationships | |
- Experiment with different text types (technical, casual, mixed) | |
""") |