1 |
import streamlit as st
2 |
import numpy as np
3 |
import pandas as pd
4 |
import warnings
5 |
6 |
import math
7 |
from scipy.cluster.hierarchy import dendrogram, linkage
8 |
from sklearn.feature_extraction.text import TfidfVectorizer
9 |
import itertools
10 |
import plotly.figure_factory as ff
11 |
from community import community_louvain
12 |
import networkx as nx
13 |
from sklearn.metrics.pairwise import cosine_distances
14 |
from sklearn.metrics.pairwise import cosine_similarity
15 |
from sklearn.feature_extraction.text import CountVectorizer
16 |
from sklearn.cluster import AgglomerativeClustering
17 |
from PIL import Image
18 |
from wordcloud import WordCloud
19 |
import plotly.graph_objects as go
20 |
21 |
22 |
def create_dendrogram(X, labels):
23 |
Z = linkage(X.toarray(), "single")
24 |
fig = ff.create_dendrogram(Z, orientation='left', labels=labels)
25 |
return fig
26 |
27 |
28 |
def load_data():
29 |
data = pd.read_csv("HuggingFaceLLMsWithParamsAndReadmeLinks.csv")
30 |
return data
31 |
32 |
df = pd.read_csv("HuggingFaceLLMsWithParamsAndReadmeLinks.csv")
33 |
st.title("Constellation: An Atlas of 15,000 Large Language Models")
34 |
st.write("15,821 to be precise. Scraped from Hugging Face on July 18, 2023.")
35 |
st.write("Please cite: Gao, S., & Gao, A. K. (2023, July 19). On the Origin of LLMs: An Evolutionary Tree and Graph for 15,821 Large Language Models.; ArXiv.")
36 |
threshold = st.number_input("Enter the minimum number of downloads an LLM must have to be considered.", value=10000)
37 |
numClusters = st.number_input("Number of clusters to group into.", value=20, min_value=2, max_value=50)
38 |
wordClouds = st.checkbox("Show word clouds?")
39 |
40 |
def create_downloads_vs_likes_scatter(dataframe):
41 |
# Convert 'likes' column to numeric values
42 |
dataframe['likes'] = pd.to_numeric(dataframe['likes'], errors='coerce')
43 |
44 |
# Filter out the outlier point at 14M likes
45 |
dataframe_filtered = dataframe[dataframe['likes'] != 14000000]
46 |
47 |
fig = go.Figure()
48 |
fig.add_trace(go.Scatter(x=dataframe_filtered['downloads'], y=dataframe_filtered['likes'], mode='markers',
49 |
marker=dict(color='blue', size=7, opacity=0.7),
50 |
51 |
hovertemplate="Model Name: %{text}<br>Downloads: %{x}<br>Likes: %{y}<extra></extra>"))
52 |
fig.update_layout(title='Downloads vs Likes',
53 |
54 |
55 |
56 |
#yaxis_range=[0, 800]) # Set custom y-axis range
57 |
return fig
58 |
59 |
60 |
if st.button("Run Clustering"):
61 |
df_filtered = df[df['downloads'] > threshold]
62 |
df_extra_filtered = df_filtered.drop_duplicates(subset='model_name', keep='first')
63 |
64 |
# Convert the model names into a matrix of TF-IDF features
65 |
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 8))
66 |
X = vectorizer.fit_transform(df_extra_filtered['model_name'].tolist()).toarray()
67 |
68 |
# Function to compute the pairwise cosine distances
69 |
def distfun(X):
70 |
return cosine_distances(X)
71 |
72 |
# Function to compute the linkage matrix
73 |
def linkagefun(dist_array):
74 |
return linkage(dist_array, "single")
75 |
76 |
# Create dendrogram
77 |
fig = ff.create_dendrogram(X, orientation='bottom', labels=df_extra_filtered['model_name'].tolist(), distfun=distfun, linkagefun=linkagefun)
78 |
#fig.update_layout(width=800, height=500)
79 |
st.plotly_chart(fig, use_container_width=True)
80 |
81 |
# Group by cluster
82 |
# Convert the model names into a matrix of token counts
83 |
vectorizer = CountVectorizer(analyzer='char', ngram_range=(3, 6))
84 |
X = vectorizer.fit_transform(df_extra_filtered['model_name'])
85 |
# Use clustering to group model names
86 |
clustering = AgglomerativeClustering(n_clusters=20).fit(X.toarray())
87 |
88 |
# Add cluster labels to the filtered DataFrame
89 |
df_extra_filtered['cluster'] = clustering.labels_
90 |
91 |
# Count the number of models in each cluster
92 |
cluster_counts = df_extra_filtered['cluster'].value_counts()
93 |
94 |
# Create a bar chart
95 |
fig = go.Figure([go.Bar(x=cluster_counts.index, y=cluster_counts.values)])
96 |
fig.update_layout(title='Number of Models per Cluster', xaxis_title='Cluster', yaxis_title='Number of Models')
97 |
98 |
99 |
# graphing!
100 |
101 |
# Convert the model names into a matrix of TF-IDF features
102 |
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 8))
103 |
X = vectorizer.fit_transform(df_extra_filtered['model_name'])
104 |
105 |
# Compute the pairwise cosine similarities
106 |
sim_matrix = cosine_similarity(X)
107 |
108 |
# Create a graph
109 |
G = nx.Graph()
110 |
111 |
# Add nodes to the graph
112 |
for i in range(len(df_extra_filtered)):
113 |
G.add_node(i, label=df_extra_filtered['model_name'].iloc[i])
114 |
115 |
# Add edges to the graph
116 |
for i in range(len(df_extra_filtered)):
117 |
for j in range(i+1, len(df_extra_filtered)):
118 |
# If the similarity is above a certain threshold
119 |
if sim_matrix[i, j] > 0.2:
120 |
G.add_edge(i, j, weight=sim_matrix[i, j])
121 |
122 |
# Compute the layout positions
123 |
pos = nx.spring_layout(G)
124 |
125 |
# Detect communities
126 |
partition = community_louvain.best_partition(G)
127 |
# Create a figure
128 |
# Compute the layout for each community
129 |
layouts = {}
130 |
for community in set(partition.values()):
131 |
nodes_in_community = [node for node, comm in partition.items() if comm == community]
132 |
subgraph = G.subgraph(nodes_in_community)
133 |
layouts[community] = nx.spring_layout(subgraph)
134 |
135 |
# Combine the layouts, spreading them out on a grid
136 |
grid_size = math.ceil(math.sqrt(len(layouts))) # Size of the grid
137 |
grid = np.array(list(itertools.product(range(grid_size), repeat=2))) # Coordinates for the grid
138 |
scale = 2 # Scale factor for spreading out the communities
139 |
offsets = dict(zip(layouts, grid*scale)) # Map communities to grid coordinates
140 |
141 |
combined_layout = {}
142 |
for community, layout in layouts.items():
143 |
for node, position in layout.items():
144 |
combined_layout[node] = position + offsets[community]
145 |
146 |
# Prepare data for plotly
147 |
x = [combined_layout[node][0] for node in range(len(df_extra_filtered))]
148 |
y = [combined_layout[node][1] for node in range(len(df_extra_filtered))]
149 |
150 |
# Create a figure
151 |
fig = go.Figure()
152 |
153 |
# Prepare lists for node positions, labels, ranks, downloads, likes, and params
154 |
x, y, labels, ranks, downloads, likes, params = [], [], [], [], [], [], []
155 |
156 |
# Prepare the node attributes
157 |
for node, community in partition.items():
158 |
# Get model info
159 |
model_info = df_extra_filtered.iloc[node]
160 |
161 |
# Node position
162 |
163 |
164 |
165 |
# Node attributes
166 |
167 |
168 |
169 |
170 |
params.append(model_info['params_millions'] if pd.notnull(model_info['params_millions']) else 'N/A')
171 |
172 |
# Compute the centroid of each cluster for background coloring
173 |
centroids = dict()
174 |
community_sizes = dict() # Create a dict to store the sizes of each community
175 |
for community in set(partition.values()):
176 |
nodes_in_community = [node for node, comm in partition.items() if comm == community]
177 |
if len(nodes_in_community) > 1: # Only consider communities with more than one node
178 |
centroid_x = np.mean([pos[node][0] for node in nodes_in_community])
179 |
centroid_y = np.mean([pos[node][1] for node in nodes_in_community])
180 |
centroids[community] = (centroid_x, centroid_y)
181 |
community_sizes[community] = len(nodes_in_community)
182 |
183 |
# Add background coloring for each cluster
184 |
for community, centroid in centroids.items():
185 |
186 |
x=[centroid[0]], y=[centroid[1]],
187 |
188 |
189 |
size=community_sizes[community]*5, # Adjust size by multiplying the community size by a factor
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
# Add nodes to the figure
198 |
199 |
x=x, y=y,
200 |
201 |
marker=dict(size=3, color=community),
202 |
203 |
customdata=np.stack((ranks, downloads, likes, params), axis=-1),
204 |
205 |
"Model Name: %{text}<br>"
206 |
"Rank: %{customdata[0]}<br>"
207 |
"Downloads: %{customdata[1]}<br>"
208 |
"Likes: %{customdata[2]}<br>"
209 |
"Params (millions): %{customdata[3]}"
210 |
211 |
212 |
213 |
214 |
# Add edges to the figure
215 |
for edge in G.edges():
216 |
# Calculate edge weight for line width, normalize it for better visibility
217 |
line_width = G.edges[edge]['weight'] / np.max(list(nx.get_edge_attributes(G, 'weight').values()))
218 |
219 |
220 |
x=[pos[edge[0]][0], pos[edge[1]][0]],
221 |
y=[pos[edge[0]][1], pos[edge[1]][1]],
222 |
223 |
line=dict(width=line_width), # Multiply by a factor for better visibility
224 |
225 |
226 |
227 |
# Set the figure layout
228 |
fig.update_layout(showlegend=False, hovermode='closest')
229 |
230 |
231 |
232 |
# Calculate degree of each node
233 |
degrees = dict(
234 |
235 |
# Sort nodes by degree in descending order and get top 20
236 |
top_20_models = sorted(degrees.items(), key=lambda x: x[1], reverse=True)[:20]
237 |
238 |
# Prepare data for display
239 |
models = [df_extra_filtered.iloc[node]['model_name'] for node, degree in top_20_models]
240 |
connections = [degree for node, degree in top_20_models]
241 |
242 |
st.subheader("Top 20 Models by Number of Connections")
243 |
for model, connections in zip(models, connections):
244 |
st.write(f"{model}: {connections} connections")
245 |
246 |
247 |
# Find the representative model for each community
248 |
representatives = dict()
249 |
for community in set(partition.values()):
250 |
nodes_in_community = [node for node, comm in partition.items() if comm == community]
251 |
# Select the node with the highest degree within the community as representative
252 |
representative = max(nodes_in_community, key=lambda node: degrees[node])
253 |
representatives[community] = df_extra_filtered.iloc[representative]['model_name']
254 |
255 |
# Prepare data for display
256 |
communities = list(representatives.keys())
257 |
community_sizes = [community_sizes.get(comm, 1) for comm in communities] # Use a default size of 1 for communities not in the dictionary
258 |
representatives = list(representatives.values())
259 |
260 |
# Create a DataFrame to hold the data
261 |
df_reps = pd.DataFrame({
262 |
'Community ID': communities,
263 |
'Size': community_sizes,
264 |
'Representative Model': representatives
265 |
266 |
267 |
# Sort the DataFrame by community size in descending order
268 |
df_reps.sort_values(by='Size', ascending=False, inplace=True)
269 |
270 |
# Display in Streamlit
271 |
st.subheader("Representative for each community, sorted by community size.")
272 |
273 |
if wordClouds:
274 |
groups = df_extra_filtered.groupby('cluster')
275 |
276 |
for name, group in groups:
277 |
# Join all model names in the cluster into a single string
278 |
text = ' '.join(group['model_name'])
279 |
280 |
# Generate a word cloud
281 |
wordcloud = WordCloud().generate(text)
282 |
283 |
# Convert WordCloud to Image
284 |
image = wordcloud.to_image()
285 |
286 |
# Display the word cloud
287 |
st.image(image, use_column_width=True)
288 |
st.write(f'Word Cloud for Cluster {name}')
289 |
290 |
scatter_plot = create_downloads_vs_likes_scatter(df_extra_filtered)
291 |
st.plotly_chart(scatter_plot, use_container_width=True)