Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,291 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
import warnings
|
5 |
+
warnings.filterwarnings('ignore')
|
6 |
+
import math
|
7 |
+
from scipy.cluster.hierarchy import dendrogram, linkage
|
8 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
9 |
+
import itertools
|
10 |
+
import plotly.figure_factory as ff
|
11 |
+
from community import community_louvain
|
12 |
+
import networkx as nx
|
13 |
+
from sklearn.metrics.pairwise import cosine_distances
|
14 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
15 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
16 |
+
from sklearn.cluster import AgglomerativeClustering
|
17 |
+
from PIL import Image
|
18 |
+
from wordcloud import WordCloud
|
19 |
+
import plotly.graph_objects as go
|
20 |
+
|
21 |
+
|
22 |
+
def create_dendrogram(X, labels):
|
23 |
+
Z = linkage(X.toarray(), "single")
|
24 |
+
fig = ff.create_dendrogram(Z, orientation='left', labels=labels)
|
25 |
+
return fig
|
26 |
+
|
27 |
+
@st.cache_data
|
28 |
+
def load_data():
|
29 |
+
data = pd.read_csv("HuggingFaceLLMsWithParamsAndReadmeLinks.csv")
|
30 |
+
return data
|
31 |
+
|
32 |
+
df = pd.read_csv("HuggingFaceLLMsWithParamsAndReadmeLinks.csv")
|
33 |
+
st.title("Constellation: An Atlas of 15,000 Large Language Models")
|
34 |
+
st.write("15,821 to be precise. Scraped from Hugging Face on July 18, 2023.")
|
35 |
+
st.write("Please cite: Gao, S., & Gao, A. K. (2023, July 19). On the Origin of LLMs: An Evolutionary Tree and Graph for 15,821 Large Language Models. ArXiv.org; ArXiv. https://doi.org/10.48550/arXiv.2307.09793")
|
36 |
+
threshold = st.number_input("Enter the minimum number of downloads an LLM must have to be considered.", value=10000)
|
37 |
+
numClusters = st.number_input("Number of clusters to group into.", value=20, min_value=2, max_value=50)
|
38 |
+
wordClouds = st.checkbox("Show word clouds?")
|
39 |
+
|
40 |
+
def create_downloads_vs_likes_scatter(dataframe):
|
41 |
+
# Convert 'likes' column to numeric values
|
42 |
+
dataframe['likes'] = pd.to_numeric(dataframe['likes'], errors='coerce')
|
43 |
+
|
44 |
+
# Filter out the outlier point at 14M likes
|
45 |
+
dataframe_filtered = dataframe[dataframe['likes'] != 14000000]
|
46 |
+
|
47 |
+
fig = go.Figure()
|
48 |
+
fig.add_trace(go.Scatter(x=dataframe_filtered['downloads'], y=dataframe_filtered['likes'], mode='markers',
|
49 |
+
marker=dict(color='blue', size=7, opacity=0.7),
|
50 |
+
text=dataframe_filtered['model_name'],
|
51 |
+
hovertemplate="Model Name: %{text}<br>Downloads: %{x}<br>Likes: %{y}<extra></extra>"))
|
52 |
+
fig.update_layout(title='Downloads vs Likes',
|
53 |
+
xaxis_title='Downloads',
|
54 |
+
#xaxis_range=[0,300000],
|
55 |
+
yaxis_title='Likes')
|
56 |
+
#yaxis_range=[0, 800]) # Set custom y-axis range
|
57 |
+
return fig
|
58 |
+
|
59 |
+
|
60 |
+
if st.button("Run Clustering"):
|
61 |
+
df_filtered = df[df['downloads'] > threshold]
|
62 |
+
df_extra_filtered = df_filtered.drop_duplicates(subset='model_name', keep='first')
|
63 |
+
|
64 |
+
# Convert the model names into a matrix of TF-IDF features
|
65 |
+
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 8))
|
66 |
+
X = vectorizer.fit_transform(df_extra_filtered['model_name'].tolist()).toarray()
|
67 |
+
|
68 |
+
# Function to compute the pairwise cosine distances
|
69 |
+
def distfun(X):
|
70 |
+
return cosine_distances(X)
|
71 |
+
|
72 |
+
# Function to compute the linkage matrix
|
73 |
+
def linkagefun(dist_array):
|
74 |
+
return linkage(dist_array, "single")
|
75 |
+
|
76 |
+
# Create dendrogram
|
77 |
+
fig = ff.create_dendrogram(X, orientation='bottom', labels=df_extra_filtered['model_name'].tolist(), distfun=distfun, linkagefun=linkagefun)
|
78 |
+
#fig.update_layout(width=800, height=500)
|
79 |
+
st.plotly_chart(fig, use_container_width=True)
|
80 |
+
|
81 |
+
# Group by cluster
|
82 |
+
# Convert the model names into a matrix of token counts
|
83 |
+
vectorizer = CountVectorizer(analyzer='char', ngram_range=(3, 6))
|
84 |
+
X = vectorizer.fit_transform(df_extra_filtered['model_name'])
|
85 |
+
# Use clustering to group model names
|
86 |
+
clustering = AgglomerativeClustering(n_clusters=20).fit(X.toarray())
|
87 |
+
|
88 |
+
# Add cluster labels to the filtered DataFrame
|
89 |
+
df_extra_filtered['cluster'] = clustering.labels_
|
90 |
+
|
91 |
+
# Count the number of models in each cluster
|
92 |
+
cluster_counts = df_extra_filtered['cluster'].value_counts()
|
93 |
+
|
94 |
+
# Create a bar chart
|
95 |
+
fig = go.Figure([go.Bar(x=cluster_counts.index, y=cluster_counts.values)])
|
96 |
+
fig.update_layout(title='Number of Models per Cluster', xaxis_title='Cluster', yaxis_title='Number of Models')
|
97 |
+
st.plotly_chart(fig)
|
98 |
+
|
99 |
+
# graphing!
|
100 |
+
|
101 |
+
# Convert the model names into a matrix of TF-IDF features
|
102 |
+
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 8))
|
103 |
+
X = vectorizer.fit_transform(df_extra_filtered['model_name'])
|
104 |
+
|
105 |
+
# Compute the pairwise cosine similarities
|
106 |
+
sim_matrix = cosine_similarity(X)
|
107 |
+
|
108 |
+
# Create a graph
|
109 |
+
G = nx.Graph()
|
110 |
+
|
111 |
+
# Add nodes to the graph
|
112 |
+
for i in range(len(df_extra_filtered)):
|
113 |
+
G.add_node(i, label=df_extra_filtered['model_name'].iloc[i])
|
114 |
+
|
115 |
+
# Add edges to the graph
|
116 |
+
for i in range(len(df_extra_filtered)):
|
117 |
+
for j in range(i+1, len(df_extra_filtered)):
|
118 |
+
# If the similarity is above a certain threshold
|
119 |
+
if sim_matrix[i, j] > 0.2:
|
120 |
+
G.add_edge(i, j, weight=sim_matrix[i, j])
|
121 |
+
|
122 |
+
# Compute the layout positions
|
123 |
+
pos = nx.spring_layout(G)
|
124 |
+
|
125 |
+
# Detect communities
|
126 |
+
partition = community_louvain.best_partition(G)
|
127 |
+
# Create a figure
|
128 |
+
# Compute the layout for each community
|
129 |
+
layouts = {}
|
130 |
+
for community in set(partition.values()):
|
131 |
+
nodes_in_community = [node for node, comm in partition.items() if comm == community]
|
132 |
+
subgraph = G.subgraph(nodes_in_community)
|
133 |
+
layouts[community] = nx.spring_layout(subgraph)
|
134 |
+
|
135 |
+
# Combine the layouts, spreading them out on a grid
|
136 |
+
grid_size = math.ceil(math.sqrt(len(layouts))) # Size of the grid
|
137 |
+
grid = np.array(list(itertools.product(range(grid_size), repeat=2))) # Coordinates for the grid
|
138 |
+
scale = 2 # Scale factor for spreading out the communities
|
139 |
+
offsets = dict(zip(layouts, grid*scale)) # Map communities to grid coordinates
|
140 |
+
|
141 |
+
combined_layout = {}
|
142 |
+
for community, layout in layouts.items():
|
143 |
+
for node, position in layout.items():
|
144 |
+
combined_layout[node] = position + offsets[community]
|
145 |
+
|
146 |
+
# Prepare data for plotly
|
147 |
+
x = [combined_layout[node][0] for node in range(len(df_extra_filtered))]
|
148 |
+
y = [combined_layout[node][1] for node in range(len(df_extra_filtered))]
|
149 |
+
|
150 |
+
# Create a figure
|
151 |
+
fig = go.Figure()
|
152 |
+
|
153 |
+
# Prepare lists for node positions, labels, ranks, downloads, likes, and params
|
154 |
+
x, y, labels, ranks, downloads, likes, params = [], [], [], [], [], [], []
|
155 |
+
|
156 |
+
# Prepare the node attributes
|
157 |
+
for node, community in partition.items():
|
158 |
+
# Get model info
|
159 |
+
model_info = df_extra_filtered.iloc[node]
|
160 |
+
|
161 |
+
# Node position
|
162 |
+
x.append(pos[node][0])
|
163 |
+
y.append(pos[node][1])
|
164 |
+
|
165 |
+
# Node attributes
|
166 |
+
labels.append(model_info['model_name'])
|
167 |
+
ranks.append(model_info['rank'])
|
168 |
+
downloads.append(model_info['downloads'])
|
169 |
+
likes.append(model_info['likes'])
|
170 |
+
params.append(model_info['params_millions'] if pd.notnull(model_info['params_millions']) else 'N/A')
|
171 |
+
|
172 |
+
# Compute the centroid of each cluster for background coloring
|
173 |
+
centroids = dict()
|
174 |
+
community_sizes = dict() # Create a dict to store the sizes of each community
|
175 |
+
for community in set(partition.values()):
|
176 |
+
nodes_in_community = [node for node, comm in partition.items() if comm == community]
|
177 |
+
if len(nodes_in_community) > 1: # Only consider communities with more than one node
|
178 |
+
centroid_x = np.mean([pos[node][0] for node in nodes_in_community])
|
179 |
+
centroid_y = np.mean([pos[node][1] for node in nodes_in_community])
|
180 |
+
centroids[community] = (centroid_x, centroid_y)
|
181 |
+
community_sizes[community] = len(nodes_in_community)
|
182 |
+
|
183 |
+
# Add background coloring for each cluster
|
184 |
+
for community, centroid in centroids.items():
|
185 |
+
fig.add_trace(go.Scatter(
|
186 |
+
x=[centroid[0]], y=[centroid[1]],
|
187 |
+
mode='markers',
|
188 |
+
marker=dict(
|
189 |
+
size=community_sizes[community]*5, # Adjust size by multiplying the community size by a factor
|
190 |
+
color=community,
|
191 |
+
opacity=0.1
|
192 |
+
),
|
193 |
+
hoverinfo='none',
|
194 |
+
showlegend=False
|
195 |
+
))
|
196 |
+
|
197 |
+
# Add nodes to the figure
|
198 |
+
fig.add_trace(go.Scatter(
|
199 |
+
x=x, y=y,
|
200 |
+
mode='markers',
|
201 |
+
marker=dict(size=3, color=community),
|
202 |
+
text=labels,
|
203 |
+
customdata=np.stack((ranks, downloads, likes, params), axis=-1),
|
204 |
+
hovertemplate=(
|
205 |
+
"Model Name: %{text}<br>"
|
206 |
+
"Rank: %{customdata[0]}<br>"
|
207 |
+
"Downloads: %{customdata[1]}<br>"
|
208 |
+
"Likes: %{customdata[2]}<br>"
|
209 |
+
"Params (millions): %{customdata[3]}"
|
210 |
+
"<extra></extra>"
|
211 |
+
)
|
212 |
+
))
|
213 |
+
|
214 |
+
# Add edges to the figure
|
215 |
+
for edge in G.edges():
|
216 |
+
# Calculate edge weight for line width, normalize it for better visibility
|
217 |
+
line_width = G.edges[edge]['weight'] / np.max(list(nx.get_edge_attributes(G, 'weight').values()))
|
218 |
+
|
219 |
+
fig.add_trace(go.Scatter(
|
220 |
+
x=[pos[edge[0]][0], pos[edge[1]][0]],
|
221 |
+
y=[pos[edge[0]][1], pos[edge[1]][1]],
|
222 |
+
mode='lines',
|
223 |
+
line=dict(width=line_width), # Multiply by a factor for better visibility
|
224 |
+
hoverinfo='none'
|
225 |
+
))
|
226 |
+
|
227 |
+
# Set the figure layout
|
228 |
+
fig.update_layout(showlegend=False, hovermode='closest')
|
229 |
+
|
230 |
+
st.plotly_chart(fig)
|
231 |
+
|
232 |
+
# Calculate degree of each node
|
233 |
+
degrees = dict(G.degree())
|
234 |
+
|
235 |
+
# Sort nodes by degree in descending order and get top 20
|
236 |
+
top_20_models = sorted(degrees.items(), key=lambda x: x[1], reverse=True)[:20]
|
237 |
+
|
238 |
+
# Prepare data for display
|
239 |
+
models = [df_extra_filtered.iloc[node]['model_name'] for node, degree in top_20_models]
|
240 |
+
connections = [degree for node, degree in top_20_models]
|
241 |
+
|
242 |
+
st.subheader("Top 20 Models by Number of Connections")
|
243 |
+
for model, connections in zip(models, connections):
|
244 |
+
st.write(f"{model}: {connections} connections")
|
245 |
+
|
246 |
+
|
247 |
+
# Find the representative model for each community
|
248 |
+
representatives = dict()
|
249 |
+
for community in set(partition.values()):
|
250 |
+
nodes_in_community = [node for node, comm in partition.items() if comm == community]
|
251 |
+
# Select the node with the highest degree within the community as representative
|
252 |
+
representative = max(nodes_in_community, key=lambda node: degrees[node])
|
253 |
+
representatives[community] = df_extra_filtered.iloc[representative]['model_name']
|
254 |
+
|
255 |
+
# Prepare data for display
|
256 |
+
communities = list(representatives.keys())
|
257 |
+
community_sizes = [community_sizes.get(comm, 1) for comm in communities] # Use a default size of 1 for communities not in the dictionary
|
258 |
+
representatives = list(representatives.values())
|
259 |
+
|
260 |
+
# Create a DataFrame to hold the data
|
261 |
+
df_reps = pd.DataFrame({
|
262 |
+
'Community ID': communities,
|
263 |
+
'Size': community_sizes,
|
264 |
+
'Representative Model': representatives
|
265 |
+
})
|
266 |
+
|
267 |
+
# Sort the DataFrame by community size in descending order
|
268 |
+
df_reps.sort_values(by='Size', ascending=False, inplace=True)
|
269 |
+
|
270 |
+
# Display in Streamlit
|
271 |
+
st.subheader("Representative for each community, sorted by community size.")
|
272 |
+
st.dataframe(df_reps)
|
273 |
+
if wordClouds:
|
274 |
+
groups = df_extra_filtered.groupby('cluster')
|
275 |
+
|
276 |
+
for name, group in groups:
|
277 |
+
# Join all model names in the cluster into a single string
|
278 |
+
text = ' '.join(group['model_name'])
|
279 |
+
|
280 |
+
# Generate a word cloud
|
281 |
+
wordcloud = WordCloud().generate(text)
|
282 |
+
|
283 |
+
# Convert WordCloud to Image
|
284 |
+
image = wordcloud.to_image()
|
285 |
+
|
286 |
+
# Display the word cloud
|
287 |
+
st.image(image, use_column_width=True)
|
288 |
+
st.write(f'Word Cloud for Cluster {name}')
|
289 |
+
|
290 |
+
scatter_plot = create_downloads_vs_likes_scatter(df_extra_filtered)
|
291 |
+
st.plotly_chart(scatter_plot, use_container_width=True)
|