Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
eduagarcia
commited on
Commit
·
6f8ad2f
1
Parent(s):
9066f73
Caches models metadata card to a temporary file to speed up initilization
Browse files- .gitignore +2 -1
- app.py +21 -10
.gitignore
CHANGED
@@ -1 +1,2 @@
|
|
1 |
-
*.pyc
|
|
|
|
1 |
+
*.pyc
|
2 |
+
model_infos.json
|
app.py
CHANGED
@@ -151,10 +151,14 @@ def add_rank(df):
|
|
151 |
df.fillna("", inplace=True)
|
152 |
return df
|
153 |
|
154 |
-
|
155 |
-
|
|
|
|
|
|
|
|
|
156 |
def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=True, task_to_metric=TASK_TO_METRIC, rank=True, refresh=True):
|
157 |
-
global
|
158 |
api = API
|
159 |
models = api.list_models(filter="mteb")
|
160 |
# Initialize list to models that we cannot fetch metadata from
|
@@ -181,11 +185,13 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
181 |
for model in models:
|
182 |
if model.modelId in MODELS_TO_SKIP: continue
|
183 |
print("MODEL", model.modelId)
|
184 |
-
if model.modelId not in
|
185 |
readme_path = hf_hub_download(model.modelId, filename="README.md")
|
186 |
meta = metadata_load(readme_path)
|
187 |
-
|
188 |
-
|
|
|
|
|
189 |
if "model-index" not in meta:
|
190 |
continue
|
191 |
# meta['model-index'][0]["results"] is list of elements like:
|
@@ -217,14 +223,19 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
217 |
if add_emb_dim:
|
218 |
try:
|
219 |
# Fails on gated repos, so we only include scores for them
|
220 |
-
if
|
221 |
-
|
222 |
-
out["Embedding Dimensions"], out["Max Tokens"], out["Model Size (Million Parameters)"], out["Memory Usage (GB, fp32)"] =
|
223 |
except:
|
224 |
-
|
225 |
df_list.append(out)
|
226 |
if model.library_name == "sentence-transformers" or "sentence-transformers" in model.tags or "modules.json" in {file.rfilename for file in model.siblings}:
|
227 |
SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS.add(out["Model"])
|
|
|
|
|
|
|
|
|
|
|
228 |
df = pd.DataFrame(df_list)
|
229 |
# If there are any models that are the same, merge them
|
230 |
# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
|
|
|
151 |
df.fillna("", inplace=True)
|
152 |
return df
|
153 |
|
154 |
+
model_infos_path = "model_infos.json"
|
155 |
+
MODEL_INFOS = {}
|
156 |
+
if os.path.exists(model_infos_path):
|
157 |
+
with open(model_infos_path) as f:
|
158 |
+
MODEL_INFOS = json.load(f)
|
159 |
+
|
160 |
def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=True, task_to_metric=TASK_TO_METRIC, rank=True, refresh=True):
|
161 |
+
global MODEL_INFOS
|
162 |
api = API
|
163 |
models = api.list_models(filter="mteb")
|
164 |
# Initialize list to models that we cannot fetch metadata from
|
|
|
185 |
for model in models:
|
186 |
if model.modelId in MODELS_TO_SKIP: continue
|
187 |
print("MODEL", model.modelId)
|
188 |
+
if model.modelId not in MODEL_INFOS or refresh:
|
189 |
readme_path = hf_hub_download(model.modelId, filename="README.md")
|
190 |
meta = metadata_load(readme_path)
|
191 |
+
MODEL_INFOS[model.modelId] = {
|
192 |
+
"metadata": meta
|
193 |
+
}
|
194 |
+
meta = MODEL_INFOS[model.modelId]["metadata"]
|
195 |
if "model-index" not in meta:
|
196 |
continue
|
197 |
# meta['model-index'][0]["results"] is list of elements like:
|
|
|
223 |
if add_emb_dim:
|
224 |
try:
|
225 |
# Fails on gated repos, so we only include scores for them
|
226 |
+
if "dim_seq_size" not in MODEL_INFOS[model.modelId] or refresh:
|
227 |
+
MODEL_INFOS[model.modelId]["dim_seq_size"] = list(get_dim_seq_size(model))
|
228 |
+
out["Embedding Dimensions"], out["Max Tokens"], out["Model Size (Million Parameters)"], out["Memory Usage (GB, fp32)"] = tuple(MODEL_INFOS[model.modelId]["dim_seq_size"])
|
229 |
except:
|
230 |
+
MODEL_INFOS[model.modelId]["dim_seq_size"] = "", "", "", ""
|
231 |
df_list.append(out)
|
232 |
if model.library_name == "sentence-transformers" or "sentence-transformers" in model.tags or "modules.json" in {file.rfilename for file in model.siblings}:
|
233 |
SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS.add(out["Model"])
|
234 |
+
|
235 |
+
# Save & cache MODEL_INFOS
|
236 |
+
with open("model_infos.json", "w") as f:
|
237 |
+
json.dump(MODEL_INFOS, f)
|
238 |
+
|
239 |
df = pd.DataFrame(df_list)
|
240 |
# If there are any models that are the same, merge them
|
241 |
# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
|