eduagarcia commited on
Commit
6f8ad2f
·
1 Parent(s): 9066f73

Caches models metadata card to a temporary file to speed up initilization

Browse files
Files changed (2) hide show
  1. .gitignore +2 -1
  2. app.py +21 -10
.gitignore CHANGED
@@ -1 +1,2 @@
1
- *.pyc
 
 
1
+ *.pyc
2
+ model_infos.json
app.py CHANGED
@@ -151,10 +151,14 @@ def add_rank(df):
151
  df.fillna("", inplace=True)
152
  return df
153
 
154
- MODEL_CARD_METADATA = {}
155
- MODEL_EMB_DIM = {}
 
 
 
 
156
  def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=True, task_to_metric=TASK_TO_METRIC, rank=True, refresh=True):
157
- global MODEL_CARD_METADATA, MODEL_EMB_DIM
158
  api = API
159
  models = api.list_models(filter="mteb")
160
  # Initialize list to models that we cannot fetch metadata from
@@ -181,11 +185,13 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
181
  for model in models:
182
  if model.modelId in MODELS_TO_SKIP: continue
183
  print("MODEL", model.modelId)
184
- if model.modelId not in MODEL_CARD_METADATA or refresh:
185
  readme_path = hf_hub_download(model.modelId, filename="README.md")
186
  meta = metadata_load(readme_path)
187
- MODEL_CARD_METADATA[model.modelId] = meta
188
- meta = MODEL_CARD_METADATA[model.modelId]
 
 
189
  if "model-index" not in meta:
190
  continue
191
  # meta['model-index'][0]["results"] is list of elements like:
@@ -217,14 +223,19 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
217
  if add_emb_dim:
218
  try:
219
  # Fails on gated repos, so we only include scores for them
220
- if model.modelId not in MODEL_EMB_DIM or refresh:
221
- MODEL_EMB_DIM[model.modelId] = get_dim_seq_size(model)
222
- out["Embedding Dimensions"], out["Max Tokens"], out["Model Size (Million Parameters)"], out["Memory Usage (GB, fp32)"] = MODEL_EMB_DIM[model.modelId]
223
  except:
224
- MODEL_EMB_DIM[model.modelId] = "", "", "", ""
225
  df_list.append(out)
226
  if model.library_name == "sentence-transformers" or "sentence-transformers" in model.tags or "modules.json" in {file.rfilename for file in model.siblings}:
227
  SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS.add(out["Model"])
 
 
 
 
 
228
  df = pd.DataFrame(df_list)
229
  # If there are any models that are the same, merge them
230
  # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
 
151
  df.fillna("", inplace=True)
152
  return df
153
 
154
+ model_infos_path = "model_infos.json"
155
+ MODEL_INFOS = {}
156
+ if os.path.exists(model_infos_path):
157
+ with open(model_infos_path) as f:
158
+ MODEL_INFOS = json.load(f)
159
+
160
  def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=True, task_to_metric=TASK_TO_METRIC, rank=True, refresh=True):
161
+ global MODEL_INFOS
162
  api = API
163
  models = api.list_models(filter="mteb")
164
  # Initialize list to models that we cannot fetch metadata from
 
185
  for model in models:
186
  if model.modelId in MODELS_TO_SKIP: continue
187
  print("MODEL", model.modelId)
188
+ if model.modelId not in MODEL_INFOS or refresh:
189
  readme_path = hf_hub_download(model.modelId, filename="README.md")
190
  meta = metadata_load(readme_path)
191
+ MODEL_INFOS[model.modelId] = {
192
+ "metadata": meta
193
+ }
194
+ meta = MODEL_INFOS[model.modelId]["metadata"]
195
  if "model-index" not in meta:
196
  continue
197
  # meta['model-index'][0]["results"] is list of elements like:
 
223
  if add_emb_dim:
224
  try:
225
  # Fails on gated repos, so we only include scores for them
226
+ if "dim_seq_size" not in MODEL_INFOS[model.modelId] or refresh:
227
+ MODEL_INFOS[model.modelId]["dim_seq_size"] = list(get_dim_seq_size(model))
228
+ out["Embedding Dimensions"], out["Max Tokens"], out["Model Size (Million Parameters)"], out["Memory Usage (GB, fp32)"] = tuple(MODEL_INFOS[model.modelId]["dim_seq_size"])
229
  except:
230
+ MODEL_INFOS[model.modelId]["dim_seq_size"] = "", "", "", ""
231
  df_list.append(out)
232
  if model.library_name == "sentence-transformers" or "sentence-transformers" in model.tags or "modules.json" in {file.rfilename for file in model.siblings}:
233
  SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS.add(out["Model"])
234
+
235
+ # Save & cache MODEL_INFOS
236
+ with open("model_infos.json", "w") as f:
237
+ json.dump(MODEL_INFOS, f)
238
+
239
  df = pd.DataFrame(df_list)
240
  # If there are any models that are the same, merge them
241
  # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one