Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Tom Aarsen
commited on
Commit
·
6c6aac5
1
Parent(s):
cfacdee
Add Sentence Transformers model type option
Browse files
app.py
CHANGED
@@ -1003,6 +1003,104 @@ MODELS_TO_SKIP = {
|
|
1003 |
"Koat/gte-tiny",
|
1004 |
}
|
1005 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1006 |
def add_lang(examples):
|
1007 |
if not(examples["eval_language"]):
|
1008 |
examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"]
|
@@ -1170,6 +1268,8 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
1170 |
except:
|
1171 |
pass
|
1172 |
df_list.append(out)
|
|
|
|
|
1173 |
df = pd.DataFrame(df_list)
|
1174 |
# If there are any models that are the same, merge them
|
1175 |
# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
|
@@ -1863,22 +1963,21 @@ def update_url_language(event: gr.SelectData, current_task_language: dict, langu
|
|
1863 |
|
1864 |
NUMERIC_INTERVALS = {
|
1865 |
"<100M": pd.Interval(0, 100, closed="right"),
|
1866 |
-
"
|
1867 |
-
"
|
1868 |
-
"
|
1869 |
">1B": pd.Interval(1000, 1_000_000, closed="right"),
|
1870 |
}
|
1871 |
|
1872 |
MODEL_TYPES = [
|
1873 |
"Open",
|
1874 |
"Proprietary",
|
|
|
1875 |
]
|
1876 |
|
1877 |
def filter_data(search_query, model_types, model_sizes, *full_dataframes):
|
1878 |
output_dataframes = []
|
1879 |
for df in full_dataframes:
|
1880 |
-
# df = pd.DataFrame(data=dataframe.value["data"], columns=dataframe.value["headers"])
|
1881 |
-
|
1882 |
# Apply the search query
|
1883 |
if search_query:
|
1884 |
names = df["Model"].map(lambda x: re.match("<a .+?>(.+)</a>", x).group(1))
|
@@ -1895,7 +1994,12 @@ def filter_data(search_query, model_types, model_sizes, *full_dataframes):
|
|
1895 |
masks.append(df["Model Size (Million Parameters)"] != "")
|
1896 |
elif model_type == "Proprietary":
|
1897 |
masks.append(df["Model Size (Million Parameters)"] == "")
|
1898 |
-
|
|
|
|
|
|
|
|
|
|
|
1899 |
|
1900 |
# Apply the model size filtering
|
1901 |
if model_sizes != list(NUMERIC_INTERVALS.keys()):
|
@@ -1920,8 +2024,8 @@ with gr.Blocks(css=css) as block:
|
|
1920 |
|
1921 |
with gr.Row():
|
1922 |
search_bar = gr.Textbox(
|
1923 |
-
label="Search Bar",
|
1924 |
-
placeholder=" 🔍 Search for
|
1925 |
)
|
1926 |
filter_model_type = gr.CheckboxGroup(
|
1927 |
label="Model types",
|
@@ -1935,7 +2039,8 @@ with gr.Blocks(css=css) as block:
|
|
1935 |
choices=list(NUMERIC_INTERVALS.keys()),
|
1936 |
value=list(NUMERIC_INTERVALS.keys()),
|
1937 |
interactive=True,
|
1938 |
-
elem_classes=["filter-checkbox-group"]
|
|
|
1939 |
)
|
1940 |
|
1941 |
with gr.Tabs() as outer_tabs:
|
|
|
1003 |
"Koat/gte-tiny",
|
1004 |
}
|
1005 |
|
1006 |
+
SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS = {
|
1007 |
+
"allenai-specter",
|
1008 |
+
"allenai-specter",
|
1009 |
+
"all-MiniLM-L12-v2",
|
1010 |
+
"all-MiniLM-L6-v2",
|
1011 |
+
"all-mpnet-base-v2",
|
1012 |
+
"bert-base-10lang-cased",
|
1013 |
+
"bert-base-15lang-cased",
|
1014 |
+
"bert-base-25lang-cased",
|
1015 |
+
"bert-base-multilingual-cased",
|
1016 |
+
"bert-base-multilingual-uncased",
|
1017 |
+
"bert-base-swedish-cased",
|
1018 |
+
"bert-base-uncased",
|
1019 |
+
"bge-base-zh-v1.5",
|
1020 |
+
"bge-large-zh-v1.5",
|
1021 |
+
"bge-large-zh-noinstruct",
|
1022 |
+
"bge-small-zh-v1.5",
|
1023 |
+
"camembert-base",
|
1024 |
+
"camembert-large",
|
1025 |
+
"contriever-base-msmarco",
|
1026 |
+
"cross-en-de-roberta-sentence-transformer",
|
1027 |
+
"DanskBERT",
|
1028 |
+
"distilbert-base-25lang-cased",
|
1029 |
+
"distilbert-base-en-fr-cased",
|
1030 |
+
"distilbert-base-en-fr-es-pt-it-cased",
|
1031 |
+
"distilbert-base-fr-cased",
|
1032 |
+
"distilbert-base-uncased",
|
1033 |
+
"distiluse-base-multilingual-cased-v2",
|
1034 |
+
"dfm-encoder-large-v1",
|
1035 |
+
"dfm-sentence-encoder-large-1",
|
1036 |
+
"e5-base",
|
1037 |
+
"e5-large",
|
1038 |
+
"e5-mistral-7b-instruct",
|
1039 |
+
"e5-small",
|
1040 |
+
"electra-small-nordic",
|
1041 |
+
"electra-small-swedish-cased-discriminator",
|
1042 |
+
"flaubert_base_cased",
|
1043 |
+
"flaubert_base_uncased",
|
1044 |
+
"flaubert_large_cased",
|
1045 |
+
"gbert-base",
|
1046 |
+
"gbert-large",
|
1047 |
+
"gelectra-base",
|
1048 |
+
"gelectra-large",
|
1049 |
+
"glove.6B.300d",
|
1050 |
+
"gottbert-base",
|
1051 |
+
"gtr-t5-base",
|
1052 |
+
"gtr-t5-large",
|
1053 |
+
"gtr-t5-xl",
|
1054 |
+
"gtr-t5-xxl",
|
1055 |
+
"herbert-base-retrieval-v2",
|
1056 |
+
"komninos",
|
1057 |
+
"luotuo-bert-medium",
|
1058 |
+
"LaBSE",
|
1059 |
+
"m3e-base",
|
1060 |
+
"m3e-large",
|
1061 |
+
"msmarco-bert-co-condensor",
|
1062 |
+
"multi-qa-MiniLM-L6-cos-v1",
|
1063 |
+
"multilingual-e5-base",
|
1064 |
+
"multilingual-e5-large",
|
1065 |
+
"multilingual-e5-small",
|
1066 |
+
"nb-bert-base",
|
1067 |
+
"nb-bert-large",
|
1068 |
+
"nomic-embed-text-v1.5-64",
|
1069 |
+
"nomic-embed-text-v1.5-128",
|
1070 |
+
"nomic-embed-text-v1.5-256",
|
1071 |
+
"nomic-embed-text-v1.5-512",
|
1072 |
+
"norbert3-base",
|
1073 |
+
"norbert3-large",
|
1074 |
+
"paraphrase-multilingual-mpnet-base-v2",
|
1075 |
+
"paraphrase-multilingual-MiniLM-L12-v2",
|
1076 |
+
"sentence-camembert-base",
|
1077 |
+
"sentence-camembert-large",
|
1078 |
+
"sentence-croissant-llm-base",
|
1079 |
+
"sentence-bert-swedish-cased",
|
1080 |
+
"sentence-t5-base",
|
1081 |
+
"sentence-t5-large",
|
1082 |
+
"sentence-t5-xl",
|
1083 |
+
"sentence-t5-xxl",
|
1084 |
+
"silver-retriever-base-v1",
|
1085 |
+
"sup-simcse-bert-base-uncased",
|
1086 |
+
"st-polish-paraphrase-from-distilroberta",
|
1087 |
+
"st-polish-paraphrase-from-mpnet",
|
1088 |
+
"text2vec-base-chinese",
|
1089 |
+
"text2vec-large-chinese",
|
1090 |
+
"udever-bloom-1b1",
|
1091 |
+
"udever-bloom-560m",
|
1092 |
+
"universal-sentence-encoder-multilingual-3",
|
1093 |
+
"universal-sentence-encoder-multilingual-large-3",
|
1094 |
+
"unsup-simcse-bert-base-uncased",
|
1095 |
+
"use-cmlm-multilingual",
|
1096 |
+
"xlm-roberta-base",
|
1097 |
+
"xlm-roberta-large",
|
1098 |
+
}
|
1099 |
+
SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS = {
|
1100 |
+
make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, "https://huggingface.co/spaces/mteb/leaderboard"))
|
1101 |
+
for model in SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS
|
1102 |
+
}
|
1103 |
+
|
1104 |
def add_lang(examples):
|
1105 |
if not(examples["eval_language"]):
|
1106 |
examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"]
|
|
|
1268 |
except:
|
1269 |
pass
|
1270 |
df_list.append(out)
|
1271 |
+
if model.library_name == "sentence-transformers" or "sentence-transformers" in model.tags or "modules.json" in {file.rfilename for file in model.siblings}:
|
1272 |
+
SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS.add(out["Model"])
|
1273 |
df = pd.DataFrame(df_list)
|
1274 |
# If there are any models that are the same, merge them
|
1275 |
# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
|
|
|
1963 |
|
1964 |
NUMERIC_INTERVALS = {
|
1965 |
"<100M": pd.Interval(0, 100, closed="right"),
|
1966 |
+
"100M to 250M": pd.Interval(100, 250, closed="right"),
|
1967 |
+
"250M to 500M": pd.Interval(250, 500, closed="right"),
|
1968 |
+
"500M to 1B": pd.Interval(500, 1000, closed="right"),
|
1969 |
">1B": pd.Interval(1000, 1_000_000, closed="right"),
|
1970 |
}
|
1971 |
|
1972 |
MODEL_TYPES = [
|
1973 |
"Open",
|
1974 |
"Proprietary",
|
1975 |
+
"Sentence Transformers",
|
1976 |
]
|
1977 |
|
1978 |
def filter_data(search_query, model_types, model_sizes, *full_dataframes):
|
1979 |
output_dataframes = []
|
1980 |
for df in full_dataframes:
|
|
|
|
|
1981 |
# Apply the search query
|
1982 |
if search_query:
|
1983 |
names = df["Model"].map(lambda x: re.match("<a .+?>(.+)</a>", x).group(1))
|
|
|
1994 |
masks.append(df["Model Size (Million Parameters)"] != "")
|
1995 |
elif model_type == "Proprietary":
|
1996 |
masks.append(df["Model Size (Million Parameters)"] == "")
|
1997 |
+
elif model_type == "Sentence Transformers":
|
1998 |
+
masks.append(df["Model"].isin(SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS))
|
1999 |
+
if masks:
|
2000 |
+
df = df[reduce(lambda a, b: a | b, masks)]
|
2001 |
+
else:
|
2002 |
+
df = pd.DataFrame(columns=df.columns)
|
2003 |
|
2004 |
# Apply the model size filtering
|
2005 |
if model_sizes != list(NUMERIC_INTERVALS.keys()):
|
|
|
2024 |
|
2025 |
with gr.Row():
|
2026 |
search_bar = gr.Textbox(
|
2027 |
+
label="Search Bar (separate multiple queries with `;`)",
|
2028 |
+
placeholder=" 🔍 Search for a model and press enter...",
|
2029 |
)
|
2030 |
filter_model_type = gr.CheckboxGroup(
|
2031 |
label="Model types",
|
|
|
2039 |
choices=list(NUMERIC_INTERVALS.keys()),
|
2040 |
value=list(NUMERIC_INTERVALS.keys()),
|
2041 |
interactive=True,
|
2042 |
+
elem_classes=["filter-checkbox-group"],
|
2043 |
+
scale=2,
|
2044 |
)
|
2045 |
|
2046 |
with gr.Tabs() as outer_tabs:
|