File size: 8,327 Bytes
bda7c4e
 
6b2b26c
 
 
bda7c4e
 
00b7e99
bda7c4e
 
 
 
 
 
 
 
 
f5b2e86
 
bda7c4e
 
 
 
a323ffa
138fae9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec20a0c
 
 
 
 
bda7c4e
 
00b7e99
 
 
 
 
 
 
 
 
 
a323ffa
 
 
 
00b7e99
 
ae12324
ec20a0c
 
 
 
00b7e99
138fae9
00b7e99
138fae9
07c4ca7
138fae9
f5b2e86
00b7e99
6b2b26c
 
07c4ca7
 
bda7c4e
 
 
 
2715fce
 
4a04f21
bda7c4e
0f060f8
bda7c4e
6b2b26c
1c32a9e
 
 
bda7c4e
ae12324
bda7c4e
 
 
1c32a9e
bda7c4e
0f060f8
 
bda7c4e
 
 
 
 
f5b2e86
 
 
 
 
 
 
 
 
 
62e33ae
 
 
f5b2e86
62e33ae
f5b2e86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a04f21
f5b2e86
 
 
 
 
 
 
 
 
 
6b2b26c
 
bda7c4e
 
 
6b2b26c
bda7c4e
 
 
 
 
 
 
 
 
 
 
 
 
6b2b26c
 
00b7e99
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import streamlit as st
import pandas as pd
import io
import re

# Constants
GITHUB_URL = "https://github.com/Sartify/STEL"
POSSIBLE_NON_BENCHMARK_COLS = ["Model Name", "Publisher", "Open?", "Basemodel", "Matryoshka", "Dimension", "Average"]

def extract_table_from_markdown(markdown_text, table_start):
    """Extract table content from markdown text."""
    lines = markdown_text.split('\n')
    table_content = []
    capture = False
    for line in lines:
        if line.startswith(table_start):
            capture = True
        elif capture and (line.startswith('#') or line.strip() == ''):
            break  # Stop capturing when we reach a new section or an empty line
        if capture:
            table_content.append(line)
    return '\n'.join(table_content)


# def markdown_table_to_df(table_content):
#     """Convert markdown table to pandas DataFrame."""
#     # Split the table content into lines
#     lines = table_content.split('\n')
    
#     # Extract headers
#     headers = [h.strip() for h in lines[0].split('|') if h.strip()]
    
#     # Extract data
#     data = []
#     for line in lines[2:]:  # Skip the header separator line
#         row = [cell.strip() for cell in line.split('|') if cell.strip()]
#         if row:  # Include any non-empty row
#             # Pad the row with empty strings if it's shorter than the headers
#             padded_row = row + [''] * (len(headers) - len(row))
#             data.append(padded_row[:len(headers)])  # Trim if longer than headers
    
#     # Create DataFrame
#     df = pd.DataFrame(data, columns=headers)
    
#     # Convert numeric columns to float
#     for col in df.columns:
#         if col not in ["Model Name", "Publisher", "Open?", "Basemodel", "Matryoshka"]:
#             df[col] = pd.to_numeric(df[col], errors='coerce')
    
#     return df

def extract_model_name(link):
    """Extract model name from markdown link."""
    match = re.match(r'\[(.*?)\]\(.*?\)', link)
    return match.group(1) if match else link

def markdown_table_to_df(table_content):
    """Convert markdown table to pandas DataFrame."""
    # Split the table content into lines
    lines = table_content.split('\n')
    
    # Extract headers
    headers = [h.strip() for h in lines[0].split('|') if h.strip()]
    
    # Extract data
    data = []
    for line in lines[2:]:  # Skip the header separator line
        row = [cell.strip() for cell in line.split('|') if cell.strip()]
        if row:  # Include any non-empty row
            # Pad the row with empty strings if it's shorter than the headers
            padded_row = row + [''] * (len(headers) - len(row))
            data.append(padded_row[:len(headers)])  # Trim if longer than headers
    
    # Create DataFrame
    df = pd.DataFrame(data, columns=headers)

    # Process 'Model Name' column to extract plain text from markdown link
    if 'Model Name' in df.columns:
        df['Model Name'] = df['Model Name'].apply(extract_model_name)
    
    # Convert numeric columns to float and handle Dimension column
    for col in df.columns:
        if col == "Dimension":
            df[col] = df[col].apply(lambda x: int(x) if x.isdigit() else "")
        elif col not in ["Model Name", "Publisher", "Open?", "Basemodel", "Matryoshka"]:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df



def setup_page():
    """Set up the Streamlit page."""
    st.set_page_config(page_title="Swahili Text Embeddings Leaderboard", page_icon="⚡", layout="wide")
    st.title("⚡ Swahili Text Embeddings Leaderboard (STEL)")
    # st.image("https://raw.githubusercontent.com/username/repo/main/files/STEL.jpg", width=300)
    st.image("https://huggingface.co/spaces/sartifyllc/Swahili-Text-Embeddings-Leaderboard/resolve/main/STEL.jpg", width=300)

def display_leaderboard(df):
    """Display the leaderboard."""
    st.header("📊 Leaderboard")
    
    # Determine which non-benchmark columns are present
    present_non_benchmark_cols = [col for col in POSSIBLE_NON_BENCHMARK_COLS if col in df.columns]
    
    # Add filters
    columns_to_filter = [col for col in df.columns if col not in present_non_benchmark_cols]
    selected_columns = st.multiselect("Select benchmarks to display:", columns_to_filter, default=columns_to_filter)
    
    # Filter dataframe
    df_display = df[present_non_benchmark_cols + selected_columns]
    
    # Display dataframe
    st.dataframe(df_display.style.format("{:.4f}", subset=[col for col in df_display.columns if df_display[col].dtype == 'float64']))
    
    # Download buttons
    csv = df_display.to_csv(index=False)
    st.download_button(label="Download as CSV", data=csv, file_name="leaderboard.csv", mime="text/csv")

def display_evaluation():
    """Display the evaluation section."""
    st.header("🧪 Evaluation")
    st.markdown("""
    To evaluate a model on the Swahili Embeddings Text Benchmark, you can use the following Python script:
    ```python
    pip install mteb
    pip install sentence-transformers
    import mteb
    from sentence_transformers import SentenceTransformer
    
    model_name = "MultiLinguSwahili-serengeti-E250-nli-matryoshka"	
    publisher = "sartifyllc"

    models = ["sartifyllc/MultiLinguSwahili-bert-base-sw-cased-nli-matryoshka", f"{publisher}/{model_name}"]

    for model_name in models:
        truncate_dim = 768
        language = "swa"
        
        device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
        model = SentenceTransformer(model_name, device=device, trust_remote_code=True)
        
        tasks = [
            mteb.get_task("AfriSentiClassification", languages=["swa"]),
            mteb.get_task("AfriSentiLangClassification", languages=["swa"]),
            mteb.get_task("MasakhaNEWSClassification", languages=["swa"]),
            mteb.get_task("MassiveIntentClassification", languages=["swa"]),
            mteb.get_task("MassiveScenarioClassification", languages=["swa"]),
            mteb.get_task("SwahiliNewsClassification", languages=["swa"]),
        ]
        
        evaluation = mteb.MTEB(tasks=tasks)
        results = evaluation.run(model, output_folder=f"{model_name}")
        
        tasks = mteb.get_tasks(task_types=["PairClassification", "Reranking", "BitextMining", "Clustering", "Retrieval"], languages=["swa"])
        
        evaluation = mteb.MTEB(tasks=tasks)
        results = evaluation.run(model, output_folder=f"{model_name}")
    ```
    """)

def display_contribution():
    """Display the contribution section."""
    st.header("🤝 How to Contribute")
    st.markdown("""
    We welcome and appreciate all contributions! You can help by:

    ### Table Work

    - Filling in missing entries.
    - New models are added as new rows to the leaderboard (maintaining descending order).
    - Add new benchmarks as new columns in the leaderboard and include them in the benchmarks table (maintaining descending order).

    ### Code Work

    - Improving the existing code.
    - Requesting and implementing new features.
    """)

def display_sponsorship():
    """Display the sponsorship section."""
    st.header("🤝 Sponsorship")
    st.markdown("""
    This benchmark is Swahili-based, and we need support translating and curating more tasks into Swahili. 
    Sponsorships are welcome to help advance this endeavour. Your sponsorship will facilitate essential 
    translation efforts, bridge language barriers, and make the benchmark accessible to a broader audience. 
    We are grateful for the dedication shown by our collaborators and aim to extend this impact further 
    with the support of sponsors committed to advancing language technologies.
    """)

def main():
    setup_page()
    
    # Read README content
    with open("README.md", "r") as f:
        readme_content = f.read()
    
    # Extract and process leaderboard table
    leaderboard_table = extract_table_from_markdown(readme_content, "| Model Name")
    df_leaderboard = markdown_table_to_df(leaderboard_table)
    
    display_leaderboard(df_leaderboard)
    display_evaluation()
    display_contribution()
    display_sponsorship()
    
    st.markdown("---")
    st.markdown("Thank you for being part of this effort to advance Swahili language technologies!")

if __name__ == "__main__":
    main()