LongVideoBench Leaderboard

import gradio as gr
import pandas as pd

block = gr.Blocks(title="LongVideoBench Leaderboard", theme='gradio/soft')

# Function to sort data and filter columns based on checkboxes
def sort_data(key, show_duration, show_category):
    data = pd.read_csv("result.csv")
    
    duration_columns = ['8s-15s', '15s-60s', '180s-600s', '900s-3600s']
    category_columns = ['S2E', 'S2O', 'S2A', 'E2O', 'SSS', 'SOS', 'SAA', 'T3E', 'T3O', 'TOS', 'TAA']
    
    columns_to_show = ['Model', 'Test Total']
    
    if show_duration:
        columns_to_show += duration_columns
    if show_category:
        columns_to_show += category_columns
    
    columns_to_show += ['Val Total', 'LMM Type', 'Interleaved?', "#Max Frames"]
    
    if key in data.columns:
        df_sorted = data.sort_values(by=key, ascending=False)
    else:
        df_sorted = data.sort_values(by='Test Total', ascending=False)
    
    return df_sorted[columns_to_show]

with block:

    gr.HTML("<link rel='stylesheet' type='text/css' href='style.css'>")

    with gr.Row():
        gr.Markdown("""
            <div style='text-align: center;'>
                <h1>LongVideoBench Leaderboard</h1>
                Website: <a href="https://longvideobench.github.io" target="_blank">longvideobench.github.io</a>
            </div>
        """)
     
    with gr.Tab("Existing Results"):
        with gr.Row():
            show_duration = gr.Checkbox(label="Show Test Set Accuracy by Duration Groups", value=False)
            show_category = gr.Checkbox(label="Show Test Set Accuracy by Question Categories", value=False)
            
        key_input = gr.Textbox(label="Rank LMMs by column:", placeholder="Test Total (default)")

        
        data_frame = gr.DataFrame(sort_data('Test Total', show_duration=False, show_category=False))
        
        def update_data_frame(key, show_duration, show_category):
            return sort_data(key, show_duration, show_category)
        
        key_input.change(update_data_frame, inputs=[key_input, show_duration, show_category], outputs=data_frame)
        show_duration.change(update_data_frame, inputs=[key_input, show_duration, show_category], outputs=data_frame)
        show_category.change(update_data_frame, inputs=[key_input, show_duration, show_category], outputs=data_frame)

        gr.Markdown("Models are evaluated using their optimal #max frames, capped at 256 frames.")
        
    with gr.Tab("Submit!"):
        gr.Markdown(
'''The answer of validation set of LongVideoBench is public now. Please see our [released dataset](https://huggingface.co/datasets/longvideobench/LongVideoBench) for more information.

For test set, please prepare your output as follows:
```python
    {VIDEO_ID_0: "A", VIDEO_ID_1: "D", ...} # Please make sure your submission only contains the letter of model's choice, or starts with the letter of model's choice.
``` 
and submit to us as a JSON file.

Please prepare an email to `haoning001@e.ntu.edu.sg` titled [LongVideoBench-Submission-YOURNAME] to submit and obtain your results.

_We will launch an automatic submission server soon._'''
        )
    
block.launch()