future-xy commited on
Commit
88d1c0e
·
1 Parent(s): 1ae96c8

fix display

Browse files
backend-cli.py CHANGED
@@ -12,7 +12,6 @@ from src.backend.run_eval_suite import run_evaluation
12
  from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
13
  from src.backend.sort_queue import sort_models_by_priority
14
  from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT, Task
15
- LIMIT=2
16
  from src.backend.manage_requests import EvalRequest
17
  from src.leaderboard.read_evals import EvalResult
18
 
@@ -150,10 +149,10 @@ def process_evaluation(task: Task, eval_request: EvalRequest) -> dict:
150
  else:
151
  raise
152
 
153
- print("RESULTS", results)
154
 
155
  dumped = json.dumps(results, indent=2, default=lambda o: "<not serializable>")
156
- print(dumped)
157
 
158
  output_path = os.path.join(
159
  EVAL_RESULTS_PATH_BACKEND, *eval_request.model.split("/"), f"results_{datetime.now()}.json"
 
12
  from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
13
  from src.backend.sort_queue import sort_models_by_priority
14
  from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT, Task
 
15
  from src.backend.manage_requests import EvalRequest
16
  from src.leaderboard.read_evals import EvalResult
17
 
 
149
  else:
150
  raise
151
 
152
+ # print("RESULTS", results)
153
 
154
  dumped = json.dumps(results, indent=2, default=lambda o: "<not serializable>")
155
+ # print(dumped)
156
 
157
  output_path = os.path.join(
158
  EVAL_RESULTS_PATH_BACKEND, *eval_request.model.split("/"), f"results_{datetime.now()}.json"
src/backend/tasks/measurement_task_utils.py CHANGED
@@ -8,21 +8,17 @@ def process_results_decorator(func):
8
  def wrapper(self, doc, results, *args, **kwargs):
9
  # We process the results here
10
  processed_results = [r[0] for r in results]
11
-
12
- # end_to_end_time = end_to_end_time / batch_size
13
- # prefilling_time = prefilling_time / batch_size
14
- # token_per_sec = output_length / (decoding_time / batch_size)
15
 
16
  end_to_end_time = sum([r[1] for r in results]) / len(results)
17
  prefilling_time = sum([r[2] for r in results]) / len(results)
18
- token_per_sec = sum([r[3] for r in results]) / len(results)
19
- print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, token_per_sec: {token_per_sec}")
20
 
21
  # Now call the original process_results with the processed results
22
  result_dict = func(self, doc, processed_results, *args, **kwargs)
23
  result_dict["end_to_end_time"] = end_to_end_time
24
  result_dict["prefilling_time"] = prefilling_time
25
- result_dict["token_per_sec"] = token_per_sec
26
  return result_dict
27
  return wrapper
28
 
@@ -33,7 +29,7 @@ def aggregation_decorator(func):
33
  aggregation_list = func(self, *args, **kwargs)
34
  aggregation_list["end_to_end_time"] = mean
35
  aggregation_list["prefilling_time"] = mean
36
- aggregation_list["token_per_sec"] = mean
37
  return aggregation_list
38
  return wrapper
39
 
@@ -44,7 +40,7 @@ def higher_is_better_decorator(func):
44
  higher_is_better_dict = func(self, *args, **kwargs)
45
  higher_is_better_dict["end_to_end_time"] = False
46
  higher_is_better_dict["prefilling_time"] = False
47
- higher_is_better_dict["token_per_sec"] = True
48
  return higher_is_better_dict
49
  return wrapper
50
 
 
8
  def wrapper(self, doc, results, *args, **kwargs):
9
  # We process the results here
10
  processed_results = [r[0] for r in results]
 
 
 
 
11
 
12
  end_to_end_time = sum([r[1] for r in results]) / len(results)
13
  prefilling_time = sum([r[2] for r in results]) / len(results)
14
+ decoding_throughput = sum([r[3] for r in results]) / len(results)
15
+ print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
16
 
17
  # Now call the original process_results with the processed results
18
  result_dict = func(self, doc, processed_results, *args, **kwargs)
19
  result_dict["end_to_end_time"] = end_to_end_time
20
  result_dict["prefilling_time"] = prefilling_time
21
+ result_dict["decoding_throughput"] = decoding_throughput
22
  return result_dict
23
  return wrapper
24
 
 
29
  aggregation_list = func(self, *args, **kwargs)
30
  aggregation_list["end_to_end_time"] = mean
31
  aggregation_list["prefilling_time"] = mean
32
+ aggregation_list["decoding_throughput"] = mean
33
  return aggregation_list
34
  return wrapper
35
 
 
40
  higher_is_better_dict = func(self, *args, **kwargs)
41
  higher_is_better_dict["end_to_end_time"] = False
42
  higher_is_better_dict["prefilling_time"] = False
43
+ higher_is_better_dict["decoding_throughput"] = True
44
  return higher_is_better_dict
45
  return wrapper
46
 
src/display/utils.py CHANGED
@@ -73,12 +73,12 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
73
  # Inference framework
74
  auto_eval_column_dict.append(["inference_framework", ColumnContent, ColumnContent("Inference framework", "str", True)])
75
 
76
- # System performance metrics
77
- auto_eval_column_dict.append(["prefilling_time", ColumnContent, ColumnContent("Prefilling time (s)", "number", True)])
78
- auto_eval_column_dict.append(["token_per_second", ColumnContent, ColumnContent("Tokens/s", "number", True)])
79
-
80
  for task in Tasks:
81
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 
 
 
 
82
 
83
  # Model information
84
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 
73
  # Inference framework
74
  auto_eval_column_dict.append(["inference_framework", ColumnContent, ColumnContent("Inference framework", "str", True)])
75
 
 
 
 
 
76
  for task in Tasks:
77
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
78
+ # System performance metrics
79
+ auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name} End-to-end time (s)", "number", True)])
80
+ auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name} Prefilling time (s)", "number", True)])
81
+ auto_eval_column_dict.append([f"{task.name}_decoding_throughput", ColumnContent, ColumnContent(f"{task.value.col_name} Decoding throughput (tok/s)", "number", True)])
82
 
83
  # Model information
84
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
src/leaderboard/read_evals.py CHANGED
@@ -107,7 +107,10 @@ class EvalResult:
107
  multiplier = 1.0
108
  if "squad" in benchmark:
109
  multiplier = 1.0
110
-
 
 
 
111
  # print('RESULTS', data['results'])
112
  # print('XXX', benchmark, metric, value, multiplier)
113
  results[benchmark][metric] = value * multiplier
 
107
  multiplier = 1.0
108
  if "squad" in benchmark:
109
  multiplier = 1.0
110
+ if "time" in metric:
111
+ multiplier = 1.0
112
+ if "throughput" in metric:
113
+ multiplier = 1.0
114
  # print('RESULTS', data['results'])
115
  # print('XXX', benchmark, metric, value, multiplier)
116
  results[benchmark][metric] = value * multiplier
src/populate.py CHANGED
@@ -30,7 +30,8 @@ def get_leaderboard_df(
30
  raw_data[result_idx], requests_path_open_llm
31
  )
32
 
33
- all_data_json_ = [v.to_dict() for v in raw_data if v.is_complete()]
 
34
 
35
  name_to_bm_map = {}
36
 
@@ -45,15 +46,22 @@ def get_leaderboard_df(
45
  name_to_bm_map[name] = bm
46
 
47
  # bm_to_name_map = {bm: name for name, bm in name_to_bm_map.items()}
 
 
 
 
 
48
 
49
  all_data_json = []
50
  for entry in all_data_json_:
51
  new_entry = copy.deepcopy(entry)
52
-
53
  for k, v in entry.items():
54
  if k in name_to_bm_map:
55
  benchmark, metric = name_to_bm_map[k]
56
  new_entry[k] = entry[k][metric]
 
 
 
57
 
58
  all_data_json += [new_entry]
59
 
@@ -69,10 +77,10 @@ def get_leaderboard_df(
69
  df[col] = np.nan
70
 
71
  if not df.empty:
72
- df = df[cols].round(decimals=2)
73
 
74
  # filter out if any of the benchmarks have not been produced
75
- df = df[has_no_nan_values(df, benchmark_cols)]
76
 
77
  return raw_data, df
78
 
 
30
  raw_data[result_idx], requests_path_open_llm
31
  )
32
 
33
+ # all_data_json_ = [v.to_dict() for v in raw_data if v.is_complete()]
34
+ all_data_json_ = [v.to_dict() for v in raw_data] # include incomplete evals
35
 
36
  name_to_bm_map = {}
37
 
 
46
  name_to_bm_map[name] = bm
47
 
48
  # bm_to_name_map = {bm: name for name, bm in name_to_bm_map.items()}
49
+ system_metrics_to_name_map = {
50
+ "end_to_end_time": "End-to-end time (s)",
51
+ "prefilling_time": "Prefilling time (s)",
52
+ "decoding_throughput": "Decoding throughput (tok/s)",
53
+ }
54
 
55
  all_data_json = []
56
  for entry in all_data_json_:
57
  new_entry = copy.deepcopy(entry)
 
58
  for k, v in entry.items():
59
  if k in name_to_bm_map:
60
  benchmark, metric = name_to_bm_map[k]
61
  new_entry[k] = entry[k][metric]
62
+ for sys_metric, metric_namne in system_metrics_to_name_map.items():
63
+ if sys_metric in entry[k]:
64
+ new_entry[f"{k} {metric_namne}"] = entry[k][sys_metric]
65
 
66
  all_data_json += [new_entry]
67
 
 
77
  df[col] = np.nan
78
 
79
  if not df.empty:
80
+ df = df.round(decimals=2)
81
 
82
  # filter out if any of the benchmarks have not been produced
83
+ # df = df[has_no_nan_values(df, benchmark_cols)]
84
 
85
  return raw_data, df
86