chivier commited on
Commit
572836d
·
1 Parent(s): 609b099

sync from github

Browse files
src/backend/hflm_with_measurement.py CHANGED
@@ -37,7 +37,7 @@ from lm_eval.models.utils import (
37
  stop_sequences_criteria,
38
  )
39
  from lm_eval.models.huggingface import HFLM
40
- from src.utils import get_gpu_number, get_gpu_details, get_peak_bw, transfer_precision2bytes, get_peak_flops
41
  from src.submission.check_validity import get_model_size
42
  from src.envs import API
43
 
@@ -73,6 +73,18 @@ class HFLMWithMeasurement(HFLM):
73
  self.pretrained = kwargs.get("pretrained", None)
74
  self.revision = kwargs.get("revision", None)
75
  self.precision = kwargs.get("dtype", None)
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  def _loglikelihood_tokens(
78
  self,
@@ -352,7 +364,8 @@ class HFLMWithMeasurement(HFLM):
352
  else:
353
  continue
354
  print(f"linear_count: {linear_count}")
355
- print(f"element_wise_mul: {element_wise_mul}")
 
356
 
357
  stopping_criteria = stop_sequences_criteria(
358
  self.tokenizer, stop, context.shape[1], context.shape[0]
@@ -423,7 +436,7 @@ class HFLMWithMeasurement(HFLM):
423
  per_token_kv_size = 2 * n_layers * d_model * precision_bytes
424
 
425
  peak_bw_single = get_peak_bw(get_gpu_details())
426
- peak_bw = peak_bw_single * get_gpu_number()
427
 
428
  context_prefill_size = context_length
429
  kv_size = context_prefill_size * per_token_kv_size + (output_length - 1) * per_token_kv_size / 2
@@ -441,7 +454,7 @@ class HFLMWithMeasurement(HFLM):
441
  avg_context_length = context_length + (output_length - 1) / 2
442
  flops_per_token = 2 * model_size + ((linear_count + element_wise_mul) * n_layers * avg_context_length * d_model) + 4 * d_model + 2 * d_model * n_vocab
443
  peak_flops_single = get_peak_flops(get_gpu_details(), self.precision)
444
- peak_flops = peak_flops_single * get_gpu_number()
445
 
446
  ## TODO only support llama-type decoder only models and moe models of switch transformer and mixtrial
447
  mfu = token_per_sec * flops_per_token / peak_flops
 
37
  stop_sequences_criteria,
38
  )
39
  from lm_eval.models.huggingface import HFLM
40
+ from src.utils import get_gpu_details, get_peak_bw, transfer_precision2bytes, get_peak_flops
41
  from src.submission.check_validity import get_model_size
42
  from src.envs import API
43
 
 
73
  self.pretrained = kwargs.get("pretrained", None)
74
  self.revision = kwargs.get("revision", None)
75
  self.precision = kwargs.get("dtype", None)
76
+ self.num_gpus = None
77
+
78
+ def _detect_num_gpus_used(self):
79
+ if self.num_gpus is not None:
80
+ return self.num_gpus
81
+ gpus = []
82
+ for p in self.model.parameters():
83
+ if p.device.type == "cuda":
84
+ gpus.append(p.device.index)
85
+
86
+ self.num_gpus = len(set(gpus))
87
+ return self.num_gpus
88
 
89
  def _loglikelihood_tokens(
90
  self,
 
364
  else:
365
  continue
366
  print(f"linear_count: {linear_count}")
367
+ print(f"element_wise_mul: {element_wise_mul}")
368
+ print(f"GPU usage: {self._detect_num_gpus_used()}")
369
 
370
  stopping_criteria = stop_sequences_criteria(
371
  self.tokenizer, stop, context.shape[1], context.shape[0]
 
436
  per_token_kv_size = 2 * n_layers * d_model * precision_bytes
437
 
438
  peak_bw_single = get_peak_bw(get_gpu_details())
439
+ peak_bw = peak_bw_single * self._detect_num_gpus_used()
440
 
441
  context_prefill_size = context_length
442
  kv_size = context_prefill_size * per_token_kv_size + (output_length - 1) * per_token_kv_size / 2
 
454
  avg_context_length = context_length + (output_length - 1) / 2
455
  flops_per_token = 2 * model_size + ((linear_count + element_wise_mul) * n_layers * avg_context_length * d_model) + 4 * d_model + 2 * d_model * n_vocab
456
  peak_flops_single = get_peak_flops(get_gpu_details(), self.precision)
457
+ peak_flops = peak_flops_single * self._detect_num_gpus_used()
458
 
459
  ## TODO only support llama-type decoder only models and moe models of switch transformer and mixtrial
460
  mfu = token_per_sec * flops_per_token / peak_flops
src/utils.py CHANGED
@@ -174,43 +174,6 @@ def analyze_gpu_stats(stats_list):
174
 
175
  return avg_stats
176
 
177
- def get_gpu_number():
178
- visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
179
- if visible_devices is not None:
180
- gpu_indices = visible_devices.split(',')
181
- else:
182
- # Query all GPU indices if CUDA_VISIBLE_DEVICES is not set
183
- result = subprocess.run(['nvidia-smi', '--query-gpu=index', '--format=csv,noheader'], capture_output=True, text=True)
184
- if result.returncode != 0:
185
- print("Failed to query GPU indices.")
186
- return []
187
- gpu_indices = result.stdout.strip().split('\n')
188
- # print(f"gpu_indices: {gpu_indices}")
189
- gpu_stats = []
190
-
191
- gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
192
-
193
- for index in gpu_indices:
194
- result = subprocess.run(['nvidia-smi', '-i', index], capture_output=True, text=True)
195
- output = result.stdout.strip()
196
- lines = output.split("\n")
197
- for line in lines:
198
- match = gpu_info_pattern.search(line)
199
- gpu_info = {}
200
- if match:
201
- temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
202
- gpu_info.update({
203
- GPU_TEMP: temp,
204
- GPU_Power: power_usage,
205
- GPU_Mem: round(mem_usage / 1024, 2),
206
- GPU_Util: gpu_util
207
- })
208
-
209
- if len(gpu_info) >= 4:
210
- gpu_stats.append(gpu_info)
211
-
212
- return len(gpu_stats)
213
-
214
  def get_gpu_details():
215
  gpus = GPUtil.getGPUs()
216
  gpu = gpus[0]
 
174
 
175
  return avg_stats
176
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  def get_gpu_details():
178
  gpus = GPUtil.getGPUs()
179
  gpu = gpus[0]