File size: 3,483 Bytes
3f50570 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import time
import torch
import pandas as pd
from fvcore.nn import FlopCountAnalysis, ActivationCountAnalysis
def profile_model(model, input_tensor, display=False):
flops = calculate_flops(model, input_tensor[0:1, ...]) # (1, n_mels, n_frames)
acts = calculate_activations(model, input_tensor[0:1, ...]) # (1, n_mels, n_frames)
params = calculate_params(model)
speed = calculate_speed(model, input_tensor[0:1, ...]) # (1, n_mels, n_frames)
memory = calculate_memory(model, input_tensor) # (B, n_mels, n_frames)
profile_data = {
"Metric": [
"FLOPs (G)",
"Activations (M)",
"Params (M)",
"Memory (GB)",
"Speed (A/S)",
],
"Value": [flops, acts, params, memory, speed],
}
profile_df = pd.DataFrame(profile_data).set_index("Metric").T
if display:
print(profile_df.to_markdown(index=False, tablefmt="grid"))
return profile_df
def calculate_speed(model, input_tensor, num_runs=100, warmup_runs=5):
model.eval()
if torch.cuda.is_available():
# Warm-up iterations
with torch.no_grad():
for _ in range(warmup_runs):
_ = model(input_tensor)
# Create CUDA events for timing
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
# Actual timing
start.record()
with torch.no_grad():
for _ in range(num_runs):
_ = model(input_tensor)
end.record()
# Synchronize to wait for the events to be recorded
torch.cuda.synchronize()
# Calculate elapsed time
elapsed_time = start.elapsed_time(end) # in milliseconds
latency = elapsed_time / num_runs / 1000.0 # convert to seconds
else:
# Warm-up iterations
with torch.no_grad():
for _ in range(warmup_runs):
_ = model(input_tensor)
# Actual timing
start = time.time()
with torch.no_grad():
for _ in range(num_runs):
_ = model(input_tensor)
end = time.time()
# Calculate elapsed time
latency = (end - start) / num_runs
return 1.0 / latency
def calculate_flops(model, input_tensor):
"""Calculate FLOPs in GigaFLOPs.
Models often reports MACs as FLOPs e.g. ConvNeXt, timm library
Reference:
1. https://github.com/huggingface/pytorch-image-models/blob/main/benchmark.py#L206
2. https://github.com/facebookresearch/fvcore/issues/69
"""
flops = FlopCountAnalysis(model, input_tensor).total()
return flops / 1e9 # in GigaFLOPs
def calculate_activations(model, input_tensor):
acts = ActivationCountAnalysis(model, input_tensor).total()
return acts / 1e6 # in Millions
def calculate_params(model):
return sum(p.numel() for p in model.parameters()) / 1e6 # in Millions
def calculate_memory(model, input_tensor):
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats(device=None)
start_memory = torch.cuda.max_memory_allocated(device=None)
model.train()
_ = model(input_tensor)
end_memory = torch.cuda.max_memory_allocated(device=None)
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats(device=None)
memory = (end_memory - start_memory) / (1024**3) # in GB
else:
memory = 0
return memory
|