Shortcuts

Source code for hfai.utils.profile

from contextlib import contextmanager, ExitStack
from tabulate import tabulate
from packaging import version
import torch
from collections import defaultdict
import time


[docs]def profile_memory(model, input=(), input_kwargs={}, include_children=True, sort_by="name", show_shapes=False, show_peakmem=False, show_forward_time=False, forward_funcs=["forward"]): """ 分析模型的显存占用情况 打印出来的结果包含以下几个字段: 1. ``parameter size``: 参数总量 #. ``activation size``: forward 的过程中通过 ``save_for_backward`` 保存的 tensor 大小(不包含参数) #. ``#calls``: 被调用的次数 #. ``input shape``: 输入的 tensor 形状 #. ``output shape``: 输出的 tensor 形状 #. ``peak mem``: 峰值显存;forward 过程中的峰值显存减去 forward 之前的已占用显存 #. ``forward time``: 模块时间:每个模块 forward 前后的时间差,多次调用则累加 NOTE: 不同算子可能会重复保存一部分的中间层变量,所以总的 ``activation size`` 会比实际的显存使用量要大。 NOTE: ``show_peakmem = True`` 和 ``include_children = False`` 互斥 NOTE: ``show_forward_time = True`` 和 ``include_children = False`` 互斥 NOTE: 仅支持 PyTorch >= 1.10 Args: model (torch.nn.Module): 需要被分析的模型 input (tuple): 模型的输入,通过 ``model(*input, **input_kwargs)`` 调用 input_kwargs (dict): 模型的关键字参数,通过 ``model(*input, **input_kwargs)`` 调用 include_children (bool): 每个模块的显存占用计算是否包含其子模块(类型为 ``nn.Module``)的显存占用;默认是 ``True`` sort_by (str): ``name``, ``activation``, ``parameter``, ``peakmem`` 或者 ``forward_time``,输出的时候根据哪个字段进行排序;默认是 ``name`` show_shapes (bool): 是否打印输入、输出的形状;默认是 ``False`` show_peakmem (bool): 是否打印峰值显存,模型必须在 GPU 上;默认是 ``False`` show_forward_time (bool): 是否打印模块时间 Examples: >>> import torch, hfai >>> from torchvision import models >>> model = models.alexnet().cuda() >>> x = torch.randn(64, 3, 224, 224, device="cuda") >>> hfai.utils.profile_memory(model, input=(x,)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ HFAI Memory Profiler (include_children = True, sort_by = name) ==================== ================= ================ ================= ======== module name type parameter size activation size #calls ==================== ================= ================ ================= ======== AlexNet AlexNet 233.081 MiB 345.312 MiB 1 AlexNet.features Sequential 9.421 MiB 336.000 MiB 1 AlexNet.features.0 Conv2d 0.089 MiB 36.750 MiB 1 ...... AlexNet.classifier.5 ReLU 0.000 MiB 1.000 MiB 1 AlexNet.classifier.6 Linear 15.629 MiB 1.000 MiB 1 ==================== ================= ================ ================= ======== total unique activations: 225.906 MiB ====================================================================================== """ if version.parse(torch.__version__) < version.parse("1.10.0"): raise RuntimeError("hfai.utils.profile_memory 只支持 PyTorch >= 1.10") assert isinstance(input, tuple) assert isinstance(input_kwargs, dict) assert sort_by in ["parameter", "activation", "peakmem", "name", "forward_time"] assert include_children or (not show_peakmem), "include_children = False 时不支持 show_peakmem = True" assert include_children or (not show_forward_time), "include_children = False 时不支持 show_forward_time = True" device = next(model.parameters()).device assert not show_peakmem or device != torch.device("cpu"), "show_peakmem = True 不支持 CPU 上的模型" assert sort_by != "peakmem" or show_peakmem, "show_peakmem = False 时不支持 sort_by = peakmem" assert sort_by != "forward_time" or show_forward_time stats = [] model_name = model.__class__.__name__ time_details = defaultdict(float) iters = 100 if show_forward_time: # warmup for i in range(10): model(*input, **input_kwargs) torch.cuda.synchronize() time_profiler = TimeProfiler() # backup forward, replace forward using wrap_func for name, module in model.named_modules(): for func_name in forward_funcs: if hasattr(module, func_name): func = getattr(module, func_name) setattr(module, "_hfai_orig_" + func_name, func) setattr(module, func_name, time_profiler.wrap_func(name, module, func)) start_time = time.time() for i in range(iters): model(*input, **input_kwargs) torch.cuda.synchronize() end_time = time.time() for name in time_profiler.event_dict: full_name = (model_name + '.' + name) if name else model_name for start, end in time_profiler.event_dict[name]: time_details[full_name] += start.elapsed_time(end) # recover forward for name, module in model.named_modules(): for func_name in forward_funcs: if hasattr(module, func_name): func = getattr(module, "_hfai_orig_" + func_name) setattr(module, func_name, func) profiler = MemoryProfiler(model, include_children) # backup forward, replace it using wrap_func for name, module in model.named_modules(): for func_name in forward_funcs: if hasattr(module, func_name): func = getattr(module, func_name) setattr(module, "_hfai_orig_" + func_name, func) setattr(module, func_name, profiler.wrap_func(name, module, func)) with profiler.profile(): model(*input, **input_kwargs) for name, (module, asize, ncalls, shapes, out_shapes, peak_mem) in profiler.module_stats.items(): full_name = (model_name + '.' + name) if name else model_name psize = sum(p.numel() * p.element_size() for p in module.parameters(recurse=include_children)) typename = type(module).__name__ stats.append( (full_name, typename, psize, asize, ncalls, peak_mem, shapes, out_shapes, time_details.get(full_name, 0) / iters) ) # sort if sort_by == "parameter": stats.sort(key=lambda x: (x[2], x[3], x[0]), reverse=True) elif sort_by == "activation": stats.sort(key=lambda x: (x[3], x[2], x[0]), reverse=True) elif sort_by == "peakmem": stats.sort(key=lambda x: (x[5], x[0]), reverse=True) elif sort_by == "forward_time": stats.sort(key=lambda x: (x[8], x[0]), reverse=True) table = [] for n, typename, psize, asize, ncalls, peak_mem, shapes, out_shapes, forward_time in stats: row = [n, typename, format_size(psize), format_size(asize), ncalls, format_size(peak_mem), shapes, out_shapes, f'{forward_time:.3f} ms'] table.append(row) headers = ["module name", "type", "parameter size", "activation size", "#calls", "peak mem", "input shape", "ouptut shape", "forward time"] colalign = ["left", "left", "right", "right", "right", "right", "left", "left", "right"] def pop(index): for row in table: row.pop(index) headers.pop(index) colalign.pop(index) if not show_forward_time: pop(8) if not show_shapes: pop(7), pop(6) if not show_peakmem: pop(5) table = tabulate(table, headers=headers, colalign=colalign, tablefmt="rst") total_unique_activation_size = format_size(profiler.unique_activation_size) msg = f"HFAI Memory Profiler (include_children = {include_children}, sort_by = {sort_by})\n" msg += str(table) + "\n" line_width = len(str(table).split("\n")[0]) msg = line_width * "^" + "\n" + msg n_parameters = sum(p.numel() * p.element_size() for p in model.parameters()) msg += f"total params size: {format_size(n_parameters)}\n" msg += f"total unique activations: {total_unique_activation_size}\n" if show_forward_time: msg += f"total forward time per iter: {(end_time - start_time) / iters * 1000:.3f} ms\n" msg += line_width * "=" + "\n" print(msg, flush=True) for name, module in model.named_modules(): for func_name in forward_funcs: if hasattr(module, func_name): func = getattr(module, "_hfai_orig_" + func_name) setattr(module, func_name, func)
def format_size(size): return f"{size / (1 << 20):.3f} MiB" class CudaMemoryStats(): def __init__(self) -> None: self.max_mem = 0 @contextmanager def reset_peak_memory_stats(self): self.max_mem = max(torch.cuda.max_memory_allocated(), self.max_mem) prev_max_mem = self.max_mem try: torch.cuda.reset_peak_memory_stats() self.max_mem = torch.cuda.max_memory_allocated() yield finally: self.max_mem = max(prev_max_mem, self.max_mem, torch.cuda.max_memory_allocated()) def max_memory_allocated(self): self.max_mem = max(torch.cuda.max_memory_allocated(), self.max_mem) return self.max_mem class TimeProfiler(): def __init__(self) -> None: self.event_dict = defaultdict(list) def wrap_func(self, name, module, func): def wrapped_func(*args, **kwargs): start_event = torch.cuda.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True) # record forward time using event start_event.record() outputs = func(*args, **kwargs) end_event.record() self.event_dict[name].append((start_event, end_event)) return outputs return wrapped_func class MemoryProfiler(): def __init__(self, model, include_children=True) -> None: self.module_stats = {} self.hooks = SavedTensorsHooks(include_children) self.include_children = include_children self.seen_params = set() self.seen_acts = set() self.cuda_mem_stats = CudaMemoryStats() for p in model.parameters(): storage = p.storage() if storage.data_ptr() not in self.seen_params: self.seen_params.add(storage.data_ptr()) self.seen_acts.add(storage.data_ptr()) self.unique_activation_size = 0 def wrap_func(self, name, module, func): def pack_hook(tensor): storage = tensor.storage() if storage.data_ptr() not in self.seen_params: nbytes = tensor.numel() * tensor.element_size() self.module_stats[name][1] += nbytes if storage.data_ptr() not in self.seen_acts: self.seen_acts.add(storage.data_ptr()) self.unique_activation_size += storage.size() * storage.element_size() return tensor def unpack_hook(tensor): return tensor if name not in self.module_stats: # [module, activation size, #calls, input shape, output shape, peak memory] self.module_stats[name] = [module, 0, 0, '', '', 0] def wrapped_func(*args, **kwargs): mem = torch.cuda.memory_allocated() with self.cuda_mem_stats.reset_peak_memory_stats(): with self.hooks.enable_hook(pack_hook, unpack_hook): outputs = func(*args, **kwargs) peak_mem = self.cuda_mem_stats.max_memory_allocated() - mem self.module_stats[name][2] += 1 self.module_stats[name][3] = format_input_shape(args, kwargs) self.module_stats[name][4] = format_output_shape(outputs) self.module_stats[name][5] = peak_mem return outputs return wrapped_func @contextmanager def profile(self): with ExitStack() as stack: stack.enter_context(self.hooks.saved_tensors_hooks()) yield return def format_input_shape(args, kwargs): shapes = [] for obj in args: shapes.append(format_tensor(obj)) for k, v in kwargs.items(): s = format_tensor(v) shapes.append(f"{k}={s}") msg = ", ".join([str(s) for s in shapes]) return msg def format_output_shape(outputs): shapes = [] if isinstance(outputs, torch.Tensor): shapes.append(format_tensor(outputs)) elif isinstance(outputs, (tuple, list)): for out in outputs: shapes.append(format_tensor(out)) else: return str(format_tensor(outputs)) msg = ", ".join([str(s) for s in shapes]) return msg def format_tensor(obj): if isinstance(obj, torch.Tensor): return tuple(obj.shape) if isinstance(obj, (tuple, list)): return type(obj)(format_tensor(x) for x in obj) if isinstance(obj, dict): return {k: format_tensor(v) for k, v in obj.items()} if isinstance(obj, (int, float, str, bool, type(None))): return obj return "[UNKOWN]" class SavedTensorsHooks(): def __init__(self, include_children=True) -> None: self.hooks = [] self.current_hook = None self.include_children = include_children @contextmanager def enable_hook(self, pack_hook, unpack_hook): parent_hook = None try: if not self.include_children and len(self.hooks) > 0: parent_hook = self.hooks.pop() self.hooks.append((pack_hook, unpack_hook)) yield finally: self.hooks.pop() if parent_hook: self.hooks.append(parent_hook) def pack_hook(self, tensor): for hook in reversed(self.hooks): tensor = hook[0](tensor) return tensor def unpack_hook(self, tensor): for hook in self.hooks: tensor = hook[1](tensor) return tensor @contextmanager def saved_tensors_hooks(self): with ExitStack() as stack: context = torch.autograd.graph.saved_tensors_hooks(self.pack_hook, self.unpack_hook) stack.enter_context(context) yield return