
Source code for hfai.nn.sync_function.syncfunc

import torch
import torch.distributed as dist
from torch.distributed.distributed_c10d import _get_global_rank, _get_default_group
from collections import defaultdict
from .time import timeout as timeout_warp
from .time import TimeoutError

class CudaTimer():

    def __init__(self):
        self.fwd_start = torch.cuda.Event(enable_timing=True)
        self.fwd_end = torch.cuda.Event(enable_timing=True)
        self.bwd_start = torch.cuda.Event(enable_timing=True)
        self.bwd_end = torch.cuda.Event(enable_timing=True)


    def reset(self):
        self.fwd_recorded = False
        self.bwd_recorded = False

        self.fwd_time = 0.
        self.bwd_time = 0.
        self.iters = 0
        self.comm_size = 0

        self.tot_iters = 0

    def record_fwd_start(self):

    def record_fwd_end(self, size):
        self.comm_size += size
        self.fwd_recorded = True
        self.iters += 1

    def record_bwd_start(self):

    def record_bwd_end(self):
        self.bwd_recorded = True

    def finalize_fwd(self):
        if self.fwd_recorded:

            t = self.fwd_start.elapsed_time(self.fwd_end)
            self.fwd_time += t
            self.fwd_recorded = False

    def finalize_bwd(self):
        if self.bwd_recorded:

            t = self.bwd_start.elapsed_time(self.bwd_end)
            self.bwd_time += t
            self.bwd_recorded = False

    def finalize(self):

timers = defaultdict(CudaTimer)

[docs]def sync(x, dist_group=False, dim=0, equal_size=False, tag=None, enable_timer=True, log_every_steps=1, timeout=60, reduce_grad=True): """ allgather 输入的 tensor 并沿着指定的维度拼接在一起,支持 autograd,backward 的时候梯度会传回去 ``F.sync.get_metrics`` 会返回一个字典,格式如下: .. code-block:: python { "tag1": {"iters": 100, "fwd": 25, "bwd": 40, "size": 16}, "tag2": {"iters": 100, "fwd": 25, "bwd": 40, "size": 16}, } ``iters`` 代表该 tag 调用的次数,``fwd`` / ``bwd`` 代表每次 forward / backward 的平均耗时(ms),``size`` 代表每次 forward 返回结果的平均大小(byte) Args: x (Tensor): 输入的 tensor dist_group (ProcessGroup): ProcessGroup 对象,如果是 ``False`` 则不会做 allgather dim (int): allgather 之后拼接的维度 equal_size (bool): 是否每张卡上的 tensor 大小相同 tag (str): 计时的标签,每个标签在一次 forward 中只能用一次; tag 为 ``None`` 时不计时 enable_timer (bool): 是否计时 log_every_steps (int): 每多少个 step 计时一次 timeout (int): 本函数超时的秒数,超过这个时间会抛出异常;``0`` 代表没有时间限制;默认是 ``60`` reduce_grad (bool): 是否对传回来的梯度做 reduce,默认是 ``True`` Returns: out (Tensor): 拼接后的结果 Examples: .. code-block:: python import torch.distributed as dist import hfai.nn.functional as F # init process group ... rank = dist.get_rank() x = torch.ones(1, requires_grad=True, device='cuda') * rank out = F.sync(x, dist_group, dim=0, tag='tag1') out.sum().backward() # 打印耗时、通讯量等 F.sync.print_metrics() # 获得 metrics print(F.sync.get_metrics()) # 重置 metrics F.sync.reset() """ enable_timer = enable_timer and (log_every_steps >= 1) and (tag is not None) if enable_timer: assert isinstance(tag, str), "tag 必须是一个字符串" timers[tag].tot_iters += 1 if timers[tag].tot_iters % log_every_steps != 0: enable_timer = False f = timeout_warp(timeout)(SyncFunction.apply) try: result = f(x, dist_group, dim, equal_size, tag, enable_timer, reduce_grad) except TimeoutError as e: group = dist_group or _get_default_group() rank = dist.get_rank(group=group) world_size = dist.get_world_size(group=group) msg = f"F.sync is timeout for {e.sec} seconds! RANK {rank} / {world_size}, " \ f"x.shape {x.shape}, dim {dim}, equal_size {equal_size}, tag {tag}" raise RuntimeError(msg) return result
def reset(): global timers timers = defaultdict(CudaTimer) def print_metrics(): metrics = get_metrics() print(metrics, flush=True) def get_metrics(): global timers metrics = {} for tag, timer in timers.items(): timer.finalize() it = timer.iters if it > 0: metrics[tag] = { 'iters': it, 'fwd': timer.fwd_time / it, 'bwd': timer.bwd_time / it, 'size': timer.comm_size / it } return metrics sync.reset = reset sync.print_metrics = print_metrics sync.get_metrics = get_metrics class SyncFunction(torch.autograd.Function): @staticmethod def forward(ctx, x, dist_group=False, dim=0, equal_size=False, tag=None, enable_timer=True, reduce_grad=True): if enable_timer: timers[tag].record_fwd_start() ctx.tag = tag ctx.enable_timer = enable_timer ctx.reduce_grad = reduce_grad out = sync_forward(ctx, x, dist_group, dim, equal_size) if enable_timer: size = out.numel() * out.element_size() timers[tag].record_fwd_end(size) return out @staticmethod def backward(ctx, grad_output): if ctx.enable_timer and ctx.reduce_grad: timers[ctx.tag].record_bwd_start() out = sync_backward(ctx, grad_output) if ctx.enable_timer and ctx.reduce_grad: timers[ctx.tag].record_bwd_end() return out def sync_forward(ctx, x, dist_group, dim, equal_size): ctx.dist_group = dist_group if dist_group is False: return x dist_group = dist_group or _get_default_group() rank = dist.get_rank(group=dist_group) world_size = dist.get_world_size(group=dist_group) if equal_size: batch_sizes = [x.size(dim) for _ in range(world_size)] else: sizes = torch.zeros(world_size, dtype=torch.int32, device=x.device) sizes[rank] = x.size(dim) dist.all_reduce(sizes, group=dist_group) batch_sizes = sizes.tolist() ctx.batch_sizes = batch_sizes ctx.dim = dim if x.numel() // x.size(dim) * sum(batch_sizes) < 1024 * world_size: return fwd_allreduce_impl(dim, batch_sizes, dist_group, x) return fwd_allgather_impl(dim, batch_sizes, dist_group, x) def sync_backward(ctx, grad_output): dist_group = ctx.dist_group dim, batch_sizes = ctx.dim, ctx.batch_sizes if dist_group is False: return grad_output, None, None, None, None, None, None dist_group = dist_group or _get_default_group() rank = dist.get_rank(group=dist_group) world_size = dist.get_world_size(group=dist_group) if not ctx.reduce_grad: start = sum(batch_sizes[:rank]) grad = grad_output.narrow(dim, start, batch_sizes[rank]) return grad, None, None, None, None, None, None if grad_output.numel() < 1024 * world_size: return bwd_allreduce_impl(dim, batch_sizes, dist_group, grad_output) if len(set(batch_sizes)) == 1: return bwd_reducescatter_impl(dim, batch_sizes, dist_group, grad_output) size = grad_output.numel() / (1 << 20) nodes = world_size // 8 node2size = [(1, 4), (2, 16), (4, 64), (8, 512)] for n, min_size in node2size: if nodes <= n: if size >= min_size: return bwd_reduce_impl(dim, batch_sizes, dist_group, grad_output) else: return bwd_allreduce_impl(dim, batch_sizes, dist_group, grad_output) return bwd_allreduce_impl(dim, batch_sizes, dist_group, grad_output) def fwd_allgather_impl(dim, batch_sizes, dist_group, x): if dim != 0: x = x.transpose(0, dim) rank = dist.get_rank(group=dist_group) world_size = dist.get_world_size(group=dist_group) max_batch_size = max(batch_sizes) shape = list(x.size()) shape[0] = max_batch_size * world_size result_tensor = torch.empty(shape, dtype=x.dtype, device=x.device) tensors = result_tensor.chunk(world_size) tensors[rank][:x.size(0)].copy_(x) dist_group = dist_group or _get_default_group() # gather all tensors all_gather_base(result_tensor, tensors[rank], group=dist_group) # unroll tot = 0 for i in range(world_size): if tot < i * max_batch_size: # left shift result_tensor[tot:tot + batch_sizes[i]] = \ result_tensor[i * max_batch_size:i * max_batch_size + batch_sizes[i]].clone() tot += batch_sizes[i] output = result_tensor[:tot] if dim != 0: output = output.transpose(0, dim).contiguous() return output def fwd_allreduce_impl(dim, batch_sizes, dist_group, x): rank = dist.get_rank(group=dist_group) shape = list(x.size()) shape[dim] = sum(batch_sizes) result_tensor = torch.zeros(shape, dtype=x.dtype, device=x.device) start = sum(batch_sizes[:rank]) size = batch_sizes[rank] result_tensor.narrow(dim, start, size).data.copy_(x) dist.all_reduce(result_tensor, group=dist_group) return result_tensor def bwd_reduce_impl(dim, batch_sizes, dist_group, grad_output): rank = dist.get_rank(group=dist_group) world_size = dist.get_world_size(group=dist_group) grads = [] i0 = 0 for i in range(world_size): g = grad_output.narrow(dim, i0, batch_sizes[i]) grads.append(g.contiguous()) i0 += batch_sizes[i] if dist_group is None or dist_group is _get_default_group(): global_ranks = list(range(world_size)) else: global_ranks = [_get_global_rank(dist_group, i) for i in range(world_size)] for i, gloabl_rank in enumerate(global_ranks): dist.reduce(grads[i], gloabl_rank, group=dist_group) return grads[rank], None, None, None, None, None, None def bwd_allreduce_impl(dim, batch_sizes, dist_group, grad_output): grad_output = grad_output.contiguous() rank = dist.get_rank(group=dist_group) dist.all_reduce(grad_output, group=dist_group) start = sum(batch_sizes[:rank]) grad = grad_output.narrow(dim, start, batch_sizes[rank]) return grad, None, None, None, None, None, None def bwd_reducescatter_impl(dim, batch_sizes, dist_group, grad_output): if dim != 0: grad_output = grad_output.transpose(0, dim) grad_output = grad_output.contiguous() shape = list(grad_output.shape) shape[0] = batch_sizes[0] grad = grad_output.new_empty(shape) reduce_scatter_base(grad, grad_output, group=dist_group) if dim != 0: grad = grad.transpose(0, dim) return grad, None, None, None, None, None, None def all_gather_base(output, input, group): if hasattr(dist, "_all_gather_base"): return dist._all_gather_base(output, input, group=group) chunks = list(output.chunk(group.size())) dist.all_gather(chunks, input, group=group) def reduce_scatter_base(output, input, group): if hasattr(dist, "_reduce_scatter_base"): return dist._reduce_scatter_base(output, input, group=group) chunks = list(input.chunk(group.size())) dist.reduce_scatter(output, chunks, group=group)