Shortcuts

Source code for hfai.nn.modules.hf_norm

import torch
import torch.nn as nn
import time
from torch import Tensor, Size
from typing import Union, List, Tuple
import numbers

no_layernorm = False

try:
    import hfai.hfcuda.layernorm as layernorm
except:
    no_layernorm = True

try:
    import hfai.hfcuda.rmsnorm as rmsnorm
except:
    no_rmsnorm = True


class LayerNormFunc(torch.autograd.Function):
    @staticmethod
    @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
    def forward(ctx, x, normalized_shape, gamma, beta, eps, elementwise_affine, training):

        assert x.dtype == torch.float32, \
            'hfai.nn.LayerNorm暂时只支持float32'

        shapex = x.shape
        index = x.device.index
        x = x.contiguous()

        if elementwise_affine is True:
            gamma = gamma.contiguous()
            beta = beta.contiguous()
        hidden_size = 1

        for size in normalized_shape:
            hidden_size = hidden_size * size

        if elementwise_affine is True:
            y, x_mean, x_var = layernorm.forward(x.view((-1, hidden_size)), gamma, beta, eps, index)
        else:
            y, x_mean, x_var = layernorm.forward_without_gammabeta(x.view((-1, hidden_size)), eps, index)

        if training:
            ctx.hidden_size = hidden_size
            ctx.elementwise_affine = elementwise_affine
            ctx.normalized_shape = normalized_shape
            ctx.save_for_backward(x, x_mean, x_var, gamma)

        return y.view(shapex)

    @staticmethod
    @torch.cuda.amp.custom_bwd
    @torch.autograd.function.once_differentiable
    def backward(ctx, dy):
        hidden_size = ctx.hidden_size
        elementwise_affine = ctx.elementwise_affine
        normalized_shape = ctx.normalized_shape
        x, x_mean, x_var, gamma = ctx.saved_tensors

        index = dy.device.index
        dy = dy.contiguous()
        if elementwise_affine is True:
            dxmat, dgamma, dbeta = layernorm.backward(dy.view((-1, hidden_size)), x.view((-1, hidden_size)), x_mean,
                                                      x_var, gamma, index)
            dgamma = dgamma.view(normalized_shape)
            dbeta = dbeta.view(normalized_shape)
        else:
            dxmat = \
                layernorm.backward_without_gammabeta(dy.view((-1, hidden_size)), x.view((-1, hidden_size)), x_mean,
                                                     x_var,
                                                     index)[0]

        dx = dxmat.view(dy.shape)

        if elementwise_affine is True:
            return dx, None, dgamma, dbeta, None, None, None
        else:
            return dx, None, None, None, None, None, None


_shape_t = Union[int, List[int], Size]


[docs]class LayerNorm(nn.LayerNorm): """ 更高效的LayerNorm算子 接口和 `PyTorch的LayerNorm算子 <https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html?highlight=layernorm#torch.nn.LayerNorm>`_ 一致 """ def forward(self, input): if not input.is_cuda: return super().forward(input) return LayerNormFunc.apply(input, self.normalized_shape, self.weight, self.bias, self.eps, self.elementwise_affine, torch.is_grad_enabled() and self.training)
# Reference implementation from Huggingface def manual_rms_norm(input, normalized_shape, weight, eps): # layer norm should always be calculated in float32 dims = tuple(i for i in range(-1, -len(normalized_shape) - 1, -1)) variance = input.to(torch.float32).pow(2).mean(dims, keepdim=True) input = input * torch.rsqrt(variance + eps) if weight is None: return input # convert into half-precision if necessary if weight.dtype in [torch.float16, torch.bfloat16]: input = input.to(weight.dtype) return weight * input class RMSNormFunc(torch.autograd.Function): @staticmethod def forward(ctx, input, normalized_shape, weight, eps, elementwise_affine, is_training): hidden_size = 1 for size in normalized_shape: hidden_size *= size if elementwise_affine: weight = weight.contiguous() else: weight = torch.tensor([]) input = input.contiguous() output, invvar = rmsnorm.forward(input.view(-1, hidden_size), weight, eps, hidden_size) if is_training: ctx.save_for_backward(input, weight, invvar) ctx.hidden_size = hidden_size ctx.eps = eps return output.view(input.shape) @staticmethod @torch.autograd.function.once_differentiable def backward(ctx, grad_output): hidden_size = ctx.hidden_size eps = ctx.eps input, weight, invvar = ctx.saved_tensors grad_output = grad_output.contiguous() grad_input, grad_weight = rmsnorm.backward(grad_output.view(-1, hidden_size), input.view(-1, hidden_size), weight, invvar, eps, hidden_size) grad_input = grad_input.view(grad_output.shape) return grad_input, None, grad_weight, None, None, None class RMSNorm(nn.Module): def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True): super().__init__() if isinstance(normalized_shape, numbers.Integral): normalized_shape = (normalized_shape,) self.normalized_shape = torch.Size(normalized_shape) self.eps = eps self.elementwise_affine = elementwise_affine if self.elementwise_affine: self.weight = nn.Parameter(torch.ones(*normalized_shape)) else: self.register_parameter("weight", None) def forward(self, input): if not input.is_cuda: return manual_rms_norm(input, self.normalized_shape, self.weight, self.eps) return RMSNormFunc.apply(input, self.normalized_shape, self.weight, self.eps, self.elementwise_affine, torch.is_grad_enabled() and self.training)