Source code for hfai.nn.modules.rnn

from typing import Union, Tuple, Optional
import numbers
import torch
from torch import nn
from torch.autograd import Function
from torch.nn.utils.rnn import PackedSequence
from .context import context
from .dropout import Dropout

try:
    import hfai.hfcuda.hf_a100_lstm_cuda_onchip_fp as hf_a100_lstm_cuda_onchip_fp
except:
    pass
try:
    import hfai.hfcuda.hf_a100_lstm_cuda_onchip_tf as hf_a100_lstm_cuda_onchip_tf
except:
    pass
try:
    import hfai.hfcuda.hf_a100_lstm_cuda_onchip_tf_small_h as hf_a100_lstm_cuda_onchip_tf_small_h
except:
    pass
try:
    import hfai.hfcuda.hf_a100_lstm_cuda_onchip_bf16 as hf_a100_lstm_cuda_onchip_bf16
except:
    pass
try:
    import hfai.hfcuda.hf_a100_lstm_cuda_offchip as hf_a100_lstm_cuda_offchip
except:
    pass


def get_LSTM(bs, hidden_size, device):
    if torch.cuda.get_device_capability() != (
            8, 0
    ) or torch.cuda.get_device_properties(device).multi_processor_count < 108:
        return hf_a100_lstm_cuda_offchip
    elif not torch.backends.cuda.matmul.allow_tf32:
        if bs <= 16 and hidden_size <= 1728:
            try:
                return hf_a100_lstm_cuda_onchip_fp
            except:
                return hf_a100_lstm_cuda_offchip
        else:
            return hf_a100_lstm_cuda_offchip
    elif context.GetRnnAllowConversion():
        if bs <= 72 and hidden_size <= 1728:
            try:
                return hf_a100_lstm_cuda_onchip_bf16
            except:
                return hf_a100_lstm_cuda_offchip
        else:
            return hf_a100_lstm_cuda_offchip
    else:
        if bs <= 64 and hidden_size <= 1728:
            try:
                return hf_a100_lstm_cuda_onchip_tf
            except:
                return hf_a100_lstm_cuda_offchip
        elif bs <= 512 and hidden_size <= 512:
            try:
                return hf_a100_lstm_cuda_onchip_tf_small_h
            except:
                return hf_a100_lstm_cuda_offchip
        else:
            return hf_a100_lstm_cuda_offchip


class LSTMFunction(Function):
    @staticmethod
    def forward(ctx, x, weight_ih, weight_hh, bias_ih, bias_hh, h_0, c_0, training):
        ctx.LSTM = get_LSTM(x.size()[1], weight_hh.size()[1], x.device)
        if training:
            h_1, c_1, y, cells, linear_gates = ctx.LSTM.forward(
                x, weight_ih, weight_hh, bias_ih, bias_hh, h_0, c_0)
            ctx.save_for_backward(x, weight_ih, weight_hh, h_0, c_0, y, cells, linear_gates)
            return h_1, c_1, y.narrow(0, 1, x.size(0))
        else:
            h_1, c_1, y = ctx.LSTM.forward_infer(
                x, weight_ih, weight_hh, bias_ih, bias_hh, h_0, c_0)
            return h_1, c_1, y

    @staticmethod
    @torch.autograd.function.once_differentiable
    def backward(ctx, dh_1, dc_1, dh_layer):
        dh_1 = dh_1.contiguous()
        dc_1 = dc_1.contiguous()
        dh_layer = dh_layer.contiguous()
        variables = ctx.saved_tensors
        dx, dh_0, dc_0, dweight_ih, dweight_hh, dbias_ih, dbias_hh = ctx.LSTM.backward(
            *variables, dh_layer, dh_1, dc_1, None)
        ret = dx, dweight_ih, dweight_hh, dbias_ih, dbias_hh, dh_0, dc_0, None
        return ret


[docs]class LSTM(nn.LSTM):
    """
    高效的 LSTM 算子

    使用方式与 `PyTorch 的 LSTM 算子 <https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html?highlight=lstm#torch.nn.LSTM>`_ 一致

    不支持 proj_size 参数

    .. note::
        额外支持 ``drop_connect`` 参数. 如果 ``0 < drop_connect <= 1``, 会在所有的 ``weight_hh`` 后面紧接着增加一层 ``Dropout(p=drop_connect)``

    .. note::
        支持 3 种精度模式:

        1) TF32 模式 (默认): LSTM 中的矩阵乘法使用 TF32 加速

            当 ``batch_size <= 64 && hidden_size <= 1728`` 或 ``batch_size <= 512 && hidden_size <= 512`` 时, LSTM 使用 persistent 方法加速

        2) Float32 模式: LSTM 中的矩阵乘法使用完整精度

            需要指定 ``torch.backends.cuda.matmul.allow_tf32 = False``

            当 ``batch_size <= 16 && hidden_size <= 1728`` 时, LSTM 使用 persistent 方法加速

        3) BFloat16 模式: LSTM 中的矩阵乘法使用 BFloat16 加速

            需要指定 ``hfai.nn.context.SetRnnAllowConversion(True)`` 且 ``batch_size <= 72 && hidden_size <= 1728``

    .. note::
        ``hidden_size`` 是 64 的倍数时性能最好

    Examples:

    .. code-block:: python

        lstm = hfai.nn.LSTM(input_size=10, hidden_size=20).cuda()

        input0 = torch.randn(5, 100, 10).cuda()
        output, (hn, cn) = lstm(input0, None)  # TF32 模式, 不使用 persistent 方法

        hfai.nn.context.SetRnnAllowConversion(True)
        input1 = torch.randn(5, 64, 10).cuda()
        output, (hn, cn) = lstm(input1, None)  # BFloat16 模式, 使用 persistent 方法
        hfai.nn.context.SetRnnAllowConversion(False)

        input2 = torch.randn(5, 8, 10).cuda()
        output, (hn, cn) = lstm(input2, None)  # TF32 模式, 使用 persistent 方法

    """

    def __init__(self, *args, **kwargs):
        drop_connect = kwargs.pop('drop_connect', 0)
        if not isinstance(drop_connect, numbers.Number) or not 0 <= drop_connect <= 1 or isinstance(drop_connect, bool):
            raise ValueError("dropout should be a number in range [0, 1] "
                             "representing the probability of an element being "
                             "zeroed")
        super().__init__(*args, **kwargs)
        if self.dropout != 0:
            self.drop = Dropout(self.dropout)
        else:
            self.drop = None
        if drop_connect != 0:
            self.drop_connect = Dropout(drop_connect)
        else:
            self.drop_connect = lambda x: x

    def forward(self, input: Union[torch.Tensor, PackedSequence],
                hx: Optional[Tuple[torch.Tensor, torch.Tensor]] = None):
        if not input.is_cuda:
            return super().forward(input, hx)
        if isinstance(input, PackedSequence):
            return super().forward(input, hx)
        if input.dtype != torch.float:
            return super().forward(input, hx)

        bs = input.size()[0] if self.batch_first else input.size()[1]
        D = 2 if self.bidirectional else 1

        if hx is None:
            h = torch.zeros(self.num_layers * D, bs, self.hidden_size, device=input.device)
            c = torch.zeros(self.num_layers * D, bs, self.hidden_size, device=input.device)
        else:
            h, c = hx
            h = h.contiguous()
            c = c.contiguous()

        self.check_forward_args(input, (h, c), None)

        if self.batch_first:
            input = input.transpose(0, 1)
        input = input.contiguous()

        seq = input.size()[0]

        if not self.bias:
            bias = torch.zeros(4 * self.hidden_size, device=input.device)

        h_out = []
        c_out = []
        y = input

        for i in range(self.num_layers):
            h1, c1, y1 = LSTMFunction.apply(
                y, getattr(self, f'weight_ih_l{i}'),
                self.drop_connect(getattr(self, f'weight_hh_l{i}')),
                getattr(self, f'bias_ih_l{i}') if self.bias else bias,
                getattr(self, f'bias_hh_l{i}') if self.bias else bias,
                h[i * D], c[i * D], self.training and torch.is_grad_enabled())
            h_out.append(h1)
            c_out.append(c1)

            if self.bidirectional:
                h1_reverse, c1_reverse, y1_reverse = LSTMFunction.apply(
                    y if seq == 1 else y.flip(0),
                    getattr(self, f'weight_ih_l{i}_reverse'),
                    self.drop_connect(getattr(self, f'weight_hh_l{i}_reverse')),
                    getattr(self, f'bias_ih_l{i}_reverse') if self.bias else bias,
                    getattr(self, f'bias_hh_l{i}_reverse') if self.bias else bias,
                    h[i * 2 + 1], c[i * 2 + 1], self.training and torch.is_grad_enabled())
                h_out.append(h1_reverse)
                c_out.append(c1_reverse)
                y1 = torch.cat((y1, y1_reverse if seq == 1 else y1_reverse.flip(0)), 2)

            y = y1
            if self.drop is not None and i != self.num_layers - 1:
                y = self.drop(y)

        return y.transpose(0, 1) if self.batch_first else y, \
            (torch.stack(h_out, 0), torch.stack(c_out, 0))


class LSTM_fullcFunction(Function):
    @staticmethod
    def forward(ctx, x, weight_ih, weight_hh, bias_ih, bias_hh, h_0, c_0, training):
        ctx.LSTM = get_LSTM(x.size()[1], weight_hh.size()[1], x.device)
        if training:
            h_1, c_1, y, cells, linear_gates = ctx.LSTM.forward(
                x, weight_ih, weight_hh, bias_ih, bias_hh, h_0, c_0)
            ctx.save_for_backward(x, weight_ih, weight_hh, h_0, c_0, y, cells, linear_gates)
            return h_1, c_1, y.narrow(0, 1, x.size(0)), cells
        else:
            h_1, c_1, y, cells, linear_gates = ctx.LSTM.forward(
                x, weight_ih, weight_hh, bias_ih, bias_hh, h_0, c_0)
            return h_1, c_1, y.narrow(0, 1, x.size(0)), cells

    @staticmethod
    @torch.autograd.function.once_differentiable
    def backward(ctx, dh_1, dc_1, dh_layer, dcells):
        dh_1 = dh_1.contiguous()
        dc_1 = dc_1.contiguous()
        dh_layer = dh_layer.contiguous()
        dcells = dcells.contiguous()
        variables = ctx.saved_tensors
        dx, dh_0, dc_0, dweight_ih, dweight_hh, dbias_ih, dbias_hh = ctx.LSTM.backward(
            *variables, dh_layer, dh_1, dc_1, dcells)
        ret = dx, dweight_ih, dweight_hh, dbias_ih, dbias_hh, dh_0, dc_0, None
        return ret


[docs]class LSTM_fullc(nn.LSTM):
    """
    高效的 LSTM 算子，并输出完整的 c

    模型参数和 Inputs 与 `PyTorch 的 LSTM 算子 <https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html?highlight=lstm#torch.nn.LSTM>`_ 一致

    不支持 proj_size 参数

    Outputs: output, (h_n, c_n), full_c
        * **output**: 与 PyTorch 一致
        * **h_n**: 与 PyTorch 一致
        * **c_n**: 与 PyTorch 一致
        * **full_c**: $(seq_len, D * num_layers, batch_size, hidden_size)$, 包含了完整的共 seq_len 层的 c

    .. note::
        额外支持 ``drop_connect`` 参数. 如果 ``0 < drop_connect <= 1``, 会在所有的 ``weight_hh`` 后面紧接着增加一层 ``Dropout(p=drop_connect)``

    .. note::
        支持 3 种精度模式:

        1) TF32 模式 (默认): LSTM 中的矩阵乘法使用 TF32 加速

            当 ``batch_size <= 64 && hidden_size <= 1728`` 时, LSTM 使用 persistent 方法加速

        2) Float32 模式: LSTM 中的矩阵乘法使用完整精度

            需要指定 ``torch.backends.cuda.matmul.allow_tf32 = False``

            当 ``batch_size <= 16 && hidden_size <= 1728`` 时, LSTM 使用 persistent 方法加速

        3) BFloat16 模式: LSTM 中的矩阵乘法使用 BFloat16 加速

            需要指定 ``hfai.nn.context.SetRnnAllowConversion(True)`` 且 ``batch_size <= 72 && hidden_size <= 1728``

    .. note::
        ``hidden_size`` 是 64 的倍数时性能最好

    Examples:

    .. code-block:: python

        lstm_fullc = hfai.nn.LSTM_fullc(input_size=10, hidden_size=20).cuda()

        input0 = torch.randn(5, 100, 10).cuda()
        output, (hn, cn), full_c = lstm_fullc(input0, None)  # TF32 模式, 不使用 persistent 方法

        hfai.nn.context.SetRnnAllowConversion(True)
        input1 = torch.randn(5, 64, 10).cuda()
        output, (hn, cn), full_c = lstm_fullc(input1, None)  # BFloat16 模式, 使用 persistent 方法
        hfai.nn.context.SetRnnAllowConversion(False)

        input2 = torch.randn(5, 8, 10).cuda()
        output, (hn, cn), full_c = lstm_fullc(input2, None)  # TF32 模式, 使用 persistent 方法

    """

    def __init__(self, *args, **kwargs):
        drop_connect = kwargs.pop('drop_connect', 0)
        if not isinstance(drop_connect, numbers.Number) or not 0 <= drop_connect <= 1 or isinstance(drop_connect, bool):
            raise ValueError("dropout should be a number in range [0, 1] "
                             "representing the probability of an element being "
                             "zeroed")
        super().__init__(*args, **kwargs)
        if self.dropout != 0:
            self.drop = Dropout(self.dropout)
        else:
            self.drop = None
        if drop_connect != 0:
            self.drop_connect = Dropout(drop_connect)
        else:
            self.drop_connect = lambda x: x

    def forward(self, input: Union[torch.Tensor, PackedSequence],
                hx: Optional[Tuple[torch.Tensor, torch.Tensor]] = None):
        if not input.is_cuda:
            return super().forward(input, hx)
        if isinstance(input, PackedSequence):
            return super().forward(input, hx)
        if input.dtype != torch.float:
            return super().forward(input, hx)

        bs = input.size()[0] if self.batch_first else input.size()[1]
        D = 2 if self.bidirectional else 1

        if hx is None:
            h = torch.zeros(self.num_layers * D, bs, self.hidden_size, device=input.device)
            c = torch.zeros(self.num_layers * D, bs, self.hidden_size, device=input.device)
        else:
            h, c = hx
            h = h.contiguous()
            c = c.contiguous()

        self.check_forward_args(input, (h, c), None)

        if self.batch_first:
            input = input.transpose(0, 1)
        input = input.contiguous()

        seq = input.size()[0]

        if not self.bias:
            bias = torch.zeros(4 * self.hidden_size, device=input.device)

        h_out = []
        c_out = []
        cells_out = []
        y = input

        for i in range(self.num_layers):
            h1, c1, y1, cells = LSTM_fullcFunction.apply(
                y, getattr(self, f'weight_ih_l{i}'),
                self.drop_connect(getattr(self, f'weight_hh_l{i}')),
                getattr(self, f'bias_ih_l{i}') if self.bias else bias,
                getattr(self, f'bias_hh_l{i}') if self.bias else bias,
                h[i * D], c[i * D], self.training and torch.is_grad_enabled())
            h_out.append(h1)
            c_out.append(c1)
            cells_out.append(cells)

            if self.bidirectional:
                h1_reverse, c1_reverse, y1_reverse, cells_reverse = LSTM_fullcFunction.apply(
                    y if seq == 1 else y.flip(0),
                    getattr(self, f'weight_ih_l{i}_reverse'),
                    self.drop_connect(getattr(self, f'weight_hh_l{i}_reverse')),
                    getattr(self, f'bias_ih_l{i}_reverse') if self.bias else bias,
                    getattr(self, f'bias_hh_l{i}_reverse') if self.bias else bias,
                    h[i * 2 + 1], c[i * 2 + 1], self.training and torch.is_grad_enabled())
                h_out.append(h1_reverse)
                c_out.append(c1_reverse)
                cells_out.append(cells_reverse)
                y1 = torch.cat((y1, y1_reverse if seq == 1 else y1_reverse.flip(0)), 2)

            y = y1
            if self.drop is not None and i != self.num_layers - 1:
                y = self.drop(y)

        return y.transpose(0, 1) if self.batch_first else y, \
            (torch.stack(h_out, 0), torch.stack(c_out, 0)), torch.stack(cells_out, 1)


try:
    import hfai.hfcuda.hf_a100_gru_cuda_onchip_tf as hf_a100_gru_cuda_onchip_tf
except:
    pass
try:
    import hfai.hfcuda.hf_a100_gru_cuda_offchip as hf_a100_gru_cuda_offchip
except:
    pass


def get_GRU(bs, hidden_size, device):
    if torch.cuda.get_device_capability() != (
            8, 0
    ) or torch.cuda.get_device_properties(device).multi_processor_count < 108:
        return hf_a100_gru_cuda_offchip
    elif not torch.backends.cuda.matmul.allow_tf32:
        return hf_a100_gru_cuda_offchip
    else:
        if bs <= 64 and hidden_size <= 1728:
            try:
                return hf_a100_gru_cuda_onchip_tf
            except:
                return hf_a100_gru_cuda_offchip
        else:
            return hf_a100_gru_cuda_offchip


class GRUFunction(Function):
    @staticmethod
    def forward(ctx, x, weight_ih, weight_hh, bias_ih, bias_hh, h_0, training):
        ctx.GRU = get_GRU(x.size()[1], weight_hh.size()[1], x.device)
        if training:
            h_1, y, linear_gates = ctx.GRU.forward(
                x, weight_ih, weight_hh, bias_ih, bias_hh, h_0)
            ctx.save_for_backward(x, weight_ih, weight_hh, h_0, y, linear_gates)
            return h_1, y.narrow(0, 1, x.size(0))
        else:
            h_1, y = ctx.GRU.forward_infer(
                x, weight_ih, weight_hh, bias_ih, bias_hh, h_0)
            return h_1, y

    @staticmethod
    @torch.autograd.function.once_differentiable
    def backward(ctx, dh_1, dh_layer):
        dh_1 = dh_1.contiguous()
        dh_layer = dh_layer.contiguous()
        variables = ctx.saved_tensors
        dx, dh_0, dweight_ih, dweight_hh, dbias_ih, dbias_hh = ctx.GRU.backward(
            *variables, dh_layer, dh_1)
        ret = dx, dweight_ih, dweight_hh, dbias_ih, dbias_hh, dh_0, None
        return ret


[docs]class GRU(nn.GRU):
    """
    高效的 GRU 算子

    使用方式与 `PyTorch 的 GRU 算子 <https://pytorch.org/docs/stable/generated/torch.nn.GRU.html?highlight=lstm#torch.nn.GRU>`_ 一致

    .. note::
        额外支持 ``drop_connect`` 参数. 如果 ``0 < drop_connect <= 1``, 会在所有的 ``weight_hh`` 后面紧接着增加一层 ``Dropout(p=drop_connect)``

    .. note::
        支持 2 种精度模式:

        1) TF32 模式 (默认): GRU 中的矩阵乘法使用 TF32 加速

            当 ``batch_size <= 64 && hidden_size <= 1728`` 时, GRU 使用 persistent 方法加速

        2) Float32 模式: GRU 中的矩阵乘法使用完整精度

            需要指定 ``torch.backends.cuda.matmul.allow_tf32 = False``

    .. note::
        ``hidden_size`` 是 64 的倍数时性能最好

    Examples:

    .. code-block:: python

        gru = hfai.nn.GRU(input_size=10, hidden_size=20).cuda()

        input0 = torch.randn(5, 100, 10).cuda()
        output, hn = gru(input0, None)  # TF32 模式, 不使用 persistent 方法

        input2 = torch.randn(5, 8, 10).cuda()
        output, hn = gru(input2, None)  # TF32 模式, 使用 persistent 方法

    """

    def __init__(self, *args, **kwargs):
        drop_connect = kwargs.pop('drop_connect', 0)
        if not isinstance(drop_connect, numbers.Number) or not 0 <= drop_connect <= 1 or isinstance(drop_connect, bool):
            raise ValueError("dropout should be a number in range [0, 1] "
                             "representing the probability of an element being "
                             "zeroed")
        super().__init__(*args, **kwargs)
        if self.dropout != 0:
            self.drop = Dropout(self.dropout)
        else:
            self.drop = None
        if drop_connect != 0:
            self.drop_connect = Dropout(drop_connect)
        else:
            self.drop_connect = lambda x: x

    def forward(self, input: Union[torch.Tensor, PackedSequence],
                hx: Optional[torch.Tensor] = None):
        if not input.is_cuda:
            return super().forward(input, hx)
        if isinstance(input, PackedSequence):
            return super().forward(input, hx)
        if input.dtype != torch.float:
            return super().forward(input, hx)

        bs = input.size()[0] if self.batch_first else input.size()[1]
        D = 2 if self.bidirectional else 1

        if hx is None:
            h = torch.zeros(self.num_layers * D, bs, self.hidden_size, device=input.device)
        else:
            h = hx
            h = h.contiguous()

        self.check_forward_args(input, h, None)

        if self.batch_first:
            input = input.transpose(0, 1)
        input = input.contiguous()

        seq = input.size()[0]

        if not self.bias:
            bias = torch.zeros(3 * self.hidden_size, device=input.device)

        h_out = []
        y = input

        for i in range(self.num_layers):
            h1, y1 = GRUFunction.apply(
                y, getattr(self, f'weight_ih_l{i}'),
                self.drop_connect(getattr(self, f'weight_hh_l{i}')),
                getattr(self, f'bias_ih_l{i}') if self.bias else bias,
                getattr(self, f'bias_hh_l{i}') if self.bias else bias,
                h[i * D], self.training and torch.is_grad_enabled())
            h_out.append(h1)

            if self.bidirectional:
                h1_reverse, y1_reverse = GRUFunction.apply(
                    y if seq == 1 else y.flip(0),
                    getattr(self, f'weight_ih_l{i}_reverse'),
                    self.drop_connect(getattr(self, f'weight_hh_l{i}_reverse')),
                    getattr(self, f'bias_ih_l{i}_reverse') if self.bias else bias,
                    getattr(self, f'bias_hh_l{i}_reverse') if self.bias else bias,
                    h[i * 2 + 1], self.training and torch.is_grad_enabled())
                h_out.append(h1_reverse)
                y1 = torch.cat((y1, y1_reverse if seq == 1 else y1_reverse.flip(0)), 2)

            y = y1
            if self.drop is not None and i != self.num_layers - 1:
                y = self.drop(y)

        return y.transpose(0, 1) if self.batch_first else y, \
            torch.stack(h_out, 0)