Fundamental Components of Deep Learning

Fundamental components to build deep learning systems.

6 papers

Written by Junkun Yuan.

Click here to go back to main contents.


Table of contents:

Papers are displayed in reverse chronological order. High-impact or inspiring works are highlighted in red.

Normalization

Root Mean Square Layer Normalization

Biao Zhang, Rico Sennrich

University of Edinburgh, University of Zurich

Advances in Neural Information Processing Systems (NeurIPS), 2019

Oct 16, 2019   |   RMS Norm   |   code


It proposes an efficient layer normalization method that maintains the re-scaling invariance property of LayerNorm while eliminating re-centering.

  • Why is RMS Norm more efficient than Layer Norm? Layer Norm needs to calculate mean and variance, requiring two passes over the data and extra subtraction operations. RMS Norm only needs one pass, reducing operations and memory accesses and making it GPU-friendly.

import torch
import torch.nn as nn

## --------------------------------------------------------------------------------
## Build customized RMS Normalization
## --------------------------------------------------------------------------------
class RMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(hidden_size))

    def forward(self, x):
        rms = torch.sqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + self.eps)
        x_normalized = x / rms
        return self.weight * x_normalized

## --------------------------------------------------------------------------------
## Test the customized RMS Normalization
## --------------------------------------------------------------------------------
batch, token_num, hidden_size = 2, 16, 128
x = torch.randn(batch, token_num, hidden_size)

custom_rmsn = RMSNorm(hidden_size)
torch_rmsn = nn.RMSNorm(hidden_size)

custom_rmsn.weight.data = torch_rmsn.weight.data.clone()

print(torch.allclose(custom_rmsn(x), torch_rmsn(x)))
## --------------------------------------------------------------------------------

Group Normalization

Yuxin Wu, Kaiming He

Facebook AI Research (FAIR)

European Conference on Computer Vision (ECCV), 2018

Mar 22, 2018   |   Group Norm   |   code


It normalizes features along channel groups, achieving stable accuracy even for very small batches.


## --------------------------------------------------------------------------------
## Build customized Group Normalization
## --------------------------------------------------------------------------------
import torch
from torch import nn 

class CustomGroupNorm(nn.Module):
    def __init__(self, num_features, num_groups, eps=1e-5):
        super().__init__()
        self.num_groups = num_groups
        self.num_features = num_features
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(num_features))
        self.bias = nn.Parameter(torch.zeros(num_features))

    def forward(self, x):
        N, C, H, W = x.shape
        x = x.view(N, self.num_groups, -1, H, W)
        mean = x.mean(dim=(3, 4), keepdim=True)
        var = x.var(dim=(3, 4), unbiased=False, keepdim=True)
        x_norm = (x - mean) / torch.sqrt(var + self.eps)
        x_norm = x_norm.view(N, C, H, W)
        return self.weight.view(1, C, 1, 1) * x_norm + self.bias.view(1, C, 1, 1)
## --------------------------------------------------------------------------------

## --------------------------------------------------------------------------------
## Test the customized Group Normalization
## --------------------------------------------------------------------------------
if __name__ == "__main__":
    N, C, H, W = 4, 6, 224, 224
    num_groups = 1
    x = torch.rand(N, C, H, W) * 10

    torch_gn = torch.nn.GroupNorm(num_channels=C, num_groups=num_groups)
    custom_gn = CustomGroupNorm(num_features=C, num_groups=num_groups)

    custom_gn.weight.data = torch_gn.weight.data.clone()
    custom_gn.bias.data = torch_gn.bias.data.clone()

    print(torch.allclose(torch_gn(x), custom_gn(x), atol=0.01))  # it prints False
    print(torch.allclose(torch_gn(x), custom_gn(x), atol=0.1))  # it prints False
## --------------------------------------------------------------------------------

Instance Normalization: The Missing Ingredient for Fast Stylization

Dmitry Ulyanov, Andrea Vedaldi, Victor Lempitsky

Skoltech & Yandex, University of Oxford

arXiv, 2016

Jul 27, 2016   |   Instance Norm   |   code


It normalizes samples along the batch dimension and channel dimension to improve visual generation quality.


## --------------------------------------------------------------------------------
## Build customized Instance Normalization
## --------------------------------------------------------------------------------
import torch
from torch import nn 

class CustomInstanceNorm(nn.Module):
    def __init__(self, num_features, eps=1e-5):
        super().__init__()
        self.gamma = nn.Parameter(torch.ones(num_features))
        self.beta = nn.Parameter(torch.zeros(num_features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(dim=(2, 3), keepdim=True)
        var = x.var(dim=(2, 3), unbiased=False, keepdim=True)
        x_norm = (x - mean) / torch.sqrt(var + self.eps)
        return self.gamma.view(1, -1, 1, 1) * x_norm + self.beta.view(1, -1, 1, 1)
## --------------------------------------------------------------------------------

## --------------------------------------------------------------------------------
## Test the customized Instance Normalization
## --------------------------------------------------------------------------------
C = 3
x = torch.randn(4, C, 224, 224)
torch_in = torch.nn.InstanceNorm2d(C)
custom_in = CustomInstanceNorm(C)
print(torch.allclose(torch_in(x), custom_in(x), atol=1e-5))  # it prints True
## --------------------------------------------------------------------------------

Layer Normalization

Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton

University of Toronto, Google

arXiv, 2016

Jul 21, 2016   |   Layer Norm

It normalizes across features of each sample, making it suitable for RNNs and cases with small / variable batch sizes. It has over 16,000 citations (as of Aug 2025).


It normalizes samples along the batch dimension to adapt to cases with small / variable batch sizes.


## --------------------------------------------------------------------------------
## Build customized Layer Normalization
## --------------------------------------------------------------------------------
import torch
from torch import nn 

class CustomLayerNorm(nn.Module):
    def __init__(self, shape, eps=1e-5):
        super().__init__()
        self.gamma = nn.Parameter(torch.ones(shape))
        self.beta = nn.Parameter(torch.zeros(shape))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(dim=(1, 2, 3), keepdim=True)
        var = x.var(dim=(1, 2, 3), unbiased=False, keepdim=True)
        x_norm = (x - mean) / torch.sqrt(var + self.eps)
        return self.gamma * x_norm + self.beta
## --------------------------------------------------------------------------------

## --------------------------------------------------------------------------------
## Test the customized Layer Normalization
## --------------------------------------------------------------------------------
C, H, W = 3, 224, 224
x = torch.randn(4, C, H, W)
## Note that LayerNorm often applies to the final dim (the feature dim)
torch_ln = torch.nn.LayerNorm(normalized_shape=(C, H, W))
custom_ln = CustomLayerNorm((C, H, W))
print(torch.allclose(custom_ln(x), torch_ln(x), atol=1e-5))  # it prints True
## --------------------------------------------------------------------------------

Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift

Sergey Ioffe, Christian Szegedy

Google

International Conference on Machine Learning (ICML), 2015

Feb 11, 2015   |   Batch Norm

It normalizes the activations of each layer within a batch, improving training speed, stability, and generalization. It has over 60,000 citations (as of Aug 2025).


It normalizes layer inputs along channels such that higher lr and saturating nonlinearities can be applied, and careful parameter initialization is not needed.

  • Reason of instability. The inputs to each layer are affected by the parameters of all preceding layers, so small changes to the network amplify as the network becomes deeper. Besides, changes in the input and output distributions of each layer hinder its training.
  • Previous solutions to instability. non-saturating nonlinearities like ReLU, careful parameter initialization, small learning rate, dropout.
  • Batch normalization. It normalizes each channel by the mean and standard error to stabilize the input distribution of each layer.
  • Performance. It applies to the best performing ImageNet classification network and matches its performance using only 7% of the training steps.
Figure 1. (left) Batch Normalization. The \(\gamma\) and \( \beta \) are employed to make it can represent identity transformation. (right) Training and inference.

import torch
from torch import nn

## --------------------------------------------------------------------------------
## Build customized Batch Normalization (2D)
## --------------------------------------------------------------------------------
class MyBatchNorm2d(nn.Module):
    def __init__(self, num_features, eps=1e-5, momentum=0.1):
        super().__init__()
        self.eps = eps
        self.momentum = momentum
        self.weight = torch.nn.Parameter(torch.ones(num_features))
        self.bias = torch.nn.Parameter(torch.zeros(num_features))
        self.register_buffer('running_mean', torch.zeros(num_features))
        self.register_buffer('running_var', torch.ones(num_features))

    def forward(self, x):
        if self.training:
            mean = x.mean(dim=(0, 2, 3), keepdim=True)
            var = x.var(dim=(0, 2, 3), keepdim=True, unbiased=False)
            x_hat = (x - mean) / torch.sqrt(var + self.eps)

            # Update running stats
            self.running_mean = (1 - self.momentum) * self.running_mean + \
                self.momentum * mean.view(-1)
            self.running_var = (1 - self.momentum) * self.running_var + \
                self.momentum * var.view(-1)
        else:
            mean = self.running_mean.view(1, -1, 1, 1)
            var = self.running_var.view(1, -1, 1, 1)
            x_hat = (x - mean) / torch.sqrt(var + self.eps)

        return self.weight.view(1, -1, 1, 1) * x_hat + self.bias.view(1, -1, 1, 1)
## --------------------------------------------------------------------------------

## --------------------------------------------------------------------------------
## Test the customized Batch Normalization
## --------------------------------------------------------------------------------
## Input
x = torch.randn(8, 3, 32, 32)  # BCHW

## Instantiate both modules
torch_bn = torch.nn.BatchNorm2d(3)
custom_bn = MyBatchNorm2d(3)

## Sync initial parameters
custom_bn.weight.data.copy_(torch_bn.weight.data)
custom_bn.bias.data.copy_(torch_bn.bias.data)
custom_bn.running_mean.copy_(torch_bn.running_mean)
custom_bn.running_var.copy_(torch_bn.running_var)

## Training
torch_bn.train()
custom_bn.train()
print(torch.allclose(torch_bn(x), custom_bn(x)))  # it prints True

## Inference
torch_bn.eval()
custom_bn.eval()
print(torch.allclose(torch_bn(x), custom_bn(x)))  # it prints True
## --------------------------------------------------------------------------------

Framework

Ray: A Distributed Framework for Emerging AI Applications

Philipp Moritz, Robert Nishihara, Stephanie Wang, Alexey Tumanov, Richard Liaw, Eric Liang, Melih Elibol, Zongheng Yang, William Paul, Michael I. Jordan, Ion Stoica

UC Berkeley

OSDI, 2018

Dec 16, 2017   |   Ray   |   code


Ray is a distributed computing framework that scales machine learning and data processing workflows across multiple machines and GPUs.

Last updated on May 18, 2026 at 10:47 (UTC-7).