<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
	<channel>
		<title>Normalization on ChengAo Shen</title>
		<link>https://chengaoshen.com/en/tags/normalization/</link>
		<description>Recent content in Normalization on ChengAo Shen</description>
		<generator>Hugo</generator>
		<language>en</language>
		
		
		
		
			<lastBuildDate>Mon, 21 Jul 2025 00:00:00 +0000</lastBuildDate>
		
			<atom:link href="https://chengaoshen.com/en/tags/normalization/index.xml" rel="self" type="application/rss+xml" />
			<item>
				<title>📃 Different Normalization</title>
				<link>https://chengaoshen.com/en/posts/normalization/</link>
				<pubDate>Mon, 21 Jul 2025 00:00:00 +0000</pubDate>
				<guid>https://chengaoshen.com/en/posts/normalization/</guid>
				<description>&lt;h2 id=&#34;introduction&#34;&gt;&#xA;  Introduction&#xA;  &lt;a class=&#34;heading-link&#34; href=&#34;#introduction&#34;&gt;&#xA;    &lt;i class=&#34;fa-solid fa-link&#34; aria-hidden=&#34;true&#34; title=&#34;Link to heading&#34;&gt;&lt;/i&gt;&#xA;    &lt;span class=&#34;sr-only&#34;&gt;Link to heading&lt;/span&gt;&#xA;  &lt;/a&gt;&#xA;&lt;/h2&gt;&#xA;&lt;p&gt;Normalization techniques are fundamental to training deep learning models effectively. They help &lt;strong&gt;stabilize and accelerate training&lt;/strong&gt;, &lt;strong&gt;improve generalization&lt;/strong&gt;, and &lt;strong&gt;prevent internal covariate shift&lt;/strong&gt;. Below is a summary of the &lt;strong&gt;most common normalization techniques&lt;/strong&gt;, their &lt;strong&gt;mechanisms&lt;/strong&gt;, &lt;strong&gt;key papers&lt;/strong&gt;, and &lt;strong&gt;differences&lt;/strong&gt;.&lt;/p&gt;&#xA;&lt;p&gt;&lt;img src=&#34;https://raw.githubusercontent.com/ChengAoShen/Image-Hosting/main/images/Normalization.png&#34; alt=&#34;image_normalization&#34;&gt;&lt;/p&gt;&#xA;&lt;h2 id=&#34;-summary-of-different-type-of-normalization&#34;&gt;&#xA;  &lt;strong&gt;🔑 Summary of different type of Normalization&lt;/strong&gt;&#xA;  &lt;a class=&#34;heading-link&#34; href=&#34;#-summary-of-different-type-of-normalization&#34;&gt;&#xA;    &lt;i class=&#34;fa-solid fa-link&#34; aria-hidden=&#34;true&#34; title=&#34;Link to heading&#34;&gt;&lt;/i&gt;&#xA;    &lt;span class=&#34;sr-only&#34;&gt;Link to heading&lt;/span&gt;&#xA;  &lt;/a&gt;&#xA;&lt;/h2&gt;&#xA;&lt;table&gt;&#xA;&#x9;&lt;thead&gt;&#xA;&#x9;&#x9;&#x9;&lt;tr&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;th&gt;&lt;strong&gt;Name&lt;/strong&gt;&lt;/th&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;th&gt;&lt;strong&gt;Normalized Over&lt;/strong&gt;&lt;/th&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;th&gt;&lt;strong&gt;Key Paper&lt;/strong&gt;&lt;/th&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;th&gt;&lt;strong&gt;Common Use Cases&lt;/strong&gt;&lt;/th&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;th&gt;Strength&lt;/th&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;th&gt;Weakness&lt;/th&gt;&#xA;&#x9;&#x9;&#x9;&lt;/tr&gt;&#xA;&#x9;&lt;/thead&gt;&#xA;&#x9;&lt;tbody&gt;&#xA;&#x9;&#x9;&#x9;&lt;tr&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;&lt;strong&gt;Batch Normalization (BN)&lt;/strong&gt;&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;For Conv: per channel across &lt;strong&gt;B×H×W&lt;/strong&gt;; For MLP: per feature across &lt;strong&gt;B.&lt;/strong&gt;&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;&lt;a href=&#34;https://arxiv.org/abs/1502.03167&#34;  class=&#34;external-link&#34; target=&#34;_blank&#34; rel=&#34;noopener&#34;&gt;&lt;em&gt;Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift&lt;/em&gt;&lt;/a&gt; (ICML 2015)&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;Computer Vision Field like Image Classification, Detection, Segmentation&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;Stabilizes activation scale; Enables larger learning rates, Speeds convergence;  Adds implicit regularization&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;Less suited to online / streaming / RNN small-batch settings, can cause issues in domain shift or micro-batch training&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&lt;/tr&gt;&#xA;&#x9;&#x9;&#x9;&lt;tr&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;&lt;strong&gt;Layer Normalization (LN)&lt;/strong&gt;&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;Per &lt;em&gt;sample&lt;/em&gt; (token) across its &lt;strong&gt;feature (hidden) dimensions&lt;/strong&gt; (e.g. For shape B×L×D or B×D: normalize over D; for Conv rarely used, would be over C×H×W of that sample)&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;&lt;a href=&#34;https://arxiv.org/abs/1607.06450&#34;  class=&#34;external-link&#34; target=&#34;_blank&#34; rel=&#34;noopener&#34;&gt;&lt;em&gt;Layer Normalization&lt;/em&gt;&lt;/a&gt; (arXiv 2016)&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;Transformers (NLP &amp;amp; Vision), RNNs, small-batch or batch=1 training&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;Independent of batch size, identical behavior in training &amp;amp; inference, stable for variable-length sequences, improves gradient flow (esp. Pre-LN Transformers)&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;Provides less implicit regularization, does not leverage cross-sample statistics&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&lt;/tr&gt;&#xA;&#x9;&#x9;&#x9;&lt;tr&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;&lt;strong&gt;Instance Normalization (IN)&lt;/strong&gt;&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;For conv input BxCxHxW: &lt;strong&gt;each sample &amp;amp; channel independently over its spatial pixels&lt;/strong&gt; HxW (no cross-batch, no cross-channel).&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;&lt;a href=&#34;https://arxiv.org/abs/1607.08022&#34;  class=&#34;external-link&#34; target=&#34;_blank&#34; rel=&#34;noopener&#34;&gt;&lt;em&gt;Instance Normalization: The Missing Ingredient for Fast Stylization&lt;/em&gt;&lt;/a&gt; (ECCV 2016)&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;image generation (GAN generators), image-to-image translation (e.g., style/appearance adaptation)&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;Batch size–independent, effectively strips instance-specific style (contrast, color cast), aiding fast stylization&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;Discards global intensity/contrast cues useful for recognition → poorer performance on classification/detection; lacks batch-level regularization&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&lt;/tr&gt;&#xA;&#x9;&#x9;&#x9;&lt;tr&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;&lt;strong&gt;Group Normalization (GN)&lt;/strong&gt;&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;For input BxCxHxW: &lt;strong&gt;per sample&lt;/strong&gt;, split channels into G groups (size C/G); compute mean &amp;amp; var over (C/G)xHxW inside each group.&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;&lt;a href=&#34;https://arxiv.org/abs/1803.08494&#34;  class=&#34;external-link&#34; target=&#34;_blank&#34; rel=&#34;noopener&#34;&gt;&lt;em&gt;Group Normalization&lt;/em&gt;&lt;/a&gt; (ECCV 2018)&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;Small-/micro-batch CNN training, cases where BN fails with batch sizes 1–4.&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;Batch-size independent; stable for tiny or variable batches; often better than BN when batch is very small.&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;Extra hyperparameter (G) to tune; less implicit regularization than BN, grouping may not align with the semantic channel structure&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&lt;/tr&gt;&#xA;&#x9;&#x9;&#x9;&lt;tr&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;&lt;strong&gt;Weight Normalization (WN)&lt;/strong&gt;&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;Each weight vector of a neuron/output channel.&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;&lt;a href=&#34;https://arxiv.org/abs/1602.07868&#34;  class=&#34;external-link&#34; target=&#34;_blank&#34; rel=&#34;noopener&#34;&gt;&lt;em&gt;Weight Normalization: A Simple Reparameterization to Accelerate Training of Deep Neural Networks&lt;/em&gt;&lt;/a&gt; (NIPS 2016)&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;RNN / seq models where BN is hard, small-batch or online / RL training (policy &amp;amp; value nets)&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;Negligible inference cost (can fold into static weights); works with streaming / RL; complements other norms (can combine with LayerNorm)&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;Scale may drift (need LR tuning); benefit can vanish with strong adaptive optimizers; less helpful for very deep Transformers (other norms preferred)&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&lt;/tr&gt;&#xA;&#x9;&#x9;&#x9;&lt;tr&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;&lt;strong&gt;Spectral Normalization (SN)&lt;/strong&gt;&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;Each weight tensor (e.g. matrix / conv kernel reshaped to 2D)&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;&lt;a href=&#34;https://arxiv.org/abs/1802.05957&#34;  class=&#34;external-link&#34; target=&#34;_blank&#34; rel=&#34;noopener&#34;&gt;&lt;em&gt;Spectral Normalization for Generative Adversarial Networks&lt;/em&gt;&lt;/a&gt; (ICLR 2018)&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;GAN discriminators, robustness / Lipschitz-constrained models, etc.&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;Enforces (approx.) 1-Lipschitz per layer (controls gradient explosion)&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;Extra cost (power iteration each step); only constrains the largest singular value (other singular values can still drift)&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&lt;/tr&gt;&#xA;&#x9;&#x9;&#x9;&lt;tr&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;&lt;strong&gt;RMS Normalization&lt;/strong&gt;&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;Per sample (token) feature vector&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;&lt;a href=&#34;https://arxiv.org/abs/1910.07467&#34;  class=&#34;external-link&#34; target=&#34;_blank&#34; rel=&#34;noopener&#34;&gt;&lt;em&gt;Root Mean Square Layer Normalization&lt;/em&gt;&lt;/a&gt; (NIPS 2019)&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;Modern Transformer / LLM blocks;  very deep pre-norm architectures, low-precision (FP16/BF16)&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;Simpler &amp;amp; slightly cheaper than LayerNorm, numerically stable in mixed precision, good for very deep stacks (retains strong gradient path)&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&#x9;&#x9;&lt;td&gt;Mean not zeroer,  possible drift, needs careful init/residual scaling, and isn’t fully interchangeable with zero-mean LN methods.&lt;/td&gt;&#xA;&#x9;&#x9;&#x9;&lt;/tr&gt;&#xA;&#x9;&lt;/tbody&gt;&#xA;&lt;/table&gt;&#xA;&lt;h2 id=&#34;-explanation-of-how-they-work&#34;&gt;&#xA;  &lt;strong&gt;📘 Explanation of How They Work&lt;/strong&gt;&#xA;  &lt;a class=&#34;heading-link&#34; href=&#34;#-explanation-of-how-they-work&#34;&gt;&#xA;    &lt;i class=&#34;fa-solid fa-link&#34; aria-hidden=&#34;true&#34; title=&#34;Link to heading&#34;&gt;&lt;/i&gt;&#xA;    &lt;span class=&#34;sr-only&#34;&gt;Link to heading&lt;/span&gt;&#xA;  &lt;/a&gt;&#xA;&lt;/h2&gt;&#xA;&lt;h3 id=&#34;batch-normalization-bn&#34;&gt;&#xA;  &lt;strong&gt;Batch Normalization (BN)&lt;/strong&gt;&#xA;  &lt;a class=&#34;heading-link&#34; href=&#34;#batch-normalization-bn&#34;&gt;&#xA;    &lt;i class=&#34;fa-solid fa-link&#34; aria-hidden=&#34;true&#34; title=&#34;Link to heading&#34;&gt;&lt;/i&gt;&#xA;    &lt;span class=&#34;sr-only&#34;&gt;Link to heading&lt;/span&gt;&#xA;  &lt;/a&gt;&#xA;&lt;/h3&gt;&#xA;&lt;p&gt;The Batch Normalization normally used in computer vision field, typically the CNN. Generally, the input shape of BN is  $\text{Batch}(B)\times \text{Channel}(C) \times \text{Height}(H)\times\text{Width}(W)$.&lt;/p&gt;</description>
			</item>
	</channel>
</rss>
