From b3386f8b04759636a10cb50134cdf41a1dce8a3f Mon Sep 17 00:00:00 2001
From: Phil Wang <lucidrains@gmail.com>
Date: Thu, 19 Dec 2024 07:40:52 -0800
Subject: [PATCH] add learned residual scaling for main transformer path

---
 e2_tts_pytorch/e2_tts.py | 31 ++++++++++++++++++++++++++++---
 pyproject.toml           |  2 +-
 2 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/e2_tts_pytorch/e2_tts.py b/e2_tts_pytorch/e2_tts.py
index ed0489f..d1b1062 100644
--- a/e2_tts_pytorch/e2_tts.py
+++ b/e2_tts_pytorch/e2_tts.py
@@ -500,6 +500,7 @@ def __init__(
         kernel_size = 31,
         dropout = 0.1,
         num_registers = 32,
+        scale_residual = False,
         attn_laser = False,
         attn_laser_softclamp_value = 15.,
         attn_kwargs: dict = dict(
@@ -540,6 +541,10 @@ def __init__(
         self.text_registers = nn.Parameter(torch.zeros(num_registers, dim_text))
         nn.init.normal_(self.text_registers, std = 0.02)
 
+        # maybe residual scales
+
+        residual_scales = []
+
         # rotary embedding
 
         self.rotary_emb = RotaryEmbedding(dim_head)
@@ -575,10 +580,16 @@ def __init__(
             attn = Attention(dim = dim, heads = heads, dim_head = dim_head, dropout = dropout, learned_value_residual_mix = not is_first_block, laser = attn_laser, laser_softclamp_value = attn_laser_softclamp_value, **attn_kwargs)
             attn_adaln_zero = postbranch_klass()
 
+
             ff_norm = rmsnorm_klass(dim)
             ff = FeedForward(dim = dim, glu = True, mult = ff_mult, dropout = dropout, **ff_kwargs)
             ff_adaln_zero = postbranch_klass()
 
+            residual_scales.append(nn.ParameterList([
+                nn.Parameter(torch.ones(dim)),
+                nn.Parameter(torch.ones(dim))
+            ]))
+
             skip_proj = Linear(dim * 2, dim, bias = False) if is_later_half else None
 
             speech_modules = ModuleList([
@@ -625,6 +636,8 @@ def __init__(
                 text_modules
             ]))
 
+        self.residual_scales = nn.ParameterList(residual_scales) if scale_residual else None
+
         self.final_norm = RMSNorm(dim)
 
     def forward(
@@ -685,9 +698,14 @@ def forward(
         text_attn_first_values = None
         attn_first_values = None
 
+        # prepare residual scales
+
+        residual_scales = default(self.residual_scales, (None,) * len(self.layers))
+
         # go through the layers
 
-        for ind, (speech_modules, text_modules) in enumerate(self.layers):
+        for ind, ((speech_modules, text_modules), maybe_residual_scales) in enumerate(zip(self.layers, residual_scales)):
+
             layer = ind + 1
 
             (
@@ -742,17 +760,24 @@ def forward(
 
             x = speech_conv(x, mask = mask) + x
 
+            # maybe residual scaling
+
+            attn_res_scale, ff_res_scale = 1., 1.
+
+            if exists(maybe_residual_scales):
+                attn_res_scale, ff_res_scale = maybe_residual_scales
+
             # attention and feedforward blocks
 
             attn_out, attn_inter = attn(attn_norm(x, **norm_kwargs), rotary_pos_emb = rotary_pos_emb, mask = mask, return_intermediates = True, value_residual = attn_first_values)
 
             attn_first_values = default(attn_first_values, attn_inter.values)
 
-            x = x + maybe_attn_adaln_zero(attn_out, **norm_kwargs)
+            x = x * attn_res_scale + maybe_attn_adaln_zero(attn_out, **norm_kwargs)
 
             ff_out = ff(ff_norm(x, **norm_kwargs))
 
-            x = x + maybe_ff_adaln_zero(ff_out, **norm_kwargs)
+            x = x * ff_res_scale + maybe_ff_adaln_zero(ff_out, **norm_kwargs)
 
         assert len(skips) == 0
 
diff --git a/pyproject.toml b/pyproject.toml
index 44f3c7b..b58f41b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "e2-tts-pytorch"
-version = "1.6.2"
+version = "1.6.3"
 description = "E2-TTS in Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }