use film-like conditioning for text on audio embed, initialized to id…

…entity
lucidrains · Jul 26, 2024 · cb56ac6 · cb56ac6
1 parent 92435be
commit cb56ac6
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 4 deletions.
diff --git a/e2_tts_pytorch/e2_tts.py b/e2_tts_pytorch/e2_tts.py
@@ -186,10 +186,14 @@ def __init__(
     ):
         super().__init__()
         self.dim = dim
-        self.embed = nn.Embedding(num_embeds + 1, dim) # will just use 0 as the 'filler token'
-        self.combine = nn.Linear(dim * 2, dim)
         self.cond_drop_prob = cond_drop_prob
 
+        self.embed = nn.Embedding(num_embeds + 1, dim) # will just use 0 as the 'filler token'
+        self.to_cond_gamma_beta = nn.Linear(dim * 2, dim * 2)
+
+        nn.init.zeros_(self.to_cond_gamma_beta.weight)
+        nn.init.zeros_(self.to_cond_gamma_beta.bias)
+
     def forward(
         self,
         x: Float['b n d'],
@@ -211,7 +215,9 @@ def forward(
 
         concatted = torch.cat((x, text_embed), dim = -1)
         assert x.shape[-1] == text_embed.shape[-1] == self.dim, f'expected {self.dim} but received ({x.shape[-1]}, {text_embed.shape[-1]})'
-        return self.combine(concatted)
+
+        gamma, beta = self.to_cond_gamma_beta(concatted).chunk(2, dim = -1)
+        return x * (gamma + 1.) + beta
 
 # attention and transformer backbone
 # for use in both e2tts as well as duration module

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "e2-tts-pytorch"
-version = "0.1.8"
+version = "0.1.9"
 description = "E2-TTS in Pytorch"
 authors = [
     { name = "Phil Wang", email = "[email protected]" }