diff --git a/setup.py b/setup.py index 92a4bb7..0093338 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name = 'soundstorm-pytorch', packages = find_packages(exclude=[]), - version = '0.4.10', + version = '0.4.11', license='MIT', description = 'SoundStorm - Efficient Parallel Audio Generation from Google Deepmind, in Pytorch', author = 'Phil Wang', diff --git a/soundstorm_pytorch/soundstorm.py b/soundstorm_pytorch/soundstorm.py index b7aaed3..68064b2 100644 --- a/soundstorm_pytorch/soundstorm.py +++ b/soundstorm_pytorch/soundstorm.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import math from random import random, randrange from functools import wraps @@ -6,8 +8,9 @@ from pathlib import Path import torch -from torch.cuda.amp import autocast +from torch.amp import autocast from torch import Tensor, nn, einsum +from torch.nn import Module, ModuleList import torch.nn.functional as F from einops import rearrange, reduce, repeat, unpack, pack @@ -15,7 +18,7 @@ from beartype import beartype from beartype.door import is_bearable -from beartype.typing import Union, Dict, Optional, List, Optional, Any +from beartype.typing import Any from soundstorm_pytorch.attend import Attend @@ -85,7 +88,7 @@ def coin_flip(): @beartype def get_mask_subset_prob( mask: Tensor, - prob: Union[float, Tensor], + prob: float | Tensor, min_mask: int = 0, min_keep_mask: int = 0 ): @@ -124,7 +127,7 @@ def cosine_schedule(t): # rotary embedding -class RotaryEmbedding(nn.Module): +class RotaryEmbedding(Module): def __init__(self, dim, theta = 10000): super().__init__() inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim)) @@ -134,7 +137,7 @@ def __init__(self, dim, theta = 10000): def device(self): return next(self.buffers()).device - @autocast(enabled = False) + @autocast('cuda', enabled = False) def forward(self, seq_len): t = torch.arange(seq_len, device = self.device).type_as(self.inv_freq) freqs = torch.einsum('i , j -> i j', t, self.inv_freq) @@ -145,13 +148,13 @@ def rotate_half(x): x1, x2 = x.chunk(2, dim=-1) return torch.cat((-x2, x1), dim=-1) -@autocast(enabled = False) +@autocast('cuda', enabled = False) def apply_rotary_pos_emb(pos, t): return (t * pos.cos()) + (rotate_half(t) * pos.sin()) # t5 relative positional bias -class T5RelativePositionBias(nn.Module): +class T5RelativePositionBias(Module): def __init__( self, scale = 1., @@ -209,11 +212,11 @@ def forward(self, n): # conformer -class Swish(nn.Module): +class Swish(Module): def forward(self, x): return x * x.sigmoid() -class GLU(nn.Module): +class GLU(Module): def __init__(self, dim): super().__init__() self.dim = dim @@ -222,7 +225,7 @@ def forward(self, x): out, gate = x.chunk(2, dim=self.dim) return out * gate.sigmoid() -class DepthWiseConv1d(nn.Module): +class DepthWiseConv1d(Module): def __init__(self, chan_in, chan_out, kernel_size, padding): super().__init__() self.padding = padding @@ -243,7 +246,7 @@ def forward(self, x, mask = None): # attention, feedforward, and conv module -class Scale(nn.Module): +class Scale(Module): def __init__(self, scale, fn): super().__init__() self.fn = fn @@ -252,7 +255,7 @@ def __init__(self, scale, fn): def forward(self, x, **kwargs): return self.fn(x, **kwargs) * self.scale -class ChanLayerNorm(nn.Module): +class ChanLayerNorm(Module): def __init__(self, dim): super().__init__() self.gamma = nn.Parameter(torch.ones(1, dim, 1)) @@ -263,7 +266,7 @@ def forward(self, x): mean = torch.mean(x, dim = 1, keepdim = True) return (x - mean) * var.clamp(min = eps).rsqrt() * self.gamma -class PreNorm(nn.Module): +class PreNorm(Module): def __init__(self, dim, fn): super().__init__() self.fn = fn @@ -273,7 +276,7 @@ def forward(self, x, **kwargs): x = self.norm(x) return self.fn(x, **kwargs) -class Attention(nn.Module): +class Attention(Module): def __init__( self, dim, @@ -321,7 +324,7 @@ def forward( out = rearrange(out, 'b h n d -> b n (h d)') return self.to_out(out) -class FeedForward(nn.Module): +class FeedForward(Module): def __init__( self, dim, @@ -340,7 +343,7 @@ def __init__( def forward(self, x): return self.net(x) -class ConformerConvModule(nn.Module): +class ConformerConvModule(Module): def __init__( self, dim, @@ -378,7 +381,7 @@ def forward(self, x, mask = None): # Conformer Block -class ConformerBlock(nn.Module): +class ConformerBlock(Module): def __init__( self, *, @@ -430,7 +433,7 @@ def forward( # Conformer -class Conformer(nn.Module): +class Conformer(Module): def __init__( self, dim, @@ -454,7 +457,7 @@ def __init__( assert not (t5_rel_pos_bias and attn_flash), 'flash attention is not compatible with learned bias' self.dim = dim - self.layers = nn.ModuleList([]) + self.layers = ModuleList([]) self.rotary_emb = RotaryEmbedding(dim_head) if not t5_rel_pos_bias else None self.rel_pos_bias = T5RelativePositionBias(dim_head ** 0.5, heads = heads) if t5_rel_pos_bias else None @@ -493,7 +496,7 @@ def forward(self, x, mask = None): # conformer with sum reduction across quantized tokens at the beginning, along with heads -class ConformerWrapper(nn.Module): +class ConformerWrapper(Module): @beartype def __init__( @@ -501,7 +504,7 @@ def __init__( *, codebook_size, num_quantizers, - conformer: Union[Conformer, Dict[str, Any]], + conformer: Conformer | dict[str, Any], grouped_quantizers = 1 ): super().__init__() @@ -614,7 +617,7 @@ def forward( # for main logits as well as self token critic -class LogitHead(nn.Module): +class LogitHead(Module): def __init__( self, net: ConformerWrapper, @@ -633,16 +636,16 @@ def forward(self, x): LossBreakdown = namedtuple('LossBreakdown', ['generator_loss', 'critic_loss']) -class SoundStorm(nn.Module): +class SoundStorm(Module): @beartype def __init__( self, net: ConformerWrapper, *, - soundstream: Optional[SoundStream] = None, - spear_tts_text_to_semantic: Optional[TextToSemantic] = None, - wav2vec: Optional[Union[HubertWithKmeans, FairseqVQWav2Vec]] = None, + soundstream: SoundStream | None = None, + spear_tts_text_to_semantic: TextToSemantic | None = None, + wav2vec: HubertWithKmeans | FairseqVQWav2Vec | None = None, steps = 18, self_cond = False, self_cond_train_prob = 0.75, @@ -794,7 +797,7 @@ def generate( num_latents = None, *, mask = None, - texts: Optional[Union[List[str], Tensor]] = None, + texts: list[str] | Tensor | None = None, cond_semantic_token_ids = None, prompt_acoustic_token_ids = None, seconds = None, diff --git a/soundstorm_pytorch/trainer.py b/soundstorm_pytorch/trainer.py index 7809fc2..63bd955 100644 --- a/soundstorm_pytorch/trainer.py +++ b/soundstorm_pytorch/trainer.py @@ -1,12 +1,14 @@ +from __future__ import annotations + from pathlib import Path import re from shutil import rmtree from beartype import beartype -from beartype.typing import Optional import torch from torch import nn +from torch.nn import Module from torch.optim.lr_scheduler import CosineAnnealingLR from torch.utils.data import Dataset, random_split @@ -58,7 +60,7 @@ def checkpoint_num_steps(checkpoint_path): return int(results[-1]) -class SoundStormTrainer(nn.Module): +class SoundStormTrainer(Module): @beartype def __init__( self, @@ -67,7 +69,7 @@ def __init__( num_train_steps, num_warmup_steps, batch_size, - dataset: Optional[Dataset] = None, + dataset: Dataset | None = None, only_train_generator = False, only_train_critic = False, lr = 3e-4,