diff --git a/setup.py b/setup.py index 2ab2880..9ac1bbd 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name = 'soundstorm-pytorch', packages = find_packages(exclude=[]), - version = '0.1.4', + version = '0.2.0', license='MIT', description = 'SoundStorm - Efficient Parallel Audio Generation from Google Deepmind, in Pytorch', author = 'Phil Wang', @@ -23,10 +23,10 @@ 'beartype', 'classifier-free-guidance-pytorch>=0.1.5', 'einops>=0.6.1', - 'spear-tts-pytorch>=0.0.15', + 'spear-tts-pytorch>=0.4.0', 'torch>=1.6', ], - classifiers=[ + classifiers = [ 'Development Status :: 4 - Beta', 'Intended Audience :: Developers', 'Topic :: Scientific/Engineering :: Artificial Intelligence', diff --git a/soundstorm_pytorch/soundstorm.py b/soundstorm_pytorch/soundstorm.py index a493183..67c7928 100644 --- a/soundstorm_pytorch/soundstorm.py +++ b/soundstorm_pytorch/soundstorm.py @@ -762,6 +762,8 @@ def generate( noise_level_scale = 1., num_full_sampling_levels = 1, text_to_semantic_generate_kwargs: dict = {}, + spec_decode = False, + spec_decode_gamma = 5, **kwargs ): if self.should_condition and not exists(cond_semantic_token_ids): @@ -776,6 +778,8 @@ def generate( texts, source_type = 'text', target_type = 'speech', + spec_decode = spec_decode, + spec_decode_gamma = spec_decode_gamma, **text_to_semantic_generate_kwargs )