idiap · eginhard · Dec 2, 2024 · Nov 20, 2024 · Nov 20, 2024 · Nov 21, 2024
diff --git a/.github/actions/setup-uv/action.yml b/.github/actions/setup-uv/action.yml
@@ -4,8 +4,8 @@ runs:
   using: 'composite'
   steps:
     - name: Install uv
-      uses: astral-sh/setup-uv@v3
+      uses: astral-sh/setup-uv@v4
       with:
-        version: "0.5.1"
+        version: "0.5.4"
         enable-cache: true
         cache-dependency-glob: "**/pyproject.toml"
diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -1,4 +1,4 @@
-name: unit
+name: test
 
 on:
   push:
@@ -17,7 +17,7 @@ on:
         required: false
         default: "main"
 jobs:
-  test:
+  unit:
     runs-on: ubuntu-latest
     strategy:
       fail-fast: false
@@ -62,9 +62,54 @@ jobs:
           name: coverage-data-${{ matrix.subset }}-${{ matrix.python-version }}
           path: .coverage.*
           if-no-files-found: ignore
+  integration:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.9", "3.12"]
+        subset: ["test_tts", "test_tts2", "test_vocoder", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup uv
+        uses: ./.github/actions/setup-uv
+      - name: Set up Python ${{ matrix.python-version }}
+        run: uv python install ${{ matrix.python-version }}
+      - name: Install Espeak
+        if: contains(fromJSON('["test_tts", "test_tts2", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"]'), matrix.subset)
+        run: |
+          sudo apt-get update
+          sudo apt-get install espeak espeak-ng
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y --no-install-recommends git make gcc
+          make system-deps
+      - name: Install custom Trainer and/or Coqpit if requested
+        run: |
+          if [[ -n "${{ github.event.inputs.trainer_branch }}" ]]; then
+            uv add git+https://github.com/idiap/coqui-ai-Trainer --branch ${{ github.event.inputs.trainer_branch }}
+          fi
+          if [[ -n "${{ github.event.inputs.coqpit_branch }}" ]]; then
+            uv add git+https://github.com/idiap/coqui-ai-coqpit --branch ${{ github.event.inputs.coqpit_branch }}
+          fi
+      - name: Integration tests
+        run: |
+          resolution=highest
+          if [ "${{ matrix.python-version }}" == "3.9" ]; then
+            resolution=lowest-direct
+          fi
+          uv run --resolution=$resolution --extra server --extra languages make ${{ matrix.subset }}
+      - name: Upload coverage data
+        uses: actions/upload-artifact@v4
+        with:
+          include-hidden-files: true
+          name: coverage-data-${{ matrix.subset }}-${{ matrix.python-version }}
+          path: .coverage.*
+          if-no-files-found: ignore
   coverage:
     if: always()
-    needs: test
+    needs: [unit, integration]
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4

diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py
@@ -80,7 +80,7 @@
     num_chars = len(phonemes) if C.use_phonemes else len(symbols)
     # TODO: handle multi-speaker
     model = setup_model(C)
-    model, _ = load_checkpoint(model, args.model_path, args.use_cuda, True)
+    model, _ = load_checkpoint(model, args.model_path, use_cuda=args.use_cuda, eval=True)
 
     # data loader
     preprocessor = importlib.import_module("TTS.tts.datasets.formatters")

diff --git a/TTS/encoder/models/base_encoder.py b/TTS/encoder/models/base_encoder.py
@@ -5,10 +5,10 @@
 import torchaudio
 from coqpit import Coqpit
 from torch import nn
+from trainer.generic_utils import set_partial_state_dict
 from trainer.io import load_fsspec
 
 from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
-from TTS.utils.generic_utils import set_init_dict
 
 logger = logging.getLogger(__name__)
 
@@ -130,7 +130,7 @@ def load_checkpoint(
 
             logger.info("Partial model initialization.")
             model_dict = self.state_dict()
-            model_dict = set_init_dict(model_dict, state["model"], c)
+            model_dict = set_partial_state_dict(model_dict, state["model"], config)
             self.load_state_dict(model_dict)
             del model_dict
 

diff --git a/TTS/encoder/utils/training.py b/TTS/encoder/utils/training.py
@@ -2,9 +2,9 @@
 from dataclasses import dataclass, field
 
 from coqpit import Coqpit
-from trainer import TrainerArgs, get_last_checkpoint
+from trainer import TrainerArgs
 from trainer.generic_utils import get_experiment_folder_path, get_git_branch
-from trainer.io import copy_model_files
+from trainer.io import copy_model_files, get_last_checkpoint
 from trainer.logging import logger_factory
 from trainer.logging.console_logger import ConsoleLogger
 

diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py
@@ -63,6 +63,31 @@ def get_audio_size(audiopath: Union[str, os.PathLike[Any]]) -> int:
         raise RuntimeError(msg) from e
 
 
+def get_attribute_balancer_weights(items: list, attr_name: str, multi_dict: Optional[dict] = None):
+    """Create inverse frequency weights for balancing the dataset.
+
+    Use `multi_dict` to scale relative weights."""
+    attr_names_samples = np.array([item[attr_name] for item in items])
+    unique_attr_names = np.unique(attr_names_samples).tolist()
+    attr_idx = [unique_attr_names.index(l) for l in attr_names_samples]
+    attr_count = np.array([len(np.where(attr_names_samples == l)[0]) for l in unique_attr_names])
+    weight_attr = 1.0 / attr_count
+    dataset_samples_weight = np.array([weight_attr[l] for l in attr_idx])
+    dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight)
+    if multi_dict is not None:
+        # check if all keys are in the multi_dict
+        for k in multi_dict:
+            assert k in unique_attr_names, f"{k} not in {unique_attr_names}"
+        # scale weights
+        multiplier_samples = np.array([multi_dict.get(item[attr_name], 1.0) for item in items])
+        dataset_samples_weight *= multiplier_samples
+    return (
+        torch.from_numpy(dataset_samples_weight).float(),
+        unique_attr_names,
+        np.unique(dataset_samples_weight).tolist(),
+    )
+
+
 class TTSDataset(Dataset):
     def __init__(
         self,

diff --git a/TTS/tts/layers/bark/hubert/kmeans_hubert.py b/TTS/tts/layers/bark/hubert/kmeans_hubert.py
@@ -14,6 +14,8 @@
 from torchaudio.functional import resample
 from transformers import HubertModel
 
+from TTS.utils.generic_utils import exists
+
 
 def round_down_nearest_multiple(num, divisor):
     return num // divisor * divisor
@@ -26,14 +28,6 @@ def curtail_to_multiple(t, mult, from_left=False):
     return t[..., seq_slice]
 
 
-def exists(val):
-    return val is not None
-
-
-def default(val, d):
-    return val if exists(val) else d
-
-
 class CustomHubert(nn.Module):
     """
     checkpoint and kmeans can be downloaded at https://github.com/facebookresearch/fairseq/tree/main/examples/hubert

diff --git a/TTS/tts/layers/bark/model.py b/TTS/tts/layers/bark/model.py
@@ -12,18 +12,6 @@
 from torch.nn import functional as F
 
 
-class LayerNorm(nn.Module):
-    """LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False"""
-
-    def __init__(self, ndim, bias):
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(ndim))
-        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
-
-    def forward(self, x):
-        return F.layer_norm(x, self.weight.shape, self.weight, self.bias, 1e-5)
-
-
 class CausalSelfAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -119,9 +107,9 @@ def forward(self, x):
 class Block(nn.Module):
     def __init__(self, config, layer_idx):
         super().__init__()
-        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
+        self.ln_1 = nn.LayerNorm(config.n_embd, bias=config.bias)
         self.attn = CausalSelfAttention(config)
-        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
+        self.ln_2 = nn.LayerNorm(config.n_embd, bias=config.bias)
         self.mlp = MLP(config)
         self.layer_idx = layer_idx
 
@@ -158,7 +146,7 @@ def __init__(self, config):
                 wpe=nn.Embedding(config.block_size, config.n_embd),
                 drop=nn.Dropout(config.dropout),
                 h=nn.ModuleList([Block(config, idx) for idx in range(config.n_layer)]),
-                ln_f=LayerNorm(config.n_embd, bias=config.bias),
+                ln_f=nn.LayerNorm(config.n_embd, bias=config.bias),
             )
         )
         self.lm_head = nn.Linear(config.n_embd, config.output_vocab_size, bias=False)