Initial commit

bshall · Oct 17, 2021 · 5abc3b8 · 5abc3b8
commit 5abc3b8
Show file tree

Hide file tree

Showing 12 changed files with 1,433 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,132 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# VSCode project settings
+.vscode
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Benjamin van Niekerk
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,92 @@
+# HiFi-GAN
+
+An 16kHz implementation of HiFi-GAN for [soft-vc](https://github.com/bshall/soft-vc).
+
+Relevant links:
+- [Official HiFi-GAN repo](https://github.com/jik876/hifi-gan)
+- [HiFi-GAN paper](https://arxiv.org/abs/2010.05646)
+- [Soft-VC repo](https://github.com/bshall/soft-vc)
+- [Soft-VC paper]()
+
+## Example Usage
+
+```python
+import torch
+import numpy as np
+
+# Load checkpoint
+hifigan = torch.hub.load("bshall/hifigan:main", "hifigan-hubert-soft").cuda()
+# Load mel-spectrogram
+mel = torch.from_numpy(np.load("path/to/mel")).unsqueeze(0).cuda()
+# Generate
+wav, sr = hifigan.generate(mel)
+```
+
+## Train
+
+**Step 1**: Download and extract the [LJ-Speech dataset](https://keithito.com/LJ-Speech-Dataset/)
+
+**Step 2**: Resample the audio to 16kHz:
+```
+usage: resample.py [-h] [--sample-rate SAMPLE_RATE] in-dir out-dir
+
+Resample an audio dataset.
+
+positional arguments:
+  in-dir                path to the dataset directory
+  out-dir               path to the output directory
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --sample-rate SAMPLE_RATE
+                        target sample rate (default 16kHz)
+```
+
+**Step 3**: Download the dataset splits and move them into the root of the dataset directory.
+After steps 2 and 3 your dataset directory should look like this:
+```
+LJSpeech-1.1
+│   test.txt
+│   train.txt
+│   validation.txt
+├───mels
+└───wavs
+```
+Note: the mels directory is optional. If you want to fine-tune HiFi-GAN the mels directory should contain ground-truth aligned spectrograms from an acoustic model.
+
+**Step 4**: Train HiFi-GAN:
+```
+usage: train.py [-h] [--resume RESUME] [--finetune] dataset-dir checkpoint-dir
+
+Train or finetune HiFi-GAN.
+
+positional arguments:
+  dataset-dir      path to the preprocessed data directory
+  checkpoint-dir   path to the checkpoint directory
+
+optional arguments:
+  -h, --help       show this help message and exit
+  --resume RESUME  path to the checkpoint to resume from
+  --finetune       whether to finetune (note that a resume path must be given)
+```
+
+## Generate
+To generate using the trained HiFi-GAN models, see [Example Usage](#example-usage) or use the `generate.py` script:
+
+```
+usage: generate.py [-h] [--model-name {hifigan,hifigan-hubert-soft,hifigan-hubert-discrete}] in-dir out-dir
+
+Generate audio for a directory of mel-spectrogams using HiFi-GAN.
+
+positional arguments:
+  in-dir                path to directory containing the mel-spectrograms
+  out-dir               path to output directory
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --model-name {hifigan,hifigan-hubert-soft,hifigan-hubert-discrete}
+                        available models
+```
+
+## Acknowledgements
+This repo is based heavily on [https://github.com/jik876/hifi-gan](https://github.com/jik876/hifi-gan).
diff --git a/generate.py b/generate.py
@@ -0,0 +1,50 @@
+from pathlib import Path
+import numpy as np
+import argparse
+import torch
+import torchaudio
+from tqdm import tqdm
+
+
+def generate(args):
+    args.out_dir.mkdir(exist_ok=True, parents=True)
+
+    print("Loading checkpoint")
+    hifigan = torch.hub.load("bshall/hifigan:main", args.model_name).cuda()
+
+    print(f"Generating audio from {args.in_dir}")
+    for path in tqdm(list(args.in_dir.rglob("*.npy"))):
+        mel = torch.from_numpy(np.load(path))
+        mel = mel.unsqueeze(0).cuda()
+
+        wav, sr = hifigan.generate(mel)
+        wav = wav.squeeze(0).cpu()
+
+        out_path = args.out_dir / path.relative_to(args.in_dir)
+        out_path.parent.mkdir(exist_ok=True, parents=True)
+        torchaudio.save(out_path.with_suffix(".wav"), wav, sr)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Generate audio for a directory of mel-spectrogams using HiFi-GAN."
+    )
+    parser.add_argument(
+        "in-dir",
+        help="path to directory containing the mel-spectrograms",
+        type=Path,
+    )
+    parser.add_argument(
+        "out-dir",
+        help="path to output directory",
+        type=Path,
+    )
+    parser.add_argument(
+        "--model-name",
+        help="available models",
+        choices=["hifigan", "hifigan-hubert-soft", "hifigan-hubert-discrete"],
+        default="hifigan-hubert-soft",
+    )
+    args = parser.parse_args()
+
+    generate(args)