-
Notifications
You must be signed in to change notification settings - Fork 23
/
preprocess.py
85 lines (67 loc) · 3.06 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import hydra
from hydra import utils
from pathlib import Path
import librosa
import scipy
import json
import numpy as np
from multiprocessing import cpu_count
from concurrent.futures import ProcessPoolExecutor
from functools import partial
from tqdm import tqdm
def preemphasis(x, preemph):
return scipy.signal.lfilter([1, -preemph], [1], x)
def mulaw_encode(x, mu):
mu = mu - 1
fx = np.sign(x) * np.log1p(mu * np.abs(x)) / np.log1p(mu)
return np.floor((fx + 1) / 2 * mu + 0.5)
def mulaw_decode(y, mu):
mu = mu - 1
x = np.sign(y) / mu * ((1 + mu) ** np.abs(y) - 1)
return x
def process_wav(wav_path, out_path, sr=160000, preemph=0.97, n_fft=2048, n_mels=80, hop_length=160,
win_length=400, fmin=50, top_db=80, bits=8, offset=0.0, duration=None):
wav, _ = librosa.load(wav_path.with_suffix(".wav"), sr=sr,
offset=offset, duration=duration)
wav = wav / np.abs(wav).max() * 0.999
mel = librosa.feature.melspectrogram(preemphasis(wav, preemph),
sr=sr,
n_fft=n_fft,
n_mels=n_mels,
hop_length=hop_length,
win_length=win_length,
fmin=fmin,
power=1)
logmel = librosa.amplitude_to_db(mel, top_db=top_db)
logmel = logmel / top_db + 1
wav = mulaw_encode(wav, mu=2**bits)
np.save(out_path.with_suffix(".wav.npy"), wav)
np.save(out_path.with_suffix(".mel.npy"), logmel)
return out_path, logmel.shape[-1]
@hydra.main(config_path="config/preprocessing.yaml")
def preprocess_dataset(cfg):
in_dir = Path(utils.to_absolute_path(cfg.in_dir))
out_dir = Path(utils.to_absolute_path("datasets")) / str(cfg.dataset.dataset)
out_dir.mkdir(parents=True, exist_ok=True)
executor = ProcessPoolExecutor(max_workers=cpu_count())
for split in ["train", "test"]:
print("Extracting features for {} set".format(split))
futures = []
split_path = out_dir / cfg.dataset.language / split
with open(split_path.with_suffix(".json")) as file:
metadata = json.load(file)
for in_path, start, duration, out_path in metadata:
wav_path = in_dir / in_path
out_path = out_dir / out_path
out_path.parent.mkdir(parents=True, exist_ok=True)
futures.append(executor.submit(
partial(process_wav, wav_path, out_path, **cfg.preprocessing,
offset=start, duration=duration)))
results = [future.result() for future in tqdm(futures)]
lengths = [x[-1] for x in results]
frames = sum(lengths)
frame_shift_ms = cfg.preprocessing.hop_length / cfg.preprocessing.sr
hours = frames * frame_shift_ms / 3600
print("Wrote {} utterances, {} frames ({:.2f} hours)".format(len(lengths), frames, hours))
if __name__ == "__main__":
preprocess_dataset()