-
Notifications
You must be signed in to change notification settings - Fork 4
/
data_preprocess.py
204 lines (175 loc) · 10.8 KB
/
data_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
from util import keyword_spot
from ge2e_hparams import hparams
from util import split_audio, get_split_mels
# downloaded dataset path
audio_path = '/home/zeng/下载/LibriSpeech/train-clean-100' # utterance dataset
clean_path = r'' # clean dataset
noisy_path = r'' # noisy dataset
def extract_noise():
""" Extract noise and save the spectrogram (as numpy array in hparams.noise_path)
Need: paired clean and noisy data set
"""
print("start noise extraction!")
os.makedirs(hparams.noise_path, exist_ok=True) # make folder to save noise file
total = len(os.listdir(clean_path)) # total length of audio files
batch_frames = hparams.N * hparams.M * hparams.tdsv_frame # TD-SV frame number of each batch
stacked_noise = []
stacked_len = 0
k = 0
for i, path in enumerate(os.listdir(clean_path)):
clean, sr = librosa.core.load(os.path.join(clean_path, path), sr=8000) # load clean audio
noisy, _ = librosa.core.load(os.path.join(noisy_path, path), sr=sr) # load noisy audio
noise = clean - noisy # get noise audio by subtract clean voice from the noisy audio
S = librosa.core.stft(y=noise, n_fft=hparams.nfft,
win_length=int(hparams.window * sr), hop_length=int(hparams.hop * sr)) # perform STFT
stacked_noise.append(S)
stacked_len += S.shape[1]
if i % 100 == 0:
print("%d processing..." % i)
if stacked_len < batch_frames: # if noise frames is short than batch frames, then continue to stack the noise
continue
stacked_noise = np.concatenate(stacked_noise, axis=1)[:, :batch_frames] # concat noise and slice
np.save(os.path.join(hparams.noise_path, "noise_%d.npy" % k), stacked_noise) # save spectrogram as numpy file
print(" %dth file saved" % k, stacked_noise.shape)
stacked_noise = [] # reset list
stacked_len = 0
k += 1
print("noise extraction is end! %d noise files" % k)
def save_spectrogram_tdsv():
""" Select text specific utterance and perform STFT with the audio file.
Audio spectrogram files are divided as train set and test set and saved as numpy file.
Need : utterance data set (VTCK)
"""
print("start text dependent utterance selection")
os.makedirs(hparams.train_path, exist_ok=True) # make folder to save train file
os.makedirs(hparams.test_path, exist_ok=True) # make folder to save test file
utterances_spec = []
for folder in os.listdir(audio_path):
utter_path = os.path.join(audio_path, folder, os.listdir(os.path.join(audio_path, folder))[0])
if os.path.splitext(os.path.basename(utter_path))[0][-3:] != '001': # if the text utterance doesn't exist pass
print(os.path.basename(utter_path)[:4], "001 file doesn't exist")
continue
utter, sr = librosa.core.load(utter_path, hparams.sr) # load the utterance audio
utter_trim, index = librosa.effects.trim(utter, top_db=14) # trim the beginning and end blank
if utter_trim.shape[0] / sr <= hparams.hop * (
hparams.tdsv_frame + 2): # if trimmed file is too short, then pass
print(os.path.basename(utter_path), "voice trim fail")
continue
S = librosa.core.stft(y=utter_trim, n_fft=hparams.nfft,
win_length=int(hparams.window * sr), hop_length=int(hparams.hop * sr)) # perform STFT
S = keyword_spot(S) # keyword spot (for now, just slice last 80 frames which contains "Call Stella")
utterances_spec.append(S) # make spectrograms list
utterances_spec = np.array(utterances_spec) # list to numpy array
np.random.shuffle(utterances_spec) # shuffle spectrogram (by person)
total_num = utterances_spec.shape[0]
train_num = (total_num // 10) * 9 # split total data 90% train and 10% test
print("selection is end")
print("total utterances number : %d" % total_num, ", shape : ", utterances_spec.shape)
print("train : %d, test : %d" % (train_num, total_num - train_num))
np.save(os.path.join(hparams.train_path, "train.npy"),
utterances_spec[:train_num]) # save spectrogram as numpy file
np.save(os.path.join(hparams.test_path, "test.npy"), utterances_spec[train_num:])
def save_spectrogram_tisv():
""" Full preprocess of text independent utterance. The log-mel-spectrogram is saved as numpy file.
Each partial utterance is splitted by voice detection using DB
and the first and the last 180 frames from each partial utterance are saved.
Need : utterance data set (VTCK)
"""
print("start text independent utterance feature extraction")
os.makedirs(hparams.train_path, exist_ok=True) # make folder to save train file
os.makedirs(hparams.test_path, exist_ok=True) # make folder to save test file
utter_min_len = (hparams.tisv_frame * hparams.hop + hparams.window) * hparams.sr # lower bound of utterance length
total_speaker_num = len(os.listdir(audio_path))
train_speaker_num = (total_speaker_num // 10) * 9 # split total data 90% train and 10% test
print("total speaker number : %d" % total_speaker_num)
print("train : %d, test : %d" % (train_speaker_num, total_speaker_num - train_speaker_num))
for i, folder in enumerate(os.listdir(audio_path)):
speaker_path = os.path.join(audio_path, folder) # path of each speaker
print("%dth speaker processing..." % i)
utterances_spec = []
k = 0
for utter_name in os.listdir(speaker_path):
utter_path = os.path.join(speaker_path, utter_name) # path of each utterance
utter, sr = librosa.core.load(utter_path, hparams.sr) # load utterance audio
intervals = librosa.effects.split(utter, top_db=20) # voice activity detection
for interval in intervals:
if (interval[1] - interval[0]) > utter_min_len: # If partial utterance is sufficient long,
utter_part = utter[interval[0]:interval[1]] # save first and last 180 frames of spectrogram.
S = librosa.core.stft(y=utter_part, n_fft=hparams.nfft,
win_length=int(hparams.window * sr), hop_length=int(hparams.hop * sr))
S = np.abs(S) ** 2
mel_basis = librosa.filters.mel(sr=hparams.sr, n_fft=hparams.nfft, n_mels=40)
S = np.log10(np.dot(mel_basis, S) + 1e-6) # log mel spectrogram of utterances
utterances_spec.append(S[:, :hparams.tisv_frame]) # first 180 frames of partial utterance
utterances_spec.append(S[:, -hparams.tisv_frame:]) # last 180 frames of partial utterance
utterances_spec = np.array(utterances_spec)
print(utterances_spec.shape)
if i < train_speaker_num: # save spectrogram as numpy file
np.save(os.path.join(hparams.train_path, "speaker%d.npy" % i), utterances_spec)
else:
np.save(os.path.join(hparams.test_path, "speaker%d.npy" % (i - train_speaker_num)), utterances_spec)
def save_spectrogram_tisv_libri():
""" Full preprocess of text independent utterance. The log-mel-spectrogram is saved as numpy file.
Each partial utterance is splitted by voice detection using DB
and the first and the last 180 frames from each partial utterance are saved.
Need : utterance data set (VTCK)
"""
print("start text independent utterance feature extraction")
os.makedirs('./data/train', exist_ok=True) # make folder to save train file
os.makedirs('./data/test', exist_ok=True) # make folder to save test file
# utter_min_len = (hparams.tisv_frame * hparams.hop + hparams.window) * hparams.sr # lower bound of utterance length
total_speaker_num = len(os.listdir(audio_path))
train_speaker_num = (total_speaker_num // 10) * 9 # split total data 90% train and 10% test
print("total speaker number : %d" % total_speaker_num)
print("train : %d, test : %d" % (train_speaker_num, total_speaker_num - train_speaker_num))
for i, folder in enumerate(os.listdir(audio_path)):
speaker_path = os.path.join(audio_path, folder) # path of each speaker
print("%dth speaker processing..." % i)
utterances_spec = []
if i <= 55: continue
for utter_name in os.listdir(speaker_path):
utter_path = os.path.join(speaker_path, utter_name) # path of each utterance
for audio in os.listdir(utter_path):
utter_audio_path = os.path.join(utter_path, audio)
if 'flac' in utter_audio_path:
mels = get_feature_sequence(utter_audio_path)
utterances_spec += mels
utterances_spec = np.array(utterances_spec)
print(utterances_spec.shape)
if i < train_speaker_num: # save spectrogram as numpy file
np.save(os.path.join(hparams.train_path, "speaker%d.npy" % i), utterances_spec)
else:
np.save(os.path.join(hparams.test_path, "speaker%d.npy" % (i - train_speaker_num)), utterances_spec)
def get_feature_split(audio_path, n_fft=512, window=0.025, hop=0.01, tisv_frame=100, sr=8000, mel=40):
utter_min_len = (tisv_frame * hop + window) * sr
utter, sr = librosa.core.load(audio_path, sr)
intervals = librosa.effects.split(utter, top_db=20)
utterances_spec = []
for interval in intervals:
if (interval[1] - interval[0]) > utter_min_len: # If partial utterance is sufficient long,
utter_part = utter[interval[0]:interval[1]] # save first and last 180 frames of spectrogram.
S = librosa.core.stft(y=utter_part, n_fft=n_fft, win_length=int(window * sr),
hop_length=int(hop * sr))
S = np.abs(S) ** 2
mel_basis = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=mel)
S = np.log10(np.dot(mel_basis, S) + 1e-6) # log mel spectrogram of utterances
utterances_spec.append(S[:, :hparams.tisv_frame]) # first 180 frames of partial utterance
utterances_spec.append(S[:, -hparams.tisv_frame:])
return utterances_spec
def get_feature_sequence(audio_path, n_fft=512, window=0.025, hop=0.01, tisv_frame=100, sr=8000, mel=40):
x, sr = librosa.load(audio_path, sr)
x, insex = librosa.effects.trim(x, top_db=5)
audios = split_audio(x, sr)
mels = get_split_mels(audios, sr=sr, n_fft=n_fft, win_length=window, hop_length=hop, mel=mel)
return mels
if __name__ == "__main__":
# extract_noise()
# if hparams.mode == 'TD-SV':
# save_spectrogram_tdsv()
# else:
# save_spectrogram_tisv()
save_spectrogram_tisv_libri()