-
Notifications
You must be signed in to change notification settings - Fork 22
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Suwon Shon
committed
Oct 24, 2019
1 parent
ee52935
commit fc8195f
Showing
6 changed files
with
17,915 additions
and
17,652 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,194 @@ | ||
import tensorflow as tf | ||
import numpy as np | ||
class nn: | ||
|
||
# Create model | ||
def __init__(self, x1, y_, y_string, shapes_batch, softmax_num,is_training,input_dim, is_batchnorm): | ||
self.ea, self.eb, self.o1,self.res1,self.conv,self.ac1,self.ac2 = self.net(x1, shapes_batch, softmax_num,is_training,input_dim,is_batchnorm) | ||
|
||
# Create loss | ||
self.loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_, logits=self.o1)) | ||
self.label=y_ | ||
self.shape = shapes_batch | ||
self.true_length = x1 | ||
self.label_string=y_string | ||
|
||
|
||
|
||
def net(self,x, shapes_batch,softmax_num,is_training, input_dim, is_batchnorm): | ||
shape_list = shapes_batch[:,0] | ||
is_exclude_short = False | ||
if is_exclude_short: | ||
#randomly select start of sequences | ||
sequence_limit = tf.reduce_min(shape_list)/2 | ||
# sequence_limit = tf.cond(sequence_limit<=200, lambda: sequence_limit, lambda: tf.subtract(sequence_limit,200)) | ||
random_start_pt = tf.random_uniform([1],minval=0,maxval=sequence_limit,dtype=tf.int32) | ||
end_pt = tf.reduce_max(shape_list) | ||
x = tf.gather(x,tf.range(tf.squeeze(random_start_pt),end_pt),axis=1) | ||
shape_list = shape_list-random_start_pt | ||
|
||
#randomly chunk sequences | ||
batch_quantity = tf.size(shape_list) | ||
aug_list = tf.constant([200, 300, 400], dtype=tf.float32) | ||
aug_quantity = tf.size(aug_list) | ||
rand_index = tf.random_uniform([batch_quantity],minval=0,maxval=aug_quantity-1,dtype=tf.int32) | ||
rand_aug_list = tf.gather(aug_list,rand_index) | ||
|
||
shape_list_f = tf.cast(shape_list, tf.float32) | ||
temp = tf.multiply(shape_list_f, rand_aug_list/shape_list_f) | ||
aug_shape_list = tf.cast(temp, tf.int32) | ||
shape_list = tf.minimum(shape_list,aug_shape_list) | ||
|
||
|
||
featdim = input_dim #channel | ||
weights = [] | ||
kernel_size =5 | ||
stride = 1 | ||
depth = 500 | ||
|
||
shape_list = shape_list/stride | ||
conv1 = self.conv_layer(x,kernel_size,featdim,stride,depth,'conv1',shape_list) | ||
conv1_bn = self.batch_norm_wrapper_1dcnn(conv1, is_training,'bn1',shape_list,is_batchnorm) | ||
conv1r= tf.nn.relu(conv1_bn) | ||
|
||
|
||
featdim = depth #channel | ||
weights = [] | ||
kernel_size =7 | ||
stride = 2 | ||
depth = 500 | ||
|
||
shape_list = shape_list/stride | ||
conv2 = self.conv_layer(conv1r,kernel_size,featdim,stride,depth,'conv2',shape_list) | ||
conv2_bn = self.batch_norm_wrapper_1dcnn(conv2, is_training,'bn2',shape_list,is_batchnorm) | ||
conv2r= tf.nn.relu(conv2_bn) | ||
|
||
featdim = depth #channel | ||
weights = [] | ||
kernel_size =1 | ||
stride = 1 | ||
depth = 500 | ||
|
||
shape_list = shape_list/stride | ||
conv3 = self.conv_layer(conv2r,kernel_size,featdim,stride,depth,'conv3',shape_list) | ||
conv3_bn = self.batch_norm_wrapper_1dcnn(conv3, is_training,'bn3',shape_list,is_batchnorm) | ||
conv3r= tf.nn.relu(conv3_bn) | ||
|
||
featdim = depth #channel | ||
weights = [] | ||
kernel_size =1 | ||
stride = 1 | ||
depth = 3000 | ||
|
||
shape_list = shape_list/stride | ||
conv4 = self.conv_layer(conv3r,kernel_size,featdim,stride,depth,'conv4',shape_list) | ||
conv4_bn = self.batch_norm_wrapper_1dcnn(conv4, is_training,'bn4',shape_list,is_batchnorm) | ||
conv4r= tf.nn.relu(conv4_bn) | ||
|
||
# print conv1 | ||
|
||
|
||
|
||
# shape_list = tf.cast(shape_list, tf.float32) | ||
# shape_list = tf.reshape(shape_list,[-1,1,1]) | ||
# mean = tf.reduce_sum(conv4r,1,keep_dims=True)/shape_list | ||
# res1=tf.squeeze(mean,axis=1) | ||
res1=conv4r[0] | ||
|
||
fc1 = self.fc_layer(res1,1500,"fc1") | ||
fc1_bn = self.batch_norm_wrapper_fc(fc1, is_training,'bn5',is_batchnorm) | ||
ac1 = tf.nn.relu(fc1_bn) | ||
fc2 = self.fc_layer(ac1,600,"fc2") | ||
fc2_bn = self.batch_norm_wrapper_fc(fc2, is_training,'bn6',is_batchnorm) | ||
ac2 = tf.nn.relu(fc2_bn) | ||
|
||
fc3 = self.fc_layer(ac2,softmax_num,"fc3") | ||
return fc1, fc2, fc3,res1,conv1r,ac1,ac2 | ||
|
||
def xavier_init(self,n_inputs, n_outputs, uniform=True): | ||
if uniform: | ||
init_range = np.sqrt(6.0 / (n_inputs + n_outputs)) | ||
return tf.random_uniform_initializer(-init_range, init_range) | ||
else: | ||
stddev = np.sqrt(3.0 / (n_inputs + n_outputs)) | ||
return tf.truncated_normal_initializer(stddev=stddev) | ||
|
||
def fc_layer(self, bottom, n_weight, name): | ||
print( bottom.get_shape()) | ||
assert len(bottom.get_shape()) == 2 | ||
n_prev_weight = bottom.get_shape()[1] | ||
|
||
initer = self.xavier_init(int(n_prev_weight),n_weight) | ||
W = tf.get_variable(name+'W', dtype=tf.float32, shape=[n_prev_weight, n_weight], initializer=initer) | ||
b = tf.get_variable(name+'b', dtype=tf.float32, initializer=tf.random_uniform([n_weight],-0.001,0.001, dtype=tf.float32)) | ||
fc = tf.nn.bias_add(tf.matmul(bottom, W), b) | ||
return fc | ||
|
||
|
||
def conv_layer(self, bottom, kernel_size,num_channels, stride, depth, name, shape_list): # n_prev_weight = int(bottom.get_shape()[1]) | ||
n_prev_weight = tf.shape(bottom)[1] | ||
|
||
inputlayer=bottom | ||
initer = tf.truncated_normal_initializer(stddev=0.1) | ||
|
||
W = tf.get_variable(name+'W', dtype=tf.float32, shape=[kernel_size, num_channels, depth], initializer=tf.contrib.layers.xavier_initializer()) | ||
b = tf.get_variable(name+'b', dtype=tf.float32, initializer=tf.constant(0.001, shape=[depth], dtype=tf.float32)) | ||
|
||
conv = ( tf.nn.bias_add( tf.nn.conv1d(inputlayer, W, stride, padding='SAME'), b)) | ||
mask = tf.sequence_mask(shape_list,tf.shape(conv)[1]) # make mask with batch x frame size | ||
mask = tf.where(mask, tf.ones_like(mask,dtype=tf.float32), tf.zeros_like(mask,dtype=tf.float32)) | ||
mask=tf.tile(mask, tf.stack([tf.shape(conv)[2],1])) #replicate make with depth size | ||
mask=tf.reshape(mask,[tf.shape(conv)[2], tf.shape(conv)[0], -1]) | ||
mask = tf.transpose(mask,[1, 2, 0]) | ||
print mask | ||
conv=tf.multiply(conv,mask) | ||
return conv | ||
|
||
|
||
|
||
|
||
|
||
|
||
def batch_norm_wrapper_1dcnn(self, inputs, is_training, name, shape_list, is_batchnorm,decay = 0.999 ): | ||
if is_batchnorm: | ||
shape_list = tf.cast(shape_list, tf.float32) | ||
epsilon = 1e-3 | ||
scale = tf.get_variable(name+'scale',dtype=tf.float32,initializer=tf.ones([inputs.get_shape()[-1]]) ) | ||
beta = tf.get_variable(name+'beta',dtype=tf.float32,initializer= tf.zeros([inputs.get_shape()[-1]]) ) | ||
pop_mean = tf.get_variable(name+'pop_mean',dtype=tf.float32,initializer = tf.zeros([inputs.get_shape()[-1]]), trainable=False) | ||
pop_var = tf.get_variable(name+'pop_var',dtype=tf.float32,initializer = tf.ones([inputs.get_shape()[-1]]), trainable=False) | ||
if is_training: | ||
#batch_mean, batch_var = tf.nn.moments(inputs,[0,1]) | ||
batch_mean = tf.reduce_sum(inputs,[0,1])/tf.reduce_sum(shape_list) # for variable length input | ||
batch_var = tf.reduce_sum(tf.square(inputs-batch_mean), [0,1])/tf.reduce_sum(shape_list) # for variable length input | ||
train_mean = tf.assign(pop_mean, pop_mean * decay + batch_mean * (1 - decay)) | ||
train_var = tf.assign(pop_var, pop_var * decay + batch_var * (1 - decay)) | ||
with tf.control_dependencies([train_mean, train_var]): | ||
return tf.nn.batch_normalization(inputs,batch_mean, batch_var, beta, scale, epsilon) | ||
else: | ||
return tf.nn.batch_normalization(inputs, pop_mean, pop_var, beta, scale, epsilon) | ||
else: | ||
return inputs | ||
|
||
|
||
|
||
|
||
def batch_norm_wrapper_fc(self, inputs, is_training, name, is_batchnorm, decay = 0.999 ): | ||
if is_batchnorm: | ||
epsilon = 1e-3 | ||
scale = tf.get_variable(name+'scale',dtype=tf.float32,initializer=tf.ones([inputs.get_shape()[-1]]) ) | ||
beta = tf.get_variable(name+'beta',dtype=tf.float32,initializer= tf.zeros([inputs.get_shape()[-1]]) ) | ||
pop_mean = tf.get_variable(name+'pop_mean',dtype=tf.float32,initializer = tf.zeros([inputs.get_shape()[-1]]), trainable=False) | ||
pop_var = tf.get_variable(name+'pop_var',dtype=tf.float32,initializer = tf.ones([inputs.get_shape()[-1]]), trainable=False) | ||
if is_training: | ||
batch_mean, batch_var = tf.nn.moments(inputs,[0]) | ||
train_mean = tf.assign(pop_mean, pop_mean * decay + batch_mean * (1 - decay)) | ||
train_var = tf.assign(pop_var, pop_var * decay + batch_var * (1 - decay)) | ||
with tf.control_dependencies([train_mean, train_var]): | ||
return tf.nn.batch_normalization(inputs,batch_mean, batch_var, beta, scale, epsilon) | ||
else: | ||
return tf.nn.batch_normalization(inputs, pop_mean, pop_var, beta, scale, epsilon) | ||
else: | ||
return inputs | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
#!/bin/bash | ||
|
||
# Extract MFCC feature for NN input and save in tfrecords format | ||
#./scripts/prepare_data.py mfcc 400 160 True m 0 | ||
#./scripts/prepare_data.py melspec 400 160 True m 0 | ||
#./scripts/prepare_data.py spec 400 160 True m 0 | ||
#./scripts/prepare_data.py melspec 400 160 True m 200 | ||
#./scripts/prepare_data.py mfcc 400 160 True m 200 | ||
#./scripts/prepare_data.py spec 400 160 True m 200 | ||
|
||
# Training NN with nn_model.py definition with original dataset | ||
srun --partition=titanx --gres=gpu:1 --cpus-per-task=2 --mem=16GB --output=./log/new_nn_model_mfcc_fft400_hop160_vad_cmn.out ./scripts/train_e2e.py new_nn_model 0.001 40 False mfcc_fft400_hop160_vad_cmn & | ||
srun --partition=titanx --gres=gpu:1 --cpus-per-task=2 --mem=16GB --output=./log/new_nn_model_melspec_fft400_hop160_vad_cmn.out ./scripts/train_e2e.py new_nn_model 0.001 40 False melspec_fft400_hop160_vad_cmn & | ||
srun --partition=titanx --gres=gpu:1 --cpus-per-task=2 --mem=16GB --output=./log/new_nn_model_spec_fft400_hop160_vad_cmn.out ./scripts/train_e2e.py new_nn_model 0.001 40 False spec_fft400_hop160_vad_cmn & | ||
srun --partition=titanx --gres=gpu:1 --cpus-per-task=2 --mem=16GB --output=./log/new_nn_model_exshort200_long_mfcc_fft400_hop160_vad_cmn.out ./scripts/train_e2e.py new_nn_model_exshort200_long 0.001 40 False mfcc_fft400_hop160_vad_cmn_exshort200 & | ||
srun --partition=titanx --gres=gpu:1 --cpus-per-task=2 --mem=16GB --output=./log/new_nn_model_exshort200_long_melspec_fft400_hop160_vad_cmn.out ./scripts/train_e2e.py new_nn_model_exshort200_long 0.001 40 False melspec_fft400_hop160_vad_cmn_exshort200 & | ||
srun --partition=titanx --gres=gpu:1 --cpus-per-task=2 --mem=16GB --output=./log/new_nn_model_exshort200_long_spec_fft400_hop160_vad_cmn.out ./scripts/train_e2e.py new_nn_model_exshort200_long 0.001 40 False spec_fft400_hop160_vad_cmn_exshort200 & | ||
|
||
|
||
# Dataset augmentation using SOX command conrresponding speed(0.9, 1.0, 1.1) and volume (0.125, 1.0, 2.0 in amplitude) | ||
#./scripts/augmentation_by_speed_vol.py 0.9 0.125 ./data/wav/ | ||
#./scripts/augmentation_by_speed_vol.py 1.0 0.125 ./data/wav/ | ||
#./scripts/augmentation_by_speed_vol.py 1.1 0.125 ./data/wav/ | ||
#./scripts/augmentation_by_speed_vol.py 0.9 1.0 ./data/wav/ | ||
#./scripts/augmentation_by_speed_vol.py 1.0 1.0 ./data/wav/ | ||
#./scripts/augmentation_by_speed_vol.py 1.1 1.0 ./data/wav/ | ||
#./scripts/augmentation_by_speed_vol.py 0.9 2.0 ./data/wav/ | ||
#./scripts/augmentation_by_speed_vol.py 1.0 2.0 ./data/wav/ | ||
#./scripts/augmentation_by_speed_vol.py 1.1 2.0 ./data/wav/ | ||
|
||
# Listing augmented dataset files | ||
#find $(pwd)/data/wav/train/ -name '*.wav' >./data/train_speed_vol.txt | ||
#find $(pwd)/data/wav/dev/ -name '*.wav' >./data/dev_speed_vol.txt | ||
#find $(pwd)/data/wav/test/ -name '*.wav' >./data/test_speed_vol.txt | ||
#find $(pwd)/data/wav/train/ -name '*v1.0*.wav' >./data/train_speed.txt & | ||
#find $(pwd)/data/wav/dev/ -name '*v1.0*.wav' >./data/dev_speed.txt & | ||
#find $(pwd)/data/wav/train/ -name 's1.0*.wav' >./data/train_vol.txt & | ||
#find $(pwd)/data/wav/dev/ -name 's1.0*.wav' >./data/dev_vol.txt & | ||
|
||
# Extracting MFCC feature of augmented dataset | ||
srun --partition=630 --cpus-per-task=2 --mem=20GB --out=./log/aug_mfcc_fft400_hop160_vad_cmn ./scripts/prepare_augmented_data.py mfcc 400 160 True m 0 & | ||
srun --partition=630 --cpus-per-task=2 --mem=20GB --out=./log/aug_mfcc_fft400_hop160_vad_cmn ./scripts/prepare_augmented_data.py mfcc 400 160 True m 200 & | ||
srun --partition=630 --cpus-per-task=2 --mem=20GB --out=./log/aug_mfcc_fft400_hop160_vad_cmn ./scripts/prepare_augmented_data_vol.py mfcc 400 160 True m 0 & | ||
srun --partition=630 --cpus-per-task=2 --mem=20GB --out=./log/aug_mfcc_fft400_hop160_vad_cmn ./scripts/prepare_augmented_data_vol.py mfcc 400 160 True m 200 & | ||
srun --partition=630 --cpus-per-task=2 --mem=20GB --out=./log/aug_mfcc_fft400_hop160_vad_cmn ./scripts/prepare_augmented_data_speed.py mfcc 400 160 True m 0 & | ||
srun --partition=630 --cpus-per-task=2 --mem=20GB --out=./log/aug_mfcc_fft400_hop160_vad_cmn ./scripts/prepare_augmented_data_speed.py mfcc 400 160 True m 200 & | ||
|
||
|
||
# Training NN iwth nn_model.py definition with augmented dataset | ||
srun --partition=titanx --gres=gpu:1 --cpus-per-task=2 --mem=16GB --output=./log/new_nn_model_aug_mfcc_fft400_hop160_vad_cmn.out ./scripts/train_e2e.py new_nn_model 0.001 40 False aug_mfcc_fft400_hop160_vad_cmn & | ||
srun --partition=titanx --gres=gpu:1 --cpus-per-task=2 --mem=16GB --output=./log/new_nn_model_aug_mfcc_fft400_hop160_vad_cmn_exshort.out ./scripts/train_e2e.py new_nn_model_exshort200_long 0.001 40 False aug_mfcc_fft400_hop160_vad_cmn_exshort200 & | ||
|
||
srun --partition=titanx --gres=gpu:1 --cpus-per-task=2 --mem=16GB --output=./log/new_nn_model_aug_vol_mfcc_fft400_hop160_vad_cmn.out ./scripts/train_e2e.py new_nn_model 0.001 40 False aug_vol_mfcc_fft400_hop160_vad_cmn & | ||
srun --partition=titanx --gres=gpu:1 --cpus-per-task=2 --mem=16GB --output=./log/new_nn_model_aug_vol_mfcc_fft400_hop160_vad_cmn_exshort.out ./scripts/train_e2e.py new_nn_model_exshort200_long 0.001 40 False aug_vol_mfcc_fft400_hop160_vad_cmn_exshort200 & | ||
|
||
srun --partition=titanx --gres=gpu:1 --cpus-per-task=2 --mem=16GB --output=./log/new_nn_model_aug_speed_mfcc_fft400_hop160_vad_cmn.out ./scripts/train_e2e.py new_nn_model 0.001 40 False aug_speed_mfcc_fft400_hop160_vad_cmn & | ||
srun --partition=titanx --gres=gpu:1 --cpus-per-task=2 --mem=16GB --output=./log/new_nn_model_aug_speed_mfcc_fft400_hop160_vad_cmn_exshort.out ./scripts/train_e2e.py new_nn_model_exshort200_long 0.001 40 False aug_speed_mfcc_fft400_hop160_vad_cmn_exshort200 & | ||
|
||
|
||
|
||
|
||
|
||
## Extract framelevel embedding | ||
srun -p sm --gres=gpu:1 --output=log/test.log python ./scripts/extract_framelevel_embeddings.py --wavlist data/test.txt --outputlayer & | ||
srun -p sm --gres=gpu:1 --output=log/dev.log python ./scripts/extract_framelevel_embeddings.py --wavlist data/dev.txt --outputlayer & | ||
srun -p sm --gres=gpu:1 --output=log/train.log python ./scripts/extract_framelevel_embeddings.py --wavlist data/train.txt --outputlayer & | ||
|