Skip to content

Commit

Permalink
for framelevel
Browse files Browse the repository at this point in the history
  • Loading branch information
Suwon Shon committed Oct 24, 2019
1 parent ee52935 commit fc8195f
Show file tree
Hide file tree
Showing 6 changed files with 17,915 additions and 17,652 deletions.
3,132 changes: 1,566 additions & 1,566 deletions data/dev.txt

Large diffs are not rendered by default.

2,984 changes: 1,492 additions & 1,492 deletions data/test.txt

Large diffs are not rendered by default.

29,184 changes: 14,592 additions & 14,592 deletions data/train.txt

Large diffs are not rendered by default.

194 changes: 194 additions & 0 deletions models/e2e_model_frame.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
import tensorflow as tf
import numpy as np
class nn:

# Create model
def __init__(self, x1, y_, y_string, shapes_batch, softmax_num,is_training,input_dim, is_batchnorm):
self.ea, self.eb, self.o1,self.res1,self.conv,self.ac1,self.ac2 = self.net(x1, shapes_batch, softmax_num,is_training,input_dim,is_batchnorm)

# Create loss
self.loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_, logits=self.o1))
self.label=y_
self.shape = shapes_batch
self.true_length = x1
self.label_string=y_string



def net(self,x, shapes_batch,softmax_num,is_training, input_dim, is_batchnorm):
shape_list = shapes_batch[:,0]
is_exclude_short = False
if is_exclude_short:
#randomly select start of sequences
sequence_limit = tf.reduce_min(shape_list)/2
# sequence_limit = tf.cond(sequence_limit<=200, lambda: sequence_limit, lambda: tf.subtract(sequence_limit,200))
random_start_pt = tf.random_uniform([1],minval=0,maxval=sequence_limit,dtype=tf.int32)
end_pt = tf.reduce_max(shape_list)
x = tf.gather(x,tf.range(tf.squeeze(random_start_pt),end_pt),axis=1)
shape_list = shape_list-random_start_pt

#randomly chunk sequences
batch_quantity = tf.size(shape_list)
aug_list = tf.constant([200, 300, 400], dtype=tf.float32)
aug_quantity = tf.size(aug_list)
rand_index = tf.random_uniform([batch_quantity],minval=0,maxval=aug_quantity-1,dtype=tf.int32)
rand_aug_list = tf.gather(aug_list,rand_index)

shape_list_f = tf.cast(shape_list, tf.float32)
temp = tf.multiply(shape_list_f, rand_aug_list/shape_list_f)
aug_shape_list = tf.cast(temp, tf.int32)
shape_list = tf.minimum(shape_list,aug_shape_list)


featdim = input_dim #channel
weights = []
kernel_size =5
stride = 1
depth = 500

shape_list = shape_list/stride
conv1 = self.conv_layer(x,kernel_size,featdim,stride,depth,'conv1',shape_list)
conv1_bn = self.batch_norm_wrapper_1dcnn(conv1, is_training,'bn1',shape_list,is_batchnorm)
conv1r= tf.nn.relu(conv1_bn)


featdim = depth #channel
weights = []
kernel_size =7
stride = 2
depth = 500

shape_list = shape_list/stride
conv2 = self.conv_layer(conv1r,kernel_size,featdim,stride,depth,'conv2',shape_list)
conv2_bn = self.batch_norm_wrapper_1dcnn(conv2, is_training,'bn2',shape_list,is_batchnorm)
conv2r= tf.nn.relu(conv2_bn)

featdim = depth #channel
weights = []
kernel_size =1
stride = 1
depth = 500

shape_list = shape_list/stride
conv3 = self.conv_layer(conv2r,kernel_size,featdim,stride,depth,'conv3',shape_list)
conv3_bn = self.batch_norm_wrapper_1dcnn(conv3, is_training,'bn3',shape_list,is_batchnorm)
conv3r= tf.nn.relu(conv3_bn)

featdim = depth #channel
weights = []
kernel_size =1
stride = 1
depth = 3000

shape_list = shape_list/stride
conv4 = self.conv_layer(conv3r,kernel_size,featdim,stride,depth,'conv4',shape_list)
conv4_bn = self.batch_norm_wrapper_1dcnn(conv4, is_training,'bn4',shape_list,is_batchnorm)
conv4r= tf.nn.relu(conv4_bn)

# print conv1



# shape_list = tf.cast(shape_list, tf.float32)
# shape_list = tf.reshape(shape_list,[-1,1,1])
# mean = tf.reduce_sum(conv4r,1,keep_dims=True)/shape_list
# res1=tf.squeeze(mean,axis=1)
res1=conv4r[0]

fc1 = self.fc_layer(res1,1500,"fc1")
fc1_bn = self.batch_norm_wrapper_fc(fc1, is_training,'bn5',is_batchnorm)
ac1 = tf.nn.relu(fc1_bn)
fc2 = self.fc_layer(ac1,600,"fc2")
fc2_bn = self.batch_norm_wrapper_fc(fc2, is_training,'bn6',is_batchnorm)
ac2 = tf.nn.relu(fc2_bn)

fc3 = self.fc_layer(ac2,softmax_num,"fc3")
return fc1, fc2, fc3,res1,conv1r,ac1,ac2

def xavier_init(self,n_inputs, n_outputs, uniform=True):
if uniform:
init_range = np.sqrt(6.0 / (n_inputs + n_outputs))
return tf.random_uniform_initializer(-init_range, init_range)
else:
stddev = np.sqrt(3.0 / (n_inputs + n_outputs))
return tf.truncated_normal_initializer(stddev=stddev)

def fc_layer(self, bottom, n_weight, name):
print( bottom.get_shape())
assert len(bottom.get_shape()) == 2
n_prev_weight = bottom.get_shape()[1]

initer = self.xavier_init(int(n_prev_weight),n_weight)
W = tf.get_variable(name+'W', dtype=tf.float32, shape=[n_prev_weight, n_weight], initializer=initer)
b = tf.get_variable(name+'b', dtype=tf.float32, initializer=tf.random_uniform([n_weight],-0.001,0.001, dtype=tf.float32))
fc = tf.nn.bias_add(tf.matmul(bottom, W), b)
return fc


def conv_layer(self, bottom, kernel_size,num_channels, stride, depth, name, shape_list): # n_prev_weight = int(bottom.get_shape()[1])
n_prev_weight = tf.shape(bottom)[1]

inputlayer=bottom
initer = tf.truncated_normal_initializer(stddev=0.1)

W = tf.get_variable(name+'W', dtype=tf.float32, shape=[kernel_size, num_channels, depth], initializer=tf.contrib.layers.xavier_initializer())
b = tf.get_variable(name+'b', dtype=tf.float32, initializer=tf.constant(0.001, shape=[depth], dtype=tf.float32))

conv = ( tf.nn.bias_add( tf.nn.conv1d(inputlayer, W, stride, padding='SAME'), b))
mask = tf.sequence_mask(shape_list,tf.shape(conv)[1]) # make mask with batch x frame size
mask = tf.where(mask, tf.ones_like(mask,dtype=tf.float32), tf.zeros_like(mask,dtype=tf.float32))
mask=tf.tile(mask, tf.stack([tf.shape(conv)[2],1])) #replicate make with depth size
mask=tf.reshape(mask,[tf.shape(conv)[2], tf.shape(conv)[0], -1])
mask = tf.transpose(mask,[1, 2, 0])
print mask
conv=tf.multiply(conv,mask)
return conv






def batch_norm_wrapper_1dcnn(self, inputs, is_training, name, shape_list, is_batchnorm,decay = 0.999 ):
if is_batchnorm:
shape_list = tf.cast(shape_list, tf.float32)
epsilon = 1e-3
scale = tf.get_variable(name+'scale',dtype=tf.float32,initializer=tf.ones([inputs.get_shape()[-1]]) )
beta = tf.get_variable(name+'beta',dtype=tf.float32,initializer= tf.zeros([inputs.get_shape()[-1]]) )
pop_mean = tf.get_variable(name+'pop_mean',dtype=tf.float32,initializer = tf.zeros([inputs.get_shape()[-1]]), trainable=False)
pop_var = tf.get_variable(name+'pop_var',dtype=tf.float32,initializer = tf.ones([inputs.get_shape()[-1]]), trainable=False)
if is_training:
#batch_mean, batch_var = tf.nn.moments(inputs,[0,1])
batch_mean = tf.reduce_sum(inputs,[0,1])/tf.reduce_sum(shape_list) # for variable length input
batch_var = tf.reduce_sum(tf.square(inputs-batch_mean), [0,1])/tf.reduce_sum(shape_list) # for variable length input
train_mean = tf.assign(pop_mean, pop_mean * decay + batch_mean * (1 - decay))
train_var = tf.assign(pop_var, pop_var * decay + batch_var * (1 - decay))
with tf.control_dependencies([train_mean, train_var]):
return tf.nn.batch_normalization(inputs,batch_mean, batch_var, beta, scale, epsilon)
else:
return tf.nn.batch_normalization(inputs, pop_mean, pop_var, beta, scale, epsilon)
else:
return inputs




def batch_norm_wrapper_fc(self, inputs, is_training, name, is_batchnorm, decay = 0.999 ):
if is_batchnorm:
epsilon = 1e-3
scale = tf.get_variable(name+'scale',dtype=tf.float32,initializer=tf.ones([inputs.get_shape()[-1]]) )
beta = tf.get_variable(name+'beta',dtype=tf.float32,initializer= tf.zeros([inputs.get_shape()[-1]]) )
pop_mean = tf.get_variable(name+'pop_mean',dtype=tf.float32,initializer = tf.zeros([inputs.get_shape()[-1]]), trainable=False)
pop_var = tf.get_variable(name+'pop_var',dtype=tf.float32,initializer = tf.ones([inputs.get_shape()[-1]]), trainable=False)
if is_training:
batch_mean, batch_var = tf.nn.moments(inputs,[0])
train_mean = tf.assign(pop_mean, pop_mean * decay + batch_mean * (1 - decay))
train_var = tf.assign(pop_var, pop_var * decay + batch_var * (1 - decay))
with tf.control_dependencies([train_mean, train_var]):
return tf.nn.batch_normalization(inputs,batch_mean, batch_var, beta, scale, epsilon)
else:
return tf.nn.batch_normalization(inputs, pop_mean, pop_var, beta, scale, epsilon)
else:
return inputs


6 changes: 4 additions & 2 deletions run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,10 @@ python ./scripts/train_e2e_custom.py e2e_model 0.001 40 False aug_logmel_fft400_
python ./scripts/train_e2e_custom.py e2e_model_exshort200_long 0.001 40 False aug_logmel_fft400_hop160_vad_cmvn_exshort200 4000000 4




## Extract framelevel embedding
python ./scripts/extract_framelevel_embeddings.py --wavlist data/test.txt --outputlayer
python ./scripts/extract_framelevel_embeddings.py --wavlist data/dev.txt --outputlayer
python ./scripts/extract_framelevel_embeddings.py --wavlist data/train.txt --outputlayer



Expand Down
67 changes: 67 additions & 0 deletions srun.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/bin/bash

# Extract MFCC feature for NN input and save in tfrecords format
#./scripts/prepare_data.py mfcc 400 160 True m 0
#./scripts/prepare_data.py melspec 400 160 True m 0
#./scripts/prepare_data.py spec 400 160 True m 0
#./scripts/prepare_data.py melspec 400 160 True m 200
#./scripts/prepare_data.py mfcc 400 160 True m 200
#./scripts/prepare_data.py spec 400 160 True m 200

# Training NN with nn_model.py definition with original dataset
srun --partition=titanx --gres=gpu:1 --cpus-per-task=2 --mem=16GB --output=./log/new_nn_model_mfcc_fft400_hop160_vad_cmn.out ./scripts/train_e2e.py new_nn_model 0.001 40 False mfcc_fft400_hop160_vad_cmn &
srun --partition=titanx --gres=gpu:1 --cpus-per-task=2 --mem=16GB --output=./log/new_nn_model_melspec_fft400_hop160_vad_cmn.out ./scripts/train_e2e.py new_nn_model 0.001 40 False melspec_fft400_hop160_vad_cmn &
srun --partition=titanx --gres=gpu:1 --cpus-per-task=2 --mem=16GB --output=./log/new_nn_model_spec_fft400_hop160_vad_cmn.out ./scripts/train_e2e.py new_nn_model 0.001 40 False spec_fft400_hop160_vad_cmn &
srun --partition=titanx --gres=gpu:1 --cpus-per-task=2 --mem=16GB --output=./log/new_nn_model_exshort200_long_mfcc_fft400_hop160_vad_cmn.out ./scripts/train_e2e.py new_nn_model_exshort200_long 0.001 40 False mfcc_fft400_hop160_vad_cmn_exshort200 &
srun --partition=titanx --gres=gpu:1 --cpus-per-task=2 --mem=16GB --output=./log/new_nn_model_exshort200_long_melspec_fft400_hop160_vad_cmn.out ./scripts/train_e2e.py new_nn_model_exshort200_long 0.001 40 False melspec_fft400_hop160_vad_cmn_exshort200 &
srun --partition=titanx --gres=gpu:1 --cpus-per-task=2 --mem=16GB --output=./log/new_nn_model_exshort200_long_spec_fft400_hop160_vad_cmn.out ./scripts/train_e2e.py new_nn_model_exshort200_long 0.001 40 False spec_fft400_hop160_vad_cmn_exshort200 &


# Dataset augmentation using SOX command conrresponding speed(0.9, 1.0, 1.1) and volume (0.125, 1.0, 2.0 in amplitude)
#./scripts/augmentation_by_speed_vol.py 0.9 0.125 ./data/wav/
#./scripts/augmentation_by_speed_vol.py 1.0 0.125 ./data/wav/
#./scripts/augmentation_by_speed_vol.py 1.1 0.125 ./data/wav/
#./scripts/augmentation_by_speed_vol.py 0.9 1.0 ./data/wav/
#./scripts/augmentation_by_speed_vol.py 1.0 1.0 ./data/wav/
#./scripts/augmentation_by_speed_vol.py 1.1 1.0 ./data/wav/
#./scripts/augmentation_by_speed_vol.py 0.9 2.0 ./data/wav/
#./scripts/augmentation_by_speed_vol.py 1.0 2.0 ./data/wav/
#./scripts/augmentation_by_speed_vol.py 1.1 2.0 ./data/wav/

# Listing augmented dataset files
#find $(pwd)/data/wav/train/ -name '*.wav' >./data/train_speed_vol.txt
#find $(pwd)/data/wav/dev/ -name '*.wav' >./data/dev_speed_vol.txt
#find $(pwd)/data/wav/test/ -name '*.wav' >./data/test_speed_vol.txt
#find $(pwd)/data/wav/train/ -name '*v1.0*.wav' >./data/train_speed.txt &
#find $(pwd)/data/wav/dev/ -name '*v1.0*.wav' >./data/dev_speed.txt &
#find $(pwd)/data/wav/train/ -name 's1.0*.wav' >./data/train_vol.txt &
#find $(pwd)/data/wav/dev/ -name 's1.0*.wav' >./data/dev_vol.txt &

# Extracting MFCC feature of augmented dataset
srun --partition=630 --cpus-per-task=2 --mem=20GB --out=./log/aug_mfcc_fft400_hop160_vad_cmn ./scripts/prepare_augmented_data.py mfcc 400 160 True m 0 &
srun --partition=630 --cpus-per-task=2 --mem=20GB --out=./log/aug_mfcc_fft400_hop160_vad_cmn ./scripts/prepare_augmented_data.py mfcc 400 160 True m 200 &
srun --partition=630 --cpus-per-task=2 --mem=20GB --out=./log/aug_mfcc_fft400_hop160_vad_cmn ./scripts/prepare_augmented_data_vol.py mfcc 400 160 True m 0 &
srun --partition=630 --cpus-per-task=2 --mem=20GB --out=./log/aug_mfcc_fft400_hop160_vad_cmn ./scripts/prepare_augmented_data_vol.py mfcc 400 160 True m 200 &
srun --partition=630 --cpus-per-task=2 --mem=20GB --out=./log/aug_mfcc_fft400_hop160_vad_cmn ./scripts/prepare_augmented_data_speed.py mfcc 400 160 True m 0 &
srun --partition=630 --cpus-per-task=2 --mem=20GB --out=./log/aug_mfcc_fft400_hop160_vad_cmn ./scripts/prepare_augmented_data_speed.py mfcc 400 160 True m 200 &


# Training NN iwth nn_model.py definition with augmented dataset
srun --partition=titanx --gres=gpu:1 --cpus-per-task=2 --mem=16GB --output=./log/new_nn_model_aug_mfcc_fft400_hop160_vad_cmn.out ./scripts/train_e2e.py new_nn_model 0.001 40 False aug_mfcc_fft400_hop160_vad_cmn &
srun --partition=titanx --gres=gpu:1 --cpus-per-task=2 --mem=16GB --output=./log/new_nn_model_aug_mfcc_fft400_hop160_vad_cmn_exshort.out ./scripts/train_e2e.py new_nn_model_exshort200_long 0.001 40 False aug_mfcc_fft400_hop160_vad_cmn_exshort200 &

srun --partition=titanx --gres=gpu:1 --cpus-per-task=2 --mem=16GB --output=./log/new_nn_model_aug_vol_mfcc_fft400_hop160_vad_cmn.out ./scripts/train_e2e.py new_nn_model 0.001 40 False aug_vol_mfcc_fft400_hop160_vad_cmn &
srun --partition=titanx --gres=gpu:1 --cpus-per-task=2 --mem=16GB --output=./log/new_nn_model_aug_vol_mfcc_fft400_hop160_vad_cmn_exshort.out ./scripts/train_e2e.py new_nn_model_exshort200_long 0.001 40 False aug_vol_mfcc_fft400_hop160_vad_cmn_exshort200 &

srun --partition=titanx --gres=gpu:1 --cpus-per-task=2 --mem=16GB --output=./log/new_nn_model_aug_speed_mfcc_fft400_hop160_vad_cmn.out ./scripts/train_e2e.py new_nn_model 0.001 40 False aug_speed_mfcc_fft400_hop160_vad_cmn &
srun --partition=titanx --gres=gpu:1 --cpus-per-task=2 --mem=16GB --output=./log/new_nn_model_aug_speed_mfcc_fft400_hop160_vad_cmn_exshort.out ./scripts/train_e2e.py new_nn_model_exshort200_long 0.001 40 False aug_speed_mfcc_fft400_hop160_vad_cmn_exshort200 &





## Extract framelevel embedding
srun -p sm --gres=gpu:1 --output=log/test.log python ./scripts/extract_framelevel_embeddings.py --wavlist data/test.txt --outputlayer &
srun -p sm --gres=gpu:1 --output=log/dev.log python ./scripts/extract_framelevel_embeddings.py --wavlist data/dev.txt --outputlayer &
srun -p sm --gres=gpu:1 --output=log/train.log python ./scripts/extract_framelevel_embeddings.py --wavlist data/train.txt --outputlayer &

0 comments on commit fc8195f

Please sign in to comment.