model.py

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import keras.backend as K
from keras.models import Model
from keras.layers import Input
from keras.layers import TimeDistributed
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.layers import Lambda
from keras.layers import Dropout
from keras.regularizers import l2
from keras.initializers import random_normal
from keras.utils.conv_utils import conv_output_length
from keras.layers import GaussianNoise

'''
This file builds the models


'''

import numpy as np

from keras import backend as K
from keras.models import Model, Sequential
from keras.layers.recurrent import SimpleRNN
from keras.layers import Dense, Activation, Bidirectional, Reshape,Flatten, Lambda, Input,\
    Masking, Convolution1D, BatchNormalization, GRU, Conv1D, RepeatVector, Conv2D
from keras.optimizers import SGD, adam
from keras.layers import ZeroPadding1D, Convolution1D, ZeroPadding2D, Convolution2D, MaxPooling2D, GlobalMaxPooling2D
from keras.layers import TimeDistributed, Dropout
from keras.layers.merge import add  # , # concatenate BAD FOR COREML
from keras.utils.conv_utils import conv_output_length
from keras.activations import relu

import tensorflow as tf

def selu(x):
    # from Keras 2.0.6 - does not exist in 2.0.4
    """Scaled Exponential Linear Unit. (Klambauer et al., 2017)
    # Arguments
       x: A tensor or variable to compute the activation function for.
    # References
       - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
    """
    alpha = 1.6732632423543772848170429916717
    scale = 1.0507009873554804934193349852946
    return scale * K.elu(x, alpha)

def clipped_relu(x):
    return relu(x, max_value=20)

# Define CTC loss
def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args

    # hack for load_model
    import tensorflow as tf

    ''' from TF: Input requirements
    1. sequence_length(b) <= time for all b
    2. max(labels.indices(labels.indices[:, 1] == b, 2)) <= sequence_length(b) for all b.
    '''

    # print("CTC lambda inputs / shape")
    # print("y_pred:",y_pred.shape)  # (?, 778, 30)
    # print("labels:",labels.shape)  # (?, 80)
    # print("input_length:",input_length.shape)  # (?, 1)
    # print("label_length:",label_length.shape)  # (?, 1)


    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)


def ctc(y_true, y_pred):
    return y_pred

######################################

######################################

def ds1_dropout(input_dim=26, fc_size=2048, rnn_size=512, dropout=[0.1, 0.1, 0.1], output_dim=29):
    """ DeepSpeech 1 Implementation with Dropout

    Architecture:
        Input MFCC TIMEx26
        3 Fully Connected using Clipped Relu activation function
        3 Dropout layers between each FC
        1 BiDirectional LSTM
        1 Dropout applied to BLSTM
        1 Dropout applied to FC dense
        1 Fully connected Softmax

    Details:
        - Uses MFCC's rather paper's 80 linear spaced log filterbanks
        - Uses LSTM's rather than SimpleRNN
        - No translation of raw audio by 5ms
        - No stride the RNN

    Reference:
        https://arxiv.org/abs/1412.5567
    """
    from keras.utils.generic_utils import get_custom_objects
    get_custom_objects().update({"clipped_relu": clipped_relu})
    K.set_learning_phase(1)

    # Creates a tensor there are usually 26 MFCC
    input_data = Input(name='the_input', shape=(None, input_dim))  # >>(?, max_batch_seq, 26)

    # First 3 FC layers
    init = random_normal(stddev=0.046875)
    x = TimeDistributed(Dense(fc_size, name='fc1', kernel_initializer=init, bias_initializer=init, activation=clipped_relu))(input_data)  # >>(?, 778, 2048)
    x = TimeDistributed(Dropout(dropout[0]))(x)
    x = TimeDistributed(Dense(fc_size, name='fc2', kernel_initializer=init, bias_initializer=init, activation=clipped_relu))(x)  # >>(?, 778, 2048)
    x = TimeDistributed(Dropout(dropout[0]))(x)
    x = TimeDistributed(Dense(fc_size, name='fc3', kernel_initializer=init, bias_initializer=init, activation=clipped_relu))(x)  # >>(?, 778, 2048)
    x = TimeDistributed(Dropout(dropout[0]))(x)

    # Layer 4 BiDirectional RNN
    x = Bidirectional(LSTM(rnn_size, return_sequences=True, activation=clipped_relu, dropout=dropout[1],
                                kernel_initializer='he_normal', name='birnn'), merge_mode='sum')(x)


    # Layer 5+6 Time Dist Dense Layer & Softmax
    # x = TimeDistributed(Dense(fc_size, activation=clipped_relu, kernel_initializer=init, bias_initializer=init))(x)
    x = TimeDistributed(Dropout(dropout[2]))(x)
    y_pred = TimeDistributed(Dense(output_dim, name="y_pred", kernel_initializer=init, bias_initializer=init, activation="softmax"), name="out")(x)

    # Change shape
    labels = Input(name='the_labels', shape=[None,], dtype='int32')
    input_length = Input(name='input_length', shape=[1], dtype='int32')
    label_length = Input(name='label_length', shape=[1], dtype='int32')

    # Keras doesn't currently support loss funcs with extra parameters
    # so CTC loss is implemented in a lambda layer
    loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred,
                                                                       labels,
                                                                       input_length,
                                                                       label_length])

    model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)

    return model


def ds1(input_dim=26, fc_size=1024, rnn_size=1024, output_dim=29):
    """ DeepSpeech 1 Implementation without dropout

    Architecture:
        Input MFCC TIMEx26
        3 Fully Connected using Clipped Relu activation function
        1 BiDirectional LSTM
        1 Fully connected Softmax

    Details:
        - Removed Dropout on this implementation
        - Uses MFCC's rather paper's 80 linear spaced log filterbanks
        - Uses LSTM's rather than SimpleRNN
        - No translation of raw audio by 5ms
        - No stride the RNN

    References:
        https://arxiv.org/abs/1412.5567
    """
    # hack to get clipped_relu to work on bidir layer
    from keras.utils.generic_utils import get_custom_objects
    get_custom_objects().update({"clipped_relu": clipped_relu})


    input_data = Input(name='the_input', shape=(None, input_dim))  # >>(?, 778, 26)

    init = random_normal(stddev=0.046875)

    # First 3 FC layers
    x = TimeDistributed(Dense(fc_size, name='fc1', kernel_initializer=init, bias_initializer=init, activation=clipped_relu))(input_data)  # >>(?, 778, 2048)
    x = TimeDistributed(Dense(fc_size, name='fc2', kernel_initializer=init, bias_initializer=init, activation=clipped_relu))(x)  # >>(?, 778, 2048)
    x = TimeDistributed(Dense(fc_size, name='fc3', kernel_initializer=init, bias_initializer=init, activation=clipped_relu))(x)  # >>(?, 778, 2048)


    # # Layer 4 BiDirectional RNN - note coreml only supports LSTM BIDIR
    x = Bidirectional(LSTM(rnn_size, return_sequences=True, activation=clipped_relu,
                                kernel_initializer='glorot_uniform', name='birnn'), merge_mode='sum')(x)  #

    # Layer 5+6 Time Dist Layer & Softmax

    # x = TimeDistributed(Dense(fc_size, activation=clipped_relu))(x)
    y_pred = TimeDistributed(Dense(output_dim, name="y_pred", kernel_initializer=init, bias_initializer=init, activation="softmax"), name="out")(x)
    #y_pred = Dense(output_dim, name="y_pred", kernel_initializer=init, bias_initializer=init, activation="softmax")(x)

    # Input of labels and other CTC requirements
    labels = Input(name='the_labels', shape=[None,], dtype='int32')
    input_length = Input(name='input_length', shape=[1], dtype='int32')
    label_length = Input(name='label_length', shape=[1], dtype='int32')

    # Keras doesn't currently support loss funcs with extra parameters
    # so CTC loss is implemented in a lambda layer
    loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred,
                                                                       labels,
                                                                       input_length,
                                                                       label_length])


    model = Model(inputs=[input_data, labels, input_length, label_length], outputs=[loss_out])

    return model


def ds2_gru_model(input_dim=161, fc_size=1024, rnn_size=512, output_dim=29, initialization='glorot_uniform',
                  conv_layers=1, gru_layers=1, use_conv=True):
    """ DeepSpeech 2 implementation

    Architecture:
        Input Spectrogram TIMEx161
        1 Batch Normalisation layer on input
        1-3 Convolutional Layers
        1 Batch Normalisation layer
        1-7 BiDirectional GRU Layers
        1 Batch Normalisation layer
        1 Fully connected Dense
        1 Softmax output

    Details:
       - Uses Spectrogram as input rather than MFCC
       - Did not use BN on the first input
       - Network does not dynamically adapt to maximum audio size in the first convolutional layer. Max conv
          length padded at 2048 chars, otherwise use_conv=False

    Reference:
        https://arxiv.org/abs/1512.02595
    """

    K.set_learning_phase(1)

    input_data = Input(shape=(None, input_dim), name='the_input')
    x = BatchNormalization(axis=-1, momentum=0.99, epsilon=1e-3, center=True, scale=True)(input_data)

    if use_conv:
        conv = ZeroPadding1D(padding=(0, 2048))(x)
        for l in range(conv_layers):
            x = Conv1D(filters=fc_size, name='conv_{}'.format(l+1), kernel_size=11, padding='valid', activation='relu', strides=2)(conv)
    else:
        for l in range(conv_layers):
            x = TimeDistributed(Dense(fc_size, name='fc_{}'.format(l + 1), activation='relu'))(x)  # >>(?, time, fc_size)

    x = BatchNormalization(axis=-1, momentum=0.99, epsilon=1e-3, center=True, scale=True)(x)

    for l in range(gru_layers):
        x = Bidirectional(GRU(rnn_size, name='fc_{}'.format(l + 1), return_sequences=True, activation='relu', kernel_initializer=initialization),
                      merge_mode='sum')(x)

    x = BatchNormalization(axis=-1, momentum=0.99, epsilon=1e-3, center=True, scale=True)(x)

    # Last Layer 5+6 Time Dist Dense Layer & Softmax
    x = TimeDistributed(Dense(fc_size, activation=clipped_relu))(x)
    y_pred = TimeDistributed(Dense(output_dim, name="y_pred", activation="softmax"))(x)

    # labels = K.placeholder(name='the_labels', ndim=1, dtype='int32')
    labels = Input(name='the_labels', shape=[None,], dtype='int32')
    input_length = Input(name='input_length', shape=[1], dtype='int32')
    label_length = Input(name='label_length', shape=[1], dtype='int32')

    # Keras doesn't currently support loss funcs with extra parameters
    # so CTC loss is implemented in a lambda layer
    loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred,
                                                                       labels,
                                                                       input_length,
                                                                       label_length])

    model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)

    return model


def ownModel(input_dim=26, fc_size=512, rnn_size=512, dropout=[0.1, 0.1, 0.1], output_dim=29):
    """ Own model BN+SELU-FC+GRU+BN+DR

    Architecture:
        Batch Normalisation layer on the input data
        1 Fully connected layer of fc_size with SELU
        2 Fully connected layer of fc_size with Clipped Relu
        3 Dropout layers applied between the FC layers
        Batch Normalisation layer on the final FC output
        1 BiDirectional GRU layer with Clipped Relu
        1 Fully connected layer of fc_size with SELU
        1 Dropout layer
        1 Softmax out


    """
    from keras.utils.generic_utils import get_custom_objects
    get_custom_objects().update({"clipped_relu": clipped_relu})
    get_custom_objects().update({"selu": selu})
    K.set_learning_phase(1)

    # Creates a tensor there are usually 26 MFCC
    input_data = Input(name='the_input', shape=(None, input_dim))  # >>(?, max_batch_seq, 26)

    x = BatchNormalization(axis=-1, momentum=0.99,epsilon=1e-3,center=True,scale=True)(input_data)

    # First 3 FC layers
    init = random_normal(stddev=0.046875)
    x = TimeDistributed(Dense(fc_size, name='fc1', kernel_initializer=init, bias_initializer=init, activation=selu))(x)  # >>(?, 778, 2048)
    x = TimeDistributed(Dropout(dropout[0]))(x)
    x = TimeDistributed(Dense(fc_size, name='fc2', kernel_initializer=init, bias_initializer=init, activation=clipped_relu))(x)  # >>(?, 778, 2048)
    x = TimeDistributed(Dropout(dropout[0]))(x)
    x = TimeDistributed(Dense(fc_size, name='fc3', kernel_initializer=init, bias_initializer=init, activation=clipped_relu))(x)  # >>(?, 778, 2048)
    x = TimeDistributed(Dropout(dropout[0]))(x)
    x = BatchNormalization(axis=-1, momentum=0.99, epsilon=1e-3, center=True, scale=True)(x)

    # Layer 4 BiDirectional RNN
    x = Bidirectional(GRU(rnn_size, return_sequences=True, activation=clipped_relu, dropout=dropout[1],
                                kernel_initializer='he_normal', name='birnn'), merge_mode='sum')(x)

    # Layer 5+6 Time Dist Dense Layer & Softmax
    x = TimeDistributed(Dense(fc_size, activation=selu, kernel_initializer=init, bias_initializer=init))(x)
    x = TimeDistributed(Dropout(dropout[2]))(x)
    y_pred = TimeDistributed(Dense(output_dim, name="y_pred", kernel_initializer=init, bias_initializer=init, activation="softmax"), name="out")(x)

    # Change shape
    labels = Input(name='the_labels', shape=[None,], dtype='int32')
    input_length = Input(name='input_length', shape=[1], dtype='int32')
    label_length = Input(name='label_length', shape=[1], dtype='int32')

    # Keras doesn't currently support loss funcs with extra parameters
    # so CTC loss is implemented in a lambda layer
    loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred,
                                                                       labels,
                                                                       input_length,
                                                                       label_length])

    model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)

    return model


def graves(input_dim=26, rnn_size=512, output_dim=29, std=0.6):
    """ Implementation of Graves 2006 model

    Architecture:
        Gaussian Noise on input
        BiDirectional LSTM

    Reference:
        ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf
    """

    K.set_learning_phase(1)
    input_data = Input(name='the_input', shape=(None, input_dim))
    # x = BatchNormalization(axis=-1)(input_data)

    x = GaussianNoise(std)(input_data)
    x = Bidirectional(LSTM(rnn_size,
                      return_sequences=True,
                      implementation=0))(x)
    y_pred = TimeDistributed(Dense(output_dim, activation='softmax'))(x)

    # Input of labels and other CTC requirements
    labels = Input(name='the_labels', shape=[None,], dtype='int32')
    input_length = Input(name='input_length', shape=[1], dtype='int32')
    label_length = Input(name='label_length', shape=[1], dtype='int32')

    # Keras doesn't currently support loss funcs with extra parameters
    # so CTC loss is implemented in a lambda layer
    loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred,
                                                                       labels,
                                                                       input_length,
                                                                       label_length])


    model = Model(inputs=[input_data, labels, input_length, label_length], outputs=[loss_out])

    return model

def cnn_city(input_dim=161, fc_size=1024, rnn_size=512, output_dim=29, initialization='glorot_uniform',
                  conv_layers=4):
    """ Pure CNN implementation

    Architecture:

        1 Convolutional Layers

        1 Fully connected Dense
        1 Softmax output

    Details:s
       - Network does not dynamically adapt to maximum audio size in the first convolutional layer. Max conv
          length padded at 2048 chars, otherwise use_conv=False

    Reference:

    """

    #filters = outputsize
    #kernal_size = heigth and width of conv window
    #strides = stepsize on conv window

    kernel_size = 11  #
    conv_depth_1 = 64  #
    conv_depth_2 = 256  #

    input_data = Input(shape=(None, input_dim), name='the_input') #batch x time x spectro size
    conv = ZeroPadding1D(padding=(0, 2048))(input_data) #pad on time dimension

    x = Conv1D(filters=128, name='conv_1', kernel_size=kernel_size, padding='valid', activation='relu', strides=2)(conv)
    # x = Conv1D(filters=1024, name='conv_2', kernel_size=kernel_size, padding='valid', activation='relu', strides=2)(x)


    # Last Layer 5+6 Time Dist Dense Layer & Softmax
    x = TimeDistributed(Dense(fc_size, activation='relu'))(x)
    y_pred = TimeDistributed(Dense(output_dim, name="y_pred", activation="softmax"))(x)

    # labels = K.placeholder(name='the_labels', ndim=1, dtype='int32')
    labels = Input(name='the_labels', shape=[None,], dtype='int32')
    input_length = Input(name='input_length', shape=[1], dtype='int32')
    label_length = Input(name='label_length', shape=[1], dtype='int32')

    # Keras doesn't currently support loss funcs with extra parameters
    # so CTC loss is implemented in a lambda layer
    loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred,
                                                                       labels,
                                                                       input_length,
                                                                       label_length])

    model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)

    return model


def const(input_dim=26, fc_size=1024, rnn_size=1024, output_dim=29):
    """ Implementation of constrained model for CoreML

    Architecture:
        N number of Fully connected layer of variable FC units
        *optional* GRU RNN of rnn_size

    Details:
        The RNN has been removed in order to allow the network to run in coreml

    """

    #loop FC
    input_data = Input(name='the_input', shape=(None, input_dim))  # >>(?, time, input_dim)
    x = input_data
    init = random_normal(stddev=0.046875)

    layercount = 3
    for l in range(layercount):
        x = TimeDistributed(Dense(fc_size, name='fc_{}'.format(l+1), kernel_initializer=init,
                                  bias_initializer=init, activation='relu'))(x)  # >>(?, time, fc_size)

    # x = GRU(rnn_size, return_sequences=True, activation='relu', name='rnn1')(x)  # >> (?, time, rnn_size)

    y_pred = TimeDistributed(Dense(output_dim, name="y_pred", activation="softmax"))(x)  # >> (?,time,output_dim)


    # Input of labels and other CTC requirements
    labels = Input(name='the_labels', shape=[None,], dtype='int32')
    input_length = Input(name='input_length', shape=[1], dtype='int32')
    label_length = Input(name='label_length', shape=[1], dtype='int32')

    # Keras doesn't currently support loss funcs with extra parameters
    # so CTC loss is implemented in a lambda layer
    loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred,
                                                                       labels,
                                                                       input_length,
                                                                       label_length])


    model = Model(inputs=[input_data, labels, input_length, label_length], outputs=[loss_out])

    return model


###########################
# TRANSFER MODEL WEIGHTS

def build_const_no_ctc_and_xfer_weights(loaded_model, input_dim=26, fc_size=1024, rnn_size=512,
                         output_dim=29):
    '''
    CONST model but convert into CoreML
    '''


    K.set_learning_phase(0)

    for ind, i in enumerate(loaded_model.layers):
        print(ind, i)

    input_data = Input(name='the_input', shape=(None, input_dim))  # >>(?, 778, 26)

    x = input_data

    layercount = 3
    for l in range(layercount):
        x = TimeDistributed(Dense(fc_size, name='fc_{}'.format(l+1), weights=loaded_model.layers[l+1].get_weights(),
                                  activation='relu'))(x)  # >>(?, time, fc_size)
    # x = GRU(rnn_size, return_sequences=True, activation='relu', name='rnn1',
    #         weights=loaded_model.layers[4].get_weights())(x)  # >> (?, time, rnn_size)

    y_pred = TimeDistributed(Dense(output_dim, name="y_pred", activation="softmax",
                                   weights=loaded_model.layers[5].get_weights()))(x)  # >> (?,time,output_dim)


    # First 3 FC layers
    # x = TimeDistributed(Dense(fc_size, name='fc1', activation='relu',
    #                           weights=loaded_model.layers[1].get_weights()))(input_data)
    #
    # x = TimeDistributed(Dense(fc_size, name='fc2', activation='relu',
    #                           weights=loaded_model.layers[2].get_weights()))(x)  # >>(?, 778, 2048)
    #
    # x = TimeDistributed(Dense(fc_size, name='fc3', activation='relu',
    #                           weights=loaded_model.layers[3].get_weights()))(x)  # >>(?, 778, 2048)
    # conv = ZeroPadding1D(padding=(1, 1000))(input_data)
    # conv = Conv1D(filters=2, kernel_size=10, padding='valid', activation='relu',
    #                                weights=loaded_model.layers[2].get_weights(), strides=2)(conv)


    # Layer 4 RNN
    # rnn_1 = GRU(rnn_size, return_sequences=True, activation='relu', name='rnn1',
    #                 weights=loaded_model.layers[3].get_weights())(conv)
    # # rnn_2 = GRU(rnn_size, return_sequences=True, activation='relu', name='rnn2',
    # #             weights=loaded_model.layers[5].get_weights())(rnn_1)
    #
    # x = Dense(fc_size, activation='relu',
    #           weights=loaded_model.layers[5].get_weights())(rnn_1)
    # x = Bidirectional(LSTM(rnn_size, return_sequences=True, activation='relu'),
    #                   weights=loaded_model.layers[3].get_weights(),
    #                   merge_mode='sum')(conv)

    # conv = ZeroPadding1D(padding=(1, 500))(rnn_1)
    # conv = Convolution1D(1, 2, padding='valid',
    #                      weights=loaded_model.layers[6].get_weights())(conv)

    # y_pred = GRU(output_dim, return_sequences=True, activation='softmax', kernel_initializer='glorot_uniform', name='y_pred',
    #             weights=loaded_model.layers[4].get_weights())(x)

    # x = TimeDistributed(Dense(fc_size, activation='relu',
    #                 weights=loaded_model.layers[5].get_weights()))(rnn_1)
    # y_pred = TimeDistributed(Dense(output_dim, name="y_pred", activation="softmax",
    #                 weights=loaded_model.layers[6].get_weights()), name="out")(x)
    # y_pred = Dense(output_dim, name="y_pred", activation="softmax",
    #                weights=loaded_model.layers[6].get_weights())(x)


    # Layer 5+6 Time Dist Layer & Softmax
    # y_pred = Dense(output_dim, name="y_pred", activation="softmax", weights=loaded_model.layers[5].get_weights())(rnn_1)

    # y_pred = TimeDistributed(Dense(output_dim, name="y_pred", activation='softmax',
    #                                weights=loaded_model.layers[4].get_weights()))(x)

    model = Model(inputs=input_data, outputs=y_pred)
    return model


def build_ds0_no_ctc_and_xfer_weights(loaded_model, input_dim=26, fc_size=1024, rnn_size=512,
                         dropout=[0, 0, 0],
                         output_dim=29):
    '''
    DS1 model but convert into CoreML
    '''

    from keras.utils.generic_utils import get_custom_objects
    get_custom_objects().update({"clipped_relu": clipped_relu})


    K.set_learning_phase(0)

    for ind, i in enumerate(loaded_model.layers):
        print(ind, i)

    input_data = Input(name='the_input', shape=(None, input_dim))  # >>(?, 778, 26)

    # First 3 FC layers
    x = TimeDistributed(Dense(fc_size, name='fc1', activation='relu',
                              weights=loaded_model.layers[1].get_weights()))(input_data)
    # x = TimeDistributed(Dropout(dropout[0]))(x) #2

    x = TimeDistributed(Dense(fc_size, name='fc2', activation='relu',
                              weights=loaded_model.layers[3].get_weights()))(x)  # >>(?, 778, 2048)
    # x = TimeDistributed(Dropout(dropout[0]))(x) #4

    x = TimeDistributed(Dense(fc_size, name='fc3', activation='relu',
                              weights=loaded_model.layers[5].get_weights()))(x)  # >>(?, 778, 2048)
    # x = TimeDistributed(Dropout(dropout[0]))(x) #6

    # x = Dense(fc_size, name='fc1', activation='relu',
    #                           weights=loaded_model.layers[1].get_weights())(input_data)  # >>(?, 778, 2048)
    # x = Dense(fc_size, name='fc2', activation='relu',
    #                           weights=loaded_model.layers[2].get_weights())(x)  # >>(?, 778, 2048)
    # x = Dense(fc_size, name='fc3', activation='relu',
    #                           weights=loaded_model.layers[3].get_weights())(x)  # >>(?, 778, 2048)


    # Layer 4 BiDirectional RNN - note coreml only supports LSTM BIDIR
    x = Bidirectional(LSTM(rnn_size, return_sequences=True, activation='relu',
                                kernel_initializer='he_normal'),
                      weights=loaded_model.layers[7].get_weights(),
                                merge_mode='sum')(x)


    x = TimeDistributed(Dense(fc_size, activation='relu',
                    weights=loaded_model.layers[8].get_weights()))(x)
    y_pred = TimeDistributed(Dense(output_dim, name="y_pred", activation="softmax", weights=loaded_model.layers[10].get_weights()), name="out")(x)

    # Layer 5+6 Time Dist Layer & Softmax
    #y_pred = Dense(num_classes, name="y_pred", activation="softmax", weights=loaded_model.layers[4].get_weights())(x)

    model = Model(inputs=input_data, outputs=y_pred)
    return model


def build_ds5_no_ctc_and_xfer_weights(loaded_model, input_dim=161, fc_size=1024, rnn_size=512, output_dim=29, initialization='glorot_uniform',
                  conv_layers=4):
    """ Pure CNN implementation"""


    K.set_learning_phase(0)
    for ind, i in enumerate(loaded_model.layers):
        print(ind, i)

    kernel_size = 11  #
    conv_depth_1 = 64  #
    conv_depth_2 = 256  #

    input_data = Input(shape=(None, input_dim), name='the_input') #batch x time x spectro size
    conv = ZeroPadding1D(padding=(0, 2048))(input_data) #pad on time dimension

    x = Conv1D(filters=128, name='conv_1', kernel_size=kernel_size, padding='valid', activation='relu', strides=2,
            weights = loaded_model.layers[2].get_weights())(conv)
    # x = Conv1D(filters=1024, name='conv_2', kernel_size=kernel_size, padding='valid', activation='relu', strides=2,
    #            weights=loaded_model.layers[3].get_weights())(x)


    # Last Layer 5+6 Time Dist Dense Layer & Softmax
    x = TimeDistributed(Dense(fc_size, activation='relu',
                              weights=loaded_model.layers[3].get_weights()))(x)
    y_pred = TimeDistributed(Dense(output_dim, name="y_pred", activation="softmax"))(x)

    model = Model(inputs=input_data, outputs=y_pred)

    return model