diff --git a/dist/whiteboxlayer-0.2.1-py3-none-any.whl b/dist/whiteboxlayer-0.2.1-py3-none-any.whl new file mode 100644 index 0000000..45e9bb6 Binary files /dev/null and b/dist/whiteboxlayer-0.2.1-py3-none-any.whl differ diff --git a/setup.py b/setup.py index 8c0c205..fa14cd9 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name = 'whiteboxlayer', - version = '0.2.0', + version = '0.2.1', description = 'TensorFlow based custom layers', author = 'YeongHyeon Park', author_email = 'young200405@gmail.com', diff --git a/whiteboxlayer/extensions/attention.py b/whiteboxlayer/extensions/attention.py new file mode 100644 index 0000000..cf6c7a8 --- /dev/null +++ b/whiteboxlayer/extensions/attention.py @@ -0,0 +1,90 @@ +import numpy as np +import tensorflow as tf + +def embedding(layer, x, dim_model, name='emb', verbose=True): + + emb = layer.fully_connected(x=x, c_out=dim_model, \ + batch_norm=False, activation=None, name="%s" %(name), verbose=verbose) + + return emb + +def feed_forward_network(layer, x, dim_ff, dim_model, name='ffn', verbose=True): + + ff1 = layer.fully_connected(x=x, c_out=dim_ff, \ + batch_norm=False, activation='relu', name="%s_0" %(name), verbose=verbose) + ff2 = layer.fully_connected(x=ff1, c_out=dim_model, \ + batch_norm=False, activation=None, name="%s_1" %(name), verbose=verbose) + + return ff2 + +def get_angles(pos, i, dim_model): + # https://www.tensorflow.org/text/tutorials/transformer?hl=en + + angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(dim_model)) + + return pos * angle_rates + +def positional_encoding(position, dim_model): + angle_rads = get_angles(np.arange(position)[:, np.newaxis], + np.arange(dim_model)[np.newaxis, :], + dim_model) + + # apply sin to even indices in the array; 2i + angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2]) + + # apply cos to odd indices in the array; 2i+1 + angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2]) + + pos_encoding = angle_rads[np.newaxis, ...] + + return tf.cast(pos_encoding, dtype=tf.float32) + +def concat_heads(x, verbose=True): + # https://www.tensorflow.org/text/tutorials/transformer?hl=en + + [d_n, d_s, d_h, d_fh] = x.shape + xc = tf.reshape(x, (d_n, d_s, -1)) + + if(verbose): print("Concat Head", x.shape, "->", xc.shape) + return xc + +def self_attention(layer, x_query, x_key, x_value, num_head=1, mask_idx=-1, udmask=False, name='enc', verbose=True): + + [_, d_s, d_f] = x_query.shape + + enc_query = layer.fully_connected(x=x_query, c_out=d_f, \ + batch_norm=False, activation=None, name="%s-query" %(name), verbose=verbose) + enc_key = layer.fully_connected(x=x_key, c_out=d_f, \ + batch_norm=False, activation=None, name="%s-key" %(name), verbose=verbose) + enc_value = layer.fully_connected(x=x_value, c_out=d_f, \ + batch_norm=False, activation=None, name="%s-value" %(name), verbose=verbose) + + sq_dk = tf.math.sqrt(float(d_f)) + enc_qk = [] + if(num_head != 1): + list_query = tf.split(enc_query, num_or_size_splits=num_head, axis=2) + list_key = tf.split(enc_key, num_or_size_splits=num_head, axis=2) + list_value = tf.split(enc_value, num_or_size_splits=num_head, axis=2) + + for idx_query, _ in enumerate(list_query): + enc_qk.append(tf.matmul(a=list_query[idx_query], b=list_key[idx_query], transpose_a=False, transpose_b=True) / sq_dk) + + enc_qk = tf.stack(enc_qk) + else: + enc_qk = tf.matmul(a=enc_query, b=enc_key, transpose_a=False, transpose_b=True) / sq_dk + + if(udmask): # upper diagonal masking + enc_qk = tf.where(tf.linalg.band_part(enc_qk, -1, 0)==0, -1e+9, enc_qk) + enc_smax_qk = tf.nn.softmax(enc_qk, axis=-1) + + if(num_head != 1): + enc_qkv = [] + for idx_value, _ in enumerate(list_value): + enc_qkv.append(tf.matmul(enc_smax_qk[idx_value], list_value[idx_value])) + enc_qkv = tf.transpose(tf.stack(enc_qkv), [1, 2, 0, 3]) + enc_qkv = concat_heads(x=enc_qkv, verbose=verbose) + else: + enc_qkv = tf.matmul(enc_smax_qk, enc_value) + + if(verbose): print("Self-Attn (Head: %d)" %(num_head), x_query.shape, "->", enc_qkv.shape) + return {'query':enc_query, 'key':enc_key, 'value':enc_value, 'attention':enc_smax_qk, 'output':enc_qkv} diff --git a/whiteboxlayer/layers.py b/whiteboxlayer/layers.py index 250c42a..6466245 100644 --- a/whiteboxlayer/layers.py +++ b/whiteboxlayer/layers.py @@ -55,6 +55,12 @@ def activation(self, x, activation=None, name=''): return tf.nn.swish(x, name='%s' %(name)) else: return x + def dropout(self, x, rate=0.5, name=''): + + y = tf.nn.dropout(x=x, rate=rate, name=name) + + return y + def batch_normalization(self, x, trainable=True, name='', verbose=True): # https://arxiv.org/pdf/1502.03167.pdf @@ -79,6 +85,40 @@ def batch_normalization(self, x, trainable=True, name='', verbose=True): if(verbose): print("BN (%s)" %(name), x.shape, "->", y.shape) return y + def layer_normalization(self, x, trainable=True, name='', verbose=True): + + len_xdim = len(x.shape) + if(len_xdim == 2): x = tf.transpose(x, [1, 0]) + elif(len_xdim == 3): x = tf.transpose(x, [2, 1, 0]) + elif(len_xdim == 4): x = tf.transpose(x, [3, 1, 2, 0]) + elif(len_xdim == 5): x = tf.transpose(x, [4, 1, 2, 3, 0]) + + mean, variance = tf.nn.moments(x=x, axes=[0], keepdims=True, name="%s_mmt" %(name)) + + c_in = x.get_shape().as_list()[-1] + offset = self.get_variable(shape=[c_in], constant=0, \ + trainable=trainable, name="%s_ofs" %(name)) + scale = self.get_variable(shape=[c_in], constant=1, \ + trainable=trainable, name="%s_sce" %(name)) + + y = tf.nn.batch_normalization( + x=x, + mean=mean, + variance=variance, + offset=offset, + scale=scale, + variance_epsilon=1e-12, + name=name + ) + + if(len_xdim == 2): y = tf.transpose(y, [1, 0]) + elif(len_xdim == 3): y = tf.transpose(y, [2, 1, 0]) + elif(len_xdim == 4): y = tf.transpose(y, [3, 1, 2, 0]) + elif(len_xdim == 5): y = tf.transpose(y, [4, 1, 2, 3, 0]) + + if(verbose): print("LN (%s)" %(name), x.shape, "->", y.shape) + return y + def maxpool(self, x, ksize=2, strides=1, \ padding='SAME', name='', verbose=True):