Skip to content

Commit

Permalink
Add RMSNorm
Browse files Browse the repository at this point in the history
  • Loading branch information
zhongkaifu committed Sep 16, 2023
1 parent e972fa7 commit caff84b
Show file tree
Hide file tree
Showing 38 changed files with 887 additions and 83 deletions.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Mixture of Experts network that could easily train huge model with less computin
Support Automatic Mixed Precesion (FP16)
Built-in SentencePiece supported
Rotary Positional Embeddings
Layer Norm and RMS Norm
Python package supported
Tags embeddings mechanism
Prompted Decoders
Expand Down Expand Up @@ -193,7 +194,9 @@ You can also keep all parameters into a json file and run Seq2SeqConsole.exe -Co
"ShuffleType": "NoPadding",
"Task": "Train",
"TooLongSequence": "Ignore",
"ActivateFunc": "ReLU",
"ActivateFunc": "LeakyReLU",
"PEType": "RoPE",
"NormType": "LayerNorm",
"LogVerbose": "Normal",
"TgtLang": "TGT",
"TrainCorpusPath": ".\\data\\train",
Expand Down
4 changes: 2 additions & 2 deletions Seq2SeqSharp/Applications/Decoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,14 @@ public static MultiProcessorNetworkWrapper<IDecoder> CreateDecoders(IModel model
decoder = new MultiProcessorNetworkWrapper<IDecoder>(
new GPTDecoder("GPTDecoder", model.MultiHeadNum, model.HiddenDim, model.IntermediateDim, model.DecoderEmbeddingDim, model.DecoderLayerDepth, options.DropoutRatio, raDeviceIds.GetNextItem(),
isTrainable: options.IsDecoderTrainable && (options.Task == ModeEnums.Train), learningRateFactor: options.DecoderStartLearningRateFactor, activateFunc: model.ActivateFunc, expertNum: model.ExpertNum,
expertsPerTokenFactor: model.ExpertsPerTokenFactor, elementType: elementType, peType:model.PEType), raDeviceIds.ToArray());
expertsPerTokenFactor: model.ExpertsPerTokenFactor, elementType: elementType, peType:model.PEType, normType: model.NormType), raDeviceIds.ToArray());
}
else
{
decoder = new MultiProcessorNetworkWrapper<IDecoder>(
new TransformerDecoder("TransformerDecoder", model.MultiHeadNum, model.HiddenDim, model.IntermediateDim, model.DecoderEmbeddingDim, model.DecoderLayerDepth, options.DropoutRatio, raDeviceIds.GetNextItem(),
isTrainable: options.IsDecoderTrainable && (options.Task == ModeEnums.Train), learningRateFactor: options.DecoderStartLearningRateFactor, activateFunc: model.ActivateFunc, expertNum: model.ExpertNum,
expertsPerTokenFactor: model.ExpertsPerTokenFactor, elementType: elementType, peType:model.PEType), raDeviceIds.ToArray());
expertsPerTokenFactor: model.ExpertsPerTokenFactor, elementType: elementType, peType:model.PEType, normType: model.NormType), raDeviceIds.ToArray());
}

return decoder;
Expand Down
2 changes: 1 addition & 1 deletion Seq2SeqSharp/Applications/Encoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ public static MultiProcessorNetworkWrapper<IEncoder> CreateEncoders(IModel model
encoder = new MultiProcessorNetworkWrapper<IEncoder>(
new TransformerEncoder("TransformerEncoder", model.MultiHeadNum, model.HiddenDim, model.IntermediateDim, model.EncoderEmbeddingDim, model.EncoderLayerDepth, options.DropoutRatio, raDeviceIds.GetNextItem(),
isTrainable: options.IsEncoderTrainable, learningRateFactor: options.EncoderStartLearningRateFactor, activateFunc: model.ActivateFunc, expertNum: model.ExpertNum, expertsPerTokenFactor: model.ExpertsPerTokenFactor,
elementType, peType: model.PEType), raDeviceIds.ToArray());
elementType, peType: model.PEType, normType: model.NormType), raDeviceIds.ToArray());
}

return encoder;
Expand Down
3 changes: 3 additions & 0 deletions Seq2SeqSharp/Applications/Options.cs
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,9 @@ public class Options
[Arg("The Positional Embeddings Type. It supports APE, NoPE and RoPE", nameof(PEType))]
public PositionEmbeddingEnums PEType = PositionEmbeddingEnums.APE;

[Arg("The type of normalization. It supports LayerNorm and RMSNorm", nameof(NormType))]
public NormEnums NormType = NormEnums.LayerNorm;

public void ValidateOptions()
{
if (AMP == true && ProcessorType != ProcessorTypeEnums.GPU)
Expand Down
2 changes: 1 addition & 1 deletion Seq2SeqSharp/Applications/SeqClassification.cs
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ private bool CreateTrainableParameters(IModel model)
(m_positionalEmbeddings, m_segmentEmbedding) = Misc.CreateAuxEmbeddings(raDeviceIds, model.HiddenDim, m_options.MaxSentLength, model, createAPE: (model.PEType == PositionEmbeddingEnums.APE));

Logger.WriteLine($"Creating embeddings. Shape = '({model.SrcVocab.Count} ,{model.EncoderEmbeddingDim})'");
m_srcEmbedding = new MultiProcessorNetworkWrapper<IWeightTensor>(new WeightTensor(new long[2] { model.SrcVocab.Count, model.EncoderEmbeddingDim }, raDeviceIds.GetNextItem(), normType: NormType.Uniform, fanOut: true, name: "SrcEmbeddings",
m_srcEmbedding = new MultiProcessorNetworkWrapper<IWeightTensor>(new WeightTensor(new long[2] { model.SrcVocab.Count, model.EncoderEmbeddingDim }, raDeviceIds.GetNextItem(), initType: RandomInitType.Uniform, fanOut: true, name: "SrcEmbeddings",
isTrainable: m_options.IsEmbeddingTrainable), DeviceIds);

return true;
Expand Down
2 changes: 1 addition & 1 deletion Seq2SeqSharp/Applications/SeqLabel.cs
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ private bool CreateTrainableParameters(IModel model)
m_encoder = Encoder.CreateEncoders(model, m_options, raDeviceIds);
m_ffLayer = new MultiProcessorNetworkWrapper<FeedForwardLayer>(new FeedForwardLayer("FeedForward", model.HiddenDim, model.ClsVocab.Count, dropoutRatio: 0.0f, deviceId: raDeviceIds.GetNextItem(), isTrainable: true), DeviceIds);

m_srcEmbedding = new MultiProcessorNetworkWrapper<IWeightTensor>(new WeightTensor(new long[2] { model.SrcVocab.Count, model.EncoderEmbeddingDim }, raDeviceIds.GetNextItem(), normType: NormType.Uniform, name: "SrcEmbeddings",
m_srcEmbedding = new MultiProcessorNetworkWrapper<IWeightTensor>(new WeightTensor(new long[2] { model.SrcVocab.Count, model.EncoderEmbeddingDim }, raDeviceIds.GetNextItem(), initType: RandomInitType.Uniform, name: "SrcEmbeddings",
isTrainable: true), DeviceIds);
(m_posEmbedding, m_segmentEmbedding) = Misc.CreateAuxEmbeddings(raDeviceIds, model.HiddenDim, m_options.MaxSentLength, model, createAPE: (model.PEType == PositionEmbeddingEnums.APE));

Expand Down
2 changes: 1 addition & 1 deletion Seq2SeqSharp/Applications/SeqSimilarity.cs
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ private bool CreateTrainableParameters(IModel model)
(m_posEmbedding, m_segmentEmbedding) = Misc.CreateAuxEmbeddings(raDeviceIds, model.HiddenDim, Math.Max(m_options.MaxTrainSentLength, m_options.MaxTestSentLength), model, createAPE: (model.PEType == PositionEmbeddingEnums.APE));

Logger.WriteLine($"Creating embeddings. Shape = '({model.SrcVocab.Count} ,{model.EncoderEmbeddingDim})'");
m_srcEmbedding = new MultiProcessorNetworkWrapper<IWeightTensor>(new WeightTensor(new long[2] { model.SrcVocab.Count, model.EncoderEmbeddingDim }, raDeviceIds.GetNextItem(), normType: NormType.Uniform, fanOut: true, name: "SrcEmbeddings",
m_srcEmbedding = new MultiProcessorNetworkWrapper<IWeightTensor>(new WeightTensor(new long[2] { model.SrcVocab.Count, model.EncoderEmbeddingDim }, raDeviceIds.GetNextItem(), initType: RandomInitType.Uniform, fanOut: true, name: "SrcEmbeddings",
isTrainable: m_options.IsEmbeddingTrainable), DeviceIds);

return true;
Expand Down
4 changes: 2 additions & 2 deletions Seq2SeqSharp/Corpus/Seq2SeqCorpus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ namespace Seq2SeqSharp.Corpus
public class Seq2SeqCorpus : ParallelCorpus<Seq2SeqCorpusBatch>
{

public Seq2SeqCorpus(string corpusFilePath, string srcLangName, string tgtLangName, int maxTokenSizePerBatch, int maxSrcSentLength = 32, int maxTgtSentLength = 32, ShuffleEnums shuffleEnums = ShuffleEnums.Random, TooLongSequence tooLongSequence = TooLongSequence.Ignore)
:base (corpusFilePath, srcLangName, tgtLangName, maxTokenSizePerBatch, maxSrcSentLength, maxTgtSentLength, shuffleEnums: shuffleEnums, tooLongSequence: tooLongSequence)
public Seq2SeqCorpus(string corpusFilePath, string srcLangName, string tgtLangName, int maxTokenSizePerBatch, int maxSrcSentLength = 32, int maxTgtSentLength = 32, ShuffleEnums shuffleEnums = ShuffleEnums.Random, TooLongSequence tooLongSequence = TooLongSequence.Ignore, string indexedFilePath = null)
:base (corpusFilePath, srcLangName, tgtLangName, maxTokenSizePerBatch, maxSrcSentLength, maxTgtSentLength, shuffleEnums: shuffleEnums, tooLongSequence: tooLongSequence, indexedFilePath: indexedFilePath)
{

}
Expand Down
8 changes: 4 additions & 4 deletions Seq2SeqSharp/Layers/AttentionUnit.cs
Original file line number Diff line number Diff line change
Expand Up @@ -56,15 +56,15 @@ public AttentionUnit(string name, int hiddenDim, int contextDim, int deviceId, b

Logger.WriteLine($"Creating attention unit '{name}' HiddenDim = '{hiddenDim}', ContextDim = '{contextDim}', DeviceId = '{deviceId}', EnableCoverageModel = '{enableCoverageModel}'");

m_Ua = new WeightTensor(new long[2] { contextDim, hiddenDim }, deviceId, normType: NormType.Uniform, name: $"{name}.{nameof(m_Ua)}", isTrainable: isTrainable, dtype: elementType);
m_Wa = new WeightTensor(new long[2] { hiddenDim, hiddenDim }, deviceId, normType: NormType.Uniform, name: $"{name}.{nameof(m_Wa)}", isTrainable: isTrainable, dtype: elementType);
m_Ua = new WeightTensor(new long[2] { contextDim, hiddenDim }, deviceId, initType: RandomInitType.Uniform, name: $"{name}.{nameof(m_Ua)}", isTrainable: isTrainable, dtype: elementType);
m_Wa = new WeightTensor(new long[2] { hiddenDim, hiddenDim }, deviceId, initType: RandomInitType.Uniform, name: $"{name}.{nameof(m_Wa)}", isTrainable: isTrainable, dtype: elementType);
m_bUa = new WeightTensor(new long[2] { 1, hiddenDim }, 0, deviceId, name: $"{name}.{nameof(m_bUa)}", isTrainable: isTrainable, dtype: elementType);
m_bWa = new WeightTensor(new long[2] { 1, hiddenDim }, 0, deviceId, name: $"{name}.{nameof(m_bWa)}", isTrainable: isTrainable, dtype: elementType);
m_V = new WeightTensor(new long[2] { hiddenDim, 1 }, deviceId, normType: NormType.Uniform, name: $"{name}.{nameof(m_V)}", isTrainable: isTrainable, dtype: elementType);
m_V = new WeightTensor(new long[2] { hiddenDim, 1 }, deviceId, initType: RandomInitType.Uniform, name: $"{name}.{nameof(m_V)}", isTrainable: isTrainable, dtype: elementType);

if (m_enableCoverageModel)
{
m_Wc = new WeightTensor(new long[2] { k_coverageModelDim, hiddenDim }, deviceId, normType: NormType.Uniform, name: $"{name}.{nameof(m_Wc)}", isTrainable: isTrainable, dtype: elementType);
m_Wc = new WeightTensor(new long[2] { k_coverageModelDim, hiddenDim }, deviceId, initType: RandomInitType.Uniform, name: $"{name}.{nameof(m_Wc)}", isTrainable: isTrainable, dtype: elementType);
m_bWc = new WeightTensor(new long[2] { 1, hiddenDim }, 0, deviceId, name: $"{name}.{nameof(m_bWc)}", isTrainable: isTrainable, dtype: elementType);
m_coverage = new LSTMCell(name: $"{name}.{nameof(m_coverage)}", hdim: k_coverageModelDim, inputDim: 1 + contextDim + hiddenDim, deviceId: deviceId, isTrainable: isTrainable, elementType: elementType);
}
Expand Down
2 changes: 1 addition & 1 deletion Seq2SeqSharp/Layers/FeedForwardLayer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public FeedForwardLayer(string name, int inputDim, int outputDim, float dropoutR
m_isTrainable = isTrainable;
m_elementType = elementType;

m_Whd = new WeightTensor(new long[2] { inputDim, outputDim }, deviceId, name: $"{name}.{nameof(m_Whd)}", normType: NormType.Uniform, isTrainable: isTrainable, learningRateFactor: learningRateFactor, dtype: elementType);
m_Whd = new WeightTensor(new long[2] { inputDim, outputDim }, deviceId, name: $"{name}.{nameof(m_Whd)}", initType: RandomInitType.Uniform, isTrainable: isTrainable, learningRateFactor: learningRateFactor, dtype: elementType);
m_Bd = new WeightTensor(new long[2] { 1, outputDim }, 0, deviceId, name: $"{name}.{nameof(m_Bd)}", isTrainable: isTrainable, learningRateFactor: learningRateFactor, dtype: elementType);
}

Expand Down
9 changes: 9 additions & 0 deletions Seq2SeqSharp/Layers/INormalization.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
using Seq2SeqSharp.Tools;

namespace Seq2SeqSharp.Layers
{
internal interface INormalization : INeuralUnit
{
IWeightTensor Norm(IWeightTensor input, IComputeGraph g);
}
}
2 changes: 1 addition & 1 deletion Seq2SeqSharp/Layers/LSTMAttentionDecoderCell.cs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public LSTMAttentionDecoderCell(string name, int hiddenDim, int inputDim, int co

Logger.WriteLine($"Create LSTM attention decoder cell '{name}' HiddemDim = '{hiddenDim}', InputDim = '{inputDim}', ContextDim = '{contextDim}', DeviceId = '{deviceId}'");

m_Wxhc = new WeightTensor(new long[2] { inputDim + hiddenDim + contextDim, hiddenDim * 4 }, deviceId, normType: NormType.Uniform, name: $"{name}.{nameof(m_Wxhc)}", isTrainable: isTrainable, dtype: elementType);
m_Wxhc = new WeightTensor(new long[2] { inputDim + hiddenDim + contextDim, hiddenDim * 4 }, deviceId, initType: RandomInitType.Uniform, name: $"{name}.{nameof(m_Wxhc)}", isTrainable: isTrainable, dtype: elementType);
m_b = new WeightTensor(new long[2] { 1, hiddenDim * 4 }, 0, deviceId, name: $"{name}.{nameof(m_b)}", isTrainable: isTrainable, dtype: elementType);

m_layerNorm1 = new LayerNormalization($"{name}.{nameof(m_layerNorm1)}", hiddenDim * 4, deviceId, isTrainable, elementType: elementType);
Expand Down
2 changes: 1 addition & 1 deletion Seq2SeqSharp/Layers/LSTMCell.cs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ public LSTMCell(string name, int hdim, int inputDim, int deviceId, bool isTraina
{
m_name = name;

m_Wxh = new WeightTensor(new long[2] { inputDim + hdim, hdim * 4 }, deviceId, normType: NormType.Uniform, name: $"{name}.{nameof(m_Wxh)}", isTrainable: isTrainable, dtype: elementType);
m_Wxh = new WeightTensor(new long[2] { inputDim + hdim, hdim * 4 }, deviceId, initType: RandomInitType.Uniform, name: $"{name}.{nameof(m_Wxh)}", isTrainable: isTrainable, dtype: elementType);
m_b = new WeightTensor(new long[2] { 1, hdim * 4 }, 0, deviceId, name: $"{name}.{nameof(m_b)}", isTrainable: isTrainable, dtype: elementType);

m_hdim = hdim;
Expand Down
13 changes: 12 additions & 1 deletion Seq2SeqSharp/Layers/LayerNormalization.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
// Seq2SeqSharp is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details.

using Seq2SeqSharp.Layers;
using Seq2SeqSharp.Tools;
using System;
using System.Collections.Generic;
Expand All @@ -16,7 +17,7 @@
namespace Seq2SeqSharp
{
[Serializable]
internal class LayerNormalization
internal class LayerNormalization : INormalization
{
private readonly IWeightTensor m_alpha;
private readonly IWeightTensor m_beta;
Expand Down Expand Up @@ -70,5 +71,15 @@ public void Load(IModel stream)
m_alpha.Load(stream);
m_beta.Load(stream);
}

public INeuralUnit CloneToDeviceAt(int deviceId)
{
throw new NotImplementedException();
}

public int GetDeviceId()
{
throw new NotImplementedException();
}
}
}
6 changes: 3 additions & 3 deletions Seq2SeqSharp/Layers/MoEFeedForward.cs
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,10 @@ public MoEFeedForward(string name, int expertNum, int hiddenDim, float dropoutRa

layerNorm = new LayerNormalization($"{name}.{nameof(layerNorm)}", hiddenDim, deviceId, isTrainable, learningRateFactor: learningRateFactor, elementType: elementType);

m_Whd1 = new WeightTensor(new long[3] { expertNum, hiddenDim, hiddenDim * 4 }, deviceId, name: $"{name}.{nameof(m_Whd1)}", normType: NormType.Uniform, isTrainable: isTrainable, learningRateFactor: learningRateFactor, dtype: elementType);
m_Whd2 = new WeightTensor(new long[3] { expertNum, hiddenDim * 4, hiddenDim }, deviceId, name: $"{name}.{nameof(m_Whd2)}", normType: NormType.Uniform, isTrainable: isTrainable, learningRateFactor: learningRateFactor, dtype: elementType);
m_Whd1 = new WeightTensor(new long[3] { expertNum, hiddenDim, hiddenDim * 4 }, deviceId, name: $"{name}.{nameof(m_Whd1)}", initType: RandomInitType.Uniform, isTrainable: isTrainable, learningRateFactor: learningRateFactor, dtype: elementType);
m_Whd2 = new WeightTensor(new long[3] { expertNum, hiddenDim * 4, hiddenDim }, deviceId, name: $"{name}.{nameof(m_Whd2)}", initType: RandomInitType.Uniform, isTrainable: isTrainable, learningRateFactor: learningRateFactor, dtype: elementType);

m_Router = new WeightTensor(new long[2] { hiddenDim, expertNum }, deviceId, name: $"{name}.{nameof(m_Router)}", normType: NormType.Uniform, isTrainable: isTrainable, learningRateFactor: learningRateFactor, dtype: elementType);
m_Router = new WeightTensor(new long[2] { hiddenDim, expertNum }, deviceId, name: $"{name}.{nameof(m_Router)}", initType: RandomInitType.Uniform, isTrainable: isTrainable, learningRateFactor: learningRateFactor, dtype: elementType);
m_RouterBias = new WeightTensor(new long[2] { 1, expertNum }, 0, deviceId, name: $"{name}.{nameof(m_RouterBias)}", isTrainable: isTrainable, learningRateFactor: learningRateFactor, dtype: elementType);

}
Expand Down
Loading

0 comments on commit caff84b

Please sign in to comment.