NVIDIA
diff --git a/‎cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceCommon.h
Lines changed: 91 additions & 0 deletions b/‎cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceCommon.h
Lines changed: 91 additions & 0 deletions
@@ -0,0 +1,91 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+
+struct MoeLoadBalanceSingleLayerSignal
+{
+    static constexpr unsigned long long kGPU = 0ULL;
+    static constexpr unsigned long long kCPU = 1ULL;
+    static constexpr unsigned long long kDevice = 1ULL;
+    static constexpr unsigned long long kSkipStep = 1ULL << 1U;
+    static constexpr unsigned long long kDisabled = 1ULL << 63U;
+    // Bit 0 means the current owner of this layer, 0: gpu, 1: cpu, updated by cpu and gpu alternately
+    // Bit 1 means whether skip statistic for current step, cpu set that at one iteration start,
+    //  maybe with or without ownership, but since forward is not started, so no conflict.
+    // Bits 2-62 means the current step, updated by cpu after one iteration with cpu ownership
+    // Bit 63 means if step update is disabled, 0: not disabled, 1: disabled, updated by cpu
+    unsigned long long int volatile stepAndOwner;
+};
+
+struct MoeLoadBalanceMetaInfo
+{
+    // Model Layer Info
+    int expertCount;
+    int topK;
+
+    // Parallelism Info
+    int epRank;
+    int epSize;
+
+    // Slot Info
+    int slotCountPerRank;
+};
+
+struct MoeLoadBalanceStatisticInfo
+{
+    // Statistic Info
+    // expertLoadFactor[i] means the load factor of expert i
+    // The length of expertLoadFactor should be expertCount
+    float* expertLoadFactor = nullptr;
+
+    // expertTokenCount[i] means the number of tokens of expert i
+    // The length of expertTokenCount should be rawDataWindowSize * expertCount
+    int* expertTokenCount = nullptr;
+
+    // rawDataWindowSize means the size of the raw data window.
+    // e.g. how many steps of raw data are kept in the memory.
+    int rawDataWindowSize = 1;
+
+    // decayFactor means the decay factor of the raw data per step.
+    // e.g. if decayFactor is 0.9, then the raw data of expert i will be decayed by 0.9 for each step.
+    float decayFactor = 0.9f;
+};
+
+// The placement information for GPU
+struct MoePlacementInfo
+{
+    // Placement Info
+    // expertReplicaCount[i] means the number of replicas of expert i
+    int* expertReplicaCount = nullptr;
+
+    // expertReplicaStartOffset[i] means the start offset of expert i's replicas in globalSlotIds
+    // and the values of globalSlotIds[expertReplicaStartOffset[i]] ~ globalSlotIds[expertReplicaStartOffset[i] +
+    // expertReplicaCount[i] - 1] are possible globalSlotId for expert i, and can be dispatched to any one.
+    int* expertReplicaStartOffset = nullptr;
+
+    // globalSlotIds[i] means the global slot id for expert i
+    // The length of globalSlotIds should be epSize * slotCountPerRank
+    int* globalSlotIds = nullptr;
+};
+
+} // namespace kernels
+} // namespace tensorrt_llm