Skip to content

Commit 21aff2e

Browse files
authored
feat: large-scale EP(part 2: MoE Load Balancer - core utilities) (#4384)
* first commit of cpp moe loadbalance code Signed-off-by: Dongxu Yang <[email protected]> * add python bindings for moe load balance Signed-off-by: Dongxu Yang <[email protected]> * add python wrapper, ut and bug fixes Signed-off-by: Dongxu Yang <[email protected]> * add binding for layerId and update binding test Signed-off-by: Dongxu Yang <[email protected]> * add host tensor sharing and ut Signed-off-by: Dongxu Yang <[email protected]> --------- Signed-off-by: Dongxu Yang <[email protected]>
1 parent ec4190f commit 21aff2e

20 files changed

+4970
-0
lines changed
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
* You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
#pragma once
18+
19+
namespace tensorrt_llm
20+
{
21+
namespace kernels
22+
{
23+
24+
struct MoeLoadBalanceSingleLayerSignal
25+
{
26+
static constexpr unsigned long long kGPU = 0ULL;
27+
static constexpr unsigned long long kCPU = 1ULL;
28+
static constexpr unsigned long long kDevice = 1ULL;
29+
static constexpr unsigned long long kSkipStep = 1ULL << 1U;
30+
static constexpr unsigned long long kDisabled = 1ULL << 63U;
31+
// Bit 0 means the current owner of this layer, 0: gpu, 1: cpu, updated by cpu and gpu alternately
32+
// Bit 1 means whether skip statistic for current step, cpu set that at one iteration start,
33+
// maybe with or without ownership, but since forward is not started, so no conflict.
34+
// Bits 2-62 means the current step, updated by cpu after one iteration with cpu ownership
35+
// Bit 63 means if step update is disabled, 0: not disabled, 1: disabled, updated by cpu
36+
unsigned long long int volatile stepAndOwner;
37+
};
38+
39+
struct MoeLoadBalanceMetaInfo
40+
{
41+
// Model Layer Info
42+
int expertCount;
43+
int topK;
44+
45+
// Parallelism Info
46+
int epRank;
47+
int epSize;
48+
49+
// Slot Info
50+
int slotCountPerRank;
51+
};
52+
53+
struct MoeLoadBalanceStatisticInfo
54+
{
55+
// Statistic Info
56+
// expertLoadFactor[i] means the load factor of expert i
57+
// The length of expertLoadFactor should be expertCount
58+
float* expertLoadFactor = nullptr;
59+
60+
// expertTokenCount[i] means the number of tokens of expert i
61+
// The length of expertTokenCount should be rawDataWindowSize * expertCount
62+
int* expertTokenCount = nullptr;
63+
64+
// rawDataWindowSize means the size of the raw data window.
65+
// e.g. how many steps of raw data are kept in the memory.
66+
int rawDataWindowSize = 1;
67+
68+
// decayFactor means the decay factor of the raw data per step.
69+
// e.g. if decayFactor is 0.9, then the raw data of expert i will be decayed by 0.9 for each step.
70+
float decayFactor = 0.9f;
71+
};
72+
73+
// The placement information for GPU
74+
struct MoePlacementInfo
75+
{
76+
// Placement Info
77+
// expertReplicaCount[i] means the number of replicas of expert i
78+
int* expertReplicaCount = nullptr;
79+
80+
// expertReplicaStartOffset[i] means the start offset of expert i's replicas in globalSlotIds
81+
// and the values of globalSlotIds[expertReplicaStartOffset[i]] ~ globalSlotIds[expertReplicaStartOffset[i] +
82+
// expertReplicaCount[i] - 1] are possible globalSlotId for expert i, and can be dispatched to any one.
83+
int* expertReplicaStartOffset = nullptr;
84+
85+
// globalSlotIds[i] means the global slot id for expert i
86+
// The length of globalSlotIds should be epSize * slotCountPerRank
87+
int* globalSlotIds = nullptr;
88+
};
89+
90+
} // namespace kernels
91+
} // namespace tensorrt_llm

0 commit comments

Comments
 (0)