Skip to content

Commit 741643c

Browse files
lyuCompute-Runtime-Automation
authored andcommitted
feature: report multi-hop fabric connections
In additon to physical connections we should also report multi-hop logical connections (MDFI + XeLink) as have positive bandwidth. Use a modified BFS algorithm to try to find a path between fabric vertices that are not directly connected together because the KMD always try to use MDFI link first, then go to XeLink. Multi-hop connections are bi-directional but might not be symmetric, so for every pair of vertices A & B that are not directly connected, we need to try to find both `A -> B` and `B -> A`. Related-To: GSD-7126 Signed-off-by: Wenbin Lu <[email protected]> Source: a0faad6
1 parent 175310d commit 741643c

File tree

15 files changed

+1623
-118
lines changed

15 files changed

+1623
-118
lines changed

level_zero/core/source/device/device_imp.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -551,7 +551,7 @@ void DeviceImp::getP2PPropertiesDirectFabricConnection(DeviceImp *peerDeviceImp,
551551
ze_fabric_edge_exp_properties_t edgeProperties{};
552552
fabricEdge->getProperties(&edgeProperties);
553553

554-
if (strcmp(edgeProperties.model, "XeLink") == 0) {
554+
if (strstr(edgeProperties.model, "XeLink") != nullptr) {
555555
bandwidthPropertiesDesc->logicalBandwidth = edgeProperties.bandwidth;
556556
bandwidthPropertiesDesc->physicalBandwidth = edgeProperties.bandwidth;
557557
bandwidthPropertiesDesc->bandwidthUnit = edgeProperties.bandwidthUnit;

level_zero/core/source/driver/driver_handle_imp.cpp

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,11 @@ DriverHandleImp::~DriverHandleImp() {
193193
}
194194
this->fabricEdges.clear();
195195

196+
for (auto &edge : this->fabricIndirectEdges) {
197+
delete edge;
198+
}
199+
this->fabricIndirectEdges.clear();
200+
196201
if (this->svmAllocsManager) {
197202
this->svmAllocsManager->trimUSMDeviceAllocCache();
198203
delete this->svmAllocsManager;
@@ -903,7 +908,7 @@ void DriverHandleImp::initializeVertexes() {
903908
this->fabricVertices.push_back(fabricVertex);
904909
}
905910

906-
FabricEdge::createEdgesFromVertices(this->fabricVertices, this->fabricEdges);
911+
FabricEdge::createEdgesFromVertices(this->fabricVertices, this->fabricEdges, this->fabricIndirectEdges);
907912
}
908913

909914
ze_result_t DriverHandleImp::fabricVertexGetExp(uint32_t *pCount, ze_fabric_vertex_handle_t *phVertices) {
@@ -957,17 +962,20 @@ ze_result_t DriverHandleImp::fabricEdgeGetExp(ze_fabric_vertex_handle_t hVertexA
957962
bool updateEdges = false;
958963

959964
if (*pCount == 0) {
960-
maxEdges = static_cast<uint32_t>(fabricEdges.size());
965+
maxEdges = static_cast<uint32_t>(fabricEdges.size() + fabricIndirectEdges.size());
961966
} else {
962-
maxEdges = std::min<uint32_t>(*pCount, static_cast<uint32_t>(fabricEdges.size()));
967+
maxEdges = std::min<uint32_t>(*pCount, static_cast<uint32_t>(fabricEdges.size() + fabricIndirectEdges.size()));
963968
}
964969

965970
if (phEdges != nullptr) {
966971
updateEdges = true;
967972
}
968973

969974
for (const auto &edge : fabricEdges) {
970-
// Fabric Connections are bi-directional
975+
if (edgeUpdateIndex >= maxEdges) {
976+
break;
977+
}
978+
// Direct physical fabric connections are bi-directional
971979
if ((edge->vertexA == queryVertexA && edge->vertexB == queryVertexB) ||
972980
(edge->vertexA == queryVertexB && edge->vertexB == queryVertexA)) {
973981

@@ -976,11 +984,19 @@ ze_result_t DriverHandleImp::fabricEdgeGetExp(ze_fabric_vertex_handle_t hVertexA
976984
}
977985
++edgeUpdateIndex;
978986
}
987+
}
979988

980-
// Stop if the edges overflow the count
989+
for (const auto &edge : fabricIndirectEdges) {
981990
if (edgeUpdateIndex >= maxEdges) {
982991
break;
983992
}
993+
// Logical multi-hop edges might not be symmetric
994+
if (edge->vertexA == queryVertexA && edge->vertexB == queryVertexB) {
995+
if (updateEdges == true) {
996+
phEdges[edgeUpdateIndex] = edge->toHandle();
997+
}
998+
++edgeUpdateIndex;
999+
}
9841000
}
9851001

9861002
*pCount = edgeUpdateIndex;

level_zero/core/source/driver/driver_handle_imp.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ struct DriverHandleImp : public DriverHandle {
133133
std::vector<Device *> devices;
134134
std::vector<FabricVertex *> fabricVertices;
135135
std::vector<FabricEdge *> fabricEdges;
136+
std::vector<FabricEdge *> fabricIndirectEdges;
136137

137138
std::mutex rtasLock;
138139

level_zero/core/source/fabric/fabric.cpp

Lines changed: 1 addition & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2022-2023 Intel Corporation
2+
* Copyright (C) 2022-2024 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -121,31 +121,4 @@ FabricEdge *FabricEdge::create(FabricVertex *vertexA, FabricVertex *vertexB, ze_
121121
return edge;
122122
}
123123

124-
void FabricEdge::createEdgesFromVertices(const std::vector<FabricVertex *> &vertices, std::vector<FabricEdge *> &edges) {
125-
126-
// Get all vertices and sub-vertices
127-
std::vector<FabricVertex *> allVertices = {};
128-
for (auto &fabricVertex : vertices) {
129-
allVertices.push_back(fabricVertex);
130-
for (auto &fabricSubVertex : fabricVertex->subVertices) {
131-
allVertices.push_back(fabricSubVertex);
132-
}
133-
}
134-
135-
// Get edges between all vertices
136-
for (uint32_t vertexAIndex = 0; vertexAIndex < allVertices.size(); vertexAIndex++) {
137-
for (uint32_t vertexBIndex = vertexAIndex + 1; vertexBIndex < allVertices.size(); vertexBIndex++) {
138-
ze_fabric_edge_exp_properties_t edgeProperty = {};
139-
140-
for (auto const &fabricDeviceInterface : allVertices[vertexAIndex]->pFabricDeviceInterfaces) {
141-
bool isConnected =
142-
fabricDeviceInterface.second->getEdgeProperty(allVertices[vertexBIndex], edgeProperty);
143-
if (isConnected) {
144-
edges.push_back(create(allVertices[vertexAIndex], allVertices[vertexBIndex], edgeProperty));
145-
}
146-
}
147-
}
148-
}
149-
}
150-
151124
} // namespace L0

level_zero/core/source/fabric/fabric.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2022 Intel Corporation
2+
* Copyright (C) 2022-2024 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -46,7 +46,7 @@ struct FabricEdge : _ze_fabric_edge_handle_t {
4646
public:
4747
virtual ~FabricEdge() = default;
4848

49-
static void createEdgesFromVertices(const std::vector<FabricVertex *> &vertices, std::vector<FabricEdge *> &edges);
49+
static void createEdgesFromVertices(const std::vector<FabricVertex *> &vertices, std::vector<FabricEdge *> &edges, std::vector<FabricEdge *> &indirectEdges);
5050
static FabricEdge *create(FabricVertex *vertexA, FabricVertex *vertexB, ze_fabric_edge_exp_properties_t &properties);
5151
ze_result_t getProperties(ze_fabric_edge_exp_properties_t *pEdgeProperties) const {
5252
*pEdgeProperties = properties;

level_zero/core/source/fabric/linux/CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#
2-
# Copyright (C) 2022-2023 Intel Corporation
2+
# Copyright (C) 2022-2024 Intel Corporation
33
#
44
# SPDX-License-Identifier: MIT
55
#
@@ -10,13 +10,15 @@ if(UNIX)
1010
target_sources(${L0_STATIC_LIB_NAME}
1111
PRIVATE
1212
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
13+
${CMAKE_CURRENT_SOURCE_DIR}/fabric.cpp
1314
${CMAKE_CURRENT_SOURCE_DIR}/fabric_device_iaf.h
1415
${CMAKE_CURRENT_SOURCE_DIR}/fabric_device_iaf.cpp
1516
)
1617
else()
1718
target_sources(${L0_STATIC_LIB_NAME}
1819
PRIVATE
1920
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
21+
${CMAKE_CURRENT_SOURCE_DIR}/fabric.cpp
2022
${CMAKE_CURRENT_SOURCE_DIR}/fabric_device_iaf_stub.h
2123
${CMAKE_CURRENT_SOURCE_DIR}/fabric_device_iaf_stub.cpp
2224
)
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
/*
2+
* Copyright (C) 2024 Intel Corporation
3+
*
4+
* SPDX-License-Identifier: MIT
5+
*
6+
*/
7+
8+
#include "level_zero/core/source/fabric/fabric.h"
9+
10+
#include "shared/source/helpers/debug_helpers.h"
11+
12+
#include <algorithm>
13+
#include <cstring>
14+
#include <deque>
15+
#include <limits>
16+
#include <map>
17+
#include <string>
18+
#include <vector>
19+
20+
namespace L0 {
21+
22+
void FabricEdge::createEdgesFromVertices(const std::vector<FabricVertex *> &vertices, std::vector<FabricEdge *> &edges, std::vector<FabricEdge *> &indirectEdges) {
23+
24+
// Get all vertices and sub-vertices
25+
std::vector<FabricVertex *> allVertices = {};
26+
for (auto &fabricVertex : vertices) {
27+
allVertices.push_back(fabricVertex);
28+
for (auto &fabricSubVertex : fabricVertex->subVertices) {
29+
allVertices.push_back(fabricSubVertex);
30+
}
31+
}
32+
33+
// Get direct physical edges between all vertices
34+
std::map<uint32_t, std::vector<std::pair<uint32_t, ze_fabric_edge_exp_properties_t *>>> adjacentVerticesMap;
35+
std::map<uint32_t, std::vector<uint32_t>> nonAdjacentVerticesMap;
36+
for (uint32_t vertexAIndex = 0; vertexAIndex < allVertices.size(); vertexAIndex++) {
37+
for (uint32_t vertexBIndex = vertexAIndex + 1; vertexBIndex < allVertices.size(); vertexBIndex++) {
38+
bool isAdjacent = false;
39+
auto vertexA = allVertices[vertexAIndex];
40+
auto vertexB = allVertices[vertexBIndex];
41+
ze_fabric_edge_exp_properties_t edgeProperty = {};
42+
43+
for (auto const &fabricDeviceInterface : vertexA->pFabricDeviceInterfaces) {
44+
bool isConnected =
45+
fabricDeviceInterface.second->getEdgeProperty(vertexB, edgeProperty);
46+
if (isConnected) {
47+
edges.push_back(create(vertexA, vertexB, edgeProperty));
48+
adjacentVerticesMap[vertexAIndex].emplace_back(vertexBIndex, &edges.back()->properties);
49+
adjacentVerticesMap[vertexBIndex].emplace_back(vertexAIndex, &edges.back()->properties);
50+
isAdjacent = true;
51+
}
52+
}
53+
if (!isAdjacent) {
54+
auto &subVerticesOfA = vertexA->subVertices;
55+
if (std::find(subVerticesOfA.begin(), subVerticesOfA.end(), vertexB) == subVerticesOfA.end()) {
56+
nonAdjacentVerticesMap[vertexAIndex].push_back(vertexBIndex);
57+
nonAdjacentVerticesMap[vertexBIndex].push_back(vertexAIndex);
58+
}
59+
}
60+
}
61+
}
62+
63+
// Find logical multi-hop edges between vertices not directly connected
64+
for (const auto &[vertexAIndex, nonAdjacentVertices] : nonAdjacentVerticesMap) {
65+
for (auto vertexBIndex : nonAdjacentVertices) {
66+
std::map<uint32_t, uint32_t> visited;
67+
visited[vertexAIndex] = vertexAIndex;
68+
69+
std::deque<uint32_t> toVisit;
70+
toVisit.push_back(vertexAIndex);
71+
72+
uint32_t currVertexIndex = vertexAIndex;
73+
74+
while (true) {
75+
std::deque<uint32_t> toVisitIaf, toVisitMdfi;
76+
while (!toVisit.empty()) {
77+
currVertexIndex = toVisit.front();
78+
toVisit.pop_front();
79+
if (currVertexIndex == vertexBIndex) {
80+
break;
81+
}
82+
83+
for (auto [vertexIndex, edgeProperty] : adjacentVerticesMap[currVertexIndex]) {
84+
if (visited.find(vertexIndex) == visited.end()) {
85+
if (strncmp(edgeProperty->model, "XeLink", 7) == 0) {
86+
toVisitIaf.push_back(vertexIndex);
87+
} else {
88+
DEBUG_BREAK_IF(strncmp(edgeProperty->model, "MDFI", 5) != 0);
89+
toVisitMdfi.push_back(vertexIndex);
90+
}
91+
visited[vertexIndex] = currVertexIndex;
92+
}
93+
}
94+
}
95+
96+
if (currVertexIndex != vertexBIndex) {
97+
if (toVisitIaf.size() + toVisitMdfi.size() != 0) {
98+
toVisit = toVisitMdfi;
99+
toVisit.insert(toVisit.end(), toVisitIaf.begin(), toVisitIaf.end());
100+
} else {
101+
break;
102+
}
103+
} else {
104+
std::string path = "";
105+
ze_fabric_edge_exp_properties_t properties = {};
106+
properties.stype = ZE_STRUCTURE_TYPE_FABRIC_EDGE_EXP_PROPERTIES;
107+
properties.pNext = nullptr;
108+
memset(properties.uuid.id, 0, ZE_MAX_UUID_SIZE);
109+
memset(properties.model, 0, ZE_MAX_FABRIC_EDGE_MODEL_EXP_SIZE);
110+
properties.bandwidth = std::numeric_limits<uint32_t>::max();
111+
properties.bandwidthUnit = ZE_BANDWIDTH_UNIT_BYTES_PER_NANOSEC;
112+
properties.latency = std::numeric_limits<uint32_t>::max();
113+
properties.latencyUnit = ZE_LATENCY_UNIT_UNKNOWN;
114+
properties.duplexity = ZE_FABRIC_EDGE_EXP_DUPLEXITY_FULL_DUPLEX;
115+
116+
while (true) {
117+
const auto parentIndex = visited[currVertexIndex];
118+
ze_fabric_edge_exp_properties_t *currEdgeProperty = nullptr;
119+
for (const auto &[vertexIndex, edgeProperty] : adjacentVerticesMap[parentIndex]) {
120+
if (vertexIndex == currVertexIndex) {
121+
currEdgeProperty = edgeProperty;
122+
break;
123+
}
124+
}
125+
UNRECOVERABLE_IF(currEdgeProperty == nullptr);
126+
path = std::string(currEdgeProperty->model) + path;
127+
if ((strncmp(currEdgeProperty->model, "XeLink", 7) == 0) &&
128+
(currEdgeProperty->bandwidth < properties.bandwidth)) {
129+
properties.bandwidth = currEdgeProperty->bandwidth;
130+
}
131+
132+
currVertexIndex = parentIndex;
133+
if (currVertexIndex == vertexAIndex) {
134+
path.resize(ZE_MAX_FABRIC_EDGE_MODEL_EXP_SIZE - 1, '\0');
135+
path.copy(properties.model, path.size());
136+
break;
137+
} else {
138+
path = '-' + path;
139+
}
140+
}
141+
indirectEdges.push_back(create(allVertices[vertexAIndex], allVertices[vertexBIndex], properties));
142+
break;
143+
}
144+
}
145+
}
146+
}
147+
}
148+
149+
} // namespace L0

level_zero/core/source/fabric/windows/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#
2-
# Copyright (C) 2022-2023 Intel Corporation
2+
# Copyright (C) 2022-2024 Intel Corporation
33
#
44
# SPDX-License-Identifier: MIT
55
#
@@ -8,6 +8,7 @@ if(WIN32)
88
target_sources(${L0_STATIC_LIB_NAME}
99
PRIVATE
1010
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
11+
${CMAKE_CURRENT_SOURCE_DIR}/fabric.cpp
1112
${CMAKE_CURRENT_SOURCE_DIR}/fabric_device_iaf.h
1213
${CMAKE_CURRENT_SOURCE_DIR}/fabric_device_iaf.cpp
1314
)
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
/*
2+
* Copyright (C) 2024 Intel Corporation
3+
*
4+
* SPDX-License-Identifier: MIT
5+
*
6+
*/
7+
8+
#include "level_zero/core/source/fabric/fabric.h"
9+
10+
#include <vector>
11+
12+
namespace L0 {
13+
14+
void FabricEdge::createEdgesFromVertices(const std::vector<FabricVertex *> &vertices, std::vector<FabricEdge *> &edges, std::vector<FabricEdge *> &) {
15+
16+
// Get all vertices and sub-vertices
17+
std::vector<FabricVertex *> allVertices = {};
18+
for (auto &fabricVertex : vertices) {
19+
allVertices.push_back(fabricVertex);
20+
for (auto &fabricSubVertex : fabricVertex->subVertices) {
21+
allVertices.push_back(fabricSubVertex);
22+
}
23+
}
24+
25+
// Get direct physical edges between all vertices
26+
for (uint32_t vertexAIndex = 0; vertexAIndex < allVertices.size(); vertexAIndex++) {
27+
for (uint32_t vertexBIndex = vertexAIndex + 1; vertexBIndex < allVertices.size(); vertexBIndex++) {
28+
auto vertexA = allVertices[vertexAIndex];
29+
auto vertexB = allVertices[vertexBIndex];
30+
ze_fabric_edge_exp_properties_t edgeProperty = {};
31+
32+
for (auto const &fabricDeviceInterface : vertexA->pFabricDeviceInterfaces) {
33+
bool isConnected =
34+
fabricDeviceInterface.second->getEdgeProperty(vertexB, edgeProperty);
35+
if (isConnected) {
36+
edges.push_back(create(vertexA, vertexB, edgeProperty));
37+
}
38+
}
39+
}
40+
}
41+
}
42+
43+
} // namespace L0

0 commit comments

Comments
 (0)