-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[mlir][gpu][transforms] Add support for mapping to lanes #146912
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-mlir Author: Nicolas Vasilache (nicolasvasilache) ChangesFull diff: https://github.com/llvm/llvm-project/pull/146912.diff 6 Files Affected:
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUDeviceMappingAttr.td b/mlir/include/mlir/Dialect/GPU/IR/GPUDeviceMappingAttr.td
index 6e0f6f1d78eda..63f228ca3157f 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUDeviceMappingAttr.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUDeviceMappingAttr.td
@@ -228,6 +228,30 @@ def GPUThreadMappingAttr
}];
}
+def GPULaneMappingAttr
+ : GPU_Attr<"GPULaneMapping", "lane", [
+ DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ]> {
+ let parameters = (ins
+ EnumParameter<MappingIdEnum>:$lane
+ );
+ let assemblyFormat = "`<` params `>`";
+ let description = [{
+ An attribute that allows defining lane parallelism for GPU devices.
+
+ It can be consumed by lowering to generate GPU.
+
+ #### 3D mapping mode
+
+ Unsupported
+
+ #### Linear mapping mode
+
+ The linear lane id is obtained by linearizing the index of the lane.
+ If required, predication occurs on the linear id. This allows specifying
+ predication on a 1D subset of the (linearized) lanes.
+ }];
+}
+
def GPUMemorySpaceMappingAttr : GPU_Attr<"GPUMemorySpaceMapping", "memory_space", [
DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ] > {
let parameters = (ins
diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h b/mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h
index 52fc6f4d5c71b..111c67638efc8 100644
--- a/mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h
@@ -117,6 +117,15 @@ struct GpuThreadIdBuilder : public GpuIdBuilder {
GpuThreadIdBuilder(MLIRContext *ctx, bool useLinearMapping = false);
};
+/// Builder for lane id.
+/// The `idBuilder` method returns nD values used for indexing rewrites as well
+/// as 1D sizes for predicate generation.
+/// This `useLinearMapping` case is the only supported case.
+struct GpuLaneIdBuilder : public GpuIdBuilder {
+ GpuLaneIdBuilder(MLIRContext *ctx, int64_t warpSize, bool unused);
+ int64_t warpSize = 32;
+};
+
/// Determine if the size of the kernel configuration is supported by the
/// GPU architecture being used.
/// TODO this is currently hardwired to CUDA, parameterize and generalize.
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index a5eb62ce66e0b..56631f1aac084 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -106,6 +106,20 @@ int64_t GPUThreadMappingAttr::getRelativeIndex() const {
: getMappingId();
}
+int64_t GPULaneMappingAttr::getMappingId() const {
+ return static_cast<int64_t>(getLane());
+}
+
+bool GPULaneMappingAttr::isLinearMapping() const {
+ return getMappingId() >= static_cast<int64_t>(MappingId::LinearDim0);
+}
+
+int64_t GPULaneMappingAttr::getRelativeIndex() const {
+ return isLinearMapping()
+ ? getMappingId() - static_cast<int64_t>(MappingId::LinearDim0)
+ : getMappingId();
+}
+
int64_t GPUMemorySpaceMappingAttr::getMappingId() const {
return static_cast<int64_t>(getAddressSpace());
}
diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
index 6446235c06fb2..20d1c94409238 100644
--- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
@@ -313,11 +313,14 @@ checkMappingAttributeTypes(std::optional<TransformOpInterface> transformOp,
llvm::IsaPred<GPUWarpMappingAttr>);
bool hasThreadMapping = llvm::any_of(forallOp.getMapping().value(),
llvm::IsaPred<GPUThreadMappingAttr>);
+ bool hasLaneMapping = llvm::any_of(forallOp.getMapping().value(),
+ llvm::IsaPred<GPULaneMappingAttr>);
int64_t countMappingTypes = 0;
countMappingTypes += hasBlockMapping ? 1 : 0;
countMappingTypes += hasWarpgroupMapping ? 1 : 0;
countMappingTypes += hasWarpMapping ? 1 : 0;
countMappingTypes += hasThreadMapping ? 1 : 0;
+ countMappingTypes += hasLaneMapping ? 1 : 0;
if (countMappingTypes > 1) {
return definiteFailureHelper(
transformOp, forallOp,
@@ -330,7 +333,8 @@ checkMappingAttributeTypes(std::optional<TransformOpInterface> transformOp,
"scf.forall op requires a mapping attribute of kind 'block'");
}
if (std::is_same<MappingKindType, ThreadMappingKind>::value &&
- !hasThreadMapping && !hasWarpMapping && !hasWarpgroupMapping) {
+ !hasLaneMapping && !hasThreadMapping && !hasWarpMapping &&
+ !hasWarpgroupMapping) {
return definiteFailureHelper(transformOp, forallOp,
"scf.forall op requires a mapping attribute "
"of kind 'thread' or 'warp'");
@@ -473,10 +477,17 @@ static DiagnosedSilenceableFailure rewriteOneForallCommonImpl(
SmallVector<int64_t> originalBasis(availableMappingSizes);
bool originalBasisWasProvided = !originalBasis.empty();
if (!originalBasisWasProvided) {
+ LDBG("----originalBasis was not provided, deriving it and there will be no "
+ "predication");
originalBasis = forallMappingSizes;
while (originalBasis.size() < 3)
originalBasis.push_back(1);
+ } else {
+ LDBG("----originalBasis was provided, using it, there will be predication");
}
+ LLVM_DEBUG(
+ llvm::interleaveComma(originalBasis, DBGS() << "------originalBasis: ");
+ llvm::dbgs() << "\n");
IdBuilderResult builderResult =
gpuIdBuilder.idBuilder(rewriter, loc, forallMappingSizes, originalBasis);
@@ -490,6 +501,7 @@ static DiagnosedSilenceableFailure rewriteOneForallCommonImpl(
forallMappingAttrs.getArrayRef().take_front(forallOp.getRank()))) {
auto mappingAttr = cast<DeviceMappingAttrInterface>(dim);
Value peIdOp = mappingIdOps[mappingAttr.getRelativeIndex()];
+ LDBG("----map: " << iv << " to" << peIdOp);
bvm.map(iv, peIdOp);
}
@@ -790,6 +802,9 @@ getThreadIdBuilder(std::optional<TransformOpInterface> transformOp,
.Case([&](GPUThreadMappingAttr) {
return GpuThreadIdBuilder(ctx, useLinearMapping);
})
+ .Case([&](GPULaneMappingAttr) {
+ return GpuLaneIdBuilder(ctx, warpSize, useLinearMapping);
+ })
.Default([&](DeviceMappingAttrInterface) -> GpuIdBuilder {
llvm_unreachable("unknown mapping attribute");
});
diff --git a/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp b/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp
index 9853e80828390..c693a2fa01e89 100644
--- a/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp
@@ -156,6 +156,63 @@ static GpuIdBuilderFnType common3DIdBuilderFn(int64_t multiplicity = 1) {
return res;
}
+/// Create a lane id builder that takes the `originalBasis` and decompose
+/// it in the basis of `forallMappingSizes`. The linear id builder returns an
+/// n-D vector of ids for indexing and 1-D size + id for predicate generation.
+static GpuIdBuilderFnType laneIdBuilderFn(int64_t periodicity) {
+ auto res = [periodicity](RewriterBase &rewriter, Location loc,
+ ArrayRef<int64_t> forallMappingSizes,
+ ArrayRef<int64_t> originalBasis) {
+ SmallVector<OpFoldResult> originalBasisOfr =
+ getAsIndexOpFoldResult(rewriter.getContext(), originalBasis);
+ OpFoldResult linearId =
+ buildLinearId<ThreadIdOp>(rewriter, loc, originalBasisOfr);
+ AffineExpr d0 = getAffineDimExpr(0, rewriter.getContext());
+ linearId = affine::makeComposedFoldedAffineApply(
+ rewriter, loc, d0 % periodicity, {linearId});
+
+ // Sizes in [0 .. n] -> [n .. 0] order to properly compute strides in
+ // "row-major" order.
+ SmallVector<int64_t> reverseBasisSizes(llvm::reverse(forallMappingSizes));
+ SmallVector<int64_t> strides = computeStrides(reverseBasisSizes);
+ SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
+ SmallVector<Value> ids;
+ // Reverse back to be in [0 .. n] order.
+ for (AffineExpr e : llvm::reverse(delinearizingExprs)) {
+ ids.push_back(
+ affine::makeComposedAffineApply(rewriter, loc, e, {linearId}));
+ }
+
+ // clang-format off
+ LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes,
+ DBGS() << "--delinearization basis: ");
+ llvm::dbgs() << "\n";
+ llvm::interleaveComma(strides,
+ DBGS() << "--delinearization strides: ");
+ llvm::dbgs() << "\n";
+ llvm::interleaveComma(delinearizingExprs,
+ DBGS() << "--delinearization exprs: ");
+ llvm::dbgs() << "\n";
+ llvm::interleaveComma(ids, DBGS() << "--ids: ");
+ llvm::dbgs() << "\n";);
+ // clang-format on
+
+ // Return n-D ids for indexing and 1-D size + id for predicate generation.
+ return IdBuilderResult{
+ /*mappingIdOps=*/ids,
+ /*availableMappingSizes=*/
+ SmallVector<int64_t>{computeProduct(originalBasis)},
+ // `forallMappingSizes` iterate in the scaled basis, they need to be
+ // scaled back into the original basis to provide tight
+ // activeMappingSizes quantities for predication.
+ /*activeMappingSizes=*/
+ SmallVector<int64_t>{computeProduct(forallMappingSizes)},
+ /*activeIdOps=*/SmallVector<Value>{linearId.get<Value>()}};
+ };
+
+ return res;
+}
+
namespace mlir {
namespace transform {
namespace gpu {
@@ -221,6 +278,16 @@ GpuThreadIdBuilder::GpuThreadIdBuilder(MLIRContext *ctx, bool useLinearMapping)
: common3DIdBuilderFn<ThreadIdOp>(/*multiplicity=*/1);
}
+GpuLaneIdBuilder::GpuLaneIdBuilder(MLIRContext *ctx, int64_t warpSize,
+ bool unused)
+ : GpuIdBuilder(ctx, /*useLinearMapping=*/true,
+ [](MLIRContext *ctx, MappingId id) {
+ return GPULaneMappingAttr::get(ctx, id);
+ }),
+ warpSize(warpSize) {
+ idBuilder = laneIdBuilderFn(/*periodicity=*/warpSize);
+}
+
DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp,
std::optional<int64_t> gridDimX,
std::optional<int64_t> gridDimY,
diff --git a/mlir/test/Dialect/GPU/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir
index 09ae0f4af686f..d1f84ef5eea0f 100644
--- a/mlir/test/Dialect/GPU/transform-gpu.mlir
+++ b/mlir/test/Dialect/GPU/transform-gpu.mlir
@@ -691,3 +691,66 @@ module attributes {transform.with_named_sequence} {
transform.yield
}
}
+
+// -----
+
+#map = affine_map<(d0) -> (d0 * 128)>
+#map1 = affine_map<(d0) -> (d0 * 32)>
+
+// CHECK-DAG: #[[$MAPB:.*]] = affine_map<(d0) -> (d0 * 128)>
+// CHECK-DAG: #[[$MAPLANE:.*]] = affine_map<(d0, d1) -> ((d0 + d1 * 73) mod 32)>
+// CHECK-DAG: #[[$MAPI:.*]] = affine_map<(d0, d1) -> (d0 * 32 + d1 * 2336 - ((d0 + d1 * 73) floordiv 2) * 64)>
+// CHECK-DAG: #[[$MAPJ:.*]] = affine_map<(d0, d1) -> ((((d0 + d1 * 73) mod 32) floordiv 2) * 32)>
+
+// CHECK-LABEL: func.func @simple_fill(
+func.func @simple_fill(%arg0: memref<128xf32>) -> memref<128xf32> {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant dense<0.000000e+00> : vector<32xf32>
+// CHECK: %[[C6:.*]] = arith.constant 6 : index
+// CHECK: gpu.launch
+ scf.forall (%arg1) in (1) {
+// CHECK: %[[BIDX:.*]] = gpu.block_id x
+// CHECK: %[[BLX:.*]] = affine.apply #[[$MAPB]](%[[BIDX]])
+ %0 = affine.apply #map(%arg1)
+ %subview = memref.subview %arg0[%0] [128] [1] : memref<128xf32> to memref<128xf32, strided<[1], offset: ?>>
+
+ // %arg2 and %arg3 map to lanes [0, 6) and are turned into epxressions
+ // involving threadIdx.x/y by the map_nested_forall_to_threads
+ // transformation. This results in a if (linear_thread_id < 6) conditional.
+ scf.forall (%arg2, %arg3) in (2, 3) {
+ // CHECK: %[[TIDX:.*]] = gpu.thread_id x
+ // CHECK: %[[TIDY:.*]] = gpu.thread_id y
+ // CHECK: %[[LID:.*]] = affine.apply #[[$MAPLANE]](%[[TIDX]], %[[TIDY]])
+ // CHECK: %[[COND:.*]] = arith.cmpi ult, %[[LID]], %[[C6]]
+ // CHECK: scf.if %[[COND]]
+ // CHECK: %[[I:.*]] = affine.apply #[[$MAPI]](%[[TIDX]], %[[TIDY]])
+ // CHECK: %[[J:.*]] = affine.apply #[[$MAPJ]](%[[TIDX]], %[[TIDY]])
+ // CHECK: memref.subview %{{.*}}[%[[I]]] [%[[J]]]
+ %1 = affine.apply #map1(%arg2)
+ %2 = affine.apply #map1(%arg3)
+ %subview_0 = memref.subview %subview[%1] [%2] [1] : memref<128xf32, strided<[1], offset: ?>> to memref<?xf32, strided<[1], offset: ?>>
+ vector.transfer_write %cst, %subview_0[%c0] {in_bounds = [true]} : vector<32xf32>, memref<?xf32, strided<[1], offset: ?>>
+
+ // This could be obtained e.g. if a previous transformation mapped this loop
+ // to lanes. This can aslo be written by hand as valid IR.
+ } {mapping = [#gpu.lane<linear_dim_0>, #gpu.lane<linear_dim_1>]}
+
+ memref.copy %subview, %subview : memref<128xf32, strided<[1], offset: ?>> to memref<128xf32, strided<[1], offset: ?>>
+ } {mapping = [#gpu.block<x>]}
+ return %arg0 : memref<128xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
+ %func = transform.structured.match ops{["func.func"]} in %module_op
+ : (!transform.any_op) -> !transform.any_op
+ %gpu_launch = transform.gpu.map_forall_to_blocks %func generate_gpu_launch
+ : (!transform.any_op) -> !transform.any_op
+
+ // This transformation maps scf.forall ivs to a particular mapping of thread
+ // ids (laneid, threadid, warpid or warpgroupid).
+ transform.gpu.map_nested_forall_to_threads %gpu_launch block_dims = [73, 5, 1]
+ : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ }
+}
|
@llvm/pr-subscribers-mlir-gpu Author: Nicolas Vasilache (nicolasvasilache) ChangesFull diff: https://github.com/llvm/llvm-project/pull/146912.diff 6 Files Affected:
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUDeviceMappingAttr.td b/mlir/include/mlir/Dialect/GPU/IR/GPUDeviceMappingAttr.td
index 6e0f6f1d78eda..63f228ca3157f 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUDeviceMappingAttr.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUDeviceMappingAttr.td
@@ -228,6 +228,30 @@ def GPUThreadMappingAttr
}];
}
+def GPULaneMappingAttr
+ : GPU_Attr<"GPULaneMapping", "lane", [
+ DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ]> {
+ let parameters = (ins
+ EnumParameter<MappingIdEnum>:$lane
+ );
+ let assemblyFormat = "`<` params `>`";
+ let description = [{
+ An attribute that allows defining lane parallelism for GPU devices.
+
+ It can be consumed by lowering to generate GPU.
+
+ #### 3D mapping mode
+
+ Unsupported
+
+ #### Linear mapping mode
+
+ The linear lane id is obtained by linearizing the index of the lane.
+ If required, predication occurs on the linear id. This allows specifying
+ predication on a 1D subset of the (linearized) lanes.
+ }];
+}
+
def GPUMemorySpaceMappingAttr : GPU_Attr<"GPUMemorySpaceMapping", "memory_space", [
DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ] > {
let parameters = (ins
diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h b/mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h
index 52fc6f4d5c71b..111c67638efc8 100644
--- a/mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h
@@ -117,6 +117,15 @@ struct GpuThreadIdBuilder : public GpuIdBuilder {
GpuThreadIdBuilder(MLIRContext *ctx, bool useLinearMapping = false);
};
+/// Builder for lane id.
+/// The `idBuilder` method returns nD values used for indexing rewrites as well
+/// as 1D sizes for predicate generation.
+/// This `useLinearMapping` case is the only supported case.
+struct GpuLaneIdBuilder : public GpuIdBuilder {
+ GpuLaneIdBuilder(MLIRContext *ctx, int64_t warpSize, bool unused);
+ int64_t warpSize = 32;
+};
+
/// Determine if the size of the kernel configuration is supported by the
/// GPU architecture being used.
/// TODO this is currently hardwired to CUDA, parameterize and generalize.
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index a5eb62ce66e0b..56631f1aac084 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -106,6 +106,20 @@ int64_t GPUThreadMappingAttr::getRelativeIndex() const {
: getMappingId();
}
+int64_t GPULaneMappingAttr::getMappingId() const {
+ return static_cast<int64_t>(getLane());
+}
+
+bool GPULaneMappingAttr::isLinearMapping() const {
+ return getMappingId() >= static_cast<int64_t>(MappingId::LinearDim0);
+}
+
+int64_t GPULaneMappingAttr::getRelativeIndex() const {
+ return isLinearMapping()
+ ? getMappingId() - static_cast<int64_t>(MappingId::LinearDim0)
+ : getMappingId();
+}
+
int64_t GPUMemorySpaceMappingAttr::getMappingId() const {
return static_cast<int64_t>(getAddressSpace());
}
diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
index 6446235c06fb2..20d1c94409238 100644
--- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
@@ -313,11 +313,14 @@ checkMappingAttributeTypes(std::optional<TransformOpInterface> transformOp,
llvm::IsaPred<GPUWarpMappingAttr>);
bool hasThreadMapping = llvm::any_of(forallOp.getMapping().value(),
llvm::IsaPred<GPUThreadMappingAttr>);
+ bool hasLaneMapping = llvm::any_of(forallOp.getMapping().value(),
+ llvm::IsaPred<GPULaneMappingAttr>);
int64_t countMappingTypes = 0;
countMappingTypes += hasBlockMapping ? 1 : 0;
countMappingTypes += hasWarpgroupMapping ? 1 : 0;
countMappingTypes += hasWarpMapping ? 1 : 0;
countMappingTypes += hasThreadMapping ? 1 : 0;
+ countMappingTypes += hasLaneMapping ? 1 : 0;
if (countMappingTypes > 1) {
return definiteFailureHelper(
transformOp, forallOp,
@@ -330,7 +333,8 @@ checkMappingAttributeTypes(std::optional<TransformOpInterface> transformOp,
"scf.forall op requires a mapping attribute of kind 'block'");
}
if (std::is_same<MappingKindType, ThreadMappingKind>::value &&
- !hasThreadMapping && !hasWarpMapping && !hasWarpgroupMapping) {
+ !hasLaneMapping && !hasThreadMapping && !hasWarpMapping &&
+ !hasWarpgroupMapping) {
return definiteFailureHelper(transformOp, forallOp,
"scf.forall op requires a mapping attribute "
"of kind 'thread' or 'warp'");
@@ -473,10 +477,17 @@ static DiagnosedSilenceableFailure rewriteOneForallCommonImpl(
SmallVector<int64_t> originalBasis(availableMappingSizes);
bool originalBasisWasProvided = !originalBasis.empty();
if (!originalBasisWasProvided) {
+ LDBG("----originalBasis was not provided, deriving it and there will be no "
+ "predication");
originalBasis = forallMappingSizes;
while (originalBasis.size() < 3)
originalBasis.push_back(1);
+ } else {
+ LDBG("----originalBasis was provided, using it, there will be predication");
}
+ LLVM_DEBUG(
+ llvm::interleaveComma(originalBasis, DBGS() << "------originalBasis: ");
+ llvm::dbgs() << "\n");
IdBuilderResult builderResult =
gpuIdBuilder.idBuilder(rewriter, loc, forallMappingSizes, originalBasis);
@@ -490,6 +501,7 @@ static DiagnosedSilenceableFailure rewriteOneForallCommonImpl(
forallMappingAttrs.getArrayRef().take_front(forallOp.getRank()))) {
auto mappingAttr = cast<DeviceMappingAttrInterface>(dim);
Value peIdOp = mappingIdOps[mappingAttr.getRelativeIndex()];
+ LDBG("----map: " << iv << " to" << peIdOp);
bvm.map(iv, peIdOp);
}
@@ -790,6 +802,9 @@ getThreadIdBuilder(std::optional<TransformOpInterface> transformOp,
.Case([&](GPUThreadMappingAttr) {
return GpuThreadIdBuilder(ctx, useLinearMapping);
})
+ .Case([&](GPULaneMappingAttr) {
+ return GpuLaneIdBuilder(ctx, warpSize, useLinearMapping);
+ })
.Default([&](DeviceMappingAttrInterface) -> GpuIdBuilder {
llvm_unreachable("unknown mapping attribute");
});
diff --git a/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp b/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp
index 9853e80828390..c693a2fa01e89 100644
--- a/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp
@@ -156,6 +156,63 @@ static GpuIdBuilderFnType common3DIdBuilderFn(int64_t multiplicity = 1) {
return res;
}
+/// Create a lane id builder that takes the `originalBasis` and decompose
+/// it in the basis of `forallMappingSizes`. The linear id builder returns an
+/// n-D vector of ids for indexing and 1-D size + id for predicate generation.
+static GpuIdBuilderFnType laneIdBuilderFn(int64_t periodicity) {
+ auto res = [periodicity](RewriterBase &rewriter, Location loc,
+ ArrayRef<int64_t> forallMappingSizes,
+ ArrayRef<int64_t> originalBasis) {
+ SmallVector<OpFoldResult> originalBasisOfr =
+ getAsIndexOpFoldResult(rewriter.getContext(), originalBasis);
+ OpFoldResult linearId =
+ buildLinearId<ThreadIdOp>(rewriter, loc, originalBasisOfr);
+ AffineExpr d0 = getAffineDimExpr(0, rewriter.getContext());
+ linearId = affine::makeComposedFoldedAffineApply(
+ rewriter, loc, d0 % periodicity, {linearId});
+
+ // Sizes in [0 .. n] -> [n .. 0] order to properly compute strides in
+ // "row-major" order.
+ SmallVector<int64_t> reverseBasisSizes(llvm::reverse(forallMappingSizes));
+ SmallVector<int64_t> strides = computeStrides(reverseBasisSizes);
+ SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
+ SmallVector<Value> ids;
+ // Reverse back to be in [0 .. n] order.
+ for (AffineExpr e : llvm::reverse(delinearizingExprs)) {
+ ids.push_back(
+ affine::makeComposedAffineApply(rewriter, loc, e, {linearId}));
+ }
+
+ // clang-format off
+ LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes,
+ DBGS() << "--delinearization basis: ");
+ llvm::dbgs() << "\n";
+ llvm::interleaveComma(strides,
+ DBGS() << "--delinearization strides: ");
+ llvm::dbgs() << "\n";
+ llvm::interleaveComma(delinearizingExprs,
+ DBGS() << "--delinearization exprs: ");
+ llvm::dbgs() << "\n";
+ llvm::interleaveComma(ids, DBGS() << "--ids: ");
+ llvm::dbgs() << "\n";);
+ // clang-format on
+
+ // Return n-D ids for indexing and 1-D size + id for predicate generation.
+ return IdBuilderResult{
+ /*mappingIdOps=*/ids,
+ /*availableMappingSizes=*/
+ SmallVector<int64_t>{computeProduct(originalBasis)},
+ // `forallMappingSizes` iterate in the scaled basis, they need to be
+ // scaled back into the original basis to provide tight
+ // activeMappingSizes quantities for predication.
+ /*activeMappingSizes=*/
+ SmallVector<int64_t>{computeProduct(forallMappingSizes)},
+ /*activeIdOps=*/SmallVector<Value>{linearId.get<Value>()}};
+ };
+
+ return res;
+}
+
namespace mlir {
namespace transform {
namespace gpu {
@@ -221,6 +278,16 @@ GpuThreadIdBuilder::GpuThreadIdBuilder(MLIRContext *ctx, bool useLinearMapping)
: common3DIdBuilderFn<ThreadIdOp>(/*multiplicity=*/1);
}
+GpuLaneIdBuilder::GpuLaneIdBuilder(MLIRContext *ctx, int64_t warpSize,
+ bool unused)
+ : GpuIdBuilder(ctx, /*useLinearMapping=*/true,
+ [](MLIRContext *ctx, MappingId id) {
+ return GPULaneMappingAttr::get(ctx, id);
+ }),
+ warpSize(warpSize) {
+ idBuilder = laneIdBuilderFn(/*periodicity=*/warpSize);
+}
+
DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp,
std::optional<int64_t> gridDimX,
std::optional<int64_t> gridDimY,
diff --git a/mlir/test/Dialect/GPU/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir
index 09ae0f4af686f..d1f84ef5eea0f 100644
--- a/mlir/test/Dialect/GPU/transform-gpu.mlir
+++ b/mlir/test/Dialect/GPU/transform-gpu.mlir
@@ -691,3 +691,66 @@ module attributes {transform.with_named_sequence} {
transform.yield
}
}
+
+// -----
+
+#map = affine_map<(d0) -> (d0 * 128)>
+#map1 = affine_map<(d0) -> (d0 * 32)>
+
+// CHECK-DAG: #[[$MAPB:.*]] = affine_map<(d0) -> (d0 * 128)>
+// CHECK-DAG: #[[$MAPLANE:.*]] = affine_map<(d0, d1) -> ((d0 + d1 * 73) mod 32)>
+// CHECK-DAG: #[[$MAPI:.*]] = affine_map<(d0, d1) -> (d0 * 32 + d1 * 2336 - ((d0 + d1 * 73) floordiv 2) * 64)>
+// CHECK-DAG: #[[$MAPJ:.*]] = affine_map<(d0, d1) -> ((((d0 + d1 * 73) mod 32) floordiv 2) * 32)>
+
+// CHECK-LABEL: func.func @simple_fill(
+func.func @simple_fill(%arg0: memref<128xf32>) -> memref<128xf32> {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant dense<0.000000e+00> : vector<32xf32>
+// CHECK: %[[C6:.*]] = arith.constant 6 : index
+// CHECK: gpu.launch
+ scf.forall (%arg1) in (1) {
+// CHECK: %[[BIDX:.*]] = gpu.block_id x
+// CHECK: %[[BLX:.*]] = affine.apply #[[$MAPB]](%[[BIDX]])
+ %0 = affine.apply #map(%arg1)
+ %subview = memref.subview %arg0[%0] [128] [1] : memref<128xf32> to memref<128xf32, strided<[1], offset: ?>>
+
+ // %arg2 and %arg3 map to lanes [0, 6) and are turned into epxressions
+ // involving threadIdx.x/y by the map_nested_forall_to_threads
+ // transformation. This results in a if (linear_thread_id < 6) conditional.
+ scf.forall (%arg2, %arg3) in (2, 3) {
+ // CHECK: %[[TIDX:.*]] = gpu.thread_id x
+ // CHECK: %[[TIDY:.*]] = gpu.thread_id y
+ // CHECK: %[[LID:.*]] = affine.apply #[[$MAPLANE]](%[[TIDX]], %[[TIDY]])
+ // CHECK: %[[COND:.*]] = arith.cmpi ult, %[[LID]], %[[C6]]
+ // CHECK: scf.if %[[COND]]
+ // CHECK: %[[I:.*]] = affine.apply #[[$MAPI]](%[[TIDX]], %[[TIDY]])
+ // CHECK: %[[J:.*]] = affine.apply #[[$MAPJ]](%[[TIDX]], %[[TIDY]])
+ // CHECK: memref.subview %{{.*}}[%[[I]]] [%[[J]]]
+ %1 = affine.apply #map1(%arg2)
+ %2 = affine.apply #map1(%arg3)
+ %subview_0 = memref.subview %subview[%1] [%2] [1] : memref<128xf32, strided<[1], offset: ?>> to memref<?xf32, strided<[1], offset: ?>>
+ vector.transfer_write %cst, %subview_0[%c0] {in_bounds = [true]} : vector<32xf32>, memref<?xf32, strided<[1], offset: ?>>
+
+ // This could be obtained e.g. if a previous transformation mapped this loop
+ // to lanes. This can aslo be written by hand as valid IR.
+ } {mapping = [#gpu.lane<linear_dim_0>, #gpu.lane<linear_dim_1>]}
+
+ memref.copy %subview, %subview : memref<128xf32, strided<[1], offset: ?>> to memref<128xf32, strided<[1], offset: ?>>
+ } {mapping = [#gpu.block<x>]}
+ return %arg0 : memref<128xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
+ %func = transform.structured.match ops{["func.func"]} in %module_op
+ : (!transform.any_op) -> !transform.any_op
+ %gpu_launch = transform.gpu.map_forall_to_blocks %func generate_gpu_launch
+ : (!transform.any_op) -> !transform.any_op
+
+ // This transformation maps scf.forall ivs to a particular mapping of thread
+ // ids (laneid, threadid, warpid or warpgroupid).
+ transform.gpu.map_nested_forall_to_threads %gpu_launch block_dims = [73, 5, 1]
+ : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ }
+}
|
Co-authored-by: Oleksandr "Alex" Zinenko <[email protected]>
2430d7b
to
a9cf089
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, but it looks like there's a declaration/header missing though the CI passing implies otherwise...
|
||
// ----- | ||
|
||
#map = affine_map<(d0) -> (d0 * 128)> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#map = affine_map<(d0) -> (d0 * 128)> | |
#map = affine_map<(d0) -> (d0 * 128)> |
// CHECK: %[[C6:.*]] = arith.constant 6 : index | ||
// CHECK: gpu.launch | ||
scf.forall (%arg1) in (1) { | ||
// CHECK: %[[BIDX:.*]] = gpu.block_id x | ||
// CHECK: %[[BLX:.*]] = affine.apply #[[$MAPB]]()[%[[BIDX]]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// CHECK: %[[C6:.*]] = arith.constant 6 : index | |
// CHECK: gpu.launch | |
scf.forall (%arg1) in (1) { | |
// CHECK: %[[BIDX:.*]] = gpu.block_id x | |
// CHECK: %[[BLX:.*]] = affine.apply #[[$MAPB]]()[%[[BIDX]]] | |
// CHECK: %[[C6:.*]] = arith.constant 6 : index | |
// CHECK: gpu.launch | |
scf.forall (%arg1) in (1) { | |
// CHECK: %[[BIDX:.*]] = gpu.block_id x | |
// CHECK: %[[BLX:.*]] = affine.apply #[[$MAPB]]()[%[[BIDX]]] |
int64_t GPULaneMappingAttr::getMappingId() const { | ||
return static_cast<int64_t>(getLane()); | ||
} | ||
|
||
bool GPULaneMappingAttr::isLinearMapping() const { | ||
return getMappingId() >= static_cast<int64_t>(MappingId::LinearDim0); | ||
} | ||
|
||
int64_t GPULaneMappingAttr::getRelativeIndex() const { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm mildly confused. Where are the declarations of these methods?
// CHECK: memref.subview %{{.*}}[%[[I]]] [%[[J]]] | ||
%1 = affine.apply #map1(%arg2) | ||
%2 = affine.apply #map1(%arg3) | ||
%subview_0 = memref.subview %subview[%1] [%2] [1] : memref<128xf32, strided<[1], offset: ?>> to memref<?xf32, strided<[1], offset: ?>> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it intentional that %2 is used as size of 1d memref instead of a second index to an 2d memref? It makes the test case hard to understand.
This revision adds a new attribute for mapping
scf.forall
to linear lane ids.Example: