-
Notifications
You must be signed in to change notification settings - Fork 14.8k
[LV] Ignore user-specified interleave count when unsafe. #153009
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4645,9 +4645,9 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() { | |
} | ||
} | ||
|
||
unsigned | ||
LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, | ||
InstructionCost LoopCost) { | ||
unsigned LoopVectorizationPlanner::selectInterleaveCount( | ||
VPlan &Plan, ElementCount VF, unsigned UserIC, InstructionCost LoopCost, | ||
bool &IntBeneficial) { | ||
// -- The interleave heuristics -- | ||
// We interleave the loop in order to expose ILP and reduce the loop overhead. | ||
// There are many micro-architectural considerations that we can't predict | ||
|
@@ -4662,25 +4662,26 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, | |
// 3. We don't interleave if we think that we will spill registers to memory | ||
// due to the increased register pressure. | ||
|
||
if (!CM.isScalarEpilogueAllowed()) | ||
// We used the distance for the interleave count. This should not be overriden | ||
// by a user-specified IC. | ||
if (!Legal->isSafeForAnyVectorWidth()) | ||
return 1; | ||
|
||
if (!CM.isScalarEpilogueAllowed()) | ||
return std::max(1U, UserIC); | ||
|
||
if (any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), | ||
IsaPred<VPEVLBasedIVPHIRecipe>)) { | ||
LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. " | ||
"Unroll factor forced to be 1.\n"); | ||
return 1; | ||
return std::max(1U, UserIC); | ||
} | ||
|
||
// We used the distance for the interleave count. | ||
if (!Legal->isSafeForAnyVectorWidth()) | ||
return 1; | ||
|
||
// We don't attempt to perform interleaving for loops with uncountable early | ||
// exits because the VPInstruction::AnyOf code cannot currently handle | ||
// multiple parts. | ||
if (Plan.hasEarlyExit()) | ||
return 1; | ||
return std::max(1U, UserIC); | ||
|
||
const bool HasReductions = | ||
any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), | ||
|
@@ -4697,7 +4698,7 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, | |
|
||
// Loop body is free and there is no need for interleaving. | ||
if (LoopCost == 0) | ||
return 1; | ||
return std::max(1U, UserIC); | ||
} | ||
|
||
VPRegisterUsage R = | ||
|
@@ -4834,7 +4835,8 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, | |
// benefit from interleaving. | ||
if (VF.isVector() && HasReductions) { | ||
LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); | ||
return IC; | ||
IntBeneficial = IC > 1; | ||
return UserIC > 0 ? UserIC : IC; | ||
} | ||
|
||
// For any scalar loop that either requires runtime checks or predication we | ||
|
@@ -4917,7 +4919,7 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, | |
}); | ||
if (HasSelectCmpReductions) { | ||
LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); | ||
return 1; | ||
return std::max(1U, UserIC); | ||
} | ||
|
||
// If we have a scalar reduction (vector reductions are already dealt with | ||
|
@@ -4936,7 +4938,7 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, | |
if (HasOrderedReductions) { | ||
LLVM_DEBUG( | ||
dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); | ||
return 1; | ||
return std::max(1U, UserIC); | ||
} | ||
|
||
unsigned F = MaxNestedScalarReductionIC; | ||
|
@@ -4949,7 +4951,9 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, | |
std::max(StoresIC, LoadsIC) > SmallIC) { | ||
LLVM_DEBUG( | ||
dbgs() << "LV: Interleaving to saturate store or load ports.\n"); | ||
return std::max(StoresIC, LoadsIC); | ||
IC = std::max(StoresIC, LoadsIC); | ||
IntBeneficial = IC > 1; | ||
return UserIC > 0 ? UserIC : IC; | ||
} | ||
|
||
// If there are scalar reductions and TTI has enabled aggressive | ||
|
@@ -4958,22 +4962,27 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, | |
LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); | ||
// Interleave no less than SmallIC but not as aggressive as the normal IC | ||
// to satisfy the rare situation when resources are too limited. | ||
return std::max(IC / 2, SmallIC); | ||
IC = std::max(IC / 2, SmallIC); | ||
IntBeneficial = IC > 1; | ||
return UserIC > 0 ? UserIC : IC; | ||
} | ||
|
||
LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); | ||
return SmallIC; | ||
IC = std::max(SmallIC, UserIC); | ||
IntBeneficial = IC > 1; | ||
return UserIC > 0 ? UserIC : IC; | ||
} | ||
|
||
// Interleave if this is a large loop (small loops are already dealt with by | ||
// this point) that could benefit from interleaving. | ||
if (AggressivelyInterleaveReductions) { | ||
LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); | ||
return IC; | ||
IntBeneficial = IC > 1; | ||
return UserIC > 0 ? UserIC : IC; | ||
} | ||
|
||
LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); | ||
return 1; | ||
return std::max(1U, UserIC); | ||
} | ||
|
||
bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, | ||
|
@@ -10147,15 +10156,16 @@ bool LoopVectorizePass::processLoop(Loop *L) { | |
LVP.emitInvalidCostRemarks(ORE); | ||
|
||
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind); | ||
bool IntBeneficial = false; | ||
if (LVP.hasPlanWithVF(VF.Width)) { | ||
// Select the interleave count. | ||
IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost); | ||
IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, UserIC, | ||
VF.Cost, IntBeneficial); | ||
|
||
unsigned SelectedIC = std::max(IC, UserIC); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @fhahn I think this code looks wrong and is making life difficult for @kmclaughlin-arm here. If the user has asked for an interleave count of 2, but we create checks based on the selected interleave count of 4, then either it's just functionally incorrect, or it's leading to a pessimistic cost model where we may never enter the vector loop. |
||
// Optimistically generate runtime checks if they are needed. Drop them if | ||
// they turn out to not be profitable. | ||
if (VF.Width.isVector() || SelectedIC > 1) { | ||
Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); | ||
if (VF.Width.isVector() || IC > 1) { | ||
Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, IC); | ||
|
||
// Bail out early if either the SCEV or memory runtime checks are known to | ||
// fail. In that case, the vector loop would never execute. | ||
|
@@ -10201,15 +10211,21 @@ bool LoopVectorizePass::processLoop(Loop *L) { | |
VectorizeLoop = false; | ||
} | ||
|
||
if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) { | ||
if (IC == 1 && UserIC > 1) { | ||
LLVM_DEBUG(dbgs() << "LV: Ignoring user-specified interleave count.\n"); | ||
IntDiagMsg = {"InterleavingUnsafe", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this is strictly true because on the line above you've set SafeUserIC to 0, and then we have It's possible that |
||
"Ignoring user-specified interleave count due to possibly " | ||
"unsafe dependencies in the loop."}; | ||
InterleaveLoop = false; | ||
} else if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) { | ||
// Tell the user interleaving was avoided up-front, despite being explicitly | ||
// requested. | ||
LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " | ||
"interleaving should be avoided up front\n"); | ||
IntDiagMsg = {"InterleavingAvoided", | ||
"Ignoring UserIC, because interleaving was avoided up front"}; | ||
InterleaveLoop = false; | ||
} else if (IC == 1 && UserIC <= 1) { | ||
} else if (!IntBeneficial && UserIC <= 1) { | ||
// Tell the user interleaving is not beneficial. | ||
LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); | ||
IntDiagMsg = { | ||
|
@@ -10221,7 +10237,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { | |
IntDiagMsg.second += | ||
" and is explicitly disabled or interleave count is set to 1"; | ||
} | ||
} else if (IC > 1 && UserIC == 1) { | ||
} else if (IntBeneficial && UserIC == 1) { | ||
// Tell the user interleaving is beneficial, but it explicitly disabled. | ||
LLVM_DEBUG( | ||
dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); | ||
|
@@ -10244,9 +10260,6 @@ bool LoopVectorizePass::processLoop(Loop *L) { | |
InterleaveLoop = false; | ||
} | ||
|
||
// Override IC if user provided an interleave count. | ||
IC = UserIC > 0 ? UserIC : IC; | ||
|
||
// Emit diagnostic messages, if any. | ||
const char *VAPassName = Hints.vectorizeAnalysisPassName(); | ||
if (!VectorizeLoop && !InterleaveLoop) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -417,21 +417,17 @@ for.end: ; preds = %for.body, %entry | |
|
||
; Note: This test was added to ensure we always check the legality of reductions (end emit a warning if necessary) before checking for memory dependencies | ||
; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. | ||
; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2) | ||
; CHECK-REMARK: Ignoring user-specified interleave count due to possibly unsafe dependencies in the loop. | ||
; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it worth adding a check for the remark we emit about ignoring the UserIC? |
||
define i32 @memory_dependence(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) { | ||
; CHECK-LABEL: @memory_dependence | ||
; CHECK: vector.body: | ||
; CHECK: %[[LOAD1:.*]] = load <4 x i32> | ||
; CHECK: %[[LOAD2:.*]] = load <4 x i32> | ||
; CHECK: %[[LOAD3:.*]] = load <4 x i32> | ||
; CHECK: %[[LOAD4:.*]] = load <4 x i32> | ||
; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]] | ||
; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]] | ||
; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]] | ||
; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]] | ||
; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD2]], %[[LOAD1]] | ||
; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD2]] | ||
; CHECK: middle.block: | ||
; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]] | ||
; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]]) | ||
; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[MUL1]]) | ||
entry: | ||
br label %for.body | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
; RUN: opt -passes=loop-vectorize -pass-remarks-analysis=loop-vectorize -S < %s 2>&1 | FileCheck %s | ||
|
||
; Make sure the unsafe user specified interleave count is ignored. | ||
|
||
; CHECK: remark: <unknown>:0:0: Ignoring user-specified interleave count due to possibly unsafe dependencies in the loop. | ||
; CHECK-LABEL: @loop_distance_4 | ||
define void @loop_distance_4(i64 %N, ptr %a, ptr %b) { | ||
entry: | ||
%cmp10 = icmp sgt i64 %N, 4 | ||
br i1 %cmp10, label %for.body, label %for.end | ||
|
||
for.body: | ||
%indvars.iv = phi i64 [ 4, %entry ], [ %indvars.iv.next, %for.body ] | ||
%0 = getelementptr i32, ptr %b, i64 %indvars.iv | ||
%arrayidx = getelementptr i8, ptr %0, i64 -16 | ||
%1 = load i32, ptr %arrayidx, align 4 | ||
%arrayidx2 = getelementptr inbounds nuw i32, ptr %a, i64 %indvars.iv | ||
%2 = load i32, ptr %arrayidx2, align 4 | ||
%add = add nsw i32 %2, %1 | ||
store i32 %add, ptr %0, align 4 | ||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 | ||
%exitcond.not = icmp eq i64 %indvars.iv.next, %N | ||
br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 | ||
|
||
for.end: | ||
ret void | ||
} | ||
|
||
!1 = !{!1, !2, !3} | ||
!2 = !{!"llvm.loop.interleave.count", i32 4} | ||
!3 = !{!"llvm.loop.vectorize.width", i32 4} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@fhahn by fixing the UserIC in selectInterleaveCount we have reintroduced a bug versus the original version of the patch. If you look above we pass in the original UserIC to LVP.plan, which is then used by computeMaxVF to determine if there is no scalar epilogue and to decide the tail-folding style. This was based on the assumption that UserIC will not change. I think we either: