Skip to content

[LV] Ignore user-specified interleave count when unsafe. #153009

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -497,8 +497,8 @@ class LoopVectorizationPlanner {
/// If interleave count has been specified by metadata it will be returned.
/// Otherwise, the interleave count is computed and returned. VF and LoopCost
/// are the selected vectorization factor and the cost of the selected VF.
unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF,
InstructionCost LoopCost);
unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF, unsigned UserIC,
InstructionCost LoopCost, bool &IntBeneficial);

/// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan
/// according to the best selected \p VF and \p UF.
Expand Down
71 changes: 42 additions & 29 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4645,9 +4645,9 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
}
}

unsigned
LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
InstructionCost LoopCost) {
unsigned LoopVectorizationPlanner::selectInterleaveCount(
VPlan &Plan, ElementCount VF, unsigned UserIC, InstructionCost LoopCost,
bool &IntBeneficial) {
// -- The interleave heuristics --
// We interleave the loop in order to expose ILP and reduce the loop overhead.
// There are many micro-architectural considerations that we can't predict
Expand All @@ -4662,25 +4662,26 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
// 3. We don't interleave if we think that we will spill registers to memory
// due to the increased register pressure.

if (!CM.isScalarEpilogueAllowed())
// We used the distance for the interleave count. This should not be overriden
// by a user-specified IC.
if (!Legal->isSafeForAnyVectorWidth())
return 1;

if (!CM.isScalarEpilogueAllowed())
return std::max(1U, UserIC);

if (any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
IsaPred<VPEVLBasedIVPHIRecipe>)) {
LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
"Unroll factor forced to be 1.\n");
return 1;
return std::max(1U, UserIC);
}

// We used the distance for the interleave count.
if (!Legal->isSafeForAnyVectorWidth())
return 1;

// We don't attempt to perform interleaving for loops with uncountable early
// exits because the VPInstruction::AnyOf code cannot currently handle
// multiple parts.
if (Plan.hasEarlyExit())
return 1;
return std::max(1U, UserIC);

const bool HasReductions =
any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
Expand All @@ -4697,7 +4698,7 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,

// Loop body is free and there is no need for interleaving.
if (LoopCost == 0)
return 1;
return std::max(1U, UserIC);
}

VPRegisterUsage R =
Expand Down Expand Up @@ -4834,7 +4835,8 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
// benefit from interleaving.
if (VF.isVector() && HasReductions) {
LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
return IC;
IntBeneficial = IC > 1;
return UserIC > 0 ? UserIC : IC;
}

// For any scalar loop that either requires runtime checks or predication we
Expand Down Expand Up @@ -4917,7 +4919,7 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
});
if (HasSelectCmpReductions) {
LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
return 1;
return std::max(1U, UserIC);
}

// If we have a scalar reduction (vector reductions are already dealt with
Expand All @@ -4936,7 +4938,7 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
if (HasOrderedReductions) {
LLVM_DEBUG(
dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
return 1;
return std::max(1U, UserIC);
}

unsigned F = MaxNestedScalarReductionIC;
Expand All @@ -4949,7 +4951,9 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
std::max(StoresIC, LoadsIC) > SmallIC) {
LLVM_DEBUG(
dbgs() << "LV: Interleaving to saturate store or load ports.\n");
return std::max(StoresIC, LoadsIC);
IC = std::max(StoresIC, LoadsIC);
IntBeneficial = IC > 1;
return UserIC > 0 ? UserIC : IC;
}

// If there are scalar reductions and TTI has enabled aggressive
Expand All @@ -4958,22 +4962,27 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
// Interleave no less than SmallIC but not as aggressive as the normal IC
// to satisfy the rare situation when resources are too limited.
return std::max(IC / 2, SmallIC);
IC = std::max(IC / 2, SmallIC);
IntBeneficial = IC > 1;
return UserIC > 0 ? UserIC : IC;
}

LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
return SmallIC;
IC = std::max(SmallIC, UserIC);
IntBeneficial = IC > 1;
return UserIC > 0 ? UserIC : IC;
}

// Interleave if this is a large loop (small loops are already dealt with by
// this point) that could benefit from interleaving.
if (AggressivelyInterleaveReductions) {
LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
return IC;
IntBeneficial = IC > 1;
return UserIC > 0 ? UserIC : IC;
}

LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
return 1;
return std::max(1U, UserIC);
}

bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
Expand Down Expand Up @@ -10147,15 +10156,16 @@ bool LoopVectorizePass::processLoop(Loop *L) {
LVP.emitInvalidCostRemarks(ORE);

GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
bool IntBeneficial = false;
if (LVP.hasPlanWithVF(VF.Width)) {
// Select the interleave count.
IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, UserIC,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@fhahn by fixing the UserIC in selectInterleaveCount we have reintroduced a bug versus the original version of the patch. If you look above we pass in the original UserIC to LVP.plan, which is then used by computeMaxVF to determine if there is no scalar epilogue and to decide the tail-folding style. This was based on the assumption that UserIC will not change. I think we either:

  1. Have to go back to the original version of this PR, or
  2. The legality aspects of selectInterleaveCount need splitting out into a new function that is called before LVP.plan and the UserIC modified early on.

VF.Cost, IntBeneficial);

unsigned SelectedIC = std::max(IC, UserIC);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@fhahn I think this code looks wrong and is making life difficult for @kmclaughlin-arm here. If the user has asked for an interleave count of 2, but we create checks based on the selected interleave count of 4, then either it's just functionally incorrect, or it's leading to a pessimistic cost model where we may never enter the vector loop.

// Optimistically generate runtime checks if they are needed. Drop them if
// they turn out to not be profitable.
if (VF.Width.isVector() || SelectedIC > 1) {
Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
if (VF.Width.isVector() || IC > 1) {
Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, IC);

// Bail out early if either the SCEV or memory runtime checks are known to
// fail. In that case, the vector loop would never execute.
Expand Down Expand Up @@ -10201,15 +10211,21 @@ bool LoopVectorizePass::processLoop(Loop *L) {
VectorizeLoop = false;
}

if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
if (IC == 1 && UserIC > 1) {
LLVM_DEBUG(dbgs() << "LV: Ignoring user-specified interleave count.\n");
IntDiagMsg = {"InterleavingUnsafe",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is strictly true because on the line above you've set SafeUserIC to 0, and then we have unsigned SelectedIC = std::max(IC, SafeUserIC);

It's possible that selectInterleaveCount may set IC to > 1 in future based on the maximum safe distance. It might be better to say something more vague like "Ignoring UserIC due to possibly unsafe dependencies in the loop".

"Ignoring user-specified interleave count due to possibly "
"unsafe dependencies in the loop."};
InterleaveLoop = false;
} else if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
// Tell the user interleaving was avoided up-front, despite being explicitly
// requested.
LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
"interleaving should be avoided up front\n");
IntDiagMsg = {"InterleavingAvoided",
"Ignoring UserIC, because interleaving was avoided up front"};
InterleaveLoop = false;
} else if (IC == 1 && UserIC <= 1) {
} else if (!IntBeneficial && UserIC <= 1) {
// Tell the user interleaving is not beneficial.
LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
IntDiagMsg = {
Expand All @@ -10221,7 +10237,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
IntDiagMsg.second +=
" and is explicitly disabled or interleave count is set to 1";
}
} else if (IC > 1 && UserIC == 1) {
} else if (IntBeneficial && UserIC == 1) {
// Tell the user interleaving is beneficial, but it explicitly disabled.
LLVM_DEBUG(
dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
Expand All @@ -10244,9 +10260,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
InterleaveLoop = false;
}

// Override IC if user provided an interleave count.
IC = UserIC > 0 ? UserIC : IC;

// Emit diagnostic messages, if any.
const char *VAPassName = Hints.vectorizeAnalysisPassName();
if (!VectorizeLoop && !InterleaveLoop) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -417,21 +417,17 @@ for.end: ; preds = %for.body, %entry

; Note: This test was added to ensure we always check the legality of reductions (end emit a warning if necessary) before checking for memory dependencies
; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
; CHECK-REMARK: Ignoring user-specified interleave count due to possibly unsafe dependencies in the loop.
; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it worth adding a check for the remark we emit about ignoring the UserIC?

define i32 @memory_dependence(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) {
; CHECK-LABEL: @memory_dependence
; CHECK: vector.body:
; CHECK: %[[LOAD1:.*]] = load <4 x i32>
; CHECK: %[[LOAD2:.*]] = load <4 x i32>
; CHECK: %[[LOAD3:.*]] = load <4 x i32>
; CHECK: %[[LOAD4:.*]] = load <4 x i32>
; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]]
; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]]
; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]]
; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]]
; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD2]], %[[LOAD1]]
; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD2]]
; CHECK: middle.block:
; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]]
; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]])
; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[MUL1]])
entry:
br label %for.body

Expand Down
31 changes: 31 additions & 0 deletions llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
; RUN: opt -passes=loop-vectorize -pass-remarks-analysis=loop-vectorize -S < %s 2>&1 | FileCheck %s

; Make sure the unsafe user specified interleave count is ignored.

; CHECK: remark: <unknown>:0:0: Ignoring user-specified interleave count due to possibly unsafe dependencies in the loop.
; CHECK-LABEL: @loop_distance_4
define void @loop_distance_4(i64 %N, ptr %a, ptr %b) {
entry:
%cmp10 = icmp sgt i64 %N, 4
br i1 %cmp10, label %for.body, label %for.end

for.body:
%indvars.iv = phi i64 [ 4, %entry ], [ %indvars.iv.next, %for.body ]
%0 = getelementptr i32, ptr %b, i64 %indvars.iv
%arrayidx = getelementptr i8, ptr %0, i64 -16
%1 = load i32, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds nuw i32, ptr %a, i64 %indvars.iv
%2 = load i32, ptr %arrayidx2, align 4
%add = add nsw i32 %2, %1
store i32 %add, ptr %0, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %N
br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1

for.end:
ret void
}

!1 = !{!1, !2, !3}
!2 = !{!"llvm.loop.interleave.count", i32 4}
!3 = !{!"llvm.loop.vectorize.width", i32 4}