-
Notifications
You must be signed in to change notification settings - Fork 14.8k
[LoopStrengthReduce] Encourage the creation of IVs whose increment can later be combined with memory instuctions #152995
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
…hose increment can later be combined with a memory instruction as a pre/post increment.
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-aarch64 Author: Sergey Shcherbinin (SergeyShch01) ChangesEncourage (via heuristics) the creation of IVs whose increment can later be combined with memory instructions as pre/post increments. Regresstion tests are updated accordingly. Patch is 216.70 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/152995.diff 38 Files Affected:
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index e3ef9d8680b53..27d9190688ffa 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -523,6 +523,8 @@ struct Formula {
bool countsDownToZero() const;
+ bool isBaseRegOnly() const;
+
size_t getNumRegs() const;
Type *getType() const;
@@ -717,6 +719,11 @@ bool Formula::countsDownToZero() const {
return StepInt->isNegative();
}
+bool Formula::isBaseRegOnly() const {
+ return BaseGV == nullptr && Scale == 0 && ScaledReg == nullptr &&
+ BaseOffset.isZero() && UnfoldedOffset.isZero() && BaseRegs.size() == 1;
+}
+
/// Return the total number of register operands used by this formula. This does
/// not include register uses implied by non-constant addrec strides.
size_t Formula::getNumRegs() const {
@@ -1425,12 +1432,17 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
const SCEV *Start;
const SCEVConstant *Step;
if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant(Step))))
- // If the step size matches the base offset, we could use pre-indexed
- // addressing.
- if ((AMK == TTI::AMK_PreIndexed && F.BaseOffset.isFixed() &&
+ if ( // If the step size matches the base offset, we could use
+ // pre-indexed addressing.
+ (AMK == TTI::AMK_PreIndexed && F.BaseOffset.isFixed() &&
Step->getAPInt() == F.BaseOffset.getFixedValue()) ||
(AMK == TTI::AMK_PostIndexed && !isa<SCEVConstant>(Start) &&
- SE->isLoopInvariant(Start, L)))
+ SE->isLoopInvariant(Start, L)) ||
+ // general check for post-indexed addressing with specific step
+ (LU.Kind == LSRUse::Address && F.isBaseRegOnly() &&
+ TTI->isLegalAddressingMode(LU.AccessTy.MemTy, nullptr,
+ Step->getAPInt().getSExtValue(), true,
+ 0, LU.AccessTy.AddrSpace)))
LoopCost = 0;
}
// If the loop counts down to zero and we'll be using a hardware loop then
diff --git a/llvm/test/CodeGen/AArch64/arm64-scaled_iv.ll b/llvm/test/CodeGen/AArch64/arm64-scaled_iv.ll
index c4da564434ee9..966ff15dff098 100644
--- a/llvm/test/CodeGen/AArch64/arm64-scaled_iv.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-scaled_iv.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
; RUN: opt -S -loop-reduce < %s | FileCheck %s
; Scaling factor in addressing mode are costly.
; Make loop-reduce prefer unscaled accesses.
@@ -7,20 +8,38 @@ target triple = "arm64-apple-ios7.0.0"
; Function Attrs: nounwind ssp
define void @mulDouble(ptr nocapture %a, ptr nocapture %b, ptr nocapture %c) {
-; CHECK: @mulDouble
+; CHECK-LABEL: define void @mulDouble(
+; CHECK-SAME: ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]], ptr captures(none) [[C:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr nuw i8, ptr [[A]], i64 8
+; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr nuw i8, ptr [[C]], i64 16
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[LSR_IV6:%.*]] = phi ptr [ [[SCEVGEP7:%.*]], [[FOR_BODY]] ], [ [[B]], [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[LSR_IV4:%.*]] = phi ptr [ [[SCEVGEP5:%.*]], [[FOR_BODY]] ], [ [[SCEVGEP3]], [[ENTRY]] ]
+; CHECK-NEXT: [[LSR_IV2:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 19, [[ENTRY]] ]
+; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP1:%.*]], [[FOR_BODY]] ], [ [[SCEVGEP]], [[ENTRY]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[LSR_IV6]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[LSR_IV4]], align 8
+; CHECK-NEXT: [[MUL:%.*]] = fmul double [[TMP1]], [[TMP2]]
+; CHECK-NEXT: store double [[MUL]], ptr [[LSR_IV]], align 8
+; CHECK-NEXT: [[SCEVGEP1]] = getelementptr i8, ptr [[LSR_IV]], i64 8
+; CHECK-NEXT: [[LSR_IV_NEXT]] = add nsw i32 [[LSR_IV2]], -1
+; CHECK-NEXT: [[SCEVGEP5]] = getelementptr i8, ptr [[LSR_IV4]], i64 8
+; CHECK-NEXT: [[SCEVGEP7]] = getelementptr i8, ptr [[LSR_IV6]], i64 8
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 0
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK: for.end:
+; CHECK-NEXT: ret void
+;
entry:
br label %for.body
for.body: ; preds = %for.body, %entry
-; CHECK: [[IV:%[^ ]+]] = phi i64 [ [[IVNEXT:%[^,]+]], %for.body ], [ 0, %entry ]
-; Only one induction variable should have been generated.
-; CHECK-NOT: phi
%indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.body ]
%tmp = add nsw i64 %indvars.iv, -1
%arrayidx = getelementptr inbounds double, ptr %b, i64 %tmp
%tmp1 = load double, ptr %arrayidx, align 8
-; The induction variable should carry the scaling factor: 1 * 8 = 8.
-; CHECK: [[IVNEXT]] = add nuw nsw i64 [[IV]], 8
%indvars.iv.next = add i64 %indvars.iv, 1
%arrayidx2 = getelementptr inbounds double, ptr %c, i64 %indvars.iv.next
%tmp2 = load double, ptr %arrayidx2, align 8
@@ -28,8 +47,6 @@ for.body: ; preds = %for.body, %entry
%arrayidx4 = getelementptr inbounds double, ptr %a, i64 %indvars.iv
store double %mul, ptr %arrayidx4, align 8
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
-; Comparison should be 19 * 8 = 152.
-; CHECK: icmp eq i32 {{%[^,]+}}, 152
%exitcond = icmp eq i32 %lftr.wideiv, 20
br i1 %exitcond, label %for.end, label %for.body
diff --git a/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll b/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll
index 50c70c5676c4a..673caa2a7e63c 100644
--- a/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll
+++ b/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll
@@ -26,11 +26,11 @@ define void @f0(ptr %a, i64 %n) {
; CHECK-NEXT: b.ge .LBB0_2
; CHECK-NEXT: .LBB0_1: // %loop.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr w0, [x20, x22, lsl #2]
+; CHECK-NEXT: ldr w0, [x20]
; CHECK-NEXT: mov x1, x21
; CHECK-NEXT: bl g
-; CHECK-NEXT: str w0, [x20, x22, lsl #2]
; CHECK-NEXT: add x22, x22, #1
+; CHECK-NEXT: str w0, [x20], #4
; CHECK-NEXT: cmp x22, x19
; CHECK-NEXT: b.lt .LBB0_1
; CHECK-NEXT: .LBB0_2: // %exit
@@ -76,12 +76,12 @@ define void @f1(ptr %a, i64 %n) {
; CHECK-NEXT: b.ge .LBB1_2
; CHECK-NEXT: .LBB1_1: // %loop.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr w0, [x20, x21, lsl #2]
+; CHECK-NEXT: ldr w0, [x20]
; CHECK-NEXT: mov x1, #1450704896 // =0x56780000
; CHECK-NEXT: movk x1, #4660, lsl #48
; CHECK-NEXT: bl g
-; CHECK-NEXT: str w0, [x20, x21, lsl #2]
; CHECK-NEXT: add x21, x21, #1
+; CHECK-NEXT: str w0, [x20], #4
; CHECK-NEXT: cmp x21, x19
; CHECK-NEXT: b.lt .LBB1_1
; CHECK-NEXT: .LBB1_2: // %exit
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
index 7542e9c4b8f5b..279b5e0a6dd81 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
@@ -36,10 +36,9 @@ define i32 @check_deinterleaving_has_deinterleave(ptr %a) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: add x8, x0, #16
+; CHECK-NEXT: mov w8, #32 // =0x20
; CHECK-NEXT: movi v3.2d, #0000000000000000
; CHECK-NEXT: movi v2.2d, #0000000000000000
-; CHECK-NEXT: mov w9, #32 // =0x20
; CHECK-NEXT: movi v4.2d, #0000000000000000
; CHECK-NEXT: movi v5.2d, #0000000000000000
; CHECK-NEXT: movi v7.2d, #0000000000000000
@@ -47,9 +46,8 @@ define i32 @check_deinterleaving_has_deinterleave(ptr %a) {
; CHECK-NEXT: movi v16.2d, #0000000000000000
; CHECK-NEXT: .LBB1_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldp q17, q18, [x8, #-16]
-; CHECK-NEXT: subs x9, x9, #32
-; CHECK-NEXT: add x8, x8, #32
+; CHECK-NEXT: ldp q17, q18, [x0], #32
+; CHECK-NEXT: subs x8, x8, #32
; CHECK-NEXT: cmeq v17.16b, v17.16b, #0
; CHECK-NEXT: cmeq v18.16b, v18.16b, #0
; CHECK-NEXT: ushll2 v19.8h, v17.16b, #0
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
index aed3072bb4af3..78ad7ad81f84d 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
@@ -16,15 +16,12 @@ define dso_local %"struct.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: movi v1.2d, #0000000000000000
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1600 // =0x640
; CHECK-NEXT: .LBB0_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add x9, x0, x8
-; CHECK-NEXT: add x10, x1, x8
-; CHECK-NEXT: add x8, x8, #32
-; CHECK-NEXT: ldp q3, q2, [x9]
-; CHECK-NEXT: cmp x8, #1600
-; CHECK-NEXT: ldp q5, q4, [x10]
+; CHECK-NEXT: ldp q3, q2, [x0], #32
+; CHECK-NEXT: ldp q5, q4, [x1], #32
+; CHECK-NEXT: subs x8, x8, #32
; CHECK-NEXT: fcmla v0.2d, v5.2d, v3.2d, #0
; CHECK-NEXT: fcmla v1.2d, v4.2d, v2.2d, #0
; CHECK-NEXT: fcmla v0.2d, v5.2d, v3.2d, #90
@@ -83,15 +80,12 @@ define %"struct.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: adrp x8, .LCPI1_0
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0]
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1600 // =0x640
; CHECK-NEXT: .LBB1_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add x9, x0, x8
-; CHECK-NEXT: add x10, x1, x8
-; CHECK-NEXT: add x8, x8, #32
-; CHECK-NEXT: ldp q3, q2, [x9]
-; CHECK-NEXT: cmp x8, #1600
-; CHECK-NEXT: ldp q5, q4, [x10]
+; CHECK-NEXT: ldp q3, q2, [x0], #32
+; CHECK-NEXT: ldp q5, q4, [x1], #32
+; CHECK-NEXT: subs x8, x8, #32
; CHECK-NEXT: fcmla v1.2d, v5.2d, v3.2d, #0
; CHECK-NEXT: fcmla v0.2d, v4.2d, v2.2d, #0
; CHECK-NEXT: fcmla v1.2d, v5.2d, v3.2d, #90
diff --git a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
index 1fbca7ca2c27c..be20483b75f7e 100644
--- a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
@@ -32,13 +32,11 @@ define void @fptoui_v8f32_to_v8i8_in_loop(ptr %A, ptr %dst) {
; CHECK-NEXT: adrp x8, lCPI0_0@PAGE
; CHECK-NEXT: Lloh1:
; CHECK-NEXT: ldr q0, [x8, lCPI0_0@PAGEOFF]
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1000 ; =0x3e8
; CHECK-NEXT: LBB0_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add x9, x0, x8, lsl #5
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: cmp x8, #1000
-; CHECK-NEXT: ldp q2, q1, [x9]
+; CHECK-NEXT: ldp q2, q1, [x0], #32
+; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: fcvtzu.4s v4, v1
; CHECK-NEXT: fcvtzu.4s v3, v2
; CHECK-NEXT: tbl.16b v1, { v3, v4 }, v0
@@ -111,22 +109,18 @@ define void @fptoui_2x_v8f32_to_v8i8_in_loop(ptr %A, ptr %B, ptr %dst) {
; CHECK-NEXT: adrp x8, lCPI2_0@PAGE
; CHECK-NEXT: Lloh3:
; CHECK-NEXT: ldr q0, [x8, lCPI2_0@PAGEOFF]
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1000 ; =0x3e8
; CHECK-NEXT: LBB2_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: lsl x9, x8, #5
-; CHECK-NEXT: add x10, x0, x9
-; CHECK-NEXT: add x9, x1, x9
-; CHECK-NEXT: ldp q2, q1, [x10]
+; CHECK-NEXT: ldp q2, q1, [x0], #32
+; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: fcvtzu.4s v4, v1
-; CHECK-NEXT: ldp q7, q1, [x9]
+; CHECK-NEXT: ldp q7, q1, [x1], #32
; CHECK-NEXT: fcvtzu.4s v3, v2
; CHECK-NEXT: fcvtzu.4s v6, v1
; CHECK-NEXT: fcvtzu.4s v5, v7
; CHECK-NEXT: tbl.16b v1, { v3, v4, v5, v6 }, v0
-; CHECK-NEXT: str q1, [x2, x8, lsl #4]
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: cmp x8, #1000
+; CHECK-NEXT: str q1, [x2], #16
; CHECK-NEXT: b.eq LBB2_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
@@ -178,22 +172,18 @@ define void @fptoui_2x_v8f32_to_v8i8_in_loop_no_concat_shuffle(ptr %A, ptr %B, p
; CHECK-NEXT: adrp x8, lCPI3_0@PAGE
; CHECK-NEXT: Lloh5:
; CHECK-NEXT: ldr q0, [x8, lCPI3_0@PAGEOFF]
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1000 ; =0x3e8
; CHECK-NEXT: LBB3_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: lsl x9, x8, #5
-; CHECK-NEXT: add x10, x0, x9
-; CHECK-NEXT: add x9, x1, x9
-; CHECK-NEXT: ldp q2, q1, [x10]
+; CHECK-NEXT: ldp q2, q1, [x0], #32
+; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: fcvtzu.4s v4, v1
-; CHECK-NEXT: ldp q7, q1, [x9]
+; CHECK-NEXT: ldp q7, q1, [x1], #32
; CHECK-NEXT: fcvtzu.4s v3, v2
; CHECK-NEXT: fcvtzu.4s v6, v1
; CHECK-NEXT: fcvtzu.4s v5, v7
; CHECK-NEXT: tbl.16b v1, { v3, v4, v5, v6 }, v0
-; CHECK-NEXT: str q1, [x2, x8, lsl #4]
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: cmp x8, #1000
+; CHECK-NEXT: str q1, [x2], #16
; CHECK-NEXT: b.eq LBB3_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
@@ -245,15 +235,13 @@ define void @fptoui_v16f32_to_v16i8_in_loop(ptr %A, ptr %dst) {
; CHECK-NEXT: adrp x8, lCPI4_0@PAGE
; CHECK-NEXT: Lloh7:
; CHECK-NEXT: ldr q0, [x8, lCPI4_0@PAGEOFF]
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1000 ; =0x3e8
; CHECK-NEXT: LBB4_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add x9, x0, x8, lsl #6
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: cmp x8, #1000
-; CHECK-NEXT: ldp q2, q1, [x9, #32]
+; CHECK-NEXT: ldp q2, q1, [x0, #32]
+; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: fcvtzu.4s v6, v1
-; CHECK-NEXT: ldp q7, q1, [x9]
+; CHECK-NEXT: ldp q7, q1, [x0], #64
; CHECK-NEXT: fcvtzu.4s v5, v2
; CHECK-NEXT: fcvtzu.4s v4, v1
; CHECK-NEXT: fcvtzu.4s v3, v7
@@ -306,30 +294,25 @@ define void @fptoui_2x_v16f32_to_v16i8_in_loop(ptr %A, ptr %B, ptr %dst) {
; CHECK-NEXT: adrp x8, lCPI5_0@PAGE
; CHECK-NEXT: Lloh9:
; CHECK-NEXT: ldr q0, [x8, lCPI5_0@PAGEOFF]
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1000 ; =0x3e8
; CHECK-NEXT: LBB5_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: lsl x9, x8, #6
-; CHECK-NEXT: add x10, x1, x9
-; CHECK-NEXT: add x9, x0, x9
-; CHECK-NEXT: ldp q2, q1, [x10, #32]
-; CHECK-NEXT: ldp q3, q4, [x9, #32]
-; CHECK-NEXT: ldp q5, q6, [x10]
+; CHECK-NEXT: ldp q2, q1, [x1, #32]
+; CHECK-NEXT: subs x8, x8, #1
+; CHECK-NEXT: ldp q3, q4, [x0, #32]
+; CHECK-NEXT: ldp q5, q6, [x1], #64
; CHECK-NEXT: fcvtzu.4s v19, v1
; CHECK-NEXT: fcvtzu.4s v18, v2
-; CHECK-NEXT: ldp q2, q1, [x9]
; CHECK-NEXT: fcvtzu.4s v23, v4
-; CHECK-NEXT: fcvtzu.4s v17, v6
-; CHECK-NEXT: add x9, x2, x8, lsl #5
+; CHECK-NEXT: ldp q2, q1, [x0], #64
; CHECK-NEXT: fcvtzu.4s v22, v3
+; CHECK-NEXT: fcvtzu.4s v17, v6
; CHECK-NEXT: fcvtzu.4s v16, v5
-; CHECK-NEXT: add x8, x8, #1
; CHECK-NEXT: fcvtzu.4s v21, v1
-; CHECK-NEXT: cmp x8, #1000
; CHECK-NEXT: fcvtzu.4s v20, v2
; CHECK-NEXT: tbl.16b v1, { v16, v17, v18, v19 }, v0
; CHECK-NEXT: tbl.16b v2, { v20, v21, v22, v23 }, v0
-; CHECK-NEXT: stp q2, q1, [x9]
+; CHECK-NEXT: stp q2, q1, [x2], #32
; CHECK-NEXT: b.eq LBB5_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
@@ -359,17 +342,15 @@ exit:
define void @fptoui_v8f32_to_v8i16_in_loop(ptr %A, ptr %dst) {
; CHECK-LABEL: fptoui_v8f32_to_v8i16_in_loop:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1000 ; =0x3e8
; CHECK-NEXT: LBB6_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add x9, x0, x8, lsl #5
-; CHECK-NEXT: ldp q0, q1, [x9]
+; CHECK-NEXT: ldp q0, q1, [x0], #32
+; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: fcvtzu.4s v1, v1
; CHECK-NEXT: fcvtzu.4s v0, v0
; CHECK-NEXT: uzp1.8h v0, v0, v1
-; CHECK-NEXT: str q0, [x1, x8, lsl #4]
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: cmp x8, #1000
+; CHECK-NEXT: str q0, [x1], #16
; CHECK-NEXT: b.eq LBB6_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
@@ -394,24 +375,19 @@ exit:
define void @fptoui_2x_v8f32_to_v8i16_in_loop(ptr %A, ptr %B, ptr %dst) {
; CHECK-LABEL: fptoui_2x_v8f32_to_v8i16_in_loop:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1000 ; =0x3e8
; CHECK-NEXT: LBB7_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: lsl x9, x8, #5
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: cmp x8, #1000
-; CHECK-NEXT: add x10, x0, x9
-; CHECK-NEXT: add x11, x1, x9
-; CHECK-NEXT: add x9, x2, x9
-; CHECK-NEXT: ldp q0, q1, [x10]
-; CHECK-NEXT: ldp q2, q3, [x11]
+; CHECK-NEXT: ldp q0, q1, [x0], #32
+; CHECK-NEXT: ldp q2, q3, [x1], #32
+; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: fcvtzu.4s v1, v1
; CHECK-NEXT: fcvtzu.4s v0, v0
; CHECK-NEXT: fcvtzu.4s v3, v3
; CHECK-NEXT: fcvtzu.4s v2, v2
; CHECK-NEXT: uzp1.8h v0, v0, v1
; CHECK-NEXT: uzp1.8h v1, v2, v3
-; CHECK-NEXT: stp q0, q1, [x9]
+; CHECK-NEXT: stp q0, q1, [x2], #32
; CHECK-NEXT: b.eq LBB7_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
@@ -483,18 +459,16 @@ define void @uitofp_v8i8_to_v8f32(ptr %src, ptr %dst) {
; CHECK-NEXT: ldr q0, [x8, lCPI8_0@PAGEOFF]
; CHECK-NEXT: Lloh13:
; CHECK-NEXT: ldr q1, [x9, lCPI8_1@PAGEOFF]
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1000 ; =0x3e8
; CHECK-NEXT: LBB8_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr d2, [x0, x8, lsl #3]
-; CHECK-NEXT: add x9, x1, x8, lsl #5
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: cmp x8, #1000
+; CHECK-NEXT: ldr d2, [x0], #8
+; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: tbl.16b v3, { v2 }, v0
; CHECK-NEXT: tbl.16b v2, { v2 }, v1
; CHECK-NEXT: ucvtf.4s v3, v3
; CHECK-NEXT: ucvtf.4s v2, v2
-; CHECK-NEXT: stp q2, q3, [x9]
+; CHECK-NEXT: stp q2, q3, [x1], #32
; CHECK-NEXT: b.eq LBB8_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
@@ -606,13 +580,11 @@ define void @uitofp_v16i8_to_v16f32(ptr %src, ptr %dst) {
; CHECK-NEXT: ldr q2, [x10, lCPI9_2@PAGEOFF]
; CHECK-NEXT: Lloh21:
; CHECK-NEXT: ldr q3, [x8, lCPI9_3@PAGEOFF]
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1000 ; =0x3e8
; CHECK-NEXT: LBB9_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr q4, [x0, x8, lsl #4]
-; CHECK-NEXT: add x9, x1, x8, lsl #6
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: cmp x8, #1000
+; CHECK-NEXT: ldr q4, [x0], #16
+; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: tbl.16b v5, { v4 }, v0
; CHECK-NEXT: tbl.16b v6, { v4 }, v1
; CHECK-NEXT: tbl.16b v7, { v4 }, v2
@@ -621,8 +593,8 @@ define void @uitofp_v16i8_to_v16f32(ptr %src, ptr %dst) {
; CHECK-NEXT: ucvtf.4s v6, v6
; CHECK-NEXT: ucvtf.4s v7, v7
; CHECK-NEXT: ucvtf.4s v4, v4
-; CHECK-NEXT: stp q6, q5, [x9, #32]
-; CHECK-NEXT: stp q4, q7, [x9]
+; CHECK-NEXT: stp q6, q5, [x1, #32]
+; CHECK-NEXT: stp q4, q7, [x1], #64
; CHECK-NEXT: b.eq LBB9_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
@@ -668,13 +640,11 @@ define void @uitofp_v8i16_to_v8f64(ptr nocapture noundef readonly %x, ptr nocapt
; CHECK-NEXT: ldr q2, [x10, lCPI10_2@PAGEOFF]
; CHECK-NEXT: Lloh29:
; CHECK-NEXT: ldr q3, [x8, lCPI10_3@PAGEOFF]
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1024 ; =0x400
; CHECK-NEXT: LBB10_1: ; %vector.body
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr q4, [x0, x8]
-; CHECK-NEXT: add x9, x1, x8
-; CHECK-NEXT: add x8, x8, #64
-; CHECK-NEXT: cmp x8, #2, lsl #12 ; =8192
+; CHECK-NEXT: ldr q4, [x0], #64
+; CHECK-NEXT: subs x8, x8, #8
; CHECK-NEXT: tbl.16b v5, { v4 }, v0
; CHECK-NEXT: tbl.1...
[truncated]
|
@llvm/pr-subscribers-backend-arm Author: Sergey Shcherbinin (SergeyShch01) ChangesEncourage (via heuristics) the creation of IVs whose increment can later be combined with memory instructions as pre/post increments. Regresstion tests are updated accordingly. Patch is 216.70 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/152995.diff 38 Files Affected:
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index e3ef9d8680b53..27d9190688ffa 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -523,6 +523,8 @@ struct Formula {
bool countsDownToZero() const;
+ bool isBaseRegOnly() const;
+
size_t getNumRegs() const;
Type *getType() const;
@@ -717,6 +719,11 @@ bool Formula::countsDownToZero() const {
return StepInt->isNegative();
}
+bool Formula::isBaseRegOnly() const {
+ return BaseGV == nullptr && Scale == 0 && ScaledReg == nullptr &&
+ BaseOffset.isZero() && UnfoldedOffset.isZero() && BaseRegs.size() == 1;
+}
+
/// Return the total number of register operands used by this formula. This does
/// not include register uses implied by non-constant addrec strides.
size_t Formula::getNumRegs() const {
@@ -1425,12 +1432,17 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
const SCEV *Start;
const SCEVConstant *Step;
if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant(Step))))
- // If the step size matches the base offset, we could use pre-indexed
- // addressing.
- if ((AMK == TTI::AMK_PreIndexed && F.BaseOffset.isFixed() &&
+ if ( // If the step size matches the base offset, we could use
+ // pre-indexed addressing.
+ (AMK == TTI::AMK_PreIndexed && F.BaseOffset.isFixed() &&
Step->getAPInt() == F.BaseOffset.getFixedValue()) ||
(AMK == TTI::AMK_PostIndexed && !isa<SCEVConstant>(Start) &&
- SE->isLoopInvariant(Start, L)))
+ SE->isLoopInvariant(Start, L)) ||
+ // general check for post-indexed addressing with specific step
+ (LU.Kind == LSRUse::Address && F.isBaseRegOnly() &&
+ TTI->isLegalAddressingMode(LU.AccessTy.MemTy, nullptr,
+ Step->getAPInt().getSExtValue(), true,
+ 0, LU.AccessTy.AddrSpace)))
LoopCost = 0;
}
// If the loop counts down to zero and we'll be using a hardware loop then
diff --git a/llvm/test/CodeGen/AArch64/arm64-scaled_iv.ll b/llvm/test/CodeGen/AArch64/arm64-scaled_iv.ll
index c4da564434ee9..966ff15dff098 100644
--- a/llvm/test/CodeGen/AArch64/arm64-scaled_iv.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-scaled_iv.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
; RUN: opt -S -loop-reduce < %s | FileCheck %s
; Scaling factor in addressing mode are costly.
; Make loop-reduce prefer unscaled accesses.
@@ -7,20 +8,38 @@ target triple = "arm64-apple-ios7.0.0"
; Function Attrs: nounwind ssp
define void @mulDouble(ptr nocapture %a, ptr nocapture %b, ptr nocapture %c) {
-; CHECK: @mulDouble
+; CHECK-LABEL: define void @mulDouble(
+; CHECK-SAME: ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]], ptr captures(none) [[C:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr nuw i8, ptr [[A]], i64 8
+; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr nuw i8, ptr [[C]], i64 16
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[LSR_IV6:%.*]] = phi ptr [ [[SCEVGEP7:%.*]], [[FOR_BODY]] ], [ [[B]], [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[LSR_IV4:%.*]] = phi ptr [ [[SCEVGEP5:%.*]], [[FOR_BODY]] ], [ [[SCEVGEP3]], [[ENTRY]] ]
+; CHECK-NEXT: [[LSR_IV2:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 19, [[ENTRY]] ]
+; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP1:%.*]], [[FOR_BODY]] ], [ [[SCEVGEP]], [[ENTRY]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[LSR_IV6]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[LSR_IV4]], align 8
+; CHECK-NEXT: [[MUL:%.*]] = fmul double [[TMP1]], [[TMP2]]
+; CHECK-NEXT: store double [[MUL]], ptr [[LSR_IV]], align 8
+; CHECK-NEXT: [[SCEVGEP1]] = getelementptr i8, ptr [[LSR_IV]], i64 8
+; CHECK-NEXT: [[LSR_IV_NEXT]] = add nsw i32 [[LSR_IV2]], -1
+; CHECK-NEXT: [[SCEVGEP5]] = getelementptr i8, ptr [[LSR_IV4]], i64 8
+; CHECK-NEXT: [[SCEVGEP7]] = getelementptr i8, ptr [[LSR_IV6]], i64 8
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 0
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK: for.end:
+; CHECK-NEXT: ret void
+;
entry:
br label %for.body
for.body: ; preds = %for.body, %entry
-; CHECK: [[IV:%[^ ]+]] = phi i64 [ [[IVNEXT:%[^,]+]], %for.body ], [ 0, %entry ]
-; Only one induction variable should have been generated.
-; CHECK-NOT: phi
%indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.body ]
%tmp = add nsw i64 %indvars.iv, -1
%arrayidx = getelementptr inbounds double, ptr %b, i64 %tmp
%tmp1 = load double, ptr %arrayidx, align 8
-; The induction variable should carry the scaling factor: 1 * 8 = 8.
-; CHECK: [[IVNEXT]] = add nuw nsw i64 [[IV]], 8
%indvars.iv.next = add i64 %indvars.iv, 1
%arrayidx2 = getelementptr inbounds double, ptr %c, i64 %indvars.iv.next
%tmp2 = load double, ptr %arrayidx2, align 8
@@ -28,8 +47,6 @@ for.body: ; preds = %for.body, %entry
%arrayidx4 = getelementptr inbounds double, ptr %a, i64 %indvars.iv
store double %mul, ptr %arrayidx4, align 8
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
-; Comparison should be 19 * 8 = 152.
-; CHECK: icmp eq i32 {{%[^,]+}}, 152
%exitcond = icmp eq i32 %lftr.wideiv, 20
br i1 %exitcond, label %for.end, label %for.body
diff --git a/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll b/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll
index 50c70c5676c4a..673caa2a7e63c 100644
--- a/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll
+++ b/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll
@@ -26,11 +26,11 @@ define void @f0(ptr %a, i64 %n) {
; CHECK-NEXT: b.ge .LBB0_2
; CHECK-NEXT: .LBB0_1: // %loop.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr w0, [x20, x22, lsl #2]
+; CHECK-NEXT: ldr w0, [x20]
; CHECK-NEXT: mov x1, x21
; CHECK-NEXT: bl g
-; CHECK-NEXT: str w0, [x20, x22, lsl #2]
; CHECK-NEXT: add x22, x22, #1
+; CHECK-NEXT: str w0, [x20], #4
; CHECK-NEXT: cmp x22, x19
; CHECK-NEXT: b.lt .LBB0_1
; CHECK-NEXT: .LBB0_2: // %exit
@@ -76,12 +76,12 @@ define void @f1(ptr %a, i64 %n) {
; CHECK-NEXT: b.ge .LBB1_2
; CHECK-NEXT: .LBB1_1: // %loop.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr w0, [x20, x21, lsl #2]
+; CHECK-NEXT: ldr w0, [x20]
; CHECK-NEXT: mov x1, #1450704896 // =0x56780000
; CHECK-NEXT: movk x1, #4660, lsl #48
; CHECK-NEXT: bl g
-; CHECK-NEXT: str w0, [x20, x21, lsl #2]
; CHECK-NEXT: add x21, x21, #1
+; CHECK-NEXT: str w0, [x20], #4
; CHECK-NEXT: cmp x21, x19
; CHECK-NEXT: b.lt .LBB1_1
; CHECK-NEXT: .LBB1_2: // %exit
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
index 7542e9c4b8f5b..279b5e0a6dd81 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
@@ -36,10 +36,9 @@ define i32 @check_deinterleaving_has_deinterleave(ptr %a) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: add x8, x0, #16
+; CHECK-NEXT: mov w8, #32 // =0x20
; CHECK-NEXT: movi v3.2d, #0000000000000000
; CHECK-NEXT: movi v2.2d, #0000000000000000
-; CHECK-NEXT: mov w9, #32 // =0x20
; CHECK-NEXT: movi v4.2d, #0000000000000000
; CHECK-NEXT: movi v5.2d, #0000000000000000
; CHECK-NEXT: movi v7.2d, #0000000000000000
@@ -47,9 +46,8 @@ define i32 @check_deinterleaving_has_deinterleave(ptr %a) {
; CHECK-NEXT: movi v16.2d, #0000000000000000
; CHECK-NEXT: .LBB1_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldp q17, q18, [x8, #-16]
-; CHECK-NEXT: subs x9, x9, #32
-; CHECK-NEXT: add x8, x8, #32
+; CHECK-NEXT: ldp q17, q18, [x0], #32
+; CHECK-NEXT: subs x8, x8, #32
; CHECK-NEXT: cmeq v17.16b, v17.16b, #0
; CHECK-NEXT: cmeq v18.16b, v18.16b, #0
; CHECK-NEXT: ushll2 v19.8h, v17.16b, #0
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
index aed3072bb4af3..78ad7ad81f84d 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
@@ -16,15 +16,12 @@ define dso_local %"struct.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: movi v1.2d, #0000000000000000
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1600 // =0x640
; CHECK-NEXT: .LBB0_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add x9, x0, x8
-; CHECK-NEXT: add x10, x1, x8
-; CHECK-NEXT: add x8, x8, #32
-; CHECK-NEXT: ldp q3, q2, [x9]
-; CHECK-NEXT: cmp x8, #1600
-; CHECK-NEXT: ldp q5, q4, [x10]
+; CHECK-NEXT: ldp q3, q2, [x0], #32
+; CHECK-NEXT: ldp q5, q4, [x1], #32
+; CHECK-NEXT: subs x8, x8, #32
; CHECK-NEXT: fcmla v0.2d, v5.2d, v3.2d, #0
; CHECK-NEXT: fcmla v1.2d, v4.2d, v2.2d, #0
; CHECK-NEXT: fcmla v0.2d, v5.2d, v3.2d, #90
@@ -83,15 +80,12 @@ define %"struct.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: adrp x8, .LCPI1_0
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0]
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1600 // =0x640
; CHECK-NEXT: .LBB1_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add x9, x0, x8
-; CHECK-NEXT: add x10, x1, x8
-; CHECK-NEXT: add x8, x8, #32
-; CHECK-NEXT: ldp q3, q2, [x9]
-; CHECK-NEXT: cmp x8, #1600
-; CHECK-NEXT: ldp q5, q4, [x10]
+; CHECK-NEXT: ldp q3, q2, [x0], #32
+; CHECK-NEXT: ldp q5, q4, [x1], #32
+; CHECK-NEXT: subs x8, x8, #32
; CHECK-NEXT: fcmla v1.2d, v5.2d, v3.2d, #0
; CHECK-NEXT: fcmla v0.2d, v4.2d, v2.2d, #0
; CHECK-NEXT: fcmla v1.2d, v5.2d, v3.2d, #90
diff --git a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
index 1fbca7ca2c27c..be20483b75f7e 100644
--- a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
@@ -32,13 +32,11 @@ define void @fptoui_v8f32_to_v8i8_in_loop(ptr %A, ptr %dst) {
; CHECK-NEXT: adrp x8, lCPI0_0@PAGE
; CHECK-NEXT: Lloh1:
; CHECK-NEXT: ldr q0, [x8, lCPI0_0@PAGEOFF]
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1000 ; =0x3e8
; CHECK-NEXT: LBB0_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add x9, x0, x8, lsl #5
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: cmp x8, #1000
-; CHECK-NEXT: ldp q2, q1, [x9]
+; CHECK-NEXT: ldp q2, q1, [x0], #32
+; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: fcvtzu.4s v4, v1
; CHECK-NEXT: fcvtzu.4s v3, v2
; CHECK-NEXT: tbl.16b v1, { v3, v4 }, v0
@@ -111,22 +109,18 @@ define void @fptoui_2x_v8f32_to_v8i8_in_loop(ptr %A, ptr %B, ptr %dst) {
; CHECK-NEXT: adrp x8, lCPI2_0@PAGE
; CHECK-NEXT: Lloh3:
; CHECK-NEXT: ldr q0, [x8, lCPI2_0@PAGEOFF]
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1000 ; =0x3e8
; CHECK-NEXT: LBB2_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: lsl x9, x8, #5
-; CHECK-NEXT: add x10, x0, x9
-; CHECK-NEXT: add x9, x1, x9
-; CHECK-NEXT: ldp q2, q1, [x10]
+; CHECK-NEXT: ldp q2, q1, [x0], #32
+; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: fcvtzu.4s v4, v1
-; CHECK-NEXT: ldp q7, q1, [x9]
+; CHECK-NEXT: ldp q7, q1, [x1], #32
; CHECK-NEXT: fcvtzu.4s v3, v2
; CHECK-NEXT: fcvtzu.4s v6, v1
; CHECK-NEXT: fcvtzu.4s v5, v7
; CHECK-NEXT: tbl.16b v1, { v3, v4, v5, v6 }, v0
-; CHECK-NEXT: str q1, [x2, x8, lsl #4]
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: cmp x8, #1000
+; CHECK-NEXT: str q1, [x2], #16
; CHECK-NEXT: b.eq LBB2_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
@@ -178,22 +172,18 @@ define void @fptoui_2x_v8f32_to_v8i8_in_loop_no_concat_shuffle(ptr %A, ptr %B, p
; CHECK-NEXT: adrp x8, lCPI3_0@PAGE
; CHECK-NEXT: Lloh5:
; CHECK-NEXT: ldr q0, [x8, lCPI3_0@PAGEOFF]
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1000 ; =0x3e8
; CHECK-NEXT: LBB3_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: lsl x9, x8, #5
-; CHECK-NEXT: add x10, x0, x9
-; CHECK-NEXT: add x9, x1, x9
-; CHECK-NEXT: ldp q2, q1, [x10]
+; CHECK-NEXT: ldp q2, q1, [x0], #32
+; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: fcvtzu.4s v4, v1
-; CHECK-NEXT: ldp q7, q1, [x9]
+; CHECK-NEXT: ldp q7, q1, [x1], #32
; CHECK-NEXT: fcvtzu.4s v3, v2
; CHECK-NEXT: fcvtzu.4s v6, v1
; CHECK-NEXT: fcvtzu.4s v5, v7
; CHECK-NEXT: tbl.16b v1, { v3, v4, v5, v6 }, v0
-; CHECK-NEXT: str q1, [x2, x8, lsl #4]
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: cmp x8, #1000
+; CHECK-NEXT: str q1, [x2], #16
; CHECK-NEXT: b.eq LBB3_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
@@ -245,15 +235,13 @@ define void @fptoui_v16f32_to_v16i8_in_loop(ptr %A, ptr %dst) {
; CHECK-NEXT: adrp x8, lCPI4_0@PAGE
; CHECK-NEXT: Lloh7:
; CHECK-NEXT: ldr q0, [x8, lCPI4_0@PAGEOFF]
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1000 ; =0x3e8
; CHECK-NEXT: LBB4_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add x9, x0, x8, lsl #6
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: cmp x8, #1000
-; CHECK-NEXT: ldp q2, q1, [x9, #32]
+; CHECK-NEXT: ldp q2, q1, [x0, #32]
+; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: fcvtzu.4s v6, v1
-; CHECK-NEXT: ldp q7, q1, [x9]
+; CHECK-NEXT: ldp q7, q1, [x0], #64
; CHECK-NEXT: fcvtzu.4s v5, v2
; CHECK-NEXT: fcvtzu.4s v4, v1
; CHECK-NEXT: fcvtzu.4s v3, v7
@@ -306,30 +294,25 @@ define void @fptoui_2x_v16f32_to_v16i8_in_loop(ptr %A, ptr %B, ptr %dst) {
; CHECK-NEXT: adrp x8, lCPI5_0@PAGE
; CHECK-NEXT: Lloh9:
; CHECK-NEXT: ldr q0, [x8, lCPI5_0@PAGEOFF]
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1000 ; =0x3e8
; CHECK-NEXT: LBB5_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: lsl x9, x8, #6
-; CHECK-NEXT: add x10, x1, x9
-; CHECK-NEXT: add x9, x0, x9
-; CHECK-NEXT: ldp q2, q1, [x10, #32]
-; CHECK-NEXT: ldp q3, q4, [x9, #32]
-; CHECK-NEXT: ldp q5, q6, [x10]
+; CHECK-NEXT: ldp q2, q1, [x1, #32]
+; CHECK-NEXT: subs x8, x8, #1
+; CHECK-NEXT: ldp q3, q4, [x0, #32]
+; CHECK-NEXT: ldp q5, q6, [x1], #64
; CHECK-NEXT: fcvtzu.4s v19, v1
; CHECK-NEXT: fcvtzu.4s v18, v2
-; CHECK-NEXT: ldp q2, q1, [x9]
; CHECK-NEXT: fcvtzu.4s v23, v4
-; CHECK-NEXT: fcvtzu.4s v17, v6
-; CHECK-NEXT: add x9, x2, x8, lsl #5
+; CHECK-NEXT: ldp q2, q1, [x0], #64
; CHECK-NEXT: fcvtzu.4s v22, v3
+; CHECK-NEXT: fcvtzu.4s v17, v6
; CHECK-NEXT: fcvtzu.4s v16, v5
-; CHECK-NEXT: add x8, x8, #1
; CHECK-NEXT: fcvtzu.4s v21, v1
-; CHECK-NEXT: cmp x8, #1000
; CHECK-NEXT: fcvtzu.4s v20, v2
; CHECK-NEXT: tbl.16b v1, { v16, v17, v18, v19 }, v0
; CHECK-NEXT: tbl.16b v2, { v20, v21, v22, v23 }, v0
-; CHECK-NEXT: stp q2, q1, [x9]
+; CHECK-NEXT: stp q2, q1, [x2], #32
; CHECK-NEXT: b.eq LBB5_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
@@ -359,17 +342,15 @@ exit:
define void @fptoui_v8f32_to_v8i16_in_loop(ptr %A, ptr %dst) {
; CHECK-LABEL: fptoui_v8f32_to_v8i16_in_loop:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1000 ; =0x3e8
; CHECK-NEXT: LBB6_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add x9, x0, x8, lsl #5
-; CHECK-NEXT: ldp q0, q1, [x9]
+; CHECK-NEXT: ldp q0, q1, [x0], #32
+; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: fcvtzu.4s v1, v1
; CHECK-NEXT: fcvtzu.4s v0, v0
; CHECK-NEXT: uzp1.8h v0, v0, v1
-; CHECK-NEXT: str q0, [x1, x8, lsl #4]
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: cmp x8, #1000
+; CHECK-NEXT: str q0, [x1], #16
; CHECK-NEXT: b.eq LBB6_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
@@ -394,24 +375,19 @@ exit:
define void @fptoui_2x_v8f32_to_v8i16_in_loop(ptr %A, ptr %B, ptr %dst) {
; CHECK-LABEL: fptoui_2x_v8f32_to_v8i16_in_loop:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1000 ; =0x3e8
; CHECK-NEXT: LBB7_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: lsl x9, x8, #5
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: cmp x8, #1000
-; CHECK-NEXT: add x10, x0, x9
-; CHECK-NEXT: add x11, x1, x9
-; CHECK-NEXT: add x9, x2, x9
-; CHECK-NEXT: ldp q0, q1, [x10]
-; CHECK-NEXT: ldp q2, q3, [x11]
+; CHECK-NEXT: ldp q0, q1, [x0], #32
+; CHECK-NEXT: ldp q2, q3, [x1], #32
+; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: fcvtzu.4s v1, v1
; CHECK-NEXT: fcvtzu.4s v0, v0
; CHECK-NEXT: fcvtzu.4s v3, v3
; CHECK-NEXT: fcvtzu.4s v2, v2
; CHECK-NEXT: uzp1.8h v0, v0, v1
; CHECK-NEXT: uzp1.8h v1, v2, v3
-; CHECK-NEXT: stp q0, q1, [x9]
+; CHECK-NEXT: stp q0, q1, [x2], #32
; CHECK-NEXT: b.eq LBB7_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
@@ -483,18 +459,16 @@ define void @uitofp_v8i8_to_v8f32(ptr %src, ptr %dst) {
; CHECK-NEXT: ldr q0, [x8, lCPI8_0@PAGEOFF]
; CHECK-NEXT: Lloh13:
; CHECK-NEXT: ldr q1, [x9, lCPI8_1@PAGEOFF]
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1000 ; =0x3e8
; CHECK-NEXT: LBB8_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr d2, [x0, x8, lsl #3]
-; CHECK-NEXT: add x9, x1, x8, lsl #5
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: cmp x8, #1000
+; CHECK-NEXT: ldr d2, [x0], #8
+; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: tbl.16b v3, { v2 }, v0
; CHECK-NEXT: tbl.16b v2, { v2 }, v1
; CHECK-NEXT: ucvtf.4s v3, v3
; CHECK-NEXT: ucvtf.4s v2, v2
-; CHECK-NEXT: stp q2, q3, [x9]
+; CHECK-NEXT: stp q2, q3, [x1], #32
; CHECK-NEXT: b.eq LBB8_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
@@ -606,13 +580,11 @@ define void @uitofp_v16i8_to_v16f32(ptr %src, ptr %dst) {
; CHECK-NEXT: ldr q2, [x10, lCPI9_2@PAGEOFF]
; CHECK-NEXT: Lloh21:
; CHECK-NEXT: ldr q3, [x8, lCPI9_3@PAGEOFF]
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1000 ; =0x3e8
; CHECK-NEXT: LBB9_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr q4, [x0, x8, lsl #4]
-; CHECK-NEXT: add x9, x1, x8, lsl #6
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: cmp x8, #1000
+; CHECK-NEXT: ldr q4, [x0], #16
+; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: tbl.16b v5, { v4 }, v0
; CHECK-NEXT: tbl.16b v6, { v4 }, v1
; CHECK-NEXT: tbl.16b v7, { v4 }, v2
@@ -621,8 +593,8 @@ define void @uitofp_v16i8_to_v16f32(ptr %src, ptr %dst) {
; CHECK-NEXT: ucvtf.4s v6, v6
; CHECK-NEXT: ucvtf.4s v7, v7
; CHECK-NEXT: ucvtf.4s v4, v4
-; CHECK-NEXT: stp q6, q5, [x9, #32]
-; CHECK-NEXT: stp q4, q7, [x9]
+; CHECK-NEXT: stp q6, q5, [x1, #32]
+; CHECK-NEXT: stp q4, q7, [x1], #64
; CHECK-NEXT: b.eq LBB9_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
@@ -668,13 +640,11 @@ define void @uitofp_v8i16_to_v8f64(ptr nocapture noundef readonly %x, ptr nocapt
; CHECK-NEXT: ldr q2, [x10, lCPI10_2@PAGEOFF]
; CHECK-NEXT: Lloh29:
; CHECK-NEXT: ldr q3, [x8, lCPI10_3@PAGEOFF]
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1024 ; =0x400
; CHECK-NEXT: LBB10_1: ; %vector.body
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr q4, [x0, x8]
-; CHECK-NEXT: add x9, x1, x8
-; CHECK-NEXT: add x8, x8, #64
-; CHECK-NEXT: cmp x8, #2, lsl #12 ; =8192
+; CHECK-NEXT: ldr q4, [x0], #64
+; CHECK-NEXT: subs x8, x8, #8
; CHECK-NEXT: tbl.16b v5, { v4 }, v0
; CHECK-NEXT: tbl.1...
[truncated]
|
Encourage (via heuristics) the creation of IVs whose increment can later be combined with memory instructions as pre/post increments. Regresstion tests are updated accordingly.