pulley: Add a multiply-and-add macro instruction (bytecodealliance#10081

) This is present in riscv64 and aarch64 native ISAs and was found in a benchmark I was looking at so let's add a macro-op as well to help cases where this crops up in the wild.
cfallin · Jan 22, 2025 · 2f27a10 · 2f27a10
1 parent ca95576
commit 2f27a10
Show file tree

Hide file tree

Showing 4 changed files with 61 additions and 8 deletions.
diff --git a/cranelift/codegen/src/isa/pulley_shared/lower.isle b/cranelift/codegen/src/isa/pulley_shared/lower.isle
@@ -195,27 +195,27 @@
 (rule 1 (lower (has_type $I64 (iadd a b))) (pulley_xadd64 a b))
 
 ;; Fold constants into the instruction if possible
-(rule 2 (lower (has_type (ty_int (fits_in_32 _)) (iadd a (u32_from_iconst b))))
+(rule 10 (lower (has_type (ty_int (fits_in_32 _)) (iadd a (u32_from_iconst b))))
   (pulley_xadd32_u32 a b))
-(rule 3 (lower (has_type (ty_int (fits_in_32 _)) (iadd a (u8_from_iconst b))))
+(rule 11 (lower (has_type (ty_int (fits_in_32 _)) (iadd a (u8_from_iconst b))))
   (pulley_xadd32_u8 a b))
-(rule 4 (lower (has_type $I64 (iadd a (u32_from_iconst b))))
+(rule 12 (lower (has_type $I64 (iadd a (u32_from_iconst b))))
   (pulley_xadd64_u32 a b))
-(rule 5 (lower (has_type $I64 (iadd a (u8_from_iconst b))))
+(rule 13 (lower (has_type $I64 (iadd a (u8_from_iconst b))))
   (pulley_xadd64_u8 a b))
 
 ;; If the rhs is a constant and the negated version can fit within a smaller
 ;; constant then switch this to a subtraction with the negated constant.
-(rule 6 (lower (has_type (ty_int (fits_in_32 _)) (iadd a b)))
+(rule 14 (lower (has_type (ty_int (fits_in_32 _)) (iadd a b)))
   (if-let c (u32_from_negated_iconst b))
   (pulley_xsub32_u32 a c))
-(rule 7 (lower (has_type $I64 (iadd a b)))
+(rule 15 (lower (has_type $I64 (iadd a b)))
   (if-let c (u32_from_negated_iconst b))
   (pulley_xsub64_u32 a c))
-(rule 8 (lower (has_type (ty_int (fits_in_32 _)) (iadd a b)))
+(rule 16 (lower (has_type (ty_int (fits_in_32 _)) (iadd a b)))
   (if-let c (u8_from_negated_iconst b))
   (pulley_xsub32_u8 a c))
-(rule 9 (lower (has_type $I64 (iadd a b)))
+(rule 17 (lower (has_type $I64 (iadd a b)))
   (if-let c (u8_from_negated_iconst b))
   (pulley_xsub64_u8 a c))
 
@@ -258,6 +258,13 @@
 (rule 1 (lower (has_type $I16X8 (sadd_sat a b))) (pulley_vaddi16x8_sat a b))
 (rule 1 (lower (has_type $I16X8 (uadd_sat a b))) (pulley_vaddu16x8_sat a b))
 
+;; Specialized lowerings for multiply-and-add
+
+(rule 2 (lower (has_type $I32 (iadd (imul a b) c))) (pulley_xmadd32 a b c))
+(rule 3 (lower (has_type $I32 (iadd c (imul a b)))) (pulley_xmadd32 a b c))
+(rule 2 (lower (has_type $I64 (iadd (imul a b) c))) (pulley_xmadd64 a b c))
+(rule 3 (lower (has_type $I64 (iadd c (imul a b)))) (pulley_xmadd64 a b c))
+
 ;;;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (has_type $I16X8 (iadd_pairwise a b))) (pulley_vaddpairwisei16x8_s a b))

diff --git a/pulley/src/interp.rs b/pulley/src/interp.rs
@@ -1520,6 +1520,22 @@ impl OpVisitor for Interpreter<'_> {
         ControlFlow::Continue(())
     }
 
+    fn xmadd32(&mut self, dst: XReg, src1: XReg, src2: XReg, src3: XReg) -> ControlFlow<Done> {
+        let a = self.state[src1].get_u32();
+        let b = self.state[src2].get_u32();
+        let c = self.state[src3].get_u32();
+        self.state[dst].set_u32(a.wrapping_mul(b).wrapping_add(c));
+        ControlFlow::Continue(())
+    }
+
+    fn xmadd64(&mut self, dst: XReg, src1: XReg, src2: XReg, src3: XReg) -> ControlFlow<Done> {
+        let a = self.state[src1].get_u64();
+        let b = self.state[src2].get_u64();
+        let c = self.state[src3].get_u64();
+        self.state[dst].set_u64(a.wrapping_mul(b).wrapping_add(c));
+        ControlFlow::Continue(())
+    }
+
     fn xsub32(&mut self, operands: BinaryOperands<XReg>) -> ControlFlow<Done> {
         let a = self.state[operands.src1].get_u32();
         let b = self.state[operands.src2].get_u32();

diff --git a/pulley/src/lib.rs b/pulley/src/lib.rs
@@ -276,6 +276,11 @@ macro_rules! for_each_op {
             /// Same as `xadd64` but `src2` is a zero-extended 32-bit immediate.
             xadd64_u32 = Xadd64U32 { dst: XReg, src1: XReg, src2: u32 };
 
+            /// `low32(dst) = low32(src1) * low32(src2) + low32(src3)`
+            xmadd32 = Xmadd32 { dst: XReg, src1: XReg, src2: XReg, src3: XReg };
+            /// `dst = src1 * src2 + src3`
+            xmadd64 = Xmadd64 { dst: XReg, src1: XReg, src2: XReg, src3: XReg };
+
             /// 32-bit wrapping subtraction: `low32(dst) = low32(src1) - low32(src2)`.
             ///
             /// The upper 32-bits of `dst` are unmodified.

diff --git a/tests/disas/pulley/madd.wat b/tests/disas/pulley/madd.wat
@@ -0,0 +1,25 @@
+;;! target = "pulley32"
+;;! test = "compile"
+
+(module
+  (func $madd32 (param i32 i32 i32) (result i32)
+    (i32.add
+      (i32.mul (local.get 0) (local.get 1))
+      (local.get 2)))
+
+  (func $madd64 (param i64 i64 i64) (result i64)
+    (i64.add
+      (i64.mul (local.get 0) (local.get 1))
+      (local.get 2)))
+)
+;; wasm[0]::function[0]::madd32:
+;;       push_frame
+;;       xmadd32 x0, x2, x3, x4
+;;       pop_frame
+;;       ret
+;;
+;; wasm[0]::function[1]::madd64:
+;;       push_frame
+;;       xmadd64 x0, x2, x3, x4
+;;       pop_frame
+;;       ret