Description
Another one from the swift team. I'm surprised no one has found this yet.
@llvm.umul.with.overflow.i16(i16 , i16 ) seems to be lowering to invalid assembly, specifically it ends up moving the two 16 bit values into the top two bytes of 32 bit numbers then using __mulsi3 to multiply them together and put the result into a 32 bit number, where it takes the top two bytes. This will always produce zero.
In my opinion if it's using __mulsi3, it should be putting two 16 bit numbers into the bottom two bytes of each input and taking the bottom two bytes of the output, then any non zero value in the top two bytes after multiplication should be interpreted as an overflow and the flag set accordingly.
Here's the llvm ir in a test case as a patch to llvm...
diff --git a/test/CodeGen/AVR/umul.with.overflow.i16-bug.ll b/test/CodeGen/AVR/umul.with.overflow.i16-bug.ll
new file mode 100644
index 00000000000..12c4030f943
--- /dev/null
+++ b/test/CodeGen/AVR/umul.with.overflow.i16-bug.ll
@@ -0,0 +1,39 @@
+; RUN: llc -O1 < %s -march=avr | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9"
+
+%Vs6UInt16 = type <{ i16 }>
+%Sb = type <{ i1 }>
+
+define hidden void @_TF4main13setServoAngleFT5angleVs6UInt16_T_(i16) #0 {
+entry:
+ %adjustedAngle = alloca %Vs6UInt16, align 2
+ %1 = bitcast %Vs6UInt16* %adjustedAngle to i8*
+ %adjustedAngle._value = getelementptr inbounds %Vs6UInt16, %Vs6UInt16* %adjustedAngle, i32 0, i32 0
+ store i16 %0, i16* %adjustedAngle._value, align 2
+
+;print(unsignedInt: adjustedAngle &* UInt16(11))
+; breaks here
+ %adjustedAngle._value2 = getelementptr inbounds %Vs6UInt16, %Vs6UInt16* %adjustedAngle, i32 0, i32 0
+ %2 = load i16, i16* %adjustedAngle._value2, align 2
+
+ %3 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 %2, i16 11)
+ %4 = extractvalue { i16, i1 } %3, 0
+ %5 = extractvalue { i16, i1 } %3, 1
+
+ ; above code looks fine, how is it lowered?
+ %6 = call i1 @_TIF3AVR5printFT11unsignedIntVs6UInt1610addNewlineSb_T_A0_()
+ call void @_TF3AVR5printFT11unsignedIntVs6UInt1610addNewlineSb_T_(i16 %4, i1 %6)
+
+ ret void
+}
+
+declare void @_TF3AVR5printFT11unsignedIntVs6UInt1610addNewlineSb_T_(i16, i1) #0
+declare i1 @_TIF3AVR5printFT11unsignedIntVs6UInt1610addNewlineSb_T_A0_() #0
+
+; Function Attrs: nounwind readnone speculatable
+declare { i16, i1 } @llvm.umul.with.overflow.i16(i16, i16) #2
+
+attributes #0 = { "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "target-cpu"="core2" "target-features"="+ssse3,+cx16,+fxsr,+mmx,+x87,+sse,+sse2,+sse3" }
+attributes #2 = { nounwind readnone speculatable }
(Note that I haven't yet put in the FileCheck directives because I haven't decided/worked out what the compiler should be doing here yet!)
FYI the original source, with debug statements in was something like:
func setServoAngle(angle: UInt16) {
var adjustedAngle: UInt16 = angle
print(unsignedInt: adjustedAngle &* UInt16(11))
}
This produces the following assembly...
.text
.file "<stdin>"
.hidden _TF4main13setServoAngleFT5angleVs6UInt16_T_ ; -- Begin function _TF4main13setServoAngleFT5angleVs6UInt16_T_
.globl _TF4main13setServoAngleFT5angleVs6UInt16_T_
.p2align 1
.type _TF4main13setServoAngleFT5angleVs6UInt16_T_,@function
_TF4main13setServoAngleFT5angleVs6UInt16_T_: ; @_TF4main13setServoAngleFT5angleVs6UInt16_T_
; %bb.0: ; %entry
push r28
push r29
push r16
push r17
in r28, 61
in r29, 62
sbiw r28, 2
in r0, 63
cli
out 62, r29
out 63, r0
out 61, r28
std Y+1, r24
std Y+2, r25
ldi r20, 11
ldi r21, 0
ldi r18, 0
ldi r19, 0
mov r22, r18
mov r23, r19
call __mulsi3
mov r16, r24
mov r17, r25
call _TIF3AVR5printFT11unsignedIntVs6UInt1610addNewlineSb_T_A0_
mov r22, r24
mov r24, r16
mov r25, r17
call _TF3AVR5printFT11unsignedIntVs6UInt1610addNewlineSb_T_
adiw r28, 2
in r0, 63
cli
out 62, r29
out 63, r0
out 61, r28
pop r17
pop r16
pop r29
pop r28
ret
.Lfunc_end0:
.size _TF4main13setServoAngleFT5angleVs6UInt16_T_, .Lfunc_end0-_TF4main13setServoAngleFT5angleVs6UInt16_T_
; -- End function
From what I know, __mulsi3
takes the 32 bit value in r18-r21, multiplies it by the 32 bit value in r22-r25 and stores the result in the 32 bit value r22-r25. (I'm not sure how it detects overflow.)
It should be multiplying the input value, e.g. 90 by 11 then printing out the result. Instead it always prints 0.
As I read this assembly, it's moving both input values into the top two bytes of 32 bit numbers, which looks broken. Either it should move them to the bottom two bytes or use a different function.
I'd love to investigate this but I'll need to get some pointers from people where this is all happening. I couldn't even find mulsi3 by grepping through llvm source code. No idea how this is made!