[InstCombine] Known-bits optimization for ARM MVE VADC.

The MVE VADC instruction reads and writes the carry bit at bit 29 of
the FPSCR register. The corresponding ACLE intrinsic is specified to
work with an integer in which the carry bit is stored at bit 0. So if
a user writes a code sequence in C that passes the carry from one VADC
to the next, like this,

    s0 = vadcq_u32(a0, b0, &carry);
    s1 = vadcq_u32(a1, b1, &carry);

then clang will generate IR for each of those operations that shifts
the carry bit up into bit 29 before the VADC, and after it, shifts it
back down and masks off all but the low bit. But in this situation
what you really wanted was two consecutive VADC instructions, so that
the second one directly reads the value left in FPSCR by the first,
without wasting several instructions on pointlessly clearing the other
flag bits in between.

This commit explains to InstCombine that the other bits of the flags
operand don't matter, and adds a test that demonstrates that all the
code between the two VADC instructions can be optimized away as a
result.

Reviewers: dmgreen, miyuki, ostannard

Subscribers: kristof.beyls, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D67162
This commit is contained in:
Simon Tatham 2019-09-11 10:29:56 +01:00
parent 08074cc965
commit e5f485c3bd
2 changed files with 101 additions and 0 deletions

View File

@ -3308,6 +3308,20 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
}
break;
}
case Intrinsic::arm_mve_vadc:
case Intrinsic::arm_mve_vadc_predicated: {
unsigned CarryOp =
(II->getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
Value *CarryArg = II->getArgOperand(CarryOp);
assert(CarryArg->getType()->getScalarSizeInBits() == 32 &&
"Bad type for intrinsic!");
KnownBits CarryKnown(32);
if (SimplifyDemandedBits(II, CarryOp, APInt::getOneBitSet(32, 29),
CarryKnown))
return II;
break;
}
case Intrinsic::amdgcn_rcp: {
Value *Src = II->getArgOperand(0);

View File

@ -0,0 +1,87 @@
; RUN: opt -instcombine -S %s | FileCheck --check-prefix=IR %s
; RUN: opt -instcombine %s | llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -O3 -o - | FileCheck --check-prefix=ASM %s
%struct.foo = type { [2 x <4 x i32>] }
define arm_aapcs_vfpcc i32 @test_vadciq_multiple(%struct.foo %a, %struct.foo %b, i32 %carry) {
entry:
%a.0 = extractvalue %struct.foo %a, 0, 0
%a.1 = extractvalue %struct.foo %a, 0, 1
%b.0 = extractvalue %struct.foo %b, 0, 0
%b.1 = extractvalue %struct.foo %b, 0, 1
%fpscr.in.0 = shl i32 %carry, 29
%outpair.0 = call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a.0, <4 x i32> %b.0, i32 %fpscr.in.0)
%fpscr.out.0 = extractvalue { <4 x i32>, i32 } %outpair.0, 1
%shifted.out.0 = lshr i32 %fpscr.out.0, 29
%carry.out.0 = and i32 1, %shifted.out.0
%fpscr.in.1 = shl i32 %carry.out.0, 29
%outpair.1 = call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a.1, <4 x i32> %b.1, i32 %fpscr.in.1)
%fpscr.out.1 = extractvalue { <4 x i32>, i32 } %outpair.1, 1
%shifted.out.1 = lshr i32 %fpscr.out.1, 29
%carry.out.1 = and i32 1, %shifted.out.1
ret i32 %carry.out.1
}
define arm_aapcs_vfpcc i32 @test_vadciq_pred_multiple(%struct.foo %a, %struct.foo %b, i32 %ipred, i32 %carry) {
entry:
%a.0 = extractvalue %struct.foo %a, 0, 0
%a.1 = extractvalue %struct.foo %a, 0, 1
%b.0 = extractvalue %struct.foo %b, 0, 0
%b.1 = extractvalue %struct.foo %b, 0, 1
%vpred = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %ipred)
%fpscr.in.0 = shl i32 %carry, 29
%outpair.0 = call { <4 x i32>, i32 } @llvm.arm.mve.vadc.predicated.v4i32.v4i1(<4 x i32> undef, <4 x i32> %a.0, <4 x i32> %b.0, i32 %fpscr.in.0, <4 x i1> %vpred)
%fpscr.out.0 = extractvalue { <4 x i32>, i32 } %outpair.0, 1
%shifted.out.0 = lshr i32 %fpscr.out.0, 29
%carry.out.0 = and i32 1, %shifted.out.0
%fpscr.in.1 = shl i32 %carry.out.0, 29
%outpair.1 = call { <4 x i32>, i32 } @llvm.arm.mve.vadc.predicated.v4i32.v4i1(<4 x i32> undef, <4 x i32> %a.1, <4 x i32> %b.1, i32 %fpscr.in.1, <4 x i1> %vpred)
%fpscr.out.1 = extractvalue { <4 x i32>, i32 } %outpair.1, 1
%shifted.out.1 = lshr i32 %fpscr.out.1, 29
%carry.out.1 = and i32 1, %shifted.out.1
ret i32 %carry.out.1
}
declare { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32>, <4 x i32>, i32)
declare { <4 x i32>, i32 } @llvm.arm.mve.vadc.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i32>, i32, <4 x i1>)
declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
; Expect the transformation in between the two intrinsics, where the
; fpscr-formatted output value is turned back into just the carry bit
; at bit 0 and then back again for the next call, to be optimized away
; completely in InstCombine, so that the FPSCR output from one
; intrinsic is passed straight on to the next:
; IR: %outpair.0 = call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a.0, <4 x i32> %b.0, i32 %fpscr.in.0)
; IR: %fpscr.out.0 = extractvalue { <4 x i32>, i32 } %outpair.0, 1
; IR: %outpair.1 = call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a.1, <4 x i32> %b.1, i32 %fpscr.out.0)
; IR: %outpair.0 = call { <4 x i32>, i32 } @llvm.arm.mve.vadc.predicated.v4i32.v4i1(<4 x i32> undef, <4 x i32> %a.0, <4 x i32> %b.0, i32 %fpscr.in.0, <4 x i1> %vpred)
; IR: %fpscr.out.0 = extractvalue { <4 x i32>, i32 } %outpair.0, 1
; IR: %outpair.1 = call { <4 x i32>, i32 } @llvm.arm.mve.vadc.predicated.v4i32.v4i1(<4 x i32> undef, <4 x i32> %a.1, <4 x i32> %b.1, i32 %fpscr.out.0, <4 x i1> %vpred)
; And this is the assembly language we expect at the end of it, with
; the two vadc.i32 instructions right next to each other, and the
; second one implicitly reusing the FPSCR written by the first.
; ASM: test_vadciq_multiple:
; ASM: lsls r0, r0, #29
; ASM-NEXT: vmsr fpscr_nzcvqc, r0
; ASM-NEXT: vadc.i32 q0, q0, q2
; ASM-NEXT: vadc.i32 q0, q1, q3
; ASM-NEXT: vmrs r0, fpscr_nzcvqc
; ASM-NEXT: ubfx r0, r0, #29, #1
; ASM-NEXT: bx lr
; ASM: test_vadciq_pred_multiple:
; ASM: lsls r1, r1, #29
; ASM-NEXT: vmsr p0, r0
; ASM-NEXT: vmsr fpscr_nzcvqc, r1
; ASM-NEXT: vpstt
; ASM-NEXT: vadct.i32 q0, q0, q2
; ASM-NEXT: vadct.i32 q0, q1, q3
; ASM-NEXT: vmrs r0, fpscr_nzcvqc
; ASM-NEXT: ubfx r0, r0, #29, #1
; ASM-NEXT: bx lr