From 7c6b229204c0dfcbe5758f0fd1bee370536a5658 Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Thu, 22 Aug 2019 08:12:06 +0000 Subject: [PATCH] [ARM] Fix lsrl with a 128/256 bit shift amount or a shift of 32 This patch fixes shifts by a 128/256 bit shift amount. It also fixes codegen for shifts of 32 by delegating to LLVM's default optimisation instead of emitting a long shift. Tests that used to generate long shifts of 32 are updated to check for the more optimised codegen. Differential revision: https://reviews.llvm.org/D66519 llvm-svn: 369626 --- llvm/lib/Target/ARM/ARMISelLowering.cpp | 13 +++-- llvm/test/CodeGen/ARM/shift_parts.ll | 65 +++++++++++++++++++++- llvm/test/CodeGen/Thumb2/mve-abs.ll | 64 ++++++++++----------- llvm/test/CodeGen/Thumb2/mve-div-expand.ll | 12 ++-- llvm/test/CodeGen/Thumb2/mve-vcvt.ll | 12 ++-- 5 files changed, 108 insertions(+), 58 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 04ac7777f5e0..275859a6b912 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -5938,14 +5938,15 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, unsigned ShPartsOpc = ARMISD::LSLL; ConstantSDNode *Con = dyn_cast(ShAmt); - // If the shift amount is greater than 32 then do the default optimisation - if (Con && Con->getZExtValue() > 32) + // If the shift amount is greater than 32 or has a greater bitwidth than 64 + // then do the default optimisation + if (ShAmt->getValueType(0).getSizeInBits() > 64 || + (Con && Con->getZExtValue() >= 32)) return SDValue(); - // Extract the lower 32 bits of the shift amount if it's an i64 - if (ShAmt->getValueType(0) == MVT::i64) - ShAmt = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ShAmt, - DAG.getConstant(0, dl, MVT::i32)); + // Extract the lower 32 bits of the shift amount if it's not an i32 + if (ShAmt->getValueType(0) != MVT::i32) + ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32); if (ShOpc == ISD::SRL) { if (!Con) diff --git a/llvm/test/CodeGen/ARM/shift_parts.ll b/llvm/test/CodeGen/ARM/shift_parts.ll index a3a98e6d2520..9bc77d585bf9 100644 --- a/llvm/test/CodeGen/ARM/shift_parts.ll +++ b/llvm/test/CodeGen/ARM/shift_parts.ll @@ -52,7 +52,8 @@ entry: define i64 @shift_left_imm_big2(i64 %x) { ; CHECK-MVE-LABEL: shift_left_imm_big2: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: lsll r0, r1, #32 +; CHECK-MVE-NEXT: mov r1, r0 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-NON-MVE-LABEL: shift_left_imm_big2: @@ -128,7 +129,8 @@ entry: define i64 @shift_right_imm_big2(i64 %x) { ; CHECK-MVE-LABEL: shift_right_imm_big2: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: lsrl r0, r1, #32 +; CHECK-MVE-NEXT: mov r0, r1 +; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-NON-MVE-LABEL: shift_right_imm_big2: @@ -219,3 +221,62 @@ entry: store i40 %bf.clear, i40* %0, align 1 ret void } + +%struct.a = type { i96 } + +define void @lsll_128bit_shift(%struct.a* nocapture %x) local_unnamed_addr #0 { +; CHECK-MVE-LABEL: lsll_128bit_shift: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: movs r1, #0 +; CHECK-MVE-NEXT: strd r1, r1, [r0] +; CHECK-MVE-NEXT: str r1, [r0, #8] +; CHECK-MVE-NEXT: bx lr +; +; CHECK-NON-MVE-LABEL: lsll_128bit_shift: +; CHECK-NON-MVE: @ %bb.0: @ %entry +; CHECK-NON-MVE-NEXT: movs r1, #0 +; CHECK-NON-MVE-NEXT: str r1, [r0] +; CHECK-NON-MVE-NEXT: str r1, [r0, #4] +; CHECK-NON-MVE-NEXT: str r1, [r0, #8] +; CHECK-NON-MVE-NEXT: bx lr +entry: + %0 = bitcast %struct.a* %x to i128* + %bf.load = load i128, i128* %0, align 8 + %bf.clear4 = and i128 %bf.load, -79228162514264337593543950336 + store i128 %bf.clear4, i128* %0, align 8 + ret void +} + +%struct.b = type { i184 } + +define void @lsll_256bit_shift(%struct.b* nocapture %x) local_unnamed_addr #0 { +; CHECK-MVE-LABEL: lsll_256bit_shift: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: movs r1, #0 +; CHECK-MVE-NEXT: str r1, [r0, #16] +; CHECK-MVE-NEXT: strd r1, r1, [r0, #8] +; CHECK-MVE-NEXT: strd r1, r1, [r0] +; CHECK-MVE-NEXT: ldrb r1, [r0, #23] +; CHECK-MVE-NEXT: lsls r1, r1, #24 +; CHECK-MVE-NEXT: str r1, [r0, #20] +; CHECK-MVE-NEXT: bx lr +; +; CHECK-NON-MVE-LABEL: lsll_256bit_shift: +; CHECK-NON-MVE: @ %bb.0: @ %entry +; CHECK-NON-MVE-NEXT: movs r1, #0 +; CHECK-NON-MVE-NEXT: str r1, [r0, #16] +; CHECK-NON-MVE-NEXT: str r1, [r0, #8] +; CHECK-NON-MVE-NEXT: str r1, [r0, #12] +; CHECK-NON-MVE-NEXT: str r1, [r0] +; CHECK-NON-MVE-NEXT: str r1, [r0, #4] +; CHECK-NON-MVE-NEXT: ldrb r1, [r0, #23] +; CHECK-NON-MVE-NEXT: lsls r1, r1, #24 +; CHECK-NON-MVE-NEXT: str r1, [r0, #20] +; CHECK-NON-MVE-NEXT: bx lr +entry: + %0 = bitcast %struct.b* %x to i192* + %bf.load = load i192, i192* %0, align 8 + %bf.clear4 = and i192 %bf.load, -24519928653854221733733552434404946937899825954937634816 + store i192 %bf.clear4, i192* %0, align 8 + ret void +} diff --git a/llvm/test/CodeGen/Thumb2/mve-abs.ll b/llvm/test/CodeGen/Thumb2/mve-abs.ll index 6e2100e2f463..081157b07042 100644 --- a/llvm/test/CodeGen/Thumb2/mve-abs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-abs.ll @@ -40,43 +40,39 @@ entry: define arm_aapcs_vfpcc <2 x i64> @abs_v2i64(<2 x i64> %s1) { ; CHECK-LABEL: abs_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vmov r12, s2 -; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: rsbs.w lr, r12, #0 -; CHECK-NEXT: sbc.w r5, r0, r3 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: mov r2, lr -; CHECK-NEXT: lsrl r2, r5, #32 -; CHECK-NEXT: mov.w r5, #0 -; CHECK-NEXT: it mi -; CHECK-NEXT: movmi r5, #1 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: it eq -; CHECK-NEXT: moveq r2, r3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: rsbs r4, r1, #0 -; CHECK-NEXT: mov r6, r4 -; CHECK-NEXT: sbc.w r7, r0, r3 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: lsrl r6, r7, #32 -; CHECK-NEXT: it mi -; CHECK-NEXT: movmi r0, #1 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: rsbs.w r3, r12, #0 +; CHECK-NEXT: sbc.w lr, r2, r0 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: ite eq -; CHECK-NEXT: moveq r6, r3 -; CHECK-NEXT: movne r1, r4 -; CHECK-NEXT: vmov.32 q0[0], r1 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: vmov.32 q0[1], r6 +; CHECK-NEXT: it mi +; CHECK-NEXT: movmi r1, #1 +; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it eq -; CHECK-NEXT: moveq lr, r12 -; CHECK-NEXT: vmov.32 q0[2], lr -; CHECK-NEXT: vmov.32 q0[3], r2 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: moveq lr, r0 +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: rsbs r5, r4, #0 +; CHECK-NEXT: sbc.w r6, r2, r0 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it mi +; CHECK-NEXT: movmi r2, #1 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: itt eq +; CHECK-NEXT: moveq r6, r0 +; CHECK-NEXT: moveq r5, r4 +; CHECK-NEXT: vmov.32 q0[0], r5 +; CHECK-NEXT: vmov.32 q0[1], r6 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: it eq +; CHECK-NEXT: moveq r3, r12 +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %0 = icmp slt <2 x i64> %s1, zeroinitializer %1 = sub nsw <2 x i64> zeroinitializer, %s1 diff --git a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll index e0dddcd273c2..794b340ad723 100644 --- a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll +++ b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll @@ -755,14 +755,12 @@ define arm_aapcs_vfpcc <2 x i64> @udiv_i64(<2 x i64> %in1, <2 x i64> %in2) { ; CHECK-NEXT: vmov r2, s18 ; CHECK-NEXT: vmov r3, s19 ; CHECK-NEXT: vmov.32 q4[0], r0 -; CHECK-NEXT: lsrl r0, r1, #32 -; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: vmov.32 q4[1], r1 ; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: mov r1, lr ; CHECK-NEXT: bl __aeabi_uldivmod ; CHECK-NEXT: vmov.32 q4[2], r0 -; CHECK-NEXT: lsrl r0, r1, #32 -; CHECK-NEXT: vmov.32 q4[3], r0 +; CHECK-NEXT: vmov.32 q4[3], r1 ; CHECK-NEXT: vmov q0, q4 ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop {r7, pc} @@ -790,14 +788,12 @@ define arm_aapcs_vfpcc <2 x i64> @sdiv_i64(<2 x i64> %in1, <2 x i64> %in2) { ; CHECK-NEXT: vmov r2, s18 ; CHECK-NEXT: vmov r3, s19 ; CHECK-NEXT: vmov.32 q4[0], r0 -; CHECK-NEXT: lsrl r0, r1, #32 -; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: vmov.32 q4[1], r1 ; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: mov r1, lr ; CHECK-NEXT: bl __aeabi_ldivmod ; CHECK-NEXT: vmov.32 q4[2], r0 -; CHECK-NEXT: lsrl r0, r1, #32 -; CHECK-NEXT: vmov.32 q4[3], r0 +; CHECK-NEXT: vmov.32 q4[3], r1 ; CHECK-NEXT: vmov q0, q4 ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop {r7, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll index 524ec692c8c1..ff2c7927b099 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll @@ -382,14 +382,12 @@ define arm_aapcs_vfpcc <2 x i64> @foo_int64_float(<2 x double> %src) { ; CHECK-NEXT: bl __aeabi_d2lz ; CHECK-NEXT: vmov r2, r3, d9 ; CHECK-NEXT: vmov.32 q4[0], r0 -; CHECK-NEXT: lsrl r0, r1, #32 -; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: vmov.32 q4[1], r1 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_d2lz ; CHECK-NEXT: vmov.32 q4[2], r0 -; CHECK-NEXT: lsrl r0, r1, #32 -; CHECK-NEXT: vmov.32 q4[3], r0 +; CHECK-NEXT: vmov.32 q4[3], r1 ; CHECK-NEXT: vmov q0, q4 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} @@ -410,14 +408,12 @@ define arm_aapcs_vfpcc <2 x i64> @foo_uint64_float(<2 x double> %src) { ; CHECK-NEXT: bl __aeabi_d2ulz ; CHECK-NEXT: vmov r2, r3, d9 ; CHECK-NEXT: vmov.32 q4[0], r0 -; CHECK-NEXT: lsrl r0, r1, #32 -; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: vmov.32 q4[1], r1 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_d2ulz ; CHECK-NEXT: vmov.32 q4[2], r0 -; CHECK-NEXT: lsrl r0, r1, #32 -; CHECK-NEXT: vmov.32 q4[3], r0 +; CHECK-NEXT: vmov.32 q4[3], r1 ; CHECK-NEXT: vmov q0, q4 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc}