From 8491d01cc385d08b8b4f5dd097239ea0009ddc63 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 16 Sep 2022 12:42:49 +0100 Subject: [PATCH] [AArch64] Lower vector trunc using tbl. Similar to using tbl to lower vector ZExts, tbl4 can be used to lower vector truncates. The initial version support i32->i8 conversions. Depends on D120571 Reviewed By: t.p.northover Differential Revision: https://reviews.llvm.org/D133495 --- llvm/lib/CodeGen/CodeGenPrepare.cpp | 5 +- .../Target/AArch64/AArch64ISelLowering.cpp | 50 +++++++ llvm/test/CodeGen/AArch64/trunc-to-tbl.ll | 134 ++++++++++++++---- 3 files changed, 162 insertions(+), 27 deletions(-) diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 45416edb3011..17dac33840a6 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -8047,8 +8047,9 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) { if (OptimizeNoopCopyExpression(CI, *TLI, *DL)) return true; - if (isa(I) && TLI->optimizeExtendOrTruncateConversion( - I, LI->getLoopFor(I->getParent()))) + if ((isa(I) || isa(I)) && + TLI->optimizeExtendOrTruncateConversion(I, + LI->getLoopFor(I->getParent()))) return true; if (isa(I) || isa(I)) { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 26fbcc71a555..16926e8f5688 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13209,6 +13209,44 @@ static void createTblShuffleForZExt(ZExtInst *ZExt, bool IsLittleEndian) { ZExt->eraseFromParent(); } +static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) { + IRBuilder<> Builder(TI); + SmallVector Parts; + Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16); + Parts.push_back(Builder.CreateBitCast( + Builder.CreateShuffleVector(TI->getOperand(0), {0, 1, 2, 3}), VecTy)); + Parts.push_back(Builder.CreateBitCast( + Builder.CreateShuffleVector(TI->getOperand(0), {4, 5, 6, 7}), VecTy)); + + Intrinsic::ID TblID = Intrinsic::aarch64_neon_tbl2; + unsigned NumElements = cast(TI->getType())->getNumElements(); + if (NumElements == 16) { + Parts.push_back(Builder.CreateBitCast( + Builder.CreateShuffleVector(TI->getOperand(0), {8, 9, 10, 11}), VecTy)); + Parts.push_back(Builder.CreateBitCast( + Builder.CreateShuffleVector(TI->getOperand(0), {12, 13, 14, 15}), + VecTy)); + TblID = Intrinsic::aarch64_neon_tbl4; + } + SmallVector MaskConst; + for (unsigned Idx = 0; Idx < NumElements * 4; Idx += 4) + MaskConst.push_back( + ConstantInt::get(Builder.getInt8Ty(), IsLittleEndian ? Idx : Idx + 3)); + + for (unsigned Idx = NumElements * 4; Idx < 64; Idx += 4) + MaskConst.push_back(ConstantInt::get(Builder.getInt8Ty(), 255)); + + Parts.push_back(ConstantVector::get(MaskConst)); + auto *F = + Intrinsic::getDeclaration(TI->getModule(), TblID, Parts[0]->getType()); + Value *Res = Builder.CreateCall(F, Parts); + + if (NumElements == 8) + Res = Builder.CreateShuffleVector(Res, {0, 1, 2, 3, 4, 5, 6, 7}); + TI->replaceAllUsesWith(Res); + TI->eraseFromParent(); +} + bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(Instruction *I, Loop *L) const { // Try to optimize conversions using tbl. This requires materializing constant @@ -13250,6 +13288,18 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(Instruction *I, createTblShuffleForZExt(ZExt, Subtarget->isLittleEndian()); return true; } + + // Convert 'trunc <(8|16) x i32> %x to <(8|16) x i8>' to a single tbl.4 + // instruction selecting the lowest 8 bits per lane of the input interpreted + // as 2 or 4 <4 x i32> vectors. + auto *TI = dyn_cast(I); + if (TI && (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) && + SrcTy->getElementType()->isIntegerTy(32) && + DstTy->getElementType()->isIntegerTy(8)) { + createTblForTrunc(TI, Subtarget->isLittleEndian()); + return true; + } + return false; } diff --git a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll index 80aff21dc86e..76488f588177 100644 --- a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll @@ -2,49 +2,90 @@ ; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s ; RUN: llc -mtriple=aarch64_be-unknown-linux -o - %s | FileCheck --check-prefix=CHECK-BE %s +; CHECK-LABEL: lCPI0_0: +; CHECK-NEXT: .byte 0 ; 0x0 +; CHECK-NEXT: .byte 4 ; 0x4 +; CHECK-NEXT: .byte 8 ; 0x8 +; CHECK-NEXT: .byte 12 ; 0xc +; CHECK-NEXT: .byte 16 ; 0x10 +; CHECK-NEXT: .byte 20 ; 0x14 +; CHECK-NEXT: .byte 24 ; 0x18 +; CHECK-NEXT: .byte 28 ; 0x1c +; CHECK-NEXT: .byte 32 ; 0x20 +; CHECK-NEXT: .byte 36 ; 0x24 +; CHECK-NEXT: .byte 40 ; 0x28 +; CHECK-NEXT: .byte 44 ; 0x2c +; CHECK-NEXT: .byte 48 ; 0x30 +; CHECK-NEXT: .byte 52 ; 0x34 +; CHECK-NEXT: .byte 56 ; 0x38 +; CHECK-NEXT: .byte 60 ; 0x3c + +; CHECK-BE-LABEL: .LCPI0_0: +; CHECK-BE-NEXT: .byte 3 // 0x3 +; CHECK-BE-NEXT: .byte 7 // 0x7 +; CHECK-BE-NEXT: .byte 11 // 0xb +; CHECK-BE-NEXT: .byte 15 // 0xf +; CHECK-BE-NEXT: .byte 19 // 0x13 +; CHECK-BE-NEXT: .byte 23 // 0x17 +; CHECK-BE-NEXT: .byte 27 // 0x1b +; CHECK-BE-NEXT: .byte 31 // 0x1f +; CHECK-BE-NEXT: .byte 35 // 0x23 +; CHECK-BE-NEXT: .byte 39 // 0x27 +; CHECK-BE-NEXT: .byte 43 // 0x2b +; CHECK-BE-NEXT: .byte 47 // 0x2f +; CHECK-BE-NEXT: .byte 51 // 0x33 +; CHECK-BE-NEXT: .byte 55 // 0x37 +; CHECK-BE-NEXT: .byte 59 // 0x3b +; CHECK-BE-NEXT: .byte 63 // 0x3f + ; It's profitable to use a single tbl.4 instruction to lower the truncate. define void @trunc_v16i32_to_v16i8_in_loop(ptr %A, ptr %dst) { ; CHECK-LABEL: trunc_v16i32_to_v16i8_in_loop: ; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: Lloh0: +; CHECK-NEXT: adrp x9, lCPI0_0@PAGE ; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: Lloh1: +; CHECK-NEXT: ldr q0, [x9, lCPI0_0@PAGEOFF] ; CHECK-NEXT: LBB0_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add x9, x0, x8, lsl #6 -; CHECK-NEXT: ldp q1, q0, [x9, #32] -; CHECK-NEXT: ldp q3, q2, [x9] -; CHECK-NEXT: uzp1.8h v0, v1, v0 -; CHECK-NEXT: uzp1.8h v1, v3, v2 -; CHECK-NEXT: uzp1.16b v0, v1, v0 -; CHECK-NEXT: str q0, [x1, x8, lsl #4] +; CHECK-NEXT: ldp q1, q2, [x9] +; CHECK-NEXT: ldp q3, q4, [x9, #32] +; CHECK-NEXT: tbl.16b v1, { v1, v2, v3, v4 }, v0 +; CHECK-NEXT: str q1, [x1, x8, lsl #4] ; CHECK-NEXT: add x8, x8, #1 ; CHECK-NEXT: cmp x8, #1000 ; CHECK-NEXT: b.eq LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh1 ; ; CHECK-BE-LABEL: trunc_v16i32_to_v16i8_in_loop: ; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: adrp x8, .LCPI0_0 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI0_0 +; CHECK-BE-NEXT: ld1 { v0.16b }, [x8] ; CHECK-BE-NEXT: mov x8, xzr ; CHECK-BE-NEXT: .LBB0_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: add x9, x0, x8, lsl #6 -; CHECK-BE-NEXT: add x10, x9, #48 +; CHECK-BE-NEXT: add x10, x9, #16 ; CHECK-BE-NEXT: add x11, x9, #32 -; CHECK-BE-NEXT: ld1 { v0.4s }, [x9] -; CHECK-BE-NEXT: add x9, x9, #16 -; CHECK-BE-NEXT: ld1 { v1.4s }, [x10] -; CHECK-BE-NEXT: ld1 { v2.4s }, [x11] -; CHECK-BE-NEXT: ld1 { v3.4s }, [x9] +; CHECK-BE-NEXT: ld1 { v1.16b }, [x9] +; CHECK-BE-NEXT: add x9, x9, #48 +; CHECK-BE-NEXT: ld1 { v2.16b }, [x10] +; CHECK-BE-NEXT: ld1 { v3.16b }, [x11] +; CHECK-BE-NEXT: ld1 { v4.16b }, [x9] ; CHECK-BE-NEXT: add x9, x1, x8, lsl #4 ; CHECK-BE-NEXT: add x8, x8, #1 ; CHECK-BE-NEXT: cmp x8, #1000 -; CHECK-BE-NEXT: uzp1 v1.8h, v2.8h, v1.8h -; CHECK-BE-NEXT: uzp1 v0.8h, v0.8h, v3.8h -; CHECK-BE-NEXT: uzp1 v0.16b, v0.16b, v1.16b -; CHECK-BE-NEXT: st1 { v0.16b }, [x9] +; CHECK-BE-NEXT: tbl v1.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v0.16b +; CHECK-BE-NEXT: st1 { v1.16b }, [x9] ; CHECK-BE-NEXT: b.eq .LBB0_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: ret + entry: br label %loop @@ -97,42 +138,85 @@ entry: ret void } + +; CHECK-LABEL: lCPI2_0: +; CHECK-NEXT: .byte 0 ; 0x0 +; CHECK-NEXT: .byte 4 ; 0x4 +; CHECK-NEXT: .byte 8 ; 0x8 +; CHECK-NEXT: .byte 12 ; 0xc +; CHECK-NEXT: .byte 16 ; 0x10 +; CHECK-NEXT: .byte 20 ; 0x14 +; CHECK-NEXT: .byte 24 ; 0x18 +; CHECK-NEXT: .byte 28 ; 0x1c +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff + +; CHECK-BE-LABEL: .LCPI2_0: +; CHECK-BE-NEXT: .byte 3 // 0x3 +; CHECK-BE-NEXT: .byte 7 // 0x7 +; CHECK-BE-NEXT: .byte 11 // 0xb +; CHECK-BE-NEXT: .byte 15 // 0xf +; CHECK-BE-NEXT: .byte 19 // 0x13 +; CHECK-BE-NEXT: .byte 23 // 0x17 +; CHECK-BE-NEXT: .byte 27 // 0x1b +; CHECK-BE-NEXT: .byte 31 // 0x1f +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff ; It's profitable to use a single tbl.2 instruction to lower the truncate. define void @trunc_v8i32_to_v8i8_in_loop(ptr %A, ptr %dst) { ; CHECK-LABEL: trunc_v8i32_to_v8i8_in_loop: ; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: Lloh2: +; CHECK-NEXT: adrp x9, lCPI2_0@PAGE ; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: Lloh3: +; CHECK-NEXT: ldr q0, [x9, lCPI2_0@PAGEOFF] ; CHECK-NEXT: LBB2_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add x9, x0, x8, lsl #5 -; CHECK-NEXT: ldp q1, q0, [x9] -; CHECK-NEXT: uzp1.8h v0, v1, v0 -; CHECK-NEXT: xtn.8b v0, v0 -; CHECK-NEXT: str d0, [x1, x8, lsl #3] +; CHECK-NEXT: ldp q1, q2, [x9] +; CHECK-NEXT: tbl.16b v1, { v1, v2 }, v0 +; CHECK-NEXT: str d1, [x1, x8, lsl #3] ; CHECK-NEXT: add x8, x8, #1 ; CHECK-NEXT: cmp x8, #1000 ; CHECK-NEXT: b.eq LBB2_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh3 ; ; CHECK-BE-LABEL: trunc_v8i32_to_v8i8_in_loop: ; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: adrp x8, .LCPI2_0 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI2_0 +; CHECK-BE-NEXT: ld1 { v0.16b }, [x8] ; CHECK-BE-NEXT: mov x8, xzr ; CHECK-BE-NEXT: .LBB2_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: add x9, x0, x8, lsl #5 ; CHECK-BE-NEXT: add x10, x9, #16 -; CHECK-BE-NEXT: ld1 { v0.4s }, [x9] +; CHECK-BE-NEXT: ld1 { v1.16b }, [x9] ; CHECK-BE-NEXT: add x9, x1, x8, lsl #3 ; CHECK-BE-NEXT: add x8, x8, #1 -; CHECK-BE-NEXT: ld1 { v1.4s }, [x10] +; CHECK-BE-NEXT: ld1 { v2.16b }, [x10] ; CHECK-BE-NEXT: cmp x8, #1000 -; CHECK-BE-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; CHECK-BE-NEXT: xtn v0.8b, v0.8h -; CHECK-BE-NEXT: st1 { v0.8b }, [x9] +; CHECK-BE-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v0.16b +; CHECK-BE-NEXT: st1 { v1.8b }, [x9] ; CHECK-BE-NEXT: b.eq .LBB2_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: ret + entry: br label %loop