From aec2c0c9b699f6380d0e6608be11a1927857c6bd Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Fri, 1 Jun 2018 14:52:58 +0000 Subject: [PATCH] [Hexagon] Select HVX code for vector CTPOP, CTLZ, and CTTZ llvm-svn: 333760 --- llvm/lib/Target/Hexagon/HexagonISelLowering.h | 1 + .../Target/Hexagon/HexagonISelLoweringHVX.cpp | 67 +++++++--- llvm/lib/Target/Hexagon/HexagonPatternsHVX.td | 27 ++++ .../CodeGen/Hexagon/autohvx/bitcount-128b.ll | 124 +++++++++++++++++ .../CodeGen/Hexagon/autohvx/bitcount-64b.ll | 125 ++++++++++++++++++ 5 files changed, 329 insertions(+), 15 deletions(-) create mode 100644 llvm/test/CodeGen/Hexagon/autohvx/bitcount-128b.ll create mode 100644 llvm/test/CodeGen/Hexagon/autohvx/bitcount-64b.ll diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h index cb570b4dc7ff..6dd193231e5f 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h @@ -425,6 +425,7 @@ namespace HexagonISD { SDValue LowerHvxAnyExt(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxSignExt(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxZeroExt(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerHvxCttz(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxMul(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxMulh(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxSetCC(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index b75a7f2bde9d..9d53a58d913e 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -69,21 +69,25 @@ HexagonTargetLowering::initializeHVXLowering() { setOperationAction(ISD::VECTOR_SHUFFLE, ByteV, Legal); setOperationAction(ISD::VECTOR_SHUFFLE, ByteW, Legal); - setOperationAction(ISD::AND, ByteV, Legal); - setOperationAction(ISD::OR, ByteV, Legal); - setOperationAction(ISD::XOR, ByteV, Legal); for (MVT T : LegalV) { setIndexedLoadAction(ISD::POST_INC, T, Legal); setIndexedStoreAction(ISD::POST_INC, T, Legal); - setOperationAction(ISD::ADD, T, Legal); - setOperationAction(ISD::SUB, T, Legal); + setOperationAction(ISD::AND, T, Legal); + setOperationAction(ISD::OR, T, Legal); + setOperationAction(ISD::XOR, T, Legal); + setOperationAction(ISD::ADD, T, Legal); + setOperationAction(ISD::SUB, T, Legal); + setOperationAction(ISD::CTPOP, T, Legal); + setOperationAction(ISD::CTLZ, T, Legal); if (T != ByteV) { setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, T, Legal); setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Legal); + setOperationAction(ISD::BSWAP, T, Legal); } + setOperationAction(ISD::CTTZ, T, Custom); setOperationAction(ISD::LOAD, T, Custom); setOperationAction(ISD::MUL, T, Custom); setOperationAction(ISD::MULHS, T, Custom); @@ -104,6 +108,9 @@ HexagonTargetLowering::initializeHVXLowering() { setOperationAction(ISD::SRA, T, Custom); setOperationAction(ISD::SHL, T, Custom); setOperationAction(ISD::SRL, T, Custom); + + // Promote all shuffles to operate on vectors of bytes. + setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteV); } setCondCodeAction(ISD::SETNE, T, Expand); @@ -115,16 +122,6 @@ HexagonTargetLowering::initializeHVXLowering() { setCondCodeAction(ISD::SETULT, T, Expand); } - for (MVT T : LegalV) { - if (T == ByteV) - continue; - // Promote all shuffles to operate on vectors of bytes. - setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteV); - setPromoteTo(ISD::AND, T, ByteV); - setPromoteTo(ISD::OR, T, ByteV); - setPromoteTo(ISD::XOR, T, ByteV); - } - for (MVT T : LegalW) { // Custom-lower BUILD_VECTOR for vector pairs. The standard (target- // independent) handling of it would convert it to a load, which is @@ -145,6 +142,9 @@ HexagonTargetLowering::initializeHVXLowering() { setOperationAction(ISD::LOAD, T, Custom); setOperationAction(ISD::STORE, T, Custom); + setOperationAction(ISD::CTLZ, T, Custom); + setOperationAction(ISD::CTTZ, T, Custom); + setOperationAction(ISD::CTPOP, T, Custom); setOperationAction(ISD::ADD, T, Legal); setOperationAction(ISD::SUB, T, Legal); @@ -1157,6 +1157,40 @@ HexagonTargetLowering::LowerHvxZeroExt(SDValue Op, SelectionDAG &DAG) const { return Op; } +SDValue +HexagonTargetLowering::LowerHvxCttz(SDValue Op, SelectionDAG &DAG) const { + // Lower vector CTTZ into a computation using CTLZ (Hacker's Delight): + // cttz(x) = bitwidth(x) - ctlz(~x & (x-1)) + const SDLoc &dl(Op); + MVT ResTy = ty(Op); + SDValue InpV = Op.getOperand(0); + assert(ResTy == ty(InpV)); + + // Calculate the vectors of 1 and bitwidth(x). + MVT ElemTy = ty(InpV).getVectorElementType(); + unsigned ElemWidth = ElemTy.getSizeInBits(); + uint32_t Splat1 = 0, SplatW = 0; + assert(isPowerOf2_32(ElemWidth) && ElemWidth <= 32); + for (unsigned i = 0; i != 32/ElemWidth; ++i) { + Splat1 = (Splat1 << ElemWidth) | 1; + SplatW = (SplatW << ElemWidth) | ElemWidth; + } + SDValue Vec1 = DAG.getNode(HexagonISD::VSPLATW, dl, ResTy, + DAG.getConstant(Splat1, dl, MVT::i32)); + SDValue VecW = DAG.getNode(HexagonISD::VSPLATW, dl, ResTy, + DAG.getConstant(SplatW, dl, MVT::i32)); + SDValue VecN1 = DAG.getNode(HexagonISD::VSPLATW, dl, ResTy, + DAG.getConstant(-1, dl, MVT::i32)); + // Do not use DAG.getNOT, because that would create BUILD_VECTOR with + // a BITCAST. Here we can skip the BITCAST (so we don't have to handle + // it separately in custom combine or selection). + SDValue A = DAG.getNode(ISD::AND, dl, ResTy, + {DAG.getNode(ISD::XOR, dl, ResTy, {InpV, VecN1}), + DAG.getNode(ISD::SUB, dl, ResTy, {InpV, Vec1})}); + return DAG.getNode(ISD::SUB, dl, ResTy, + {VecW, DAG.getNode(ISD::CTLZ, dl, ResTy, A)}); +} + SDValue HexagonTargetLowering::LowerHvxMul(SDValue Op, SelectionDAG &DAG) const { MVT ResTy = ty(Op); @@ -1422,6 +1456,8 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::LOAD: case ISD::STORE: return SplitHvxMemOp(Op, DAG); + case ISD::CTLZ: + case ISD::CTTZ: case ISD::MUL: case ISD::MULHS: case ISD::MULHU: @@ -1451,6 +1487,7 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ANY_EXTEND: return LowerHvxAnyExt(Op, DAG); case ISD::SIGN_EXTEND: return LowerHvxSignExt(Op, DAG); case ISD::ZERO_EXTEND: return LowerHvxZeroExt(Op, DAG); + case ISD::CTTZ: return LowerHvxCttz(Op, DAG); case ISD::SRA: case ISD::SHL: case ISD::SRL: return LowerHvxShift(Op, DAG); diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td index 9ba88048b4b5..3b075cacf5e6 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td +++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td @@ -250,7 +250,19 @@ let Predicates = [UseHVX] in { def: Pat<(VecPI32 (HexagonVSPLATW I32:$Rs)), (Rep (Vsplatrw $Rs))>; } +class Vneg1 + : PatFrag<(ops), (VecTy (HexagonVSPLATW (i32 -1)))>; + +class Vnot + : PatFrag<(ops node:$Vs), (xor $Vs, Vneg1)>; + let Predicates = [UseHVX] in { + let AddedComplexity = 200 in { + def: Pat<(Vnot HVI8:$Vs), (V6_vnot HvxVR:$Vs)>; + def: Pat<(Vnot HVI16:$Vs), (V6_vnot HvxVR:$Vs)>; + def: Pat<(Vnot HVI32:$Vs), (V6_vnot HvxVR:$Vs)>; + } + def: OpR_RR_pat; def: OpR_RR_pat; def: OpR_RR_pat; @@ -378,6 +390,21 @@ let Predicates = [UseHVX] in { (V6_vdelta HvxVR:$Vs, (V6_lvsplatw (A2_tfrsi 0x01010101)))>; def: Pat<(VecI32 (bswap HVI32:$Vs)), (V6_vdelta HvxVR:$Vs, (V6_lvsplatw (A2_tfrsi 0x03030303)))>; + + def: Pat<(VecI8 (ctpop HVI8:$Vs)), + (V6_vpackeb (V6_vpopcounth (HiVec (V6_vunpackub HvxVR:$Vs))), + (V6_vpopcounth (LoVec (V6_vunpackub HvxVR:$Vs))))>; + def: Pat<(VecI16 (ctpop HVI16:$Vs)), (V6_vpopcounth HvxVR:$Vs)>; + def: Pat<(VecI32 (ctpop HVI32:$Vs)), + (V6_vaddw (LoVec (V6_vzh (V6_vpopcounth HvxVR:$Vs))), + (HiVec (V6_vzh (V6_vpopcounth HvxVR:$Vs))))>; + + def: Pat<(VecI8 (ctlz HVI8:$Vs)), + (V6_vsubb (V6_vpackeb (V6_vcl0h (HiVec (V6_vunpackub HvxVR:$Vs))), + (V6_vcl0h (LoVec (V6_vunpackub HvxVR:$Vs)))), + (V6_lvsplatw (A2_tfrsi 0x08080808)))>; + def: Pat<(VecI16 (ctlz HVI16:$Vs)), (V6_vcl0h HvxVR:$Vs)>; + def: Pat<(VecI32 (ctlz HVI32:$Vs)), (V6_vcl0w HvxVR:$Vs)>; } class HvxSel_pat diff --git a/llvm/test/CodeGen/Hexagon/autohvx/bitcount-128b.ll b/llvm/test/CodeGen/Hexagon/autohvx/bitcount-128b.ll new file mode 100644 index 000000000000..0d735f94eaef --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/autohvx/bitcount-128b.ll @@ -0,0 +1,124 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s + +; CHECK-LABEL: f0 +; CHECK: v[[V00:[0-9]+]]:[[V01:[0-9]+]].uh = vunpack(v0.ub) +; CHECK-DAG: v[[V02:[0-9]+]].h = vpopcount(v[[V00]].h) +; CHECK-DAG: v[[V03:[0-9]+]].h = vpopcount(v[[V01]].h) +; CHECK: v0.b = vpacke(v[[V02]].h,v[[V03]].h) +define <128 x i8> @f0(<128 x i8> %a0) #0 { + %t0 = call <128 x i8> @llvm.ctpop.v128i8(<128 x i8> %a0) + ret <128 x i8> %t0 +} + +; CHECK-LABEL: f1 +; CHECK: v0.h = vpopcount(v0.h) +define <64 x i16> @f1(<64 x i16> %a0) #0 { + %t0 = call <64 x i16> @llvm.ctpop.v64i16(<64 x i16> %a0) + ret <64 x i16> %t0 +} + +; CHECK-LABEL: f2 +; CHECK: v[[V20:[0-9]+]].h = vpopcount(v0.h) +; CHECK: v[[V21:[0-9]+]]:[[V22:[0-9]+]].uw = vzxt(v[[V20]].uh) +; CHECK: v0.w = vadd(v[[V22]].w,v[[V21]].w) +define <32 x i32> @f2(<32 x i32> %a0) #0 { + %t0 = call <32 x i32> @llvm.ctpop.v32i32(<32 x i32> %a0) + ret <32 x i32> %t0 +} + +; CHECK-LABEL: f3 +; CHECK-DAG: r[[R30:[0-9]+]] = ##134744072 +; CHECK-DAG: v[[V31:[0-9]+]]:[[V32:[0-9]+]].uh = vunpack(v0.ub) +; CHECK: v[[V33:[0-9]+]] = vsplat(r[[R30]]) +; CHECK-DAG: v[[V34:[0-9]+]].uh = vcl0(v[[V31]].uh) +; CHECK-DAG: v[[V35:[0-9]+]].uh = vcl0(v[[V32]].uh) +; CHECK: v[[V36:[0-9]+]].b = vpacke(v[[V34]].h,v[[V35]].h) +; CHECK: v0.b = vsub(v[[V36]].b,v[[V33]].b) +define <128 x i8> @f3(<128 x i8> %a0) #0 { + %t0 = call <128 x i8> @llvm.ctlz.v128i8(<128 x i8> %a0) + ret <128 x i8> %t0 +} + +; CHECK-LABEL: f4 +; CHECK: v0.uh = vcl0(v0.uh) +define <64 x i16> @f4(<64 x i16> %a0) #0 { + %t0 = call <64 x i16> @llvm.ctlz.v64i16(<64 x i16> %a0) + ret <64 x i16> %t0 +} + +; CHECK-LABEL: f5 +; CHECK: v0.uw = vcl0(v0.uw) +define <32 x i32> @f5(<32 x i32> %a0) #0 { + %t0 = call <32 x i32> @llvm.ctlz.v32i32(<32 x i32> %a0) + ret <32 x i32> %t0 +} + +; CHECK-LABEL: f6 +; r = 0x01010101 +; CHECK-DAG: r[[R60:[0-9]+]] = ##16843009 +; CHECK-DAG: v[[V61:[0-9]+]] = vnot(v0) +; r = 0x08080808 +; CHECK-DAG: r[[R62:[0-9]+]] = ##134744072 +; CHECK: v[[V63:[0-9]+]] = vsplat(r[[R60]]) +; CHECK-DAG: v[[V64:[0-9]+]] = vsplat(r[[R62]]) +; CHECK: v[[V65:[0-9]+]].b = vsub(v0.b,v[[V63]].b) +; CHECK: v[[V66:[0-9]+]] = vand(v[[V61]],v[[V65]]) +; Ctlz: +; CHECK: v[[V67:[0-9]+]]:[[V68:[0-9]+]].uh = vunpack(v[[V66]].ub) +; CHECK: v[[V69:[0-9]+]].uh = vcl0(v[[V68]].uh) +; CHECK: v[[V6A:[0-9]+]].uh = vcl0(v[[V67]].uh) +; CHECK: v[[V6B:[0-9]+]].b = vpacke(v[[V6A]].h,v[[V69]].h) +; CHECK: v[[V6C:[0-9]+]].b = vsub(v[[V6B]].b,v[[V64]].b) +; CHECK: v0.b = vsub(v[[V64]].b,v[[V6C]].b) +define <128 x i8> @f6(<128 x i8> %a0) #0 { + %t0 = call <128 x i8> @llvm.cttz.v128i8(<128 x i8> %a0) + ret <128 x i8> %t0 +} + +; CHECK-LABEL: f7 +; r = 0x00010001 +; CHECK-DAG: r[[R70:[0-9]+]] = ##65537 +; CHECK-DAG: v[[V71:[0-9]+]] = vnot(v0) +; r = 0x00100010 // halfword bitwidths +; CHECK-DAG: r[[R72:[0-9]+]] = ##1048592 +; CHECK: v[[V73:[0-9]+]] = vsplat(r[[R70]]) +; CHECK: v[[V74:[0-9]+]] = vsplat(r[[R72]]) +; CHECK: v[[V75:[0-9]+]].h = vsub(v0.h,v[[V73]].h) +; CHECK: v[[V76:[0-9]+]] = vand(v[[V71]],v[[V75]]) +; Ctlz: +; CHECK: v[[V77:[0-9]+]].uh = vcl0(v[[V76]].uh) +; CHECK: v0.h = vsub(v[[V74]].h,v[[V77]].h) +define <64 x i16> @f7(<64 x i16> %a0) #0 { + %t0 = call <64 x i16> @llvm.cttz.v64i16(<64 x i16> %a0) + ret <64 x i16> %t0 +} + +; CHECK-LABEL: f8 +; CHECK-DAG: r[[R80:[0-9]+]] = #1 +; CHECK-DAG: v[[V81:[0-9]+]] = vnot(v0) +; CHECK-DAG: r[[R82:[0-9]+]] = #32 +; CHECK: v[[V83:[0-9]+]] = vsplat(r[[R80]]) +; CHECK: v[[V84:[0-9]+]] = vsplat(r[[R82]]) +; CHECK: v[[V85:[0-9]+]].w = vsub(v0.w,v[[V83]].w) +; CHECK: v[[V86:[0-9]+]] = vand(v[[V81]],v[[V85]]) +; Ctlz: +; CHECK: v[[V87:[0-9]+]].uw = vcl0(v[[V86]].uw) +; CHECK: v0.w = vsub(v[[V84]].w,v[[V87]].w) +define <32 x i32> @f8(<32 x i32> %a0) #0 { + %t0 = call <32 x i32> @llvm.cttz.v32i32(<32 x i32> %a0) + ret <32 x i32> %t0 +} + +declare <128 x i8> @llvm.ctpop.v128i8(<128 x i8>) #0 +declare <64 x i16> @llvm.ctpop.v64i16(<64 x i16>) #0 +declare <32 x i32> @llvm.ctpop.v32i32(<32 x i32>) #0 + +declare <128 x i8> @llvm.ctlz.v128i8(<128 x i8>) #0 +declare <64 x i16> @llvm.ctlz.v64i16(<64 x i16>) #0 +declare <32 x i32> @llvm.ctlz.v32i32(<32 x i32>) #0 + +declare <128 x i8> @llvm.cttz.v128i8(<128 x i8>) #0 +declare <64 x i16> @llvm.cttz.v64i16(<64 x i16>) #0 +declare <32 x i32> @llvm.cttz.v32i32(<32 x i32>) #0 + +attributes #0 = { readnone nounwind "target-cpu"="hexagonv60" "target-features"="+hvx,+hvx-length128b,-packets" } diff --git a/llvm/test/CodeGen/Hexagon/autohvx/bitcount-64b.ll b/llvm/test/CodeGen/Hexagon/autohvx/bitcount-64b.ll new file mode 100644 index 000000000000..95cedad0ff60 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/autohvx/bitcount-64b.ll @@ -0,0 +1,125 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s + +; CHECK-LABEL: f0 +; CHECK: v[[V00:[0-9]+]]:[[V01:[0-9]+]].uh = vunpack(v0.ub) +; CHECK-DAG: v[[V02:[0-9]+]].h = vpopcount(v[[V00]].h) +; CHECK-DAG: v[[V03:[0-9]+]].h = vpopcount(v[[V01]].h) +; CHECK: v0.b = vpacke(v[[V02]].h,v[[V03]].h) +define <64 x i8> @f0(<64 x i8> %a0) #0 { + %t0 = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %a0) + ret <64 x i8> %t0 +} + +; CHECK-LABEL: f1 +; CHECK: v0.h = vpopcount(v0.h) +define <32 x i16> @f1(<32 x i16> %a0) #0 { + %t0 = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %a0) + ret <32 x i16> %t0 +} + +; CHECK-LABEL: f2 +; CHECK: v[[V20:[0-9]+]].h = vpopcount(v0.h) +; CHECK: v[[V21:[0-9]+]]:[[V22:[0-9]+]].uw = vzxt(v[[V20]].uh) +; CHECK: v0.w = vadd(v[[V22]].w,v[[V21]].w) +define <16 x i32> @f2(<16 x i32> %a0) #0 { + %t0 = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %a0) + ret <16 x i32> %t0 +} + +; CHECK-LABEL: f3 +; CHECK-DAG: r[[R30:[0-9]+]] = ##134744072 +; CHECK-DAG: v[[V31:[0-9]+]]:[[V32:[0-9]+]].uh = vunpack(v0.ub) +; CHECK: v[[V33:[0-9]+]] = vsplat(r[[R30]]) +; CHECK-DAG: v[[V34:[0-9]+]].uh = vcl0(v[[V31]].uh) +; CHECK-DAG: v[[V35:[0-9]+]].uh = vcl0(v[[V32]].uh) +; CHECK: v[[V36:[0-9]+]].b = vpacke(v[[V34]].h,v[[V35]].h) +; CHECK: v0.b = vsub(v[[V36]].b,v[[V33]].b) +define <64 x i8> @f3(<64 x i8> %a0) #0 { + %t0 = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a0) + ret <64 x i8> %t0 +} + +; CHECK-LABEL: f4 +; CHECK: v0.uh = vcl0(v0.uh) +define <32 x i16> @f4(<32 x i16> %a0) #0 { + %t0 = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a0) + ret <32 x i16> %t0 +} + +; CHECK-LABEL: f5 +; CHECK: v0.uw = vcl0(v0.uw) +define <16 x i32> @f5(<16 x i32> %a0) #0 { + %t0 = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a0) + ret <16 x i32> %t0 +} + +; CHECK-LABEL: f6 +; r = 0x01010101 +; CHECK-DAG: r[[R60:[0-9]+]] = ##16843009 +; CHECK-DAG: v[[V61:[0-9]+]] = vnot(v0) +; r = 0x08080808 +; CHECK-DAG: r[[R62:[0-9]+]] = ##134744072 +; CHECK: v[[V63:[0-9]+]] = vsplat(r[[R60]]) +; CHECK-DAG: v[[V64:[0-9]+]] = vsplat(r[[R62]]) +; CHECK: v[[V65:[0-9]+]].b = vsub(v0.b,v[[V63]].b) +; CHECK: v[[V66:[0-9]+]] = vand(v[[V61]],v[[V65]]) +; Ctlz: +; CHECK: v[[V67:[0-9]+]]:[[V68:[0-9]+]].uh = vunpack(v[[V66]].ub) +; CHECK: v[[V69:[0-9]+]].uh = vcl0(v[[V68]].uh) +; CHECK: v[[V6A:[0-9]+]].uh = vcl0(v[[V67]].uh) +; CHECK: v[[V6B:[0-9]+]].b = vpacke(v[[V6A]].h,v[[V69]].h) +; CHECK: v[[V6C:[0-9]+]].b = vsub(v[[V6B]].b,v[[V64]].b) +; CHECK: v0.b = vsub(v[[V64]].b,v[[V6C]].b) +define <64 x i8> @f6(<64 x i8> %a0) #0 { + %t0 = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %a0) + ret <64 x i8> %t0 +} + +; CHECK-LABEL: f7 +; r = 0x00010001 +; CHECK-DAG: r[[R70:[0-9]+]] = ##65537 +; CHECK-DAG: v[[V71:[0-9]+]] = vnot(v0) +; r = 0x00100010 // halfword bitwidths +; CHECK-DAG: r[[R72:[0-9]+]] = ##1048592 +; CHECK: v[[V73:[0-9]+]] = vsplat(r[[R70]]) +; CHECK: v[[V74:[0-9]+]] = vsplat(r[[R72]]) +; CHECK: v[[V75:[0-9]+]].h = vsub(v0.h,v[[V73]].h) +; CHECK: v[[V76:[0-9]+]] = vand(v[[V71]],v[[V75]]) +; Ctlz: +; CHECK: v[[V77:[0-9]+]].uh = vcl0(v[[V76]].uh) +; CHECK: v0.h = vsub(v[[V74]].h,v[[V77]].h) +define <32 x i16> @f7(<32 x i16> %a0) #0 { + %t0 = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %a0) + ret <32 x i16> %t0 +} + +; CHECK-LABEL: f8 +; CHECK-DAG: r[[R80:[0-9]+]] = #1 +; CHECK-DAG: v[[V81:[0-9]+]] = vnot(v0) +; CHECK-DAG: r[[R82:[0-9]+]] = #32 +; CHECK: v[[V83:[0-9]+]] = vsplat(r[[R80]]) +; CHECK: v[[V84:[0-9]+]] = vsplat(r[[R82]]) +; CHECK: v[[V85:[0-9]+]].w = vsub(v0.w,v[[V83]].w) +; CHECK: v[[V86:[0-9]+]] = vand(v[[V81]],v[[V85]]) +; Ctlz: +; CHECK: v[[V87:[0-9]+]].uw = vcl0(v[[V86]].uw) +; CHECK: v0.w = vsub(v[[V84]].w,v[[V87]].w) +define <16 x i32> @f8(<16 x i32> %a0) #0 { + %t0 = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %a0) + ret <16 x i32> %t0 +} + + +declare <64 x i8> @llvm.ctpop.v64i8(<64 x i8>) #0 +declare <32 x i16> @llvm.ctpop.v32i16(<32 x i16>) #0 +declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) #0 + +declare <64 x i8> @llvm.ctlz.v64i8(<64 x i8>) #0 +declare <32 x i16> @llvm.ctlz.v32i16(<32 x i16>) #0 +declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>) #0 + +declare <64 x i8> @llvm.cttz.v64i8(<64 x i8>) #0 +declare <32 x i16> @llvm.cttz.v32i16(<32 x i16>) #0 +declare <16 x i32> @llvm.cttz.v16i32(<16 x i32>) #0 + +attributes #0 = { readnone nounwind "target-cpu"="hexagonv60" "target-features"="+hvx,+hvx-length64b,-packets" }