forked from OSchip/llvm-project
[TargetLowering] Only demand a rotation's modulo amount bits
ISD::ROTL/ROTR rotation values are guaranteed to act as a modulo amount, so for power-of-2 bitwidths we only need the lowest bits. Differential Revision: https://reviews.llvm.org/D76201
This commit is contained in:
parent
c45eaeabb7
commit
68224c1952
|
@ -1647,10 +1647,19 @@ bool TargetLowering::SimplifyDemandedBits(
|
|||
case ISD::ROTL:
|
||||
case ISD::ROTR: {
|
||||
SDValue Op0 = Op.getOperand(0);
|
||||
SDValue Op1 = Op.getOperand(1);
|
||||
|
||||
// If we're rotating an 0/-1 value, then it stays an 0/-1 value.
|
||||
if (BitWidth == TLO.DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1))
|
||||
return TLO.CombineTo(Op, Op0);
|
||||
|
||||
// For pow-2 bitwidths we only demand the bottom modulo amt bits.
|
||||
if (isPowerOf2_32(BitWidth)) {
|
||||
APInt DemandedAmtBits(Op1.getScalarValueSizeInBits(), BitWidth - 1);
|
||||
if (SimplifyDemandedBits(Op1, DemandedAmtBits, DemandedElts, Known2, TLO,
|
||||
Depth + 1))
|
||||
return true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ISD::BITREVERSE: {
|
||||
|
|
|
@ -284,6 +284,8 @@ SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const {
|
|||
const SDNode *N = Op.getNode();
|
||||
EVT VT = Op.getValueType();
|
||||
SDLoc dl(N);
|
||||
assert(isPowerOf2_32(VT.getSizeInBits()) &&
|
||||
"Expected power-of-2 shift amount");
|
||||
|
||||
// Expand non-constant shifts to loops.
|
||||
if (!isa<ConstantSDNode>(N->getOperand(1))) {
|
||||
|
@ -296,12 +298,20 @@ SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const {
|
|||
case ISD::SRL:
|
||||
return DAG.getNode(AVRISD::LSRLOOP, dl, VT, N->getOperand(0),
|
||||
N->getOperand(1));
|
||||
case ISD::ROTL:
|
||||
return DAG.getNode(AVRISD::ROLLOOP, dl, VT, N->getOperand(0),
|
||||
N->getOperand(1));
|
||||
case ISD::ROTR:
|
||||
return DAG.getNode(AVRISD::RORLOOP, dl, VT, N->getOperand(0),
|
||||
N->getOperand(1));
|
||||
case ISD::ROTL: {
|
||||
SDValue Amt = N->getOperand(1);
|
||||
EVT AmtVT = Amt.getValueType();
|
||||
Amt = DAG.getNode(ISD::AND, dl, AmtVT, Amt,
|
||||
DAG.getConstant(VT.getSizeInBits() - 1, dl, AmtVT));
|
||||
return DAG.getNode(AVRISD::ROLLOOP, dl, VT, N->getOperand(0), Amt);
|
||||
}
|
||||
case ISD::ROTR: {
|
||||
SDValue Amt = N->getOperand(1);
|
||||
EVT AmtVT = Amt.getValueType();
|
||||
Amt = DAG.getNode(ISD::AND, dl, AmtVT, Amt,
|
||||
DAG.getConstant(VT.getSizeInBits() - 1, dl, AmtVT));
|
||||
return DAG.getNode(AVRISD::RORLOOP, dl, VT, N->getOperand(0), Amt);
|
||||
}
|
||||
case ISD::SRA:
|
||||
return DAG.getNode(AVRISD::ASRLOOP, dl, VT, N->getOperand(0),
|
||||
N->getOperand(1));
|
||||
|
@ -317,9 +327,11 @@ SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const {
|
|||
break;
|
||||
case ISD::ROTL:
|
||||
Opc8 = AVRISD::ROL;
|
||||
ShiftAmount = ShiftAmount % VT.getSizeInBits();
|
||||
break;
|
||||
case ISD::ROTR:
|
||||
Opc8 = AVRISD::ROR;
|
||||
ShiftAmount = ShiftAmount % VT.getSizeInBits();
|
||||
break;
|
||||
case ISD::SRL:
|
||||
Opc8 = AVRISD::LSR;
|
||||
|
|
|
@ -65,7 +65,7 @@ define i32 @rotl_i32(i32 %x, i32 %z) {
|
|||
define i64 @rotl_i64(i64 %x, i64 %z) {
|
||||
; CHECK-LABEL: rotl_i64:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: neg x8, x1
|
||||
; CHECK-NEXT: neg w8, w1
|
||||
; CHECK-NEXT: ror x0, x0, x8
|
||||
; CHECK-NEXT: ret
|
||||
%f = call i64 @llvm.fshl.i64(i64 %x, i64 %x, i64 %z)
|
||||
|
|
|
@ -4,7 +4,6 @@
|
|||
define i32 @rotl32(i32 %A, i8 %Amt) nounwind {
|
||||
; CHECK-LABEL: rotl32:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: clrlwi 4, 4, 24
|
||||
; CHECK-NEXT: rotlw 3, 3, 4
|
||||
; CHECK-NEXT: blr
|
||||
%shift.upgrd.1 = zext i8 %Amt to i32 ; <i32> [#uses=1]
|
||||
|
@ -20,7 +19,6 @@ define i32 @rotr32(i32 %A, i8 %Amt) nounwind {
|
|||
; CHECK-LABEL: rotr32:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: subfic 4, 4, 32
|
||||
; CHECK-NEXT: clrlwi 4, 4, 24
|
||||
; CHECK-NEXT: rotlw 3, 3, 4
|
||||
; CHECK-NEXT: blr
|
||||
%shift.upgrd.3 = zext i8 %Amt to i32 ; <i32> [#uses=1]
|
||||
|
|
|
@ -7,10 +7,10 @@
|
|||
define i32 @f1(i32 %val, i32 %amt) {
|
||||
; CHECK-LABEL: f1:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: nill %r3, 31
|
||||
; CHECK-NEXT: nill %r3, 15
|
||||
; CHECK-NEXT: rll %r2, %r2, 0(%r3)
|
||||
; CHECK-NEXT: br %r14
|
||||
%mod = urem i32 %amt, 32
|
||||
%mod = urem i32 %amt, 16
|
||||
|
||||
%inv = sub i32 32, %mod
|
||||
%parta = shl i32 %val, %mod
|
||||
|
|
|
@ -4,14 +4,14 @@
|
|||
;
|
||||
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
|
||||
|
||||
; Test that AND is not removed when some lower 6 bits are not set.
|
||||
; Test that AND is not removed when some lower 5 bits are not set.
|
||||
define i32 @f1(i32 %val, i32 %amt) {
|
||||
; CHECK-LABEL: f1:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: nill %r3, 31
|
||||
; CHECK-NEXT: nill %r3, 15
|
||||
; CHECK-NEXT: rll %r2, %r2, 0(%r3)
|
||||
; CHECK-NEXT: br %r14
|
||||
%and = and i32 %amt, 31
|
||||
%and = and i32 %amt, 15
|
||||
|
||||
%inv = sub i32 32, %and
|
||||
%parta = shl i32 %val, %and
|
||||
|
|
|
@ -97,12 +97,12 @@ define i32 @f7(i32 %a, i64 %amt) {
|
|||
ret i32 %or
|
||||
}
|
||||
|
||||
; Check shift amounts that have the largest in-range constant term. We could
|
||||
; mask the amount instead.
|
||||
; Check shift amounts that have the largest in-range constant term, and then
|
||||
; mask the amount.
|
||||
define i32 @f8(i32 %a, i32 %amt) {
|
||||
; CHECK-LABEL: f8:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: rll %r2, %r2, 524287(%r3)
|
||||
; CHECK-NEXT: rll %r2, %r2, -1(%r3)
|
||||
; CHECK-NEXT: br %r14
|
||||
%add = add i32 %amt, 524287
|
||||
%sub = sub i32 32, %add
|
||||
|
@ -157,13 +157,11 @@ define i32 @f11(i32 %a, i32 %amt) {
|
|||
ret i32 %or
|
||||
}
|
||||
|
||||
; Check the next value down, which without masking must use a separate
|
||||
; addition.
|
||||
; Check the next value down, masking the amount removes the addition.
|
||||
define i32 @f12(i32 %a, i32 %amt) {
|
||||
; CHECK-LABEL: f12:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: afi %r3, -524289
|
||||
; CHECK-NEXT: rll %r2, %r2, 0(%r3)
|
||||
; CHECK-NEXT: rll %r2, %r2, -1(%r3)
|
||||
; CHECK-NEXT: br %r14
|
||||
%suba = sub i32 %amt, 524289
|
||||
%subb = sub i32 32, %suba
|
||||
|
|
|
@ -98,12 +98,12 @@ define i64 @f7(i64 %a, i32 %amt) {
|
|||
ret i64 %or
|
||||
}
|
||||
|
||||
; Check shift amounts that have the largest in-range constant term. We could
|
||||
; mask the amount instead.
|
||||
; Check shift amounts that have the largest in-range constant term, and then
|
||||
; mask the amount.
|
||||
define i64 @f8(i64 %a, i64 %amt) {
|
||||
; CHECK-LABEL: f8:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: rllg %r2, %r2, 524287(%r3)
|
||||
; CHECK-NEXT: rllg %r2, %r2, -1(%r3)
|
||||
; CHECK-NEXT: br %r14
|
||||
%add = add i64 %amt, 524287
|
||||
%sub = sub i64 64, %add
|
||||
|
@ -158,13 +158,11 @@ define i64 @f11(i64 %a, i64 %amt) {
|
|||
ret i64 %or
|
||||
}
|
||||
|
||||
; Check the next value down, which without masking must use a separate
|
||||
; addition.
|
||||
; Check the next value down, masking the amount removes the addition.
|
||||
define i64 @f12(i64 %a, i64 %amt) {
|
||||
; CHECK-LABEL: f12:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: afi %r3, -524289
|
||||
; CHECK-NEXT: rllg %r2, %r2, 0(%r3)
|
||||
; CHECK-NEXT: rllg %r2, %r2, -1(%r3)
|
||||
; CHECK-NEXT: br %r14
|
||||
%suba = sub i64 %amt, 524289
|
||||
%subb = sub i64 64, %suba
|
||||
|
|
|
@ -27,9 +27,7 @@ define i32 @f2(i32 %v, i32 %nbits) {
|
|||
;
|
||||
; THUMB1-LABEL: f2:
|
||||
; THUMB1: @ %bb.0: @ %entry
|
||||
; THUMB1-NEXT: movs r2, #31
|
||||
; THUMB1-NEXT: ands r2, r1
|
||||
; THUMB1-NEXT: rors r0, r2
|
||||
; THUMB1-NEXT: rors r0, r1
|
||||
; THUMB1-NEXT: bx lr
|
||||
entry:
|
||||
%and = and i32 %nbits, 31
|
||||
|
|
|
@ -118,57 +118,55 @@ define i32 @combine_rot_select_zero(i32, i32) {
|
|||
define <4 x i32> @combine_vec_rot_select_zero(<4 x i32>, <4 x i32>) {
|
||||
; SSE2-LABEL: combine_vec_rot_select_zero:
|
||||
; SSE2: # %bb.0:
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [31,31,31,31]
|
||||
; SSE2-NEXT: pand %xmm1, %xmm2
|
||||
; SSE2-NEXT: pxor %xmm3, %xmm3
|
||||
; SSE2-NEXT: pslld $23, %xmm2
|
||||
; SSE2-NEXT: paddd {{.*}}(%rip), %xmm2
|
||||
; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
|
||||
; SSE2-NEXT: pxor %xmm2, %xmm2
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [31,31,31,31]
|
||||
; SSE2-NEXT: pand %xmm1, %xmm3
|
||||
; SSE2-NEXT: pslld $23, %xmm3
|
||||
; SSE2-NEXT: paddd {{.*}}(%rip), %xmm3
|
||||
; SSE2-NEXT: cvttps2dq %xmm3, %xmm3
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm4
|
||||
; SSE2-NEXT: pmuludq %xmm2, %xmm4
|
||||
; SSE2-NEXT: pmuludq %xmm3, %xmm4
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,3,2,3]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
|
||||
; SSE2-NEXT: pmuludq %xmm6, %xmm2
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,3,2,3]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
|
||||
; SSE2-NEXT: pmuludq %xmm6, %xmm3
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3]
|
||||
; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
|
||||
; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
|
||||
; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
|
||||
; SSE2-NEXT: por %xmm5, %xmm4
|
||||
; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
|
||||
; SSE2-NEXT: pand %xmm3, %xmm0
|
||||
; SSE2-NEXT: pandn %xmm4, %xmm3
|
||||
; SSE2-NEXT: por %xmm3, %xmm0
|
||||
; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
|
||||
; SSE2-NEXT: pand %xmm2, %xmm0
|
||||
; SSE2-NEXT: pandn %xmm4, %xmm2
|
||||
; SSE2-NEXT: por %xmm2, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; XOP-LABEL: combine_vec_rot_select_zero:
|
||||
; XOP: # %bb.0:
|
||||
; XOP-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm2
|
||||
; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
||||
; XOP-NEXT: vprotd %xmm2, %xmm0, %xmm2
|
||||
; XOP-NEXT: vpcomeqd %xmm3, %xmm1, %xmm1
|
||||
; XOP-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
|
||||
; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm3
|
||||
; XOP-NEXT: vpcomeqd %xmm2, %xmm1, %xmm1
|
||||
; XOP-NEXT: vblendvps %xmm1, %xmm0, %xmm3, %xmm0
|
||||
; XOP-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: combine_vec_rot_select_zero:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
|
||||
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm2
|
||||
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
||||
; AVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm4
|
||||
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
|
||||
; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm3
|
||||
; AVX2-NEXT: vpsllvd %xmm3, %xmm0, %xmm4
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [32,32,32,32]
|
||||
; AVX2-NEXT: vpsubd %xmm2, %xmm5, %xmm2
|
||||
; AVX2-NEXT: vpsrlvd %xmm2, %xmm0, %xmm2
|
||||
; AVX2-NEXT: vpor %xmm2, %xmm4, %xmm2
|
||||
; AVX2-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
|
||||
; AVX2-NEXT: vpsubd %xmm3, %xmm5, %xmm3
|
||||
; AVX2-NEXT: vpsrlvd %xmm3, %xmm0, %xmm3
|
||||
; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3
|
||||
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vblendvps %xmm1, %xmm0, %xmm3, %xmm0
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: combine_vec_rot_select_zero:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm1, %xmm2
|
||||
; AVX512-NEXT: vprolvd %xmm2, %xmm0, %xmm2
|
||||
; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm2
|
||||
; AVX512-NEXT: vptestnmd %xmm1, %xmm1, %k1
|
||||
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm2 {%k1}
|
||||
; AVX512-NEXT: vmovdqa %xmm2, %xmm0
|
||||
|
@ -310,14 +308,13 @@ define <4 x i32> @rotate_demanded_bits_3(<4 x i32>, <4 x i32>) {
|
|||
; XOP-LABEL: rotate_demanded_bits_3:
|
||||
; XOP: # %bb.0:
|
||||
; XOP-NEXT: vpaddd %xmm1, %xmm1, %xmm1
|
||||
; XOP-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
|
||||
; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0
|
||||
; XOP-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: rotate_demanded_bits_3:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vpaddd %xmm1, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [30,30,30,30]
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
|
||||
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm2
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
|
||||
|
@ -329,7 +326,6 @@ define <4 x i32> @rotate_demanded_bits_3(<4 x i32>, <4 x i32>) {
|
|||
; AVX512-LABEL: rotate_demanded_bits_3:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vpaddd %xmm1, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%3 = shl <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
|
||||
|
|
|
@ -124,23 +124,21 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
|
|||
; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm6
|
||||
; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm6
|
||||
; AVX512F-NEXT: vpor %ymm4, %ymm6, %ymm4
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
|
||||
; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
|
||||
; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2
|
||||
; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
|
||||
; AVX512F-NEXT: vpsrlw $6, %ymm3, %ymm4
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
|
||||
; AVX512F-NEXT: vpandn %ymm4, %ymm7, %ymm4
|
||||
; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm8
|
||||
; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm8
|
||||
; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
|
||||
; AVX512F-NEXT: vpandn %ymm4, %ymm6, %ymm4
|
||||
; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm7
|
||||
; AVX512F-NEXT: vpand %ymm6, %ymm7, %ymm7
|
||||
; AVX512F-NEXT: vpor %ymm4, %ymm7, %ymm4
|
||||
; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
|
||||
; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
|
||||
; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm4
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
|
||||
; AVX512F-NEXT: vpand %ymm4, %ymm8, %ymm4
|
||||
; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm9
|
||||
; AVX512F-NEXT: vpor %ymm4, %ymm9, %ymm4
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
|
||||
; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
|
||||
; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm8
|
||||
; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4
|
||||
; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
|
||||
; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
|
||||
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3
|
||||
|
@ -148,18 +146,17 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
|
|||
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
|
||||
; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
|
||||
; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
|
||||
; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
|
||||
; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
|
||||
; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3
|
||||
; AVX512F-NEXT: vpandn %ymm3, %ymm7, %ymm3
|
||||
; AVX512F-NEXT: vpandn %ymm3, %ymm6, %ymm3
|
||||
; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4
|
||||
; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
|
||||
; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
|
||||
; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
|
||||
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
|
||||
; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3
|
||||
; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3
|
||||
; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3
|
||||
; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4
|
||||
; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
|
||||
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
|
||||
|
@ -175,36 +172,33 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
|
|||
; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm5
|
||||
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
|
||||
; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm6, %ymm5
|
||||
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
|
||||
; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2
|
||||
; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
|
||||
; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm3
|
||||
; AVX512VL-NEXT: vpsrlw $6, %ymm3, %ymm5
|
||||
; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm7
|
||||
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
|
||||
; AVX512VL-NEXT: vpternlogq $226, %ymm5, %ymm8, %ymm7
|
||||
; AVX512VL-NEXT: vpsrlw $6, %ymm3, %ymm4
|
||||
; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm5
|
||||
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
|
||||
; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm7, %ymm5
|
||||
; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
|
||||
; AVX512VL-NEXT: vpblendvb %ymm2, %ymm7, %ymm3, %ymm3
|
||||
; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm5
|
||||
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
|
||||
; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5
|
||||
; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm9
|
||||
; AVX512VL-NEXT: vpor %ymm5, %ymm9, %ymm5
|
||||
; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm3
|
||||
; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm4
|
||||
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
|
||||
; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
|
||||
; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm8
|
||||
; AVX512VL-NEXT: vpor %ymm4, %ymm8, %ymm4
|
||||
; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
|
||||
; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm2
|
||||
; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
|
||||
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
|
||||
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5
|
||||
; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm6, %ymm5
|
||||
; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1
|
||||
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
|
||||
; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm6, %ymm4
|
||||
; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
|
||||
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm5, %ymm0, %ymm0
|
||||
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
|
||||
; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3
|
||||
; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4
|
||||
; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm8, %ymm4
|
||||
; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm7, %ymm4
|
||||
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
|
||||
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
|
||||
; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3
|
||||
; AVX512VL-NEXT: vpand %ymm7, %ymm3, %ymm3
|
||||
; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3
|
||||
; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4
|
||||
; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3
|
||||
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
|
||||
|
|
|
@ -132,23 +132,21 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
|
|||
; AVX512F-NEXT: vpor %ymm4, %ymm6, %ymm4
|
||||
; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
|
||||
; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
|
||||
; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2
|
||||
; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2
|
||||
; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
|
||||
; AVX512F-NEXT: vpsrlw $6, %ymm3, %ymm4
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
|
||||
; AVX512F-NEXT: vpandn %ymm4, %ymm8, %ymm4
|
||||
; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm9
|
||||
; AVX512F-NEXT: vpand %ymm8, %ymm9, %ymm9
|
||||
; AVX512F-NEXT: vpor %ymm4, %ymm9, %ymm4
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
|
||||
; AVX512F-NEXT: vpandn %ymm4, %ymm7, %ymm4
|
||||
; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm8
|
||||
; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm8
|
||||
; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4
|
||||
; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
|
||||
; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
|
||||
; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm4
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
|
||||
; AVX512F-NEXT: vpand %ymm4, %ymm9, %ymm4
|
||||
; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm10
|
||||
; AVX512F-NEXT: vpor %ymm4, %ymm10, %ymm4
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
|
||||
; AVX512F-NEXT: vpand %ymm4, %ymm8, %ymm4
|
||||
; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm9
|
||||
; AVX512F-NEXT: vpor %ymm4, %ymm9, %ymm4
|
||||
; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
|
||||
; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
|
||||
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3
|
||||
|
@ -157,18 +155,17 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
|
|||
; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
|
||||
; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
|
||||
; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1
|
||||
; AVX512F-NEXT: vpand %ymm7, %ymm1, %ymm1
|
||||
; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
|
||||
; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3
|
||||
; AVX512F-NEXT: vpandn %ymm3, %ymm8, %ymm3
|
||||
; AVX512F-NEXT: vpandn %ymm3, %ymm7, %ymm3
|
||||
; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4
|
||||
; AVX512F-NEXT: vpand %ymm4, %ymm8, %ymm4
|
||||
; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
|
||||
; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
|
||||
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
|
||||
; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3
|
||||
; AVX512F-NEXT: vpand %ymm3, %ymm9, %ymm3
|
||||
; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3
|
||||
; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4
|
||||
; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
|
||||
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
|
||||
|
@ -186,37 +183,34 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
|
|||
; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm6, %ymm5
|
||||
; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
|
||||
; AVX512VL-NEXT: vpsubb %ymm2, %ymm4, %ymm2
|
||||
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
|
||||
; AVX512VL-NEXT: vpand %ymm7, %ymm2, %ymm2
|
||||
; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
|
||||
; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm3
|
||||
; AVX512VL-NEXT: vpsrlw $6, %ymm3, %ymm5
|
||||
; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm8
|
||||
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
|
||||
; AVX512VL-NEXT: vpternlogq $226, %ymm5, %ymm9, %ymm8
|
||||
; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm7
|
||||
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
|
||||
; AVX512VL-NEXT: vpternlogq $226, %ymm5, %ymm8, %ymm7
|
||||
; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
|
||||
; AVX512VL-NEXT: vpblendvb %ymm2, %ymm8, %ymm3, %ymm3
|
||||
; AVX512VL-NEXT: vpblendvb %ymm2, %ymm7, %ymm3, %ymm3
|
||||
; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm5
|
||||
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
|
||||
; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm5
|
||||
; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm10
|
||||
; AVX512VL-NEXT: vpor %ymm5, %ymm10, %ymm5
|
||||
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
|
||||
; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5
|
||||
; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm9
|
||||
; AVX512VL-NEXT: vpor %ymm5, %ymm9, %ymm5
|
||||
; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
|
||||
; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm2
|
||||
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
|
||||
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5
|
||||
; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm6, %ymm5
|
||||
; AVX512VL-NEXT: vpsubb %ymm1, %ymm4, %ymm1
|
||||
; AVX512VL-NEXT: vpand %ymm7, %ymm1, %ymm1
|
||||
; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
|
||||
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm5, %ymm0, %ymm0
|
||||
; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3
|
||||
; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4
|
||||
; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm9, %ymm4
|
||||
; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm8, %ymm4
|
||||
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
|
||||
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
|
||||
; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3
|
||||
; AVX512VL-NEXT: vpand %ymm3, %ymm8, %ymm3
|
||||
; AVX512VL-NEXT: vpand %ymm7, %ymm3, %ymm3
|
||||
; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4
|
||||
; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3
|
||||
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
|
||||
|
|
Loading…
Reference in New Issue