[X86] Remove PALIGNR/VALIGN handling from combineBitcastForMaskedOp and move to isel patterns instead. Prefer 128-bit VALIGND/VALIGNQ over PALIGNR during lowering when possible.

llvm-svn: 317299
This commit is contained in:
Craig Topper 2017-11-03 06:48:02 +00:00
parent 2fda36a18e
commit 333897ec31
3 changed files with 133 additions and 24 deletions

View File

@ -10716,10 +10716,16 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
if (Subtarget.hasSSSE3())
if (Subtarget.hasSSSE3()) {
if (Subtarget.hasVLX())
if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2,
Mask, Subtarget, DAG))
return Rotate;
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
return Rotate;
}
// If we have direct support for blends, we should lower by decomposing into
// a permute. That will be faster than the domain cross.
@ -11016,10 +11022,16 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
if (Subtarget.hasSSSE3())
if (Subtarget.hasSSSE3()) {
if (Subtarget.hasVLX())
if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2,
Mask, Subtarget, DAG))
return Rotate;
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
return Rotate;
}
// Assume that a single SHUFPS is faster than an alternative sequence of
// multiple instructions (even if the CPU has a domain penalty).
@ -30674,26 +30686,6 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
unsigned Opcode = Op.getOpcode();
switch (Opcode) {
case X86ISD::PALIGNR:
// PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
if (!VT.is128BitVector())
return false;
Opcode = X86ISD::VALIGN;
LLVM_FALLTHROUGH;
case X86ISD::VALIGN: {
if (EltVT != MVT::i32 && EltVT != MVT::i64)
return false;
uint64_t Imm = Op.getConstantOperandVal(2);
MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
unsigned EltSize = EltVT.getSizeInBits();
// Make sure we can represent the same shift with the new VT.
if ((ShiftAmt % EltSize) != 0)
return false;
Imm = ShiftAmt / EltSize;
return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
DAG.getConstant(Imm, DL, MVT::i8));
}
case X86ISD::SHUF128: {
if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
return false;

View File

@ -8911,6 +8911,123 @@ defm VPALIGNR: avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr" ,
avx512vl_i8_info, avx512vl_i8_info>,
EVEX_CD8<8, CD8VF>;
// Fragments to help convert valignq into masked valignd. Or valignq/valignd
// into vpalignr.
def ValignqImm32XForm : SDNodeXForm<imm, [{
return getI8Imm(N->getZExtValue() * 2, SDLoc(N));
}]>;
def ValignqImm8XForm : SDNodeXForm<imm, [{
return getI8Imm(N->getZExtValue() * 8, SDLoc(N));
}]>;
def ValigndImm8XForm : SDNodeXForm<imm, [{
return getI8Imm(N->getZExtValue() * 4, SDLoc(N));
}]>;
multiclass avx512_vpalign_mask_lowering<string OpcodeStr, SDNode OpNode,
X86VectorVTInfo From, X86VectorVTInfo To,
SDNodeXForm ImmXForm> {
def : Pat<(To.VT (vselect To.KRCWM:$mask,
(bitconvert
(From.VT (OpNode From.RC:$src1, From.RC:$src2,
imm:$src3))),
To.RC:$src0)),
(!cast<Instruction>(OpcodeStr#"rrik") To.RC:$src0, To.KRCWM:$mask,
To.RC:$src1, To.RC:$src2,
(ImmXForm imm:$src3))>;
def : Pat<(To.VT (vselect To.KRCWM:$mask,
(bitconvert
(From.VT (OpNode From.RC:$src1, From.RC:$src2,
imm:$src3))),
To.ImmAllZerosV)),
(!cast<Instruction>(OpcodeStr#"rrikz") To.KRCWM:$mask,
To.RC:$src1, To.RC:$src2,
(ImmXForm imm:$src3))>;
def : Pat<(To.VT (vselect To.KRCWM:$mask,
(bitconvert
(From.VT (OpNode From.RC:$src1,
(bitconvert (To.LdFrag addr:$src2)),
imm:$src3))),
To.RC:$src0)),
(!cast<Instruction>(OpcodeStr#"rmik") To.RC:$src0, To.KRCWM:$mask,
To.RC:$src1, addr:$src2,
(ImmXForm imm:$src3))>;
def : Pat<(To.VT (vselect To.KRCWM:$mask,
(bitconvert
(From.VT (OpNode From.RC:$src1,
(bitconvert (To.LdFrag addr:$src2)),
imm:$src3))),
To.ImmAllZerosV)),
(!cast<Instruction>(OpcodeStr#"rmikz") To.KRCWM:$mask,
To.RC:$src1, addr:$src2,
(ImmXForm imm:$src3))>;
}
multiclass avx512_vpalign_mask_lowering_mb<string OpcodeStr, SDNode OpNode,
X86VectorVTInfo From,
X86VectorVTInfo To,
SDNodeXForm ImmXForm> :
avx512_vpalign_mask_lowering<OpcodeStr, OpNode, From, To, ImmXForm> {
def : Pat<(From.VT (OpNode From.RC:$src1,
(bitconvert (To.VT (X86VBroadcast
(To.ScalarLdFrag addr:$src2)))),
imm:$src3)),
(!cast<Instruction>(OpcodeStr#"rmbi") To.RC:$src1, addr:$src2,
(ImmXForm imm:$src3))>;
def : Pat<(To.VT (vselect To.KRCWM:$mask,
(bitconvert
(From.VT (OpNode From.RC:$src1,
(bitconvert
(To.VT (X86VBroadcast
(To.ScalarLdFrag addr:$src2)))),
imm:$src3))),
To.RC:$src0)),
(!cast<Instruction>(OpcodeStr#"rmbik") To.RC:$src0, To.KRCWM:$mask,
To.RC:$src1, addr:$src2,
(ImmXForm imm:$src3))>;
def : Pat<(To.VT (vselect To.KRCWM:$mask,
(bitconvert
(From.VT (OpNode From.RC:$src1,
(bitconvert
(To.VT (X86VBroadcast
(To.ScalarLdFrag addr:$src2)))),
imm:$src3))),
To.ImmAllZerosV)),
(!cast<Instruction>(OpcodeStr#"rmbikz") To.KRCWM:$mask,
To.RC:$src1, addr:$src2,
(ImmXForm imm:$src3))>;
}
let Predicates = [HasAVX512] in {
// For 512-bit we lower to the widest element type we can. So we only need
// to handle converting valignq to valignd.
defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ", X86VAlign, v8i64_info,
v16i32_info, ValignqImm32XForm>;
}
let Predicates = [HasVLX] in {
// For 128-bit we lower to the widest element type we can. So we only need
// to handle converting valignq to valignd.
defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ128", X86VAlign, v2i64x_info,
v4i32x_info, ValignqImm32XForm>;
// For 256-bit we lower to the widest element type we can. So we only need
// to handle converting valignq to valignd.
defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ256", X86VAlign, v4i64x_info,
v8i32x_info, ValignqImm32XForm>;
}
let Predicates = [HasVLX, HasBWI] in {
// We can turn 128 and 256 bit VALIGND/VALIGNQ into VPALIGNR.
defm : avx512_vpalign_mask_lowering<"VPALIGNRZ128", X86VAlign, v2i64x_info,
v16i8x_info, ValignqImm8XForm>;
defm : avx512_vpalign_mask_lowering<"VPALIGNRZ128", X86VAlign, v4i32x_info,
v16i8x_info, ValigndImm8XForm>;
}
defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw" ,
avx512vl_i16_info, avx512vl_i8_info>, EVEX_CD8<8, CD8VF>;

View File

@ -4712,8 +4712,8 @@ declare <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32>, <8 x i32>, i32,
define <8 x i32>@test_int_x86_avx512_mask_valign_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_valign_d_256:
; CHECK: ## BB#0:
; CHECK-NEXT: valignd $6, %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0x7d,0x28,0x03,0xd9,0x06]
; CHECK-NEXT: ## ymm3 = ymm1[6,7],ymm0[0,1,2,3,4,5]
; CHECK-NEXT: valignq $3, %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0xfd,0x28,0x03,0xd9,0x03]
; CHECK-NEXT: ## ymm3 = ymm1[3],ymm0[0,1,2]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: valignd $6, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x03,0xd1,0x06]
; CHECK-NEXT: ## ymm2 {%k1} = ymm1[6,7],ymm0[0,1,2,3,4,5]