forked from OSchip/llvm-project
[X86] Stop forcing X86VPermi2X node index operand to match destination type to make masking pattern matching easier. Add extra patterns with bitcasts instead.
This basically reverts r280696 in favor of using extra patterns as mentioned as an alternative in that commit message. For now I've only added the cases we have test cases for, but it should be easy to add more in the future. This will help to convert VPERMI2PS/VPERMT2PS intrinsics to use a single ISD node opcode. And hopefully allow some intrinsics to be removed. llvm-svn: 333365
This commit is contained in:
parent
c791417a6d
commit
26bc84860a
|
@ -20534,21 +20534,19 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
|||
}
|
||||
case VPERM_3OP_MASKZ:
|
||||
case VPERM_3OP_MASK:{
|
||||
MVT VT = Op.getSimpleValueType();
|
||||
// Src2 is the PassThru
|
||||
SDValue Src1 = Op.getOperand(1);
|
||||
// PassThru needs to be the same type as the destination in order
|
||||
// to pattern match correctly.
|
||||
SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
|
||||
SDValue Src2 = Op.getOperand(2);
|
||||
SDValue Src3 = Op.getOperand(3);
|
||||
SDValue Mask = Op.getOperand(4);
|
||||
MVT VT = Op.getSimpleValueType();
|
||||
SDValue PassThru = SDValue();
|
||||
|
||||
// set PassThru element
|
||||
if (IntrData->Type == VPERM_3OP_MASKZ)
|
||||
PassThru = getZeroVector(VT, Subtarget, DAG, dl);
|
||||
else
|
||||
PassThru = Src2;
|
||||
PassThru = DAG.getBitcast(VT, Src2);
|
||||
|
||||
// Swap Src1 and Src2 in the node creation
|
||||
return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
|
||||
|
|
|
@ -331,6 +331,23 @@ multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _,
|
|||
(Select _.KRCWM:$mask, RHS, _.RC:$src1),
|
||||
Select, "", IsCommutable, IsKCommutable>;
|
||||
|
||||
// Similar to AVX512_maskable_3src but in this case the input VT for the tied
|
||||
// operand differs from the output VT. This requires a bitconvert on
|
||||
// the preserved vector going into the vselect.
|
||||
multiclass AVX512_maskable_3src_cast<bits<8> O, Format F, X86VectorVTInfo OutVT,
|
||||
X86VectorVTInfo InVT,
|
||||
dag Outs, dag NonTiedIns, string OpcodeStr,
|
||||
string AttSrcAsm, string IntelSrcAsm,
|
||||
dag RHS, bit IsCommutable = 0> :
|
||||
AVX512_maskable_common<O, F, OutVT, Outs,
|
||||
!con((ins InVT.RC:$src1), NonTiedIns),
|
||||
!con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns),
|
||||
!con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns),
|
||||
OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
|
||||
(vselect InVT.KRCWM:$mask, RHS,
|
||||
(bitconvert InVT.RC:$src1)),
|
||||
vselect, "", IsCommutable>;
|
||||
|
||||
multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _,
|
||||
dag Outs, dag NonTiedIns, string OpcodeStr,
|
||||
string AttSrcAsm, string IntelSrcAsm,
|
||||
|
@ -1699,38 +1716,34 @@ defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q",
|
|||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// -- VPERMI2 - 3 source operands form --
|
||||
|
||||
multiclass avx512_perm_i<bits<8> opc, string OpcodeStr,
|
||||
X86FoldableSchedWrite sched, X86VectorVTInfo _> {
|
||||
X86FoldableSchedWrite sched,
|
||||
X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
|
||||
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
|
||||
// The index operand in the pattern should really be an integer type. However,
|
||||
// if we do that and it happens to come from a bitcast, then it becomes
|
||||
// difficult to find the bitcast needed to convert the index to the
|
||||
// destination type for the passthru since it will be folded with the bitcast
|
||||
// of the index operand.
|
||||
defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
|
||||
defm rr: AVX512_maskable_3src_cast<opc, MRMSrcReg, _, IdxVT, (outs _.RC:$dst),
|
||||
(ins _.RC:$src2, _.RC:$src3),
|
||||
OpcodeStr, "$src3, $src2", "$src2, $src3",
|
||||
(_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2, _.RC:$src3)), 1>,
|
||||
(_.VT (X86VPermi2X IdxVT.RC:$src1, _.RC:$src2, _.RC:$src3)), 1>,
|
||||
EVEX_4V, AVX5128IBase, Sched<[sched]>;
|
||||
|
||||
defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
|
||||
defm rm: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst),
|
||||
(ins _.RC:$src2, _.MemOp:$src3),
|
||||
OpcodeStr, "$src3, $src2", "$src2, $src3",
|
||||
(_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2,
|
||||
(_.VT (X86VPermi2X IdxVT.RC:$src1, _.RC:$src2,
|
||||
(_.VT (bitconvert (_.LdFrag addr:$src3))))), 1>,
|
||||
EVEX_4V, AVX5128IBase, Sched<[sched.Folded, ReadAfterLd]>;
|
||||
}
|
||||
}
|
||||
|
||||
multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr,
|
||||
X86FoldableSchedWrite sched, X86VectorVTInfo _> {
|
||||
X86FoldableSchedWrite sched,
|
||||
X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
|
||||
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
|
||||
defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
|
||||
defm rmb: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst),
|
||||
(ins _.RC:$src2, _.ScalarMemOp:$src3),
|
||||
OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
|
||||
!strconcat("$src2, ${src3}", _.BroadcastStr ),
|
||||
(_.VT (X86VPermi2X _.RC:$src1,
|
||||
(_.VT (X86VPermi2X IdxVT.RC:$src1,
|
||||
_.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), 1>,
|
||||
AVX5128IBase, EVEX_4V, EVEX_B,
|
||||
Sched<[sched.Folded, ReadAfterLd]>;
|
||||
|
@ -1738,41 +1751,85 @@ multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr,
|
|||
|
||||
multiclass avx512_perm_i_sizes<bits<8> opc, string OpcodeStr,
|
||||
X86FoldableSchedWrite sched,
|
||||
AVX512VLVectorVTInfo VTInfo> {
|
||||
defm NAME: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info512>,
|
||||
avx512_perm_i_mb<opc, OpcodeStr, sched, VTInfo.info512>, EVEX_V512;
|
||||
AVX512VLVectorVTInfo VTInfo,
|
||||
AVX512VLVectorVTInfo ShuffleMask> {
|
||||
defm NAME: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info512,
|
||||
ShuffleMask.info512>,
|
||||
avx512_perm_i_mb<opc, OpcodeStr, sched, VTInfo.info512,
|
||||
ShuffleMask.info512>, EVEX_V512;
|
||||
let Predicates = [HasVLX] in {
|
||||
defm NAME#128: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info128>,
|
||||
avx512_perm_i_mb<opc, OpcodeStr, sched, VTInfo.info128>, EVEX_V128;
|
||||
defm NAME#256: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info256>,
|
||||
avx512_perm_i_mb<opc, OpcodeStr, sched, VTInfo.info256>, EVEX_V256;
|
||||
defm NAME#128: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info128,
|
||||
ShuffleMask.info128>,
|
||||
avx512_perm_i_mb<opc, OpcodeStr, sched, VTInfo.info128,
|
||||
ShuffleMask.info128>, EVEX_V128;
|
||||
defm NAME#256: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info256,
|
||||
ShuffleMask.info256>,
|
||||
avx512_perm_i_mb<opc, OpcodeStr, sched, VTInfo.info256,
|
||||
ShuffleMask.info256>, EVEX_V256;
|
||||
}
|
||||
}
|
||||
|
||||
multiclass avx512_perm_i_sizes_bw<bits<8> opc, string OpcodeStr,
|
||||
X86FoldableSchedWrite sched,
|
||||
AVX512VLVectorVTInfo VTInfo,
|
||||
AVX512VLVectorVTInfo Idx,
|
||||
Predicate Prd> {
|
||||
let Predicates = [Prd] in
|
||||
defm NAME: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info512>, EVEX_V512;
|
||||
defm NAME: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info512,
|
||||
Idx.info512>, EVEX_V512;
|
||||
let Predicates = [Prd, HasVLX] in {
|
||||
defm NAME#128: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info128>, EVEX_V128;
|
||||
defm NAME#256: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info256>, EVEX_V256;
|
||||
defm NAME#128: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info128,
|
||||
Idx.info128>, EVEX_V128;
|
||||
defm NAME#256: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info256,
|
||||
Idx.info256>, EVEX_V256;
|
||||
}
|
||||
}
|
||||
|
||||
defm VPERMI2D : avx512_perm_i_sizes<0x76, "vpermi2d", WriteVarShuffle256,
|
||||
avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
|
||||
avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
|
||||
defm VPERMI2Q : avx512_perm_i_sizes<0x76, "vpermi2q", WriteVarShuffle256,
|
||||
avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
|
||||
avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
|
||||
defm VPERMI2W : avx512_perm_i_sizes_bw<0x75, "vpermi2w", WriteVarShuffle256,
|
||||
avx512vl_i16_info, HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>;
|
||||
avx512vl_i16_info, avx512vl_i16_info, HasBWI>,
|
||||
VEX_W, EVEX_CD8<16, CD8VF>;
|
||||
defm VPERMI2B : avx512_perm_i_sizes_bw<0x75, "vpermi2b", WriteVarShuffle256,
|
||||
avx512vl_i8_info, HasVBMI>, EVEX_CD8<8, CD8VF>;
|
||||
avx512vl_i8_info, avx512vl_i8_info, HasVBMI>,
|
||||
EVEX_CD8<8, CD8VF>;
|
||||
defm VPERMI2PS : avx512_perm_i_sizes<0x77, "vpermi2ps", WriteFVarShuffle256,
|
||||
avx512vl_f32_info>, EVEX_CD8<32, CD8VF>;
|
||||
avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
|
||||
defm VPERMI2PD : avx512_perm_i_sizes<0x77, "vpermi2pd", WriteFVarShuffle256,
|
||||
avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
|
||||
avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
|
||||
|
||||
// Extra patterns to deal with extra bitcasts due to passthru and index being
|
||||
// different types on the fp versions.
|
||||
multiclass avx512_perm_i_lowering<string InstrStr, X86VectorVTInfo _,
|
||||
X86VectorVTInfo IdxVT,
|
||||
X86VectorVTInfo CastVT> {
|
||||
def : Pat<(_.VT (vselect _.KRCWM:$mask,
|
||||
(X86VPermi2X (IdxVT.VT (bitconvert (CastVT.VT _.RC:$src1))),
|
||||
(_.VT _.RC:$src2), _.RC:$src3),
|
||||
(_.VT (bitconvert (CastVT.VT _.RC:$src1))))),
|
||||
(!cast<Instruction>(InstrStr#"rrk") _.RC:$src1, _.KRCWM:$mask,
|
||||
_.RC:$src2, _.RC:$src3)>;
|
||||
def : Pat<(_.VT (vselect _.KRCWM:$mask,
|
||||
(X86VPermi2X (IdxVT.VT (bitconvert (CastVT.VT _.RC:$src1))),
|
||||
_.RC:$src2, (_.LdFrag addr:$src3)),
|
||||
(_.VT (bitconvert (CastVT.VT _.RC:$src1))))),
|
||||
(!cast<Instruction>(InstrStr#"rmk") _.RC:$src1, _.KRCWM:$mask,
|
||||
_.RC:$src2, addr:$src3)>;
|
||||
def : Pat<(_.VT (vselect _.KRCWM:$mask,
|
||||
(X86VPermi2X (IdxVT.VT (bitconvert (CastVT.VT _.RC:$src1))),
|
||||
_.RC:$src2,
|
||||
(X86VBroadcast (_.ScalarLdFrag addr:$src3))),
|
||||
(_.VT (bitconvert (CastVT.VT _.RC:$src1))))),
|
||||
(!cast<Instruction>(InstrStr#"rmbk") _.RC:$src1, _.KRCWM:$mask,
|
||||
_.RC:$src2, addr:$src3)>;
|
||||
}
|
||||
|
||||
// TODO: Should we add more casts? The vXi64 case is common due to ABI.
|
||||
defm : avx512_perm_i_lowering<"VPERMI2PS", v16f32_info, v16i32_info, v8i64_info>;
|
||||
defm : avx512_perm_i_lowering<"VPERMI2PS256", v8f32x_info, v8i32x_info, v4i64x_info>;
|
||||
defm : avx512_perm_i_lowering<"VPERMI2PS128", v4f32x_info, v4i32x_info, v2i64x_info>;
|
||||
|
||||
// VPERMT2
|
||||
multiclass avx512_perm_t<bits<8> opc, string OpcodeStr,
|
||||
|
|
|
@ -417,12 +417,10 @@ def X86VPermt2 : SDNode<"X86ISD::VPERMV3",
|
|||
SDTCisSameSizeAs<0,2>,
|
||||
SDTCisSameAs<0,3>]>, []>;
|
||||
|
||||
// Even though the index operand should be integer, we need to make it match the
|
||||
// destination type so that we can pattern match the masked version where the
|
||||
// index is also the passthru operand.
|
||||
def X86VPermi2X : SDNode<"X86ISD::VPERMIV3",
|
||||
SDTypeProfile<1, 3, [SDTCisVec<0>,
|
||||
SDTCisSameAs<0,1>,
|
||||
SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<1>,
|
||||
SDTCisVec<1>, SDTCisSameNumEltsAs<0, 1>,
|
||||
SDTCisSameSizeAs<0,1>,
|
||||
SDTCisSameAs<0,2>,
|
||||
SDTCisSameAs<0,3>]>, []>;
|
||||
|
||||
|
|
Loading…
Reference in New Issue