forked from OSchip/llvm-project
[X86][XOP] Add support for combining target shuffles to VPPERM
llvm-svn: 278114
This commit is contained in:
parent
3a25d84a51
commit
aae7d4a1b6
|
@ -3844,6 +3844,7 @@ static bool isTargetShuffleVariableMask(unsigned Opcode) {
|
|||
default: return false;
|
||||
case X86ISD::PSHUFB:
|
||||
case X86ISD::VPERMILPV:
|
||||
case X86ISD::VPPERM:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -25325,6 +25326,44 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
|
|||
return true;
|
||||
}
|
||||
|
||||
// With XOP, if we have a 128-bit binary input shuffle we can always combine
|
||||
// to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
|
||||
// slower than PSHUFB on targets that support both.
|
||||
if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() &&
|
||||
Subtarget.hasXOP()) {
|
||||
// VPPERM Mask Operation
|
||||
// Bits[4:0] - Byte Index (0 - 31)
|
||||
// Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
|
||||
SmallVector<SDValue, 16> VPPERMMask;
|
||||
int NumBytes = 16;
|
||||
int Ratio = NumBytes / NumMaskElts;
|
||||
for (int i = 0; i < NumBytes; ++i) {
|
||||
int M = Mask[i / Ratio];
|
||||
if (M == SM_SentinelUndef) {
|
||||
VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
|
||||
continue;
|
||||
}
|
||||
if (M == SM_SentinelZero) {
|
||||
VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
|
||||
continue;
|
||||
}
|
||||
M = Ratio * M + i % Ratio;
|
||||
VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
|
||||
}
|
||||
MVT ByteVT = MVT::v16i8;
|
||||
V1 = DAG.getBitcast(ByteVT, V1);
|
||||
DCI.AddToWorklist(V1.getNode());
|
||||
V2 = DAG.getBitcast(ByteVT, V2);
|
||||
DCI.AddToWorklist(V2.getNode());
|
||||
SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
|
||||
DCI.AddToWorklist(VPPERMMaskOp.getNode());
|
||||
Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
|
||||
DCI.AddToWorklist(Res.getNode());
|
||||
DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
|
||||
/*AddTo*/ true);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Failed to find any combines.
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -131,3 +131,15 @@ define <16 x i8> @combine_vpperm_as_unpckhwd(<16 x i8> %a0, <16 x i8> %a1) {
|
|||
%res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 8, i8 24, i8 9, i8 25, i8 10, i8 26, i8 11, i8 27, i8 12, i8 28, i8 13, i8 29, i8 14, i8 30, i8 15, i8 31>)
|
||||
ret <16 x i8> %res0
|
||||
}
|
||||
|
||||
define <4 x i32> @combine_vpperm_10zz32BA(<4 x i32> %a0, <4 x i32> %a1) {
|
||||
; CHECK-LABEL: combine_vpperm_10zz32BA:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[2,3,0,1],zero,zero,zero,zero,xmm0[6,7,4,5],xmm1[6,7,4,5]
|
||||
; CHECK-NEXT: retq
|
||||
%res0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
|
||||
%res1 = bitcast <4 x i32> %res0 to <16 x i8>
|
||||
%res2 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %res1, <16 x i8> undef, <16 x i8> <i8 2, i8 3, i8 0, i8 1, i8 128, i8 128, i8 128, i8 128, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
|
||||
%res3 = bitcast <16 x i8> %res2 to <4 x i32>
|
||||
ret <4 x i32> %res3
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue