forked from OSchip/llvm-project
[X86] Remove isel patterns for MOVSS/MOVSD ISD opcodes with integer types.
Ideally our ISD node types going into the isel table would have types consistent with their instruction domain. This prevents us having to duplicate patterns with different types for the same instruction. Unfortunately, it seems our shuffle combining is currently relying on this a little remove some bitcasts. This seems to enable some switching between shufps and shufd. Hopefully there's some way we can address this in the combining. Differential Revision: https://reviews.llvm.org/D49280 llvm-svn: 337590
This commit is contained in:
parent
6194ccf8c7
commit
28ac623f6f
|
@ -29400,13 +29400,13 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
|
||||||
(AllowFloatDomain || !Subtarget.hasSSE41())) {
|
(AllowFloatDomain || !Subtarget.hasSSE41())) {
|
||||||
std::swap(V1, V2);
|
std::swap(V1, V2);
|
||||||
Shuffle = X86ISD::MOVSD;
|
Shuffle = X86ISD::MOVSD;
|
||||||
SrcVT = DstVT = MaskVT;
|
SrcVT = DstVT = MVT::v2f64;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
|
if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
|
||||||
(AllowFloatDomain || !Subtarget.hasSSE41())) {
|
(AllowFloatDomain || !Subtarget.hasSSE41())) {
|
||||||
Shuffle = X86ISD::MOVSS;
|
Shuffle = X86ISD::MOVSS;
|
||||||
SrcVT = DstVT = MaskVT;
|
SrcVT = DstVT = MVT::v4f32;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -30715,28 +30715,6 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
SDValue V0 = peekThroughBitcasts(N0);
|
|
||||||
SDValue V1 = peekThroughBitcasts(N1);
|
|
||||||
bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
|
|
||||||
bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
|
|
||||||
if (isZero0 && isZero1)
|
|
||||||
return SDValue();
|
|
||||||
|
|
||||||
// We often lower to MOVSD/MOVSS from integer as well as native float
|
|
||||||
// types; remove unnecessary domain-crossing bitcasts if we can to make it
|
|
||||||
// easier to combine shuffles later on. We've already accounted for the
|
|
||||||
// domain switching cost when we decided to lower with it.
|
|
||||||
bool isFloat = VT.isFloatingPoint();
|
|
||||||
bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
|
|
||||||
bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
|
|
||||||
if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
|
|
||||||
MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
|
|
||||||
: (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
|
|
||||||
V0 = DAG.getBitcast(NewVT, V0);
|
|
||||||
V1 = DAG.getBitcast(NewVT, V1);
|
|
||||||
return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
|
|
||||||
}
|
|
||||||
|
|
||||||
return SDValue();
|
return SDValue();
|
||||||
}
|
}
|
||||||
case X86ISD::INSERTPS: {
|
case X86ISD::INSERTPS: {
|
||||||
|
|
|
@ -4468,16 +4468,6 @@ let Predicates = [HasAVX512] in {
|
||||||
(VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>;
|
(VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>;
|
||||||
}
|
}
|
||||||
|
|
||||||
let Predicates = [HasAVX512, OptForSize] in {
|
|
||||||
// Shuffle with VMOVSS
|
|
||||||
def : Pat<(v4i32 (X86Movss VR128X:$src1, VR128X:$src2)),
|
|
||||||
(VMOVSSZrr (v4i32 VR128X:$src1), VR128X:$src2)>;
|
|
||||||
|
|
||||||
// Shuffle with VMOVSD
|
|
||||||
def : Pat<(v2i64 (X86Movsd VR128X:$src1, VR128X:$src2)),
|
|
||||||
(VMOVSDZrr VR128X:$src1, VR128X:$src2)>;
|
|
||||||
}
|
|
||||||
|
|
||||||
let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
|
let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
|
||||||
def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
|
def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
|
||||||
(ins VR128X:$src),
|
(ins VR128X:$src),
|
||||||
|
|
|
@ -281,6 +281,8 @@ def X86insertqi : SDNode<"X86ISD::INSERTQI",
|
||||||
def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
|
def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
|
||||||
def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
|
def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
|
||||||
SDTCisSameAs<0,2>]>;
|
SDTCisSameAs<0,2>]>;
|
||||||
|
def SDTShuff2OpFP : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisFP<0>,
|
||||||
|
SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>;
|
||||||
|
|
||||||
def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
|
def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
|
||||||
SDTCisFP<0>, SDTCisInt<2>,
|
SDTCisFP<0>, SDTCisInt<2>,
|
||||||
|
@ -368,11 +370,11 @@ def X86Movddup : SDNode<"X86ISD::MOVDDUP", SDTShuff1Op>;
|
||||||
def X86Movshdup : SDNode<"X86ISD::MOVSHDUP", SDTShuff1Op>;
|
def X86Movshdup : SDNode<"X86ISD::MOVSHDUP", SDTShuff1Op>;
|
||||||
def X86Movsldup : SDNode<"X86ISD::MOVSLDUP", SDTShuff1Op>;
|
def X86Movsldup : SDNode<"X86ISD::MOVSLDUP", SDTShuff1Op>;
|
||||||
|
|
||||||
def X86Movsd : SDNode<"X86ISD::MOVSD", SDTShuff2Op>;
|
def X86Movsd : SDNode<"X86ISD::MOVSD", SDTShuff2OpFP>;
|
||||||
def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2Op>;
|
def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2OpFP>;
|
||||||
|
|
||||||
def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2Op>;
|
def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2OpFP>;
|
||||||
def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>;
|
def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2OpFP>;
|
||||||
|
|
||||||
def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<0>,
|
def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<0>,
|
||||||
SDTCisVec<1>, SDTCisInt<1>,
|
SDTCisVec<1>, SDTCisInt<1>,
|
||||||
|
|
|
@ -317,14 +317,6 @@ let Predicates = [UseAVX, OptForSize] in {
|
||||||
(v2i64 (VMOVSDrr (v2i64 (V_SET0)),
|
(v2i64 (VMOVSDrr (v2i64 (V_SET0)),
|
||||||
(v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
|
(v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
|
||||||
sub_xmm)>;
|
sub_xmm)>;
|
||||||
|
|
||||||
// Shuffle with VMOVSS
|
|
||||||
def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
|
|
||||||
(VMOVSSrr VR128:$src1, VR128:$src2)>;
|
|
||||||
|
|
||||||
// Shuffle with VMOVSD
|
|
||||||
def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
|
|
||||||
(VMOVSDrr VR128:$src1, VR128:$src2)>;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let Predicates = [UseSSE1] in {
|
let Predicates = [UseSSE1] in {
|
||||||
|
@ -335,9 +327,6 @@ let Predicates = [UseSSE1] in {
|
||||||
(MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
|
(MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
|
||||||
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
|
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
|
||||||
(MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
|
(MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
|
||||||
// Shuffle with MOVSS
|
|
||||||
def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
|
|
||||||
(MOVSSrr VR128:$src1, VR128:$src2)>;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// MOVSSrm already zeros the high parts of the register.
|
// MOVSSrm already zeros the high parts of the register.
|
||||||
|
@ -364,12 +353,6 @@ let Predicates = [UseSSE2] in {
|
||||||
(COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
|
(COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
|
||||||
def : Pat<(v2f64 (X86vzload addr:$src)),
|
def : Pat<(v2f64 (X86vzload addr:$src)),
|
||||||
(COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
|
(COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
|
||||||
|
|
||||||
let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in {
|
|
||||||
// Shuffle with MOVSD
|
|
||||||
def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
|
|
||||||
(MOVSDrr VR128:$src1, VR128:$src2)>;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Aliases to help the assembler pick two byte VEX encodings by swapping the
|
// Aliases to help the assembler pick two byte VEX encodings by swapping the
|
||||||
|
@ -6427,12 +6410,6 @@ let Predicates = [HasAVX, OptForSpeed] in {
|
||||||
(VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
|
(VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
|
||||||
def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
|
def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
|
||||||
(VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
|
(VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
|
||||||
def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
|
|
||||||
(VPBLENDWrri VR128:$src1, VR128:$src2, (i8 3))>;
|
|
||||||
def : Pat<(v4i32 (X86Movss VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))),
|
|
||||||
(VPBLENDWrmi VR128:$src1, addr:$src2, (i8 3))>;
|
|
||||||
def : Pat<(v4i32 (X86Movss (bc_v4i32 (loadv2i64 addr:$src2)), VR128:$src1)),
|
|
||||||
(VPBLENDWrmi VR128:$src1, addr:$src2, (i8 0xfc))>;
|
|
||||||
|
|
||||||
def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
|
def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
|
||||||
(VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
|
(VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
|
||||||
|
@ -6440,12 +6417,6 @@ let Predicates = [HasAVX, OptForSpeed] in {
|
||||||
(VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
|
(VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
|
||||||
def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
|
def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
|
||||||
(VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
|
(VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
|
||||||
def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
|
|
||||||
(VPBLENDWrri VR128:$src1, VR128:$src2, (i8 0xf))>;
|
|
||||||
def : Pat<(v2i64 (X86Movsd VR128:$src1, (loadv2i64 addr:$src2))),
|
|
||||||
(VPBLENDWrmi VR128:$src1, addr:$src2, (i8 0xf))>;
|
|
||||||
def : Pat<(v2i64 (X86Movsd (loadv2i64 addr:$src2), VR128:$src1)),
|
|
||||||
(VPBLENDWrmi VR128:$src1, addr:$src2, (i8 0xf0))>;
|
|
||||||
|
|
||||||
// Move low f32 and clear high bits.
|
// Move low f32 and clear high bits.
|
||||||
def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
|
def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
|
||||||
|
@ -6487,12 +6458,6 @@ let Predicates = [UseSSE41, OptForSpeed] in {
|
||||||
(BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
|
(BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
|
||||||
def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
|
def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
|
||||||
(BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
|
(BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
|
||||||
def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
|
|
||||||
(PBLENDWrri VR128:$src1, VR128:$src2, (i8 3))>;
|
|
||||||
def : Pat<(v4i32 (X86Movss VR128:$src1, (bc_v4i32 (memopv2i64 addr:$src2)))),
|
|
||||||
(PBLENDWrmi VR128:$src1, addr:$src2, (i8 3))>;
|
|
||||||
def : Pat<(v4i32 (X86Movss (bc_v4i32 (memopv2i64 addr:$src2)), VR128:$src1)),
|
|
||||||
(PBLENDWrmi VR128:$src1, addr:$src2, (i8 0xfc))>;
|
|
||||||
|
|
||||||
def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
|
def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
|
||||||
(BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
|
(BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
|
||||||
|
@ -6500,12 +6465,6 @@ let Predicates = [UseSSE41, OptForSpeed] in {
|
||||||
(BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
|
(BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
|
||||||
def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
|
def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
|
||||||
(BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
|
(BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
|
||||||
def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
|
|
||||||
(PBLENDWrri VR128:$src1, VR128:$src2, (i8 0xf))>;
|
|
||||||
def : Pat<(v2i64 (X86Movsd VR128:$src1, (memopv2i64 addr:$src2))),
|
|
||||||
(PBLENDWrmi VR128:$src1, addr:$src2, (i8 0xf))>;
|
|
||||||
def : Pat<(v2i64 (X86Movsd (memopv2i64 addr:$src2), VR128:$src1)),
|
|
||||||
(PBLENDWrmi VR128:$src1, addr:$src2, (i8 0xf0))>;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1277,46 +1277,44 @@ define void @interleave_24i16_in(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2,
|
||||||
define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind {
|
define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind {
|
||||||
; SSE2-LABEL: interleave_24i32_out:
|
; SSE2-LABEL: interleave_24i32_out:
|
||||||
; SSE2: # %bb.0:
|
; SSE2: # %bb.0:
|
||||||
; SSE2-NEXT: movups 80(%rdi), %xmm5
|
; SSE2-NEXT: movups 80(%rdi), %xmm9
|
||||||
; SSE2-NEXT: movups 64(%rdi), %xmm8
|
; SSE2-NEXT: movups 64(%rdi), %xmm10
|
||||||
; SSE2-NEXT: movups (%rdi), %xmm0
|
; SSE2-NEXT: movups (%rdi), %xmm0
|
||||||
; SSE2-NEXT: movups 16(%rdi), %xmm6
|
; SSE2-NEXT: movups 16(%rdi), %xmm11
|
||||||
; SSE2-NEXT: movups 32(%rdi), %xmm2
|
; SSE2-NEXT: movups 32(%rdi), %xmm8
|
||||||
; SSE2-NEXT: movups 48(%rdi), %xmm1
|
; SSE2-NEXT: movups 48(%rdi), %xmm2
|
||||||
; SSE2-NEXT: movaps %xmm1, %xmm3
|
; SSE2-NEXT: movaps %xmm2, %xmm3
|
||||||
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm8[2,3]
|
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm10[2,3]
|
||||||
; SSE2-NEXT: movaps %xmm5, %xmm4
|
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,0,1]
|
||||||
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[2,0]
|
; SSE2-NEXT: movaps %xmm9, %xmm6
|
||||||
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,0]
|
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,2,3]
|
||||||
; SSE2-NEXT: movaps %xmm0, %xmm4
|
; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
|
||||||
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm6[2,3]
|
; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm9[0,3]
|
||||||
; SSE2-NEXT: movaps %xmm2, %xmm7
|
; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm3[2,0]
|
||||||
; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm4[2,0]
|
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,0]
|
||||||
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,0]
|
; SSE2-NEXT: movaps %xmm0, %xmm5
|
||||||
; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,3,0,1]
|
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm11[2,3]
|
||||||
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[0,0]
|
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
|
||||||
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[3,3]
|
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,2,3]
|
||||||
; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,1,0,3]
|
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
|
||||||
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,0]
|
; SSE2-NEXT: movaps %xmm8, %xmm4
|
||||||
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
|
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,3]
|
||||||
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
|
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm5[2,0]
|
||||||
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm8[0,0]
|
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,0]
|
||||||
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[3,3]
|
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0]
|
||||||
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,1,0,3]
|
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm11[3,3]
|
||||||
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,0]
|
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,0]
|
||||||
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0]
|
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0]
|
||||||
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
|
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0]
|
||||||
; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1]
|
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm10[3,3]
|
||||||
; SSE2-NEXT: movsd {{.*#+}} xmm9 = xmm10[0],xmm9[1]
|
; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,0]
|
||||||
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,2,3]
|
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,0]
|
||||||
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
|
|
||||||
; SSE2-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1]
|
|
||||||
; SSE2-NEXT: movups %xmm3, 16(%rsi)
|
; SSE2-NEXT: movups %xmm3, 16(%rsi)
|
||||||
; SSE2-NEXT: movups %xmm4, (%rsi)
|
; SSE2-NEXT: movups %xmm5, (%rsi)
|
||||||
; SSE2-NEXT: movups %xmm1, 16(%rdx)
|
; SSE2-NEXT: movups %xmm2, 16(%rdx)
|
||||||
; SSE2-NEXT: movups %xmm0, (%rdx)
|
; SSE2-NEXT: movups %xmm0, (%rdx)
|
||||||
; SSE2-NEXT: movupd %xmm7, 16(%rcx)
|
; SSE2-NEXT: movups %xmm7, 16(%rcx)
|
||||||
; SSE2-NEXT: movupd %xmm9, (%rcx)
|
; SSE2-NEXT: movups %xmm1, (%rcx)
|
||||||
; SSE2-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
;
|
;
|
||||||
; SSE42-LABEL: interleave_24i32_out:
|
; SSE42-LABEL: interleave_24i32_out:
|
||||||
|
|
|
@ -1233,16 +1233,17 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
|
||||||
; SSE2-NEXT: movdqa %xmm0, %xmm1
|
; SSE2-NEXT: movdqa %xmm0, %xmm1
|
||||||
; SSE2-NEXT: psraw $4, %xmm1
|
; SSE2-NEXT: psraw $4, %xmm1
|
||||||
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
|
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
|
||||||
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
|
; SSE2-NEXT: movapd %xmm1, %xmm2
|
||||||
|
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
|
||||||
; SSE2-NEXT: psraw $2, %xmm1
|
; SSE2-NEXT: psraw $2, %xmm1
|
||||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
|
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
|
||||||
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
|
; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
|
||||||
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
|
; SSE2-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
|
||||||
; SSE2-NEXT: movdqa %xmm2, %xmm1
|
; SSE2-NEXT: movaps %xmm2, %xmm0
|
||||||
; SSE2-NEXT: pand %xmm0, %xmm1
|
; SSE2-NEXT: andps %xmm1, %xmm0
|
||||||
; SSE2-NEXT: psraw $1, %xmm2
|
; SSE2-NEXT: psraw $1, %xmm2
|
||||||
; SSE2-NEXT: pandn %xmm2, %xmm0
|
; SSE2-NEXT: andnps %xmm2, %xmm1
|
||||||
; SSE2-NEXT: por %xmm1, %xmm0
|
; SSE2-NEXT: orps %xmm1, %xmm0
|
||||||
; SSE2-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
;
|
;
|
||||||
; SSE41-LABEL: constant_shift_v8i16:
|
; SSE41-LABEL: constant_shift_v8i16:
|
||||||
|
@ -1318,16 +1319,17 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
|
||||||
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
|
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
|
||||||
; X32-SSE-NEXT: psraw $4, %xmm1
|
; X32-SSE-NEXT: psraw $4, %xmm1
|
||||||
; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
|
; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
|
||||||
; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
|
; X32-SSE-NEXT: movapd %xmm1, %xmm2
|
||||||
|
; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
|
||||||
; X32-SSE-NEXT: psraw $2, %xmm1
|
; X32-SSE-NEXT: psraw $2, %xmm1
|
||||||
; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
|
; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
|
||||||
; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
|
; X32-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
|
||||||
; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
|
; X32-SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
|
||||||
; X32-SSE-NEXT: movdqa %xmm2, %xmm1
|
; X32-SSE-NEXT: movaps %xmm2, %xmm0
|
||||||
; X32-SSE-NEXT: pand %xmm0, %xmm1
|
; X32-SSE-NEXT: andps %xmm1, %xmm0
|
||||||
; X32-SSE-NEXT: psraw $1, %xmm2
|
; X32-SSE-NEXT: psraw $1, %xmm2
|
||||||
; X32-SSE-NEXT: pandn %xmm2, %xmm0
|
; X32-SSE-NEXT: andnps %xmm2, %xmm1
|
||||||
; X32-SSE-NEXT: por %xmm1, %xmm0
|
; X32-SSE-NEXT: orps %xmm1, %xmm0
|
||||||
; X32-SSE-NEXT: retl
|
; X32-SSE-NEXT: retl
|
||||||
%shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
|
%shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
|
||||||
ret <8 x i16> %shift
|
ret <8 x i16> %shift
|
||||||
|
|
|
@ -993,16 +993,17 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
|
||||||
; SSE2-NEXT: movdqa %xmm0, %xmm1
|
; SSE2-NEXT: movdqa %xmm0, %xmm1
|
||||||
; SSE2-NEXT: psrlw $4, %xmm1
|
; SSE2-NEXT: psrlw $4, %xmm1
|
||||||
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
|
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
|
||||||
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
|
; SSE2-NEXT: movapd %xmm1, %xmm2
|
||||||
|
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
|
||||||
; SSE2-NEXT: psrlw $2, %xmm1
|
; SSE2-NEXT: psrlw $2, %xmm1
|
||||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
|
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
|
||||||
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
|
; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
|
||||||
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
|
; SSE2-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
|
||||||
; SSE2-NEXT: movdqa %xmm2, %xmm1
|
; SSE2-NEXT: movaps %xmm2, %xmm0
|
||||||
; SSE2-NEXT: pand %xmm0, %xmm1
|
; SSE2-NEXT: andps %xmm1, %xmm0
|
||||||
; SSE2-NEXT: psrlw $1, %xmm2
|
; SSE2-NEXT: psrlw $1, %xmm2
|
||||||
; SSE2-NEXT: pandn %xmm2, %xmm0
|
; SSE2-NEXT: andnps %xmm2, %xmm1
|
||||||
; SSE2-NEXT: por %xmm1, %xmm0
|
; SSE2-NEXT: orps %xmm1, %xmm0
|
||||||
; SSE2-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
;
|
;
|
||||||
; SSE41-LABEL: constant_shift_v8i16:
|
; SSE41-LABEL: constant_shift_v8i16:
|
||||||
|
@ -1079,16 +1080,17 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
|
||||||
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
|
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
|
||||||
; X32-SSE-NEXT: psrlw $4, %xmm1
|
; X32-SSE-NEXT: psrlw $4, %xmm1
|
||||||
; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
|
; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
|
||||||
; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
|
; X32-SSE-NEXT: movapd %xmm1, %xmm2
|
||||||
|
; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
|
||||||
; X32-SSE-NEXT: psrlw $2, %xmm1
|
; X32-SSE-NEXT: psrlw $2, %xmm1
|
||||||
; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
|
; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
|
||||||
; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
|
; X32-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
|
||||||
; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
|
; X32-SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
|
||||||
; X32-SSE-NEXT: movdqa %xmm2, %xmm1
|
; X32-SSE-NEXT: movaps %xmm2, %xmm0
|
||||||
; X32-SSE-NEXT: pand %xmm0, %xmm1
|
; X32-SSE-NEXT: andps %xmm1, %xmm0
|
||||||
; X32-SSE-NEXT: psrlw $1, %xmm2
|
; X32-SSE-NEXT: psrlw $1, %xmm2
|
||||||
; X32-SSE-NEXT: pandn %xmm2, %xmm0
|
; X32-SSE-NEXT: andnps %xmm2, %xmm1
|
||||||
; X32-SSE-NEXT: por %xmm1, %xmm0
|
; X32-SSE-NEXT: orps %xmm1, %xmm0
|
||||||
; X32-SSE-NEXT: retl
|
; X32-SSE-NEXT: retl
|
||||||
%shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
|
%shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
|
||||||
ret <8 x i16> %shift
|
ret <8 x i16> %shift
|
||||||
|
|
|
@ -1248,8 +1248,8 @@ define <8 x i16> @shuffle_v8i16_032dXXXX(<8 x i16> %a, <8 x i16> %b) {
|
||||||
; SSE2-LABEL: shuffle_v8i16_032dXXXX:
|
; SSE2-LABEL: shuffle_v8i16_032dXXXX:
|
||||||
; SSE2: # %bb.0:
|
; SSE2: # %bb.0:
|
||||||
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
|
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
|
||||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0]
|
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,0]
|
||||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
|
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,5,6,7]
|
||||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
|
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
|
||||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
|
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
|
||||||
; SSE2-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
|
@ -1403,8 +1403,8 @@ define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) {
|
||||||
; SSE2-LABEL: shuffle_v8i16_012dcde3:
|
; SSE2-LABEL: shuffle_v8i16_012dcde3:
|
||||||
; SSE2: # %bb.0:
|
; SSE2: # %bb.0:
|
||||||
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
|
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
|
||||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,1]
|
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3,2,1]
|
||||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
|
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[3,1,2,0,4,5,6,7]
|
||||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
|
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
|
||||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
|
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
|
||||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7]
|
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7]
|
||||||
|
@ -1542,11 +1542,10 @@ define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) {
|
||||||
define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) {
|
define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) {
|
||||||
; SSE2-LABEL: shuffle_v8i16_XX4X8acX:
|
; SSE2-LABEL: shuffle_v8i16_XX4X8acX:
|
||||||
; SSE2: # %bb.0:
|
; SSE2: # %bb.0:
|
||||||
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
|
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
|
||||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
|
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
|
||||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
|
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,7]
|
||||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7]
|
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm1[2,3]
|
||||||
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
|
|
||||||
; SSE2-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
;
|
;
|
||||||
; SSSE3-LABEL: shuffle_v8i16_XX4X8acX:
|
; SSSE3-LABEL: shuffle_v8i16_XX4X8acX:
|
||||||
|
|
Loading…
Reference in New Issue