forked from OSchip/llvm-project
[X86][AVX] canonicalizeLaneShuffleWithRepeatedOps - merge VPERMILPD ops with different low/high masks.
Now that PR48908 has been dealt with, we can handle v4f64 permute cases by extracting the low/high lane VPERMILPD masks and creating a new mask based on which lanes are referenced by the VPERM2F128 mask.
This commit is contained in:
parent
518af8df44
commit
e117295922
|
@ -36946,11 +36946,27 @@ static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
|
|||
return DAG.getBitcast(VT, Res);
|
||||
}
|
||||
case X86ISD::VPERMILPI:
|
||||
// TODO: Handle v4f64 permutes with different low/high lane masks.
|
||||
// Handle v4f64 permutes with different low/high lane masks by permuting
|
||||
// the permute mask on a lane-by-lane basis.
|
||||
if (SrcVT0 == MVT::v4f64) {
|
||||
uint64_t Mask = Src0.getConstantOperandVal(1);
|
||||
if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
|
||||
break;
|
||||
if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
|
||||
uint64_t LaneMask = V.getConstantOperandVal(2);
|
||||
uint64_t Mask = Src0.getConstantOperandVal(1);
|
||||
uint64_t LoMask = Mask & 0x3;
|
||||
uint64_t HiMask = (Mask >> 2) & 0x3;
|
||||
uint64_t NewMask = 0;
|
||||
NewMask |= ((LaneMask & 0x02) ? HiMask : LoMask);
|
||||
NewMask |= ((LaneMask & 0x02) ? HiMask : LoMask) << 2;
|
||||
SDValue LHS = Src0.getOperand(0);
|
||||
SDValue RHS =
|
||||
Src1.isUndef() ? DAG.getUNDEF(SrcVT0) : Src1.getOperand(0);
|
||||
SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
|
||||
V.getOperand(2));
|
||||
Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res,
|
||||
DAG.getTargetConstant(NewMask, DL, MVT::i8));
|
||||
return DAG.getBitcast(VT, Res);
|
||||
}
|
||||
break;
|
||||
}
|
||||
LLVM_FALLTHROUGH;
|
||||
case X86ISD::VSHLI:
|
||||
|
|
|
@ -442,18 +442,16 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x
|
|||
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3
|
||||
; X86-AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,1,2,2]
|
||||
; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
|
||||
; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm5
|
||||
; X86-AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[1],ymm4[0],ymm5[2],ymm4[3]
|
||||
; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[0,1]
|
||||
; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4
|
||||
; X86-AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[1],ymm3[0],ymm4[2],ymm3[3]
|
||||
; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[0,1],ymm0[0,1]
|
||||
; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1]
|
||||
; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1]
|
||||
; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3]
|
||||
; X86-AVX1-NEXT: vmovapd %ymm3, (%edx)
|
||||
; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3,0,1]
|
||||
; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm0[2],ymm4[3]
|
||||
; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3]
|
||||
; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3]
|
||||
; X86-AVX1-NEXT: vmovapd %ymm4, (%edx)
|
||||
; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1]
|
||||
; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3]
|
||||
; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
|
||||
; X86-AVX1-NEXT: vmovapd %ymm3, (%ecx)
|
||||
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
|
||||
|
@ -515,18 +513,16 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x
|
|||
;
|
||||
; X64-AVX1-LABEL: PR48908:
|
||||
; X64-AVX1: # %bb.0:
|
||||
; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3
|
||||
; X64-AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,1,2,2]
|
||||
; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
|
||||
; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm5
|
||||
; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[1],ymm4[0],ymm5[2],ymm4[3]
|
||||
; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[0,1]
|
||||
; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4
|
||||
; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[1],ymm3[0],ymm4[2],ymm3[3]
|
||||
; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[0,1],ymm0[0,1]
|
||||
; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1]
|
||||
; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1]
|
||||
; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3]
|
||||
; X64-AVX1-NEXT: vmovapd %ymm3, (%rdi)
|
||||
; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3,0,1]
|
||||
; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm0[2],ymm4[3]
|
||||
; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3]
|
||||
; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3]
|
||||
; X64-AVX1-NEXT: vmovapd %ymm4, (%rdi)
|
||||
; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1]
|
||||
; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3]
|
||||
; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
|
||||
; X64-AVX1-NEXT: vmovapd %ymm3, (%rsi)
|
||||
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
|
||||
|
|
Loading…
Reference in New Issue