diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp index b1af31067ae0..fd2b4e280039 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp @@ -1202,7 +1202,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); LLVM_FALLTHROUGH; CASE_PMOVZX(PMOVZXBW, m) - DecodeZeroExtendMask(8, 16, getRegOperandNumElts(MI, 16, 0), ShuffleMask); + DecodeZeroExtendMask(8, 16, getRegOperandNumElts(MI, 16, 0), false, + ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; @@ -1210,7 +1211,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); LLVM_FALLTHROUGH; CASE_PMOVZX(PMOVZXBD, m) - DecodeZeroExtendMask(8, 32, getRegOperandNumElts(MI, 32, 0), ShuffleMask); + DecodeZeroExtendMask(8, 32, getRegOperandNumElts(MI, 32, 0), false, + ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; @@ -1218,7 +1220,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); LLVM_FALLTHROUGH; CASE_PMOVZX(PMOVZXBQ, m) - DecodeZeroExtendMask(8, 64, getRegOperandNumElts(MI, 64, 0), ShuffleMask); + DecodeZeroExtendMask(8, 64, getRegOperandNumElts(MI, 64, 0), false, + ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; @@ -1226,7 +1229,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); LLVM_FALLTHROUGH; CASE_PMOVZX(PMOVZXWD, m) - DecodeZeroExtendMask(16, 32, getRegOperandNumElts(MI, 32, 0), ShuffleMask); + DecodeZeroExtendMask(16, 32, getRegOperandNumElts(MI, 32, 0), false, + ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; @@ -1234,7 +1238,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); LLVM_FALLTHROUGH; CASE_PMOVZX(PMOVZXWQ, m) - DecodeZeroExtendMask(16, 64, getRegOperandNumElts(MI, 64, 0), ShuffleMask); + DecodeZeroExtendMask(16, 64, getRegOperandNumElts(MI, 64, 0), false, + ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; @@ -1242,7 +1247,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); LLVM_FALLTHROUGH; CASE_PMOVZX(PMOVZXDQ, m) - DecodeZeroExtendMask(32, 64, getRegOperandNumElts(MI, 64, 0), ShuffleMask); + DecodeZeroExtendMask(32, 64, getRegOperandNumElts(MI, 64, 0), false, + ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; } diff --git a/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp index 524e72318910..48fd3e0b7ab9 100644 --- a/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -383,7 +383,8 @@ void DecodeVPERMMask(unsigned NumElts, unsigned Imm, } void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, - unsigned NumDstElts, SmallVectorImpl &Mask) { + unsigned NumDstElts, bool IsAnyExtend, + SmallVectorImpl &Mask) { unsigned Scale = DstScalarBits / SrcScalarBits; assert(SrcScalarBits < DstScalarBits && "Expected zero extension mask to increase scalar size"); @@ -391,7 +392,7 @@ void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, for (unsigned i = 0; i != NumDstElts; i++) { Mask.push_back(i); for (unsigned j = 1; j != Scale; j++) - Mask.push_back(SM_SentinelZero); + Mask.push_back(IsAnyExtend ? SM_SentinelUndef : SM_SentinelZero); } } diff --git a/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h index d1678d57a8f9..f52785063071 100644 --- a/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -136,7 +136,7 @@ void DecodeVPPERMMask(ArrayRef RawMask, const APInt &UndefElts, /// Decode a zero extension instruction as a shuffle mask. void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, - unsigned NumDstElts, + unsigned NumDstElts, bool IsAnyExtend, SmallVectorImpl &ShuffleMask); /// Decode a move lower and zero upper instruction as a shuffle mask. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 8b6edaa50ba6..47b8e9eac2d2 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -6849,17 +6849,20 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl &Mask, return true; } case ISD::ZERO_EXTEND: - case ISD::ZERO_EXTEND_VECTOR_INREG: { + case ISD::ZERO_EXTEND_VECTOR_INREG: + case ISD::ANY_EXTEND_VECTOR_INREG: { SDValue Src = N.getOperand(0); EVT SrcVT = Src.getValueType(); - // Zero-extended source must be a simple vector. + // Extended source must be a simple vector. if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 || (SrcVT.getScalarSizeInBits() % 8) != 0) return false; unsigned NumSrcBitsPerElt = SrcVT.getScalarSizeInBits(); - DecodeZeroExtendMask(NumSrcBitsPerElt, NumBitsPerElt, NumElts, Mask); + bool IsAnyExtend = (ISD::ANY_EXTEND_VECTOR_INREG == Opcode); + DecodeZeroExtendMask(NumSrcBitsPerElt, NumBitsPerElt, NumElts, IsAnyExtend, + Mask); if (NumSizeInBits != SrcVT.getSizeInBits()) { assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 && @@ -43259,7 +43262,7 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG, // Attempt to combine as a shuffle. // TODO: SSE41 support - if (Subtarget.hasAVX() && N->getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) { + if (Subtarget.hasAVX()) { SDValue Op(N, 0); if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType())) if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll index f693a57c732f..0c8949f24617 100644 --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -479,11 +479,10 @@ define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl @@ -503,11 +502,10 @@ define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X64-AVX-LABEL: mul_2xi16: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -1167,10 +1165,9 @@ define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readon ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi ; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0 -; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl @@ -1195,10 +1192,9 @@ define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readon ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax ; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0 -; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -1813,9 +1809,9 @@ define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; @@ -1834,9 +1830,9 @@ define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) { ; X64-AVX-LABEL: mul_2xi16_varconst1: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1941,9 +1937,9 @@ define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; @@ -1962,9 +1958,9 @@ define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) { ; X64-AVX-LABEL: mul_2xi16_varconst3: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: