[X86] Teach shouldSinkOperands to recognize pmuldq/pmuludq patterns.

The IR for pmuldq/pmuludq intrinsics uses a sext_inreg/zext_inreg
pattern on the inputs. Ideally we pattern match these away during
isel. It is possible for LICM or other middle end optimizations
to separate the extend from the mul. This prevents SelectionDAG
from removing it or depending on how the extend is lowered, we
may not be able to generate an AssertSExt/AssertZExt in the
mul basic block. This will prevent pmuldq/pmuludq from being
formed at all.

This patch teaches shouldSinkOperands to recognize this so
that CodeGenPrepare will clone the extend into the same basic
block as the mul.

Fixes PR51371.

Differential Revision: https://reviews.llvm.org/D107689
This commit is contained in:
Craig Topper 2021-08-06 22:19:38 -07:00
parent 8a2d1b183d
commit 24dfba8d50
2 changed files with 37 additions and 24 deletions

View File

@ -48,9 +48,10 @@
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalAlias.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
@ -32091,6 +32092,36 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
bool X86TargetLowering::shouldSinkOperands(Instruction *I,
SmallVectorImpl<Use *> &Ops) const {
using namespace llvm::PatternMatch;
FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
if (!VTy)
return false;
if (I->getOpcode() == Instruction::Mul &&
VTy->getElementType()->isIntegerTy(64)) {
for (auto &Op : I->operands()) {
// Make sure we are not already sinking this operand
if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
continue;
// Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
// the PMULUDQ pattern where the input is a zext_inreg from vXi32.
if (Subtarget.hasSSE41() &&
match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
m_SpecificInt(32)))) {
Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
Ops.push_back(&Op);
} else if (Subtarget.hasSSE2() &&
match(Op.get(),
m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
Ops.push_back(&Op);
}
}
return !Ops.empty();
}
// A uniform shift amount in a vector shift or funnel shift may be much
// cheaper than a generic variable vector shift, so make that pattern visible
// to SDAG by sinking the shuffle instruction next to the shift.

View File

@ -8,28 +8,12 @@ define void @pmuldq(<2 x i64>* nocapture %0, i32 %1, i64 %2) {
; CHECK-NEXT: je .LBB0_3
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: movd %esi, %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
; CHECK-NEXT: movdqa %xmm1, %xmm0
; CHECK-NEXT: psllq $32, %xmm0
; CHECK-NEXT: psrad $31, %xmm0
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrlq $32, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_2: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: movdqa (%rdi), %xmm2
; CHECK-NEXT: movdqa %xmm2, %xmm3
; CHECK-NEXT: psllq $32, %xmm3
; CHECK-NEXT: psrad $31, %xmm3
; CHECK-NEXT: movdqa %xmm2, %xmm4
; CHECK-NEXT: pmuludq %xmm1, %xmm4
; CHECK-NEXT: psrlq $32, %xmm3
; CHECK-NEXT: pmuludq %xmm0, %xmm3
; CHECK-NEXT: paddq %xmm4, %xmm3
; CHECK-NEXT: psllq $32, %xmm3
; CHECK-NEXT: pmuludq %xmm0, %xmm2
; CHECK-NEXT: paddq %xmm3, %xmm2
; CHECK-NEXT: movdqa %xmm2, (%rdi)
; CHECK-NEXT: movdqa (%rdi), %xmm1
; CHECK-NEXT: pmuldq %xmm0, %xmm1
; CHECK-NEXT: movdqa %xmm1, (%rdi)
; CHECK-NEXT: addq $16, %rdi
; CHECK-NEXT: decq %rdx
; CHECK-NEXT: jne .LBB0_2
@ -66,9 +50,7 @@ define void @pmuludq(<2 x i64>* nocapture %0, i32 %1, i64 %2) {
; CHECK-NEXT: je .LBB1_3
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: movd %esi, %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
; CHECK-NEXT: pxor %xmm0, %xmm0
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB1_2: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: movdqa (%rdi), %xmm1