forked from OSchip/llvm-project
[x86] use PMOVMSK to replace memcmp libcalls for 16-byte equality
This is the payoff for D31156 - if a target has efficient comparison instructions for vector-sized equality, we can replace memcmp calls with inline code that is both smaller and faster. Differential Revision: https://reviews.llvm.org/D31290 llvm-svn: 298775
This commit is contained in:
parent
c3e5c3c5bc
commit
9ebb68843e
|
@ -437,6 +437,15 @@ public:
|
|||
return false;
|
||||
}
|
||||
|
||||
/// Return the preferred operand type if the target has a quick way to compare
|
||||
/// integer values of the given size. Assume that any legal integer type can
|
||||
/// be compared efficiently. Targets may override this to allow illegal wide
|
||||
/// types to return a vector type if there is support to compare that type.
|
||||
virtual MVT hasFastEqualityCompare(unsigned NumBits) const {
|
||||
MVT VT = MVT::getIntegerVT(NumBits);
|
||||
return isTypeLegal(VT) ? VT : MVT::INVALID_SIMPLE_VALUE_TYPE;
|
||||
}
|
||||
|
||||
/// Return true if the target should transform:
|
||||
/// (X & Y) == Y ---> (~X & Y) == 0
|
||||
/// (X & Y) != Y ---> (~X & Y) != 0
|
||||
|
|
|
@ -5955,13 +5955,17 @@ static bool IsOnlyUsedInZeroEqualityComparison(const Value *V) {
|
|||
}
|
||||
|
||||
static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
|
||||
Type *LoadTy,
|
||||
SelectionDAGBuilder &Builder) {
|
||||
|
||||
// Check to see if this load can be trivially constant folded, e.g. if the
|
||||
// input is from a string literal.
|
||||
if (const Constant *LoadInput = dyn_cast<Constant>(PtrVal)) {
|
||||
// Cast pointer to the type we really want to load.
|
||||
Type *LoadTy =
|
||||
Type::getIntNTy(PtrVal->getContext(), LoadVT.getScalarSizeInBits());
|
||||
if (LoadVT.isVector())
|
||||
LoadTy = VectorType::get(LoadTy, LoadVT.getVectorNumElements());
|
||||
|
||||
LoadInput = ConstantExpr::getBitCast(const_cast<Constant *>(LoadInput),
|
||||
PointerType::getUnqual(LoadTy));
|
||||
|
||||
|
@ -6039,57 +6043,64 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
|
|||
if (!CSize || !IsOnlyUsedInZeroEqualityComparison(&I))
|
||||
return false;
|
||||
|
||||
// If the target has a fast compare for the given size, it will return a
|
||||
// preferred load type for that size. Require that the load VT is legal and
|
||||
// that the target supports unaligned loads of that type. Otherwise, return
|
||||
// INVALID.
|
||||
auto hasFastLoadsAndCompare = [&](unsigned NumBits) {
|
||||
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
||||
MVT LVT = TLI.hasFastEqualityCompare(NumBits);
|
||||
if (LVT != MVT::INVALID_SIMPLE_VALUE_TYPE) {
|
||||
// TODO: Handle 5 byte compare as 4-byte + 1 byte.
|
||||
// TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
|
||||
// TODO: Check alignment of src and dest ptrs.
|
||||
unsigned DstAS = LHS->getType()->getPointerAddressSpace();
|
||||
unsigned SrcAS = RHS->getType()->getPointerAddressSpace();
|
||||
if (!TLI.isTypeLegal(LVT) ||
|
||||
!TLI.allowsMisalignedMemoryAccesses(LVT, SrcAS) ||
|
||||
!TLI.allowsMisalignedMemoryAccesses(LVT, DstAS))
|
||||
LVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
|
||||
}
|
||||
|
||||
return LVT;
|
||||
};
|
||||
|
||||
// This turns into unaligned loads. We only do this if the target natively
|
||||
// supports the MVT we'll be loading or if it is small enough (<= 4) that
|
||||
// we'll only produce a small number of byte loads.
|
||||
MVT LoadVT;
|
||||
Type *LoadTy;
|
||||
switch (CSize->getZExtValue()) {
|
||||
default:
|
||||
return false;
|
||||
case 2:
|
||||
LoadVT = MVT::i16;
|
||||
LoadTy = Type::getInt16Ty(CSize->getContext());
|
||||
break;
|
||||
case 4:
|
||||
LoadVT = MVT::i32;
|
||||
LoadTy = Type::getInt32Ty(CSize->getContext());
|
||||
break;
|
||||
case 8:
|
||||
LoadVT = MVT::i64;
|
||||
LoadTy = Type::getInt64Ty(CSize->getContext());
|
||||
LoadVT = hasFastLoadsAndCompare(64);
|
||||
break;
|
||||
/*
|
||||
case 16:
|
||||
LoadVT = MVT::v4i32;
|
||||
LoadTy = Type::getInt32Ty(CSize->getContext());
|
||||
LoadTy = VectorType::get(LoadTy, 4);
|
||||
LoadVT = hasFastLoadsAndCompare(128);
|
||||
break;
|
||||
*/
|
||||
}
|
||||
|
||||
// This turns into unaligned loads. We only do this if the target natively
|
||||
// supports the MVT we'll be loading or if it is small enough (<= 4) that
|
||||
// we'll only produce a small number of byte loads.
|
||||
if (LoadVT == MVT::INVALID_SIMPLE_VALUE_TYPE)
|
||||
return false;
|
||||
|
||||
// Require that we can find a legal MVT, and only do this if the target
|
||||
// supports unaligned loads of that type. Expanding into byte loads would
|
||||
// bloat the code.
|
||||
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
||||
if (CSize->getZExtValue() > 4) {
|
||||
unsigned DstAS = LHS->getType()->getPointerAddressSpace();
|
||||
unsigned SrcAS = RHS->getType()->getPointerAddressSpace();
|
||||
// TODO: Handle 5 byte compare as 4-byte + 1 byte.
|
||||
// TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
|
||||
// TODO: Check alignment of src and dest ptrs.
|
||||
if (!TLI.isTypeLegal(LoadVT) ||
|
||||
!TLI.allowsMisalignedMemoryAccesses(LoadVT, SrcAS) ||
|
||||
!TLI.allowsMisalignedMemoryAccesses(LoadVT, DstAS))
|
||||
return false;
|
||||
SDValue LoadL = getMemCmpLoad(LHS, LoadVT, *this);
|
||||
SDValue LoadR = getMemCmpLoad(RHS, LoadVT, *this);
|
||||
|
||||
// Bitcast to a wide integer type if the loads are vectors.
|
||||
if (LoadVT.isVector()) {
|
||||
EVT CmpVT = EVT::getIntegerVT(LHS->getContext(), LoadVT.getSizeInBits());
|
||||
LoadL = DAG.getBitcast(CmpVT, LoadL);
|
||||
LoadR = DAG.getBitcast(CmpVT, LoadR);
|
||||
}
|
||||
|
||||
SDValue LHSVal = getMemCmpLoad(LHS, LoadVT, LoadTy, *this);
|
||||
SDValue RHSVal = getMemCmpLoad(RHS, LoadVT, LoadTy, *this);
|
||||
SDValue SetCC =
|
||||
DAG.getSetCC(getCurSDLoc(), MVT::i1, LHSVal, RHSVal, ISD::SETNE);
|
||||
processIntegerCallValue(I, SetCC, false);
|
||||
SDValue Cmp = DAG.getSetCC(getCurSDLoc(), MVT::i1, LoadL, LoadR, ISD::SETNE);
|
||||
processIntegerCallValue(I, Cmp, false);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -4637,6 +4637,22 @@ bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
|
|||
return true;
|
||||
}
|
||||
|
||||
MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
|
||||
MVT VT = MVT::getIntegerVT(NumBits);
|
||||
if (isTypeLegal(VT))
|
||||
return VT;
|
||||
|
||||
// PMOVMSKB can handle this.
|
||||
if (NumBits == 128 && isTypeLegal(MVT::v16i8))
|
||||
return MVT::v16i8;
|
||||
|
||||
// TODO: Allow 64-bit type for 32-bit target.
|
||||
// TODO: 256- and 512-bit types should be allowed, but make sure that those
|
||||
// cases are handled in combineVectorSizedSetCCEquality().
|
||||
|
||||
return MVT::INVALID_SIMPLE_VALUE_TYPE;
|
||||
}
|
||||
|
||||
/// Val is the undef sentinel value or equal to the specified value.
|
||||
static bool isUndefOrEqual(int Val, int CmpVal) {
|
||||
return ((Val == SM_SentinelUndef) || (Val == CmpVal));
|
||||
|
|
|
@ -815,6 +815,9 @@ namespace llvm {
|
|||
|
||||
bool hasAndNotCompare(SDValue Y) const override;
|
||||
|
||||
/// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
|
||||
MVT hasFastEqualityCompare(unsigned NumBits) const override;
|
||||
|
||||
/// Return the value type to use for ISD::SETCC.
|
||||
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
|
||||
EVT VT) const override;
|
||||
|
|
|
@ -179,12 +179,12 @@ define i1 @length16(i8* %x, i8* %y) nounwind {
|
|||
;
|
||||
; X64-LABEL: length16:
|
||||
; X64: # BB#0:
|
||||
; X64-NEXT: pushq %rax
|
||||
; X64-NEXT: movl $16, %edx
|
||||
; X64-NEXT: callq memcmp
|
||||
; X64-NEXT: testl %eax, %eax
|
||||
; X64-NEXT: movdqu (%rsi), %xmm0
|
||||
; X64-NEXT: movdqu (%rdi), %xmm1
|
||||
; X64-NEXT: pcmpeqb %xmm0, %xmm1
|
||||
; X64-NEXT: pmovmskb %xmm1, %eax
|
||||
; X64-NEXT: cmpl $65535, %eax # imm = 0xFFFF
|
||||
; X64-NEXT: setne %al
|
||||
; X64-NEXT: popq %rcx
|
||||
; X64-NEXT: retq
|
||||
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind
|
||||
%cmp = icmp ne i32 %call, 0
|
||||
|
@ -206,13 +206,11 @@ define i1 @length16_const(i8* %X, i32* nocapture %P) nounwind {
|
|||
;
|
||||
; X64-LABEL: length16_const:
|
||||
; X64: # BB#0:
|
||||
; X64-NEXT: pushq %rax
|
||||
; X64-NEXT: movl $.L.str, %esi
|
||||
; X64-NEXT: movl $16, %edx
|
||||
; X64-NEXT: callq memcmp
|
||||
; X64-NEXT: testl %eax, %eax
|
||||
; X64-NEXT: movdqu (%rdi), %xmm0
|
||||
; X64-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
|
||||
; X64-NEXT: pmovmskb %xmm0, %eax
|
||||
; X64-NEXT: cmpl $65535, %eax # imm = 0xFFFF
|
||||
; X64-NEXT: sete %al
|
||||
; X64-NEXT: popq %rcx
|
||||
; X64-NEXT: retq
|
||||
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind
|
||||
%c = icmp eq i32 %m, 0
|
||||
|
|
Loading…
Reference in New Issue