[instcombine][x86] Converted pdep/pext with shifted mask to simple arithmetic

If the mask of a pdep or pext instruction is a shift masked (i.e. one contiguous block of ones) we need at most one and and one shift to represent the operation without the intrinsic. One all platforms I know of, this is faster than the pdep/pext.

The cost modelling for multiple contiguous blocks might be worth exploring in a follow up, but it's not relevant for my current use case. It would almost certainly be a win on AMDs where these are really really slow though.

Differential Revision: https://reviews.llvm.org/D87861
This commit is contained in:
Philip Reames 2020-09-18 14:53:29 -07:00
parent 7c10129f5a
commit 06f136f61e
2 changed files with 68 additions and 0 deletions

View File

@ -999,6 +999,20 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
return IC.replaceInstUsesWith(II, II.getArgOperand(0));
}
if (MaskC->getValue().isShiftedMask()) {
// any single contingous sequence of 1s anywhere in the mask simply
// describes a subset of the input bits shifted to the appropriate
// position. Replace with the straight forward IR.
unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
Value *Input = II.getArgOperand(0);
Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
Value *Shifted = IC.Builder.CreateLShr(Masked,
ConstantInt::get(II.getType(),
ShiftAmount));
return IC.replaceInstUsesWith(II, Shifted);
}
if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
uint64_t Src = SrcC->getZExtValue();
uint64_t Mask = MaskC->getZExtValue();
@ -1030,6 +1044,18 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
if (MaskC->isAllOnesValue()) {
return IC.replaceInstUsesWith(II, II.getArgOperand(0));
}
if (MaskC->getValue().isShiftedMask()) {
// any single contingous sequence of 1s anywhere in the mask simply
// describes a subset of the input bits shifted to the appropriate
// position. Replace with the straight forward IR.
unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
Value *Input = II.getArgOperand(0);
Value *Shifted = IC.Builder.CreateShl(Input,
ConstantInt::get(II.getType(),
ShiftAmount));
Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
return IC.replaceInstUsesWith(II, Masked);
}
if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
uint64_t Src = SrcC->getZExtValue();

View File

@ -306,6 +306,27 @@ define i64 @test_x86_pext_64_allones_mask(i64 %x) nounwind readnone {
ret i64 %1
}
define i32 @test_x86_pext_32_shifted_mask(i32 %x) nounwind readnone {
; CHECK-LABEL: @test_x86_pext_32_shifted_mask(
; CHECK-NEXT: %1 = lshr i32 %x, 1
; CHECK-NEXT: %2 = and i32 %1, 3
; CHECK-NEXT: ret i32 %2
;
%1 = tail call i32 @llvm.x86.bmi.pext.32(i32 %x, i32 6)
ret i32 %1
}
define i64 @test_x86_pext_64_shifted_mask(i64 %x) nounwind readnone {
; CHECK-LABEL: @test_x86_pext_64_shifted_mask(
; CHECK-NEXT: %1 = lshr i64 %x, 1
; CHECK-NEXT: %2 = and i64 %1, 3
; CHECK-NEXT: ret i64 %2
;
%1 = tail call i64 @llvm.x86.bmi.pext.64(i64 %x, i64 6)
ret i64 %1
}
define i32 @test_x86_pext_32_constant_fold() nounwind readnone {
; CHECK-LABEL: @test_x86_pext_32_constant_fold(
; CHECK-NEXT: ret i32 30001
@ -370,6 +391,27 @@ define i64 @test_x86_pdep_64_allones_mask(i64 %x) nounwind readnone {
ret i64 %1
}
define i32 @test_x86_pdep_32_shifted_mask(i32 %x) nounwind readnone {
; CHECK-LABEL: @test_x86_pdep_32_shifted_mask(
; CHECK-NEXT: %1 = shl i32 %x, 2
; CHECK-NEXT: %2 = and i32 %1, 12
; CHECK-NEXT: ret i32 %2
;
%1 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 12)
ret i32 %1
}
define i64 @test_x86_pdep_64_shifted_mask(i64 %x) nounwind readnone {
; CHECK-LABEL: @test_x86_pdep_64_shifted_mask(
; CHECK-NEXT: %1 = shl i64 %x, 2
; CHECK-NEXT: %2 = and i64 %1, 12
; CHECK-NEXT: ret i64 %2
;
%1 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %x, i64 12)
ret i64 %1
}
define i32 @test_x86_pdep_32_constant_fold() nounwind readnone {
; CHECK-LABEL: @test_x86_pdep_32_constant_fold(
; CHECK-NEXT: ret i32 807407616