forked from OSchip/llvm-project
[instcombine][x86] Converted pdep/pext with shifted mask to simple arithmetic
If the mask of a pdep or pext instruction is a shift masked (i.e. one contiguous block of ones) we need at most one and and one shift to represent the operation without the intrinsic. One all platforms I know of, this is faster than the pdep/pext. The cost modelling for multiple contiguous blocks might be worth exploring in a follow up, but it's not relevant for my current use case. It would almost certainly be a win on AMDs where these are really really slow though. Differential Revision: https://reviews.llvm.org/D87861
This commit is contained in:
parent
7c10129f5a
commit
06f136f61e
|
@ -999,6 +999,20 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
|
|||
return IC.replaceInstUsesWith(II, II.getArgOperand(0));
|
||||
}
|
||||
|
||||
if (MaskC->getValue().isShiftedMask()) {
|
||||
// any single contingous sequence of 1s anywhere in the mask simply
|
||||
// describes a subset of the input bits shifted to the appropriate
|
||||
// position. Replace with the straight forward IR.
|
||||
unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
|
||||
Value *Input = II.getArgOperand(0);
|
||||
Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
|
||||
Value *Shifted = IC.Builder.CreateLShr(Masked,
|
||||
ConstantInt::get(II.getType(),
|
||||
ShiftAmount));
|
||||
return IC.replaceInstUsesWith(II, Shifted);
|
||||
}
|
||||
|
||||
|
||||
if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
|
||||
uint64_t Src = SrcC->getZExtValue();
|
||||
uint64_t Mask = MaskC->getZExtValue();
|
||||
|
@ -1030,6 +1044,18 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
|
|||
if (MaskC->isAllOnesValue()) {
|
||||
return IC.replaceInstUsesWith(II, II.getArgOperand(0));
|
||||
}
|
||||
if (MaskC->getValue().isShiftedMask()) {
|
||||
// any single contingous sequence of 1s anywhere in the mask simply
|
||||
// describes a subset of the input bits shifted to the appropriate
|
||||
// position. Replace with the straight forward IR.
|
||||
unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
|
||||
Value *Input = II.getArgOperand(0);
|
||||
Value *Shifted = IC.Builder.CreateShl(Input,
|
||||
ConstantInt::get(II.getType(),
|
||||
ShiftAmount));
|
||||
Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
|
||||
return IC.replaceInstUsesWith(II, Masked);
|
||||
}
|
||||
|
||||
if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
|
||||
uint64_t Src = SrcC->getZExtValue();
|
||||
|
|
|
@ -306,6 +306,27 @@ define i64 @test_x86_pext_64_allones_mask(i64 %x) nounwind readnone {
|
|||
ret i64 %1
|
||||
}
|
||||
|
||||
define i32 @test_x86_pext_32_shifted_mask(i32 %x) nounwind readnone {
|
||||
; CHECK-LABEL: @test_x86_pext_32_shifted_mask(
|
||||
; CHECK-NEXT: %1 = lshr i32 %x, 1
|
||||
; CHECK-NEXT: %2 = and i32 %1, 3
|
||||
; CHECK-NEXT: ret i32 %2
|
||||
;
|
||||
%1 = tail call i32 @llvm.x86.bmi.pext.32(i32 %x, i32 6)
|
||||
ret i32 %1
|
||||
}
|
||||
|
||||
define i64 @test_x86_pext_64_shifted_mask(i64 %x) nounwind readnone {
|
||||
; CHECK-LABEL: @test_x86_pext_64_shifted_mask(
|
||||
; CHECK-NEXT: %1 = lshr i64 %x, 1
|
||||
; CHECK-NEXT: %2 = and i64 %1, 3
|
||||
; CHECK-NEXT: ret i64 %2
|
||||
;
|
||||
%1 = tail call i64 @llvm.x86.bmi.pext.64(i64 %x, i64 6)
|
||||
ret i64 %1
|
||||
}
|
||||
|
||||
|
||||
define i32 @test_x86_pext_32_constant_fold() nounwind readnone {
|
||||
; CHECK-LABEL: @test_x86_pext_32_constant_fold(
|
||||
; CHECK-NEXT: ret i32 30001
|
||||
|
@ -370,6 +391,27 @@ define i64 @test_x86_pdep_64_allones_mask(i64 %x) nounwind readnone {
|
|||
ret i64 %1
|
||||
}
|
||||
|
||||
define i32 @test_x86_pdep_32_shifted_mask(i32 %x) nounwind readnone {
|
||||
; CHECK-LABEL: @test_x86_pdep_32_shifted_mask(
|
||||
; CHECK-NEXT: %1 = shl i32 %x, 2
|
||||
; CHECK-NEXT: %2 = and i32 %1, 12
|
||||
; CHECK-NEXT: ret i32 %2
|
||||
;
|
||||
%1 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 12)
|
||||
ret i32 %1
|
||||
}
|
||||
|
||||
define i64 @test_x86_pdep_64_shifted_mask(i64 %x) nounwind readnone {
|
||||
; CHECK-LABEL: @test_x86_pdep_64_shifted_mask(
|
||||
; CHECK-NEXT: %1 = shl i64 %x, 2
|
||||
; CHECK-NEXT: %2 = and i64 %1, 12
|
||||
; CHECK-NEXT: ret i64 %2
|
||||
;
|
||||
%1 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %x, i64 12)
|
||||
ret i64 %1
|
||||
}
|
||||
|
||||
|
||||
define i32 @test_x86_pdep_32_constant_fold() nounwind readnone {
|
||||
; CHECK-LABEL: @test_x86_pdep_32_constant_fold(
|
||||
; CHECK-NEXT: ret i32 807407616
|
||||
|
|
Loading…
Reference in New Issue