[AMDGPU] Constant fold Intrinsic::amdgcn_perm

Differential Revision: https://reviews.llvm.org/D102203
This commit is contained in:
Stanislav Mekhanoshin 2021-05-10 15:42:47 -07:00
parent 0077dce361
commit 22d295f695
2 changed files with 146 additions and 0 deletions

View File

@ -1470,6 +1470,7 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
case Intrinsic::vector_reduce_umin:
case Intrinsic::vector_reduce_umax:
// Target intrinsics
case Intrinsic::amdgcn_perm:
case Intrinsic::arm_mve_vctp8:
case Intrinsic::arm_mve_vctp16:
case Intrinsic::arm_mve_vctp32:
@ -2702,6 +2703,46 @@ static APFloat ConstantFoldAMDGCNCubeIntrinsic(Intrinsic::ID IntrinsicID,
}
}
static Constant *ConstantFoldAMDGCNPermIntrinsic(ArrayRef<Constant *> Operands,
Type *Ty) {
const APInt *C0, *C1, *C2;
if (!getConstIntOrUndef(Operands[0], C0) ||
!getConstIntOrUndef(Operands[1], C1) ||
!getConstIntOrUndef(Operands[2], C2))
return nullptr;
if (!C2)
return UndefValue::get(Ty);
APInt Val(32, 0);
unsigned NumUndefBytes = 0;
for (unsigned I = 0; I < 32; I += 8) {
unsigned Sel = C2->extractBitsAsZExtValue(8, I);
unsigned B = 0;
if (Sel >= 13)
B = 0xff;
else if (Sel == 12)
B = 0x00;
else {
const APInt *Src = ((Sel & 10) == 10 || (Sel & 12) == 4) ? C0 : C1;
if (!Src)
++NumUndefBytes;
else if (Sel < 8)
B = Src->extractBitsAsZExtValue(8, (Sel & 3) * 8);
else
B = Src->extractBitsAsZExtValue(1, (Sel & 1) ? 31 : 15) * 0xff;
}
Val.insertBits(B, I, 8);
}
if (NumUndefBytes == 4)
return UndefValue::get(Ty);
return ConstantInt::get(Ty, Val);
}
static Constant *ConstantFoldScalarCall3(StringRef Name,
Intrinsic::ID IntrinsicID,
Type *Ty,
@ -2817,6 +2858,9 @@ static Constant *ConstantFoldScalarCall3(StringRef Name,
return ConstantInt::get(Ty, C0->shl(ShlAmt) | C1->lshr(LshrAmt));
}
if (IntrinsicID == Intrinsic::amdgcn_perm)
return ConstantFoldAMDGCNPermIntrinsic(Operands, Ty);
return nullptr;
}

View File

@ -0,0 +1,102 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -instsimplify -S | FileCheck %s
declare i32 @llvm.amdgcn.perm(i32, i32, i32)
; src1 = 0x19203a4b (421542475), src2 = 0x5c6d7e8f (1550679695)
define void @test(i32* %p) {
; CHECK-LABEL: @test(
; CHECK-NEXT: store volatile i32 undef, i32* [[P:%.*]], align 4
; CHECK-NEXT: store volatile i32 -1887539876, i32* [[P]], align 4
; CHECK-NEXT: store volatile i32 2121096267, i32* [[P]], align 4
; CHECK-NEXT: store volatile i32 1262100505, i32* [[P]], align 4
; CHECK-NEXT: store volatile i32 1550679695, i32* [[P]], align 4
; CHECK-NEXT: store volatile i32 421542475, i32* [[P]], align 4
; CHECK-NEXT: store volatile i32 545143439, i32* [[P]], align 4
; CHECK-NEXT: store volatile i32 16711935, i32* [[P]], align 4
; CHECK-NEXT: store volatile i32 16711935, i32* [[P]], align 4
; CHECK-NEXT: store volatile i32 436174336, i32* [[P]], align 4
; CHECK-NEXT: store volatile i32 16711680, i32* [[P]], align 4
; CHECK-NEXT: store volatile i32 -1, i32* [[P]], align 4
; CHECK-NEXT: store volatile i32 -1, i32* [[P]], align 4
; CHECK-NEXT: store volatile i32 -1, i32* [[P]], align 4
; CHECK-NEXT: store volatile i32 undef, i32* [[P]], align 4
; CHECK-NEXT: store volatile i32 421542475, i32* [[P]], align 4
; CHECK-NEXT: store volatile i32 1550679695, i32* [[P]], align 4
; CHECK-NEXT: store volatile i32 undef, i32* [[P]], align 4
; CHECK-NEXT: store volatile i32 143, i32* [[P]], align 4
; CHECK-NEXT: store volatile i32 0, i32* [[P]], align 4
; CHECK-NEXT: store volatile i32 255, i32* [[P]], align 4
; CHECK-NEXT: store volatile i32 1550679552, i32* [[P]], align 4
; CHECK-NEXT: store volatile i32 75, i32* [[P]], align 4
; CHECK-NEXT: store volatile i32 0, i32* [[P]], align 4
; CHECK-NEXT: store volatile i32 255, i32* [[P]], align 4
; CHECK-NEXT: store volatile i32 65535, i32* [[P]], align 4
; CHECK-NEXT: store volatile i32 421542400, i32* [[P]], align 4
; CHECK-NEXT: store volatile i32 -16776961, i32* [[P]], align 4
; CHECK-NEXT: store volatile i32 255, i32* [[P]], align 4
; CHECK-NEXT: store volatile i32 -16777216, i32* [[P]], align 4
; CHECK-NEXT: ret void
;
%s1s2_u = call i32 @llvm.amdgcn.perm(i32 421542475, i32 1550679695, i32 undef)
store volatile i32 %s1s2_u, i32* %p
%s1s2_0x00010203 = call i32 @llvm.amdgcn.perm(i32 421542475, i32 1550679695, i32 66051)
store volatile i32 %s1s2_0x00010203, i32* %p
%s1s2_0x01020304 = call i32 @llvm.amdgcn.perm(i32 421542475, i32 1550679695, i32 16909060)
store volatile i32 %s1s2_0x01020304, i32* %p
%s1s2_0x04050607 = call i32 @llvm.amdgcn.perm(i32 421542475, i32 1550679695, i32 67438087)
store volatile i32 %s1s2_0x04050607, i32* %p
%s1s2_0x03020100 = call i32 @llvm.amdgcn.perm(i32 421542475, i32 1550679695, i32 50462976)
store volatile i32 %s1s2_0x03020100, i32* %p
%s1s2_0x07060504 = call i32 @llvm.amdgcn.perm(i32 421542475, i32 1550679695, i32 117835012)
store volatile i32 %s1s2_0x07060504, i32* %p
%s1s2_0x06010500 = call i32 @llvm.amdgcn.perm(i32 421542475, i32 1550679695, i32 100730112)
store volatile i32 %s1s2_0x06010500, i32* %p
%s1s2_0x0c0f0c0f = call i32 @llvm.amdgcn.perm(i32 421542475, i32 1550679695, i32 202312719)
store volatile i32 %s1s2_0x0c0f0c0f, i32* %p
%u1u2_0x0c0f0c0f = call i32 @llvm.amdgcn.perm(i32 undef, i32 undef, i32 202312719)
store volatile i32 %u1u2_0x0c0f0c0f, i32* %p
%s1s2_0x070d010c = call i32 @llvm.amdgcn.perm(i32 421542475, i32 1550679695, i32 118292748)
store volatile i32 %s1s2_0x070d010c, i32* %p
%u1u2_0x070d010c = call i32 @llvm.amdgcn.perm(i32 undef, i32 undef, i32 118292748)
store volatile i32 %u1u2_0x070d010c, i32* %p
%s1s2_0x80818283 = call i32 @llvm.amdgcn.perm(i32 421542475, i32 1550679695, i32 2155971203)
store volatile i32 %s1s2_0x80818283, i32* %p
%u1u2_0x80818283 = call i32 @llvm.amdgcn.perm(i32 undef, i32 undef, i32 2155971203)
store volatile i32 %u1u2_0x80818283, i32* %p
%u1u2_0x0e0e0e0e = call i32 @llvm.amdgcn.perm(i32 undef, i32 undef, i32 235802126)
store volatile i32 %u1u2_0x0e0e0e0e, i32* %p
%u1s2_0x07060504 = call i32 @llvm.amdgcn.perm(i32 undef, i32 1550679695, i32 117835012)
store volatile i32 %u1s2_0x07060504, i32* %p
%s1u2_0x07060504 = call i32 @llvm.amdgcn.perm(i32 421542475, i32 undef, i32 117835012)
store volatile i32 %s1u2_0x07060504, i32* %p
%u1s2_0x03020100 = call i32 @llvm.amdgcn.perm(i32 undef, i32 1550679695, i32 50462976)
store volatile i32 %u1s2_0x03020100, i32* %p
%s1u2_0x03020100 = call i32 @llvm.amdgcn.perm(i32 421542475, i32 undef, i32 50462976)
store volatile i32 %s1u2_0x03020100, i32* %p
%u1s2_0x07060500 = call i32 @llvm.amdgcn.perm(i32 undef, i32 1550679695, i32 117835008)
store volatile i32 %u1s2_0x07060500, i32* %p
%u1s2_0x0706050c = call i32 @llvm.amdgcn.perm(i32 undef, i32 1550679695, i32 117835020)
store volatile i32 %u1s2_0x0706050c, i32* %p
%u1s2_0x0706050d = call i32 @llvm.amdgcn.perm(i32 undef, i32 1550679695, i32 117835021)
store volatile i32 %u1s2_0x0706050d, i32* %p
%u1s2_0x03020104 = call i32 @llvm.amdgcn.perm(i32 undef, i32 1550679695, i32 50462980)
store volatile i32 %u1s2_0x03020104, i32* %p
%s1u2_0x03020104 = call i32 @llvm.amdgcn.perm(i32 421542475, i32 undef, i32 50462980)
store volatile i32 %s1u2_0x03020104, i32* %p
%s1u2_0x0302010c = call i32 @llvm.amdgcn.perm(i32 421542475, i32 undef, i32 50462988)
store volatile i32 %s1u2_0x0302010c, i32* %p
%s1u2_0x0302010e = call i32 @llvm.amdgcn.perm(i32 421542475, i32 undef, i32 50462990)
store volatile i32 %s1u2_0x0302010e, i32* %p
%s1u2_0x03020f0e = call i32 @llvm.amdgcn.perm(i32 421542475, i32 undef, i32 50466574)
store volatile i32 %s1u2_0x03020f0e, i32* %p
%s1u2_0x07060500 = call i32 @llvm.amdgcn.perm(i32 421542475, i32 undef, i32 117835008)
store volatile i32 %s1u2_0x07060500, i32* %p
%_0x81000100_0x01008100_0x0b0a0908 = call i32 @llvm.amdgcn.perm(i32 2164261120, i32 16810240, i32 185207048)
store volatile i32 %_0x81000100_0x01008100_0x0b0a0908, i32* %p
%_u1_0x01008100_0x0b0a0908 = call i32 @llvm.amdgcn.perm(i32 undef, i32 16810240, i32 185207048)
store volatile i32 %_u1_0x01008100_0x0b0a0908, i32* %p
%_0x81000100_u2_0x0b0a0908 = call i32 @llvm.amdgcn.perm(i32 2164261120, i32 undef, i32 185207048)
store volatile i32 %_0x81000100_u2_0x0b0a0908, i32* %p
ret void
}