GlobalISel: Constant fold G_PTR_ADD

Some globals lower to literal addresses on AMDGPU.

This may be wrong for non-integral address spaces. I'm wondering if we
should just allow regular G_ADD to use pointer types, and reserve
G_PTR_ADD for non-integral address spaces.
This commit is contained in:
Matt Arsenault 2020-08-15 12:07:29 -04:00
parent bb8be26a7e
commit 5af0f097ba
8 changed files with 261 additions and 212 deletions

View File

@ -174,6 +174,7 @@ MachineInstrBuilder CSEMIRBuilder::buildInstr(unsigned Opc,
default:
break;
case TargetOpcode::G_ADD:
case TargetOpcode::G_PTR_ADD:
case TargetOpcode::G_AND:
case TargetOpcode::G_ASHR:
case TargetOpcode::G_LSHR:
@ -193,7 +194,13 @@ MachineInstrBuilder CSEMIRBuilder::buildInstr(unsigned Opc,
// Try to constant fold these.
assert(SrcOps.size() == 2 && "Invalid sources");
assert(DstOps.size() == 1 && "Invalid dsts");
if (SrcOps[0].getLLTTy(*getMRI()).isVector()) {
LLT SrcTy = SrcOps[0].getLLTTy(*getMRI());
if (Opc == TargetOpcode::G_PTR_ADD &&
getDataLayout().isNonIntegralAddressSpace(SrcTy.getAddressSpace()))
break;
if (SrcTy.isVector()) {
// Try to constant fold vector constants.
Register VecCst = ConstantFoldVectorBinop(
Opc, SrcOps[0].getReg(), SrcOps[1].getReg(), *getMRI(), *this);
@ -201,6 +208,7 @@ MachineInstrBuilder CSEMIRBuilder::buildInstr(unsigned Opc,
return buildCopy(DstOps[0], VecCst);
break;
}
if (Optional<APInt> Cst = ConstantFoldBinOp(Opc, SrcOps[0].getReg(),
SrcOps[1].getReg(), *getMRI()))
return buildConstant(DstOps[0], *Cst);

View File

@ -500,6 +500,7 @@ Optional<APInt> llvm::ConstantFoldBinOp(unsigned Opcode, const Register Op1,
default:
break;
case TargetOpcode::G_ADD:
case TargetOpcode::G_PTR_ADD:
return C1 + C2;
case TargetOpcode::G_AND:
return C1 & C2;

View File

@ -342,12 +342,9 @@ body: |
; CHECK-NEXT: %reg:_(s32) = COPY $vgpr0
; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
; CHECK-NEXT: %ten:_(p3) = G_CONSTANT i32 10
; CHECK-NEXT: %twenty:_(p3) = G_CONSTANT i32 20
; CHECK-NEXT: %thirty:_(s32) = G_CONSTANT i32 30
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD %ten, %thirty(s32)
; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD %twenty, %thirty(s32)
; CHECK-NEXT: %ptr_add:_(p3) = G_SELECT %cond(s1), [[PTR_ADD]], [[PTR_ADD1]]
; CHECK-NEXT: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 40
; CHECK-NEXT: [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 50
; CHECK-NEXT: %ptr_add:_(p3) = G_SELECT %cond(s1), [[C]], [[C1]]
; CHECK-NEXT: S_ENDPGM 0, implicit %ptr_add(p3)
%reg:_(s32) = COPY $vgpr0
%zero:_(s32) = G_CONSTANT i32 0
@ -372,12 +369,9 @@ body: |
; CHECK-NEXT: %reg:_(s32) = COPY $vgpr0
; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
; CHECK-NEXT: %ten:_(s32) = G_CONSTANT i32 10
; CHECK-NEXT: %twenty:_(s32) = G_CONSTANT i32 20
; CHECK-NEXT: %thirty:_(p3) = G_CONSTANT i32 30
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD %thirty, %ten(s32)
; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD %thirty, %twenty(s32)
; CHECK-NEXT: %ptr_add:_(p3) = G_SELECT %cond(s1), [[PTR_ADD]], [[PTR_ADD1]]
; CHECK-NEXT: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 40
; CHECK-NEXT: [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 50
; CHECK-NEXT: %ptr_add:_(p3) = G_SELECT %cond(s1), [[C]], [[C1]]
; CHECK-NEXT: S_ENDPGM 0, implicit %ptr_add(p3)
%reg:_(s32) = COPY $vgpr0
%zero:_(s32) = G_CONSTANT i32 0

View File

@ -2550,13 +2550,13 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: s_cselect_b32 s7, s16, s15
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
; GFX9-NEXT: s_mov_b64 s[0:1], 16
; GFX9-NEXT: v_mov_b32_e32 v4, 16
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
; GFX9-NEXT: s_endpgm
;
; GFX8-LABEL: insertelement_s_v16i16_s_s:
@ -2685,7 +2685,8 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
; GFX10-NEXT: s_cmp_eq_u32 s7, 1
; GFX10-NEXT: v_mov_b32_e32 v8, 0
; GFX10-NEXT: v_mov_b32_e32 v9, 0
; GFX10-NEXT: v_mov_b32_e32 v10, 0
; GFX10-NEXT: v_mov_b32_e32 v10, 16
; GFX10-NEXT: v_mov_b32_e32 v11, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_cselect_b32 s0, s9, s8
; GFX10-NEXT: s_cmp_eq_u32 s7, 2
@ -2731,9 +2732,8 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
; GFX10-NEXT: v_mov_b32_e32 v5, s5
; GFX10-NEXT: v_mov_b32_e32 v6, s6
; GFX10-NEXT: v_mov_b32_e32 v7, s7
; GFX10-NEXT: s_mov_b64 s[0:1], 16
; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1]
; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
; GFX10-NEXT: s_endpgm
%vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr
%insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
@ -2783,10 +2783,10 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr,
; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v10, s[10:11]
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: s_mov_b64 s[0:1], 16
; GFX9-NEXT: v_mov_b32_e32 v10, 0
; GFX9-NEXT: v_mov_b32_e32 v10, 16
; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1]
; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
; GFX9-NEXT: s_endpgm
;
; GFX8-LABEL: insertelement_v_v16i16_s_s:
@ -2908,6 +2908,7 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr,
; GFX10-NEXT: s_lshl_b32 s8, s8, s3
; GFX10-NEXT: s_lshl_b32 s3, s9, s3
; GFX10-NEXT: s_not_b32 s8, s8
; GFX10-NEXT: v_mov_b32_e32 v12, 16
; GFX10-NEXT: v_mov_b32_e32 v13, 0
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
@ -2918,19 +2919,18 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr,
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s5
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v8, s6
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v9, s2
; GFX10-NEXT: v_and_or_b32 v12, v0, s8, s3
; GFX10-NEXT: v_and_or_b32 v14, v0, s8, s3
; GFX10-NEXT: v_cmp_eq_u32_e64 s3, s7, 0
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v12, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v12, s3
; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v12, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v12, s1
; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v12, s4
; GFX10-NEXT: v_cndmask_b32_e64 v5, v7, v12, s5
; GFX10-NEXT: v_cndmask_b32_e64 v6, v8, v12, s6
; GFX10-NEXT: v_cndmask_b32_e64 v7, v9, v12, s2
; GFX10-NEXT: s_mov_b64 s[0:1], 16
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v14, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v14, s3
; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v14, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v14, s1
; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v14, s4
; GFX10-NEXT: v_cndmask_b32_e64 v5, v7, v14, s5
; GFX10-NEXT: v_cndmask_b32_e64 v6, v8, v14, s6
; GFX10-NEXT: v_cndmask_b32_e64 v7, v9, v14, s2
; GFX10-NEXT: global_store_dwordx4 v[10:11], v[0:3], off
; GFX10-NEXT: global_store_dwordx4 v13, v[4:7], s[0:1]
; GFX10-NEXT: global_store_dwordx4 v[12:13], v[4:7], off
; GFX10-NEXT: s_endpgm
%vec = load <16 x i16>, <16 x i16> addrspace(1 )* %ptr
%insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
@ -2992,10 +2992,10 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg
; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: v_mov_b32_e32 v10, 16
; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
; GFX9-NEXT: s_mov_b64 s[0:1], 16
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: global_store_dwordx4 v0, v[4:7], s[0:1]
; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
; GFX9-NEXT: s_endpgm
;
; GFX8-LABEL: insertelement_s_v16i16_v_s:
@ -3124,6 +3124,7 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg
; GFX10-NEXT: s_cmp_eq_u32 s0, 1
; GFX10-NEXT: v_and_b32_e32 v8, s3, v0
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 0
; GFX10-NEXT: v_mov_b32_e32 v10, 16
; GFX10-NEXT: v_mov_b32_e32 v11, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_cselect_b32 s1, s9, s8
@ -3151,27 +3152,26 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg
; GFX10-NEXT: v_mov_b32_e32 v7, s15
; GFX10-NEXT: s_lshl_b32 s3, s3, s2
; GFX10-NEXT: s_andn2_b32 s1, s1, s3
; GFX10-NEXT: v_lshl_or_b32 v10, v8, s2, s1
; GFX10-NEXT: v_lshl_or_b32 v12, v8, s2, s1
; GFX10-NEXT: v_mov_b32_e32 v8, 0
; GFX10-NEXT: v_mov_b32_e32 v9, 0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 3
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 4
; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 5
; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 6
; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 7
; GFX10-NEXT: s_mov_b64 s[0:1], 16
; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc_lo
; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
; GFX10-NEXT: global_store_dwordx4 v11, v[4:7], s[0:1]
; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
; GFX10-NEXT: s_endpgm
%vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr
%insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
@ -3234,10 +3234,10 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11]
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: v_mov_b32_e32 v10, 16
; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
; GFX9-NEXT: s_mov_b64 s[0:1], 16
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: global_store_dwordx4 v0, v[4:7], s[0:1]
; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
; GFX9-NEXT: s_endpgm
;
; GFX8-LABEL: insertelement_s_v16i16_s_v:
@ -3365,22 +3365,21 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
; GFX10-LABEL: insertelement_s_v16i16_s_v:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 1, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v12, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: s_mov_b32 s5, 0xffff
; GFX10-NEXT: v_mov_b32_e32 v12, 0
; GFX10-NEXT: s_and_b32 s6, s4, s5
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v10
; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v10
; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v10
; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v10
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v12
; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v12
; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v12
; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v10
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v12
; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s5
; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v10
; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v12
; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, s6
; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v10
; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v12
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: v_xor_b32_e32 v9, -1, v2
@ -3390,29 +3389,30 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, s2
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s13, s3
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s14, s4
; GFX10-NEXT: v_cndmask_b32_e64 v11, v1, s15, s5
; GFX10-NEXT: v_cndmask_b32_e64 v10, v1, s15, s5
; GFX10-NEXT: v_mov_b32_e32 v0, s8
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: v_mov_b32_e32 v2, s10
; GFX10-NEXT: v_mov_b32_e32 v3, s11
; GFX10-NEXT: v_and_or_b32 v11, v11, v9, v8
; GFX10-NEXT: v_and_or_b32 v13, v10, v9, v8
; GFX10-NEXT: v_mov_b32_e32 v4, s12
; GFX10-NEXT: v_mov_b32_e32 v5, s13
; GFX10-NEXT: v_mov_b32_e32 v6, s14
; GFX10-NEXT: v_mov_b32_e32 v7, s15
; GFX10-NEXT: v_mov_b32_e32 v8, 0
; GFX10-NEXT: v_mov_b32_e32 v9, 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v11, s6
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v11, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s1
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v11, s2
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v11, s3
; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v11, s4
; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s5
; GFX10-NEXT: s_mov_b64 s[0:1], 16
; GFX10-NEXT: v_mov_b32_e32 v10, 16
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1
; GFX10-NEXT: v_mov_b32_e32 v11, 0
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3
; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4
; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v13, s5
; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
; GFX10-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1]
; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
; GFX10-NEXT: s_endpgm
%vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr
%insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
@ -3474,10 +3474,10 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg
; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11]
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: v_mov_b32_e32 v10, 16
; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
; GFX9-NEXT: s_mov_b64 s[0:1], 16
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: global_store_dwordx4 v0, v[4:7], s[0:1]
; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
; GFX9-NEXT: s_endpgm
;
; GFX8-LABEL: insertelement_s_v16i16_v_v:
@ -3604,20 +3604,19 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg
; GFX10-LABEL: insertelement_s_v16i16_v_v:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 1, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v12, 1, v1
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
; GFX10-NEXT: s_mov_b32 s4, 0xffff
; GFX10-NEXT: v_mov_b32_e32 v12, 0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v10
; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v10
; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v10
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v12
; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v12
; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1
; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v10
; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v10
; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v10
; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v12
; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v12
; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v12
; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s4
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v10
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v12
; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-NEXT: v_xor_b32_e32 v9, -1, v3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@ -3628,29 +3627,30 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s12, s2
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s13, s3
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s14, s4
; GFX10-NEXT: v_cndmask_b32_e64 v11, v2, s15, s5
; GFX10-NEXT: v_cndmask_b32_e64 v10, v2, s15, s5
; GFX10-NEXT: v_mov_b32_e32 v0, s8
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: v_mov_b32_e32 v2, s10
; GFX10-NEXT: v_mov_b32_e32 v3, s11
; GFX10-NEXT: v_and_or_b32 v11, v11, v9, v8
; GFX10-NEXT: v_and_or_b32 v13, v10, v9, v8
; GFX10-NEXT: v_mov_b32_e32 v4, s12
; GFX10-NEXT: v_mov_b32_e32 v5, s13
; GFX10-NEXT: v_mov_b32_e32 v6, s14
; GFX10-NEXT: v_mov_b32_e32 v7, s15
; GFX10-NEXT: v_mov_b32_e32 v8, 0
; GFX10-NEXT: v_mov_b32_e32 v9, 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v11, s6
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v11, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s1
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v11, s2
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v11, s3
; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v11, s4
; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s5
; GFX10-NEXT: s_mov_b64 s[0:1], 16
; GFX10-NEXT: v_mov_b32_e32 v10, 16
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1
; GFX10-NEXT: v_mov_b32_e32 v11, 0
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3
; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4
; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v13, s5
; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
; GFX10-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1]
; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
; GFX10-NEXT: s_endpgm
%vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr
%insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
@ -3696,13 +3696,13 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr,
; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9]
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc
; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11]
; GFX9-NEXT: s_mov_b64 s[0:1], 16
; GFX9-NEXT: v_mov_b32_e32 v10, 0
; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: v_mov_b32_e32 v10, 16
; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1]
; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
; GFX9-NEXT: s_endpgm
;
; GFX8-LABEL: insertelement_v_v16i16_s_v:
@ -3810,7 +3810,7 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr,
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v2
; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
; GFX10-NEXT: s_mov_b32 s5, 0xffff
; GFX10-NEXT: v_mov_b32_e32 v14, 0
; GFX10-NEXT: v_mov_b32_e32 v13, 16
; GFX10-NEXT: s_and_b32 s6, s2, s5
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0
@ -3819,6 +3819,7 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr,
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2
; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 6, v0
; GFX10-NEXT: v_mov_b32_e32 v14, 0
; GFX10-NEXT: v_lshlrev_b32_e64 v11, v2, s5
; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0
; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, s6
@ -3833,20 +3834,19 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr,
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v8, s4
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s2
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s5
; GFX10-NEXT: v_and_or_b32 v13, v1, v11, v2
; GFX10-NEXT: v_and_or_b32 v15, v1, v11, v2
; GFX10-NEXT: v_mov_b32_e32 v11, 0
; GFX10-NEXT: v_mov_b32_e32 v12, 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v13, s6
; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v13, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v13, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v13, s1
; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v13, s3
; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v13, s4
; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v13, s2
; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v13, s5
; GFX10-NEXT: s_mov_b64 s[0:1], 16
; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v15, s6
; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v15, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v15, s1
; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v15, s3
; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v15, s4
; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v15, s2
; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v15, s5
; GFX10-NEXT: global_store_dwordx4 v[11:12], v[0:3], off
; GFX10-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1]
; GFX10-NEXT: global_store_dwordx4 v[13:14], v[4:7], off
; GFX10-NEXT: s_endpgm
%vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr
%insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
@ -3891,13 +3891,13 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr,
; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9]
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc
; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11]
; GFX9-NEXT: s_mov_b64 s[0:1], 16
; GFX9-NEXT: v_mov_b32_e32 v10, 0
; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: v_mov_b32_e32 v10, 16
; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1]
; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
; GFX9-NEXT: s_endpgm
;
; GFX8-LABEL: insertelement_v_v16i16_v_s:
@ -4019,6 +4019,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr,
; GFX10-NEXT: s_not_b32 s7, s7
; GFX10-NEXT: v_mov_b32_e32 v11, 0
; GFX10-NEXT: v_mov_b32_e32 v12, 0
; GFX10-NEXT: v_mov_b32_e32 v13, 16
; GFX10-NEXT: v_mov_b32_e32 v14, 0
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
@ -4029,18 +4030,17 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr,
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v8, s4
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v9, s2
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v10, s5
; GFX10-NEXT: v_and_or_b32 v13, v0, s7, v1
; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v13, s6
; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v13, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v13, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v13, s1
; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v13, s3
; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v13, s4
; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v13, s2
; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v13, s5
; GFX10-NEXT: s_mov_b64 s[0:1], 16
; GFX10-NEXT: v_and_or_b32 v15, v0, s7, v1
; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v15, s6
; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v15, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v15, s1
; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v15, s3
; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v15, s4
; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v15, s2
; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v15, s5
; GFX10-NEXT: global_store_dwordx4 v[11:12], v[0:3], off
; GFX10-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1]
; GFX10-NEXT: global_store_dwordx4 v[13:14], v[4:7], off
; GFX10-NEXT: s_endpgm
%vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr
%insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
@ -4085,13 +4085,13 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr,
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11]
; GFX9-NEXT: s_mov_b64 s[0:1], 16
; GFX9-NEXT: v_mov_b32_e32 v10, 0
; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: v_mov_b32_e32 v10, 16
; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1]
; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
; GFX9-NEXT: s_endpgm
;
; GFX8-LABEL: insertelement_v_v16i16_v_v:
@ -4198,6 +4198,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr,
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v3
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
; GFX10-NEXT: s_mov_b32 s4, 0xffff
; GFX10-NEXT: v_mov_b32_e32 v14, 16
; GFX10-NEXT: v_mov_b32_e32 v15, 0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0
@ -4222,18 +4223,17 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr,
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s3
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s4
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v11, s5
; GFX10-NEXT: v_and_or_b32 v14, v1, v3, v2
; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v14, s6
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v14, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v14, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v14, s1
; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v14, s2
; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v14, s3
; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v14, s4
; GFX10-NEXT: v_cndmask_b32_e64 v7, v11, v14, s5
; GFX10-NEXT: s_mov_b64 s[0:1], 16
; GFX10-NEXT: v_and_or_b32 v16, v1, v3, v2
; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v16, s6
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v16, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v16, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v16, s1
; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v16, s2
; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v16, s3
; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v16, s4
; GFX10-NEXT: v_cndmask_b32_e64 v7, v11, v16, s5
; GFX10-NEXT: global_store_dwordx4 v[12:13], v[0:3], off
; GFX10-NEXT: global_store_dwordx4 v15, v[4:7], s[0:1]
; GFX10-NEXT: global_store_dwordx4 v[14:15], v[4:7], off
; GFX10-NEXT: s_endpgm
%vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr
%insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx

View File

@ -5184,22 +5184,22 @@ define amdgpu_ps void @amdgpu_ps_call_default_cc() {
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY [[DEF]](p4)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(p4) = G_CONSTANT i64 0
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[C1]], [[C2]](s64)
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p4) = COPY [[C1]](p4)
; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[DEF2]](s32)
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[DEF2]](s32)
; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[DEF2]](s32)
; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY4]](<4 x s32>)
; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[DEF2]](s32)
; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY5]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[DEF]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[COPY1]](p4)
; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[DEF1]](s64)
; CHECK-NEXT: $sgpr12 = COPY [[DEF2]](s32)
; CHECK-NEXT: $sgpr13 = COPY [[COPY1]](s32)
; CHECK-NEXT: $sgpr14 = COPY [[COPY2]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY3]](s32)
; CHECK-NEXT: $sgpr13 = COPY [[COPY2]](s32)
; CHECK-NEXT: $sgpr14 = COPY [[COPY3]](s32)
; CHECK-NEXT: $vgpr31 = COPY [[COPY4]](s32)
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[C]](p0), 0, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: S_ENDPGM 0

View File

@ -1947,11 +1947,11 @@ define void @byval_a3i32_align128_byval_i16_align64([3 x i32] addrspace(5)* byva
; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(s16) = G_LOAD [[COPY1]](p5) :: (dereferenceable load (s16) from %ir.arg1, addrspace 5)
; CHECK-NEXT: G_STORE [[LOAD]](s32), [[C]](p1) :: (store (s32) into `[3 x i32] addrspace(1)* null`, addrspace 1)
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[C]], [[C3]](s64)
; CHECK-NEXT: G_STORE [[LOAD1]](s32), [[PTR_ADD2]](p1) :: (store (s32) into `[3 x i32] addrspace(1)* null` + 4, addrspace 1)
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[C]], [[C4]](s64)
; CHECK-NEXT: G_STORE [[LOAD2]](s32), [[PTR_ADD3]](p1) :: (store (s32) into `[3 x i32] addrspace(1)* null` + 8, addrspace 1)
; CHECK-NEXT: [[C4:%[0-9]+]]:_(p1) = G_CONSTANT i64 4
; CHECK-NEXT: G_STORE [[LOAD1]](s32), [[C4]](p1) :: (store (s32) into `[3 x i32] addrspace(1)* null` + 4, addrspace 1)
; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; CHECK-NEXT: [[C6:%[0-9]+]]:_(p1) = G_CONSTANT i64 8
; CHECK-NEXT: G_STORE [[LOAD2]](s32), [[C6]](p1) :: (store (s32) into `[3 x i32] addrspace(1)* null` + 8, addrspace 1)
; CHECK-NEXT: G_STORE [[LOAD3]](s16), [[COPY3]](p1) :: (store (s16) into `i16 addrspace(1)* null`, addrspace 1)
; CHECK-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
; CHECK-NEXT: S_SETPC_B64_return [[COPY4]]

View File

@ -0,0 +1,45 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - -stop-after=irtranslator %s | FileCheck %s
; Check that the CSEMIRBuilder doesn't fold away the getelementptr during IRTranslator
define i8 addrspace(7)* @no_auto_constfold_gep() {
; CHECK-LABEL: name: no_auto_constfold_gep
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: liveins: $sgpr30_sgpr31
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK-NEXT: [[C:%[0-9]+]]:_(p7) = G_CONSTANT i64 0
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 123
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p7) = G_PTR_ADD [[C]], [[C1]](s64)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[PTR_ADD]](p7)
; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32)
; CHECK-NEXT: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
; CHECK-NEXT: S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1
%gep = getelementptr i8, i8 addrspace(7)* null, i64 123
ret i8 addrspace(7)* %gep
}
; Check that the CSEMIRBuilder doesn't fold away the getelementptr during IRTranslator
define <2 x i8 addrspace(7)*> @no_auto_constfold_gep_vector() {
; CHECK-LABEL: name: no_auto_constfold_gep_vector
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: liveins: $sgpr30_sgpr31
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK-NEXT: [[C:%[0-9]+]]:_(p7) = G_CONSTANT i64 0
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p7>) = G_BUILD_VECTOR [[C]](p7), [[C]](p7)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 123
; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C1]](s64), [[C1]](s64)
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(<2 x p7>) = G_PTR_ADD [[BUILD_VECTOR]], [[BUILD_VECTOR1]](<2 x s64>)
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p7>) = COPY [[PTR_ADD]](<2 x p7>)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x p7>)
; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32)
; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32)
; CHECK-NEXT: $vgpr3 = COPY [[UV3]](s32)
; CHECK-NEXT: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
; CHECK-NEXT: S_SETPC_B64_return [[COPY2]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%gep = getelementptr i8, <2 x i8 addrspace(7)*> zeroinitializer, <2 x i64> <i64 123, i64 123>
ret <2 x i8 addrspace(7)*> %gep
}

View File

@ -8,16 +8,16 @@ define amdgpu_kernel void @test_long_add4(<4 x i64> %arg) #0 {
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: v_mov_b32_e32 v1, s5
; CHECK-NEXT: global_load_dwordx4 v[7:10], v[0:1], off
; CHECK-NEXT: global_load_dwordx4 v[6:9], v[0:1], off
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: v_mov_b32_e32 v1, s5
; CHECK-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:16
; CHECK-NEXT: ; kill: def $vgpr7_vgpr8_vgpr9_vgpr10 killed $vgpr7_vgpr8_vgpr9_vgpr10 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14 killed $exec
; CHECK-NEXT: ; kill: def $vgpr6_vgpr7_vgpr8_vgpr9 killed $vgpr6_vgpr7_vgpr8_vgpr9 def $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13 killed $exec
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v14, v3
; CHECK-NEXT: v_mov_b32_e32 v13, v2
; CHECK-NEXT: v_mov_b32_e32 v12, v1
; CHECK-NEXT: v_mov_b32_e32 v11, v0
; CHECK-NEXT: v_mov_b32_e32 v13, v3
; CHECK-NEXT: v_mov_b32_e32 v12, v2
; CHECK-NEXT: v_mov_b32_e32 v11, v1
; CHECK-NEXT: v_mov_b32_e32 v10, v0
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: v_mov_b32_e32 v1, s5
; CHECK-NEXT: global_load_dwordx4 v[18:21], v[0:1], off
@ -30,66 +30,67 @@ define amdgpu_kernel void @test_long_add4(<4 x i64> %arg) #0 {
; CHECK-NEXT: v_mov_b32_e32 v24, v2
; CHECK-NEXT: v_mov_b32_e32 v23, v1
; CHECK-NEXT: v_mov_b32_e32 v22, v0
; CHECK-NEXT: v_mov_b32_e32 v2, v7
; CHECK-NEXT: v_mov_b32_e32 v3, v8
; CHECK-NEXT: v_mov_b32_e32 v0, v9
; CHECK-NEXT: v_mov_b32_e32 v1, v10
; CHECK-NEXT: v_mov_b32_e32 v5, v11
; CHECK-NEXT: v_mov_b32_e32 v6, v12
; CHECK-NEXT: v_mov_b32_e32 v12, v13
; CHECK-NEXT: v_mov_b32_e32 v13, v14
; CHECK-NEXT: v_mov_b32_e32 v8, v18
; CHECK-NEXT: v_mov_b32_e32 v9, v19
; CHECK-NEXT: v_mov_b32_e32 v16, v20
; CHECK-NEXT: v_mov_b32_e32 v17, v21
; CHECK-NEXT: v_mov_b32_e32 v14, v22
; CHECK-NEXT: v_mov_b32_e32 v15, v23
; CHECK-NEXT: v_mov_b32_e32 v4, v6
; CHECK-NEXT: v_mov_b32_e32 v5, v7
; CHECK-NEXT: v_mov_b32_e32 v2, v8
; CHECK-NEXT: v_mov_b32_e32 v3, v9
; CHECK-NEXT: v_mov_b32_e32 v0, v10
; CHECK-NEXT: v_mov_b32_e32 v1, v11
; CHECK-NEXT: v_mov_b32_e32 v8, v12
; CHECK-NEXT: v_mov_b32_e32 v9, v13
; CHECK-NEXT: v_mov_b32_e32 v16, v18
; CHECK-NEXT: v_mov_b32_e32 v17, v19
; CHECK-NEXT: v_mov_b32_e32 v14, v20
; CHECK-NEXT: v_mov_b32_e32 v15, v21
; CHECK-NEXT: v_mov_b32_e32 v12, v22
; CHECK-NEXT: v_mov_b32_e32 v13, v23
; CHECK-NEXT: v_mov_b32_e32 v10, v24
; CHECK-NEXT: v_mov_b32_e32 v11, v25
; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: v_mov_b32_e32 v4, v5
; CHECK-NEXT: v_mov_b32_e32 v7, v16
; CHECK-NEXT: v_mov_b32_e32 v5, v17
; CHECK-NEXT: v_add_co_u32 v6, s6, v6, v7
; CHECK-NEXT: v_add_co_ci_u32_e64 v4, s6, v4, v5, s6
; CHECK-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
; CHECK-NEXT: v_mov_b32_e32 v7, v4
; CHECK-NEXT: v_mov_b32_e32 v4, v2
; CHECK-NEXT: v_mov_b32_e32 v2, v3
; CHECK-NEXT: v_mov_b32_e32 v7, v8
; CHECK-NEXT: v_mov_b32_e32 v3, v9
; CHECK-NEXT: v_add_co_u32 v7, s6, v4, v7
; CHECK-NEXT: v_mov_b32_e32 v5, v14
; CHECK-NEXT: v_mov_b32_e32 v3, v15
; CHECK-NEXT: v_add_co_u32 v4, s6, v4, v5
; CHECK-NEXT: v_add_co_ci_u32_e64 v2, s6, v2, v3, s6
; CHECK-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; CHECK-NEXT: v_mov_b32_e32 v8, v2
; CHECK-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; CHECK-NEXT: v_mov_b32_e32 v5, v2
; CHECK-NEXT: v_mov_b32_e32 v2, v0
; CHECK-NEXT: v_mov_b32_e32 v0, v1
; CHECK-NEXT: v_mov_b32_e32 v3, v16
; CHECK-NEXT: v_mov_b32_e32 v1, v17
; CHECK-NEXT: v_add_co_u32 v3, s6, v2, v3
; CHECK-NEXT: v_mov_b32_e32 v3, v12
; CHECK-NEXT: v_mov_b32_e32 v1, v13
; CHECK-NEXT: v_add_co_u32 v2, s6, v2, v3
; CHECK-NEXT: v_add_co_ci_u32_e64 v0, s6, v0, v1, s6
; CHECK-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; CHECK-NEXT: v_mov_b32_e32 v4, v0
; CHECK-NEXT: v_mov_b32_e32 v1, v5
; CHECK-NEXT: v_mov_b32_e32 v0, v6
; CHECK-NEXT: v_mov_b32_e32 v5, v14
; CHECK-NEXT: v_mov_b32_e32 v2, v15
; CHECK-NEXT: v_add_co_u32 v1, s6, v1, v5
; CHECK-NEXT: v_add_co_ci_u32_e64 v0, s6, v0, v2, s6
; CHECK-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; CHECK-NEXT: v_mov_b32_e32 v2, v0
; CHECK-NEXT: v_mov_b32_e32 v5, v12
; CHECK-NEXT: v_mov_b32_e32 v0, v13
; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; CHECK-NEXT: v_mov_b32_e32 v3, v0
; CHECK-NEXT: v_mov_b32_e32 v0, v8
; CHECK-NEXT: v_mov_b32_e32 v1, v9
; CHECK-NEXT: v_mov_b32_e32 v9, v10
; CHECK-NEXT: v_mov_b32_e32 v6, v11
; CHECK-NEXT: v_add_co_u32 v5, s6, v5, v9
; CHECK-NEXT: v_add_co_ci_u32_e64 v0, s6, v0, v6, s6
; CHECK-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; CHECK-NEXT: v_mov_b32_e32 v6, v0
; CHECK-NEXT: ; kill: def $vgpr7_vgpr8 killed $vgpr7_vgpr8 def $vgpr7_vgpr8_vgpr9_vgpr10 killed $exec
; CHECK-NEXT: v_mov_b32_e32 v10, v4
; CHECK-NEXT: v_mov_b32_e32 v9, v3
; CHECK-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
; CHECK-NEXT: v_mov_b32_e32 v3, v5
; CHECK-NEXT: v_mov_b32_e32 v4, v6
; CHECK-NEXT: v_mov_b32_e32 v6, s5
; CHECK-NEXT: v_mov_b32_e32 v5, s4
; CHECK-NEXT: global_store_dwordx4 v[5:6], v[7:10], off
; CHECK-NEXT: v_mov_b32_e32 v8, v11
; CHECK-NEXT: v_add_co_u32 v0, s6, v0, v9
; CHECK-NEXT: v_add_co_ci_u32_e64 v8, s6, v1, v8, s6
; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; CHECK-NEXT: v_mov_b32_e32 v1, v8
; CHECK-NEXT: ; kill: def $vgpr6_vgpr7 killed $vgpr6_vgpr7 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec
; CHECK-NEXT: v_mov_b32_e32 v9, v5
; CHECK-NEXT: v_mov_b32_e32 v8, v4
; CHECK-NEXT: ; kill: def $vgpr2_vgpr3 killed $vgpr2_vgpr3 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; CHECK-NEXT: v_mov_b32_e32 v5, v1
; CHECK-NEXT: v_mov_b32_e32 v4, v0
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: v_mov_b32_e32 v1, s5
; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off
; CHECK-NEXT: s_mov_b64 s[4:5], 16
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: global_store_dwordx4 v0, v[1:4], s[4:5]
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: v_mov_b32_e32 v1, s5
; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT: s_endpgm
entry:
%load0 = load <4 x i64>, <4 x i64> addrspace(1)* null, align 32