forked from OSchip/llvm-project
AMDGPU: Use GlobalPriority for largest register tuples
Only do this for 16 and 32 register tuples, although we might want to extend to 8 tuples. It's incredibly expensive to spill these, and doing so majorly interferes with the ability to allocate anything else in the function. The lit tests show mostly sizeable improvements with a handful of tiny regressions with large vectors.
This commit is contained in:
parent
3afd351b5f
commit
69153d6c0a
|
@ -792,8 +792,11 @@ defm "" : SRegClass<5, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>;
|
|||
defm "" : SRegClass<6, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>;
|
||||
defm "" : SRegClass<7, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>;
|
||||
defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], SGPR_256Regs, TTMP_256Regs>;
|
||||
|
||||
let GlobalPriority = true in {
|
||||
defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>;
|
||||
defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>;
|
||||
}
|
||||
|
||||
def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
|
||||
(add VGPR_32, LDS_DIRECT_CLASS)> {
|
||||
|
@ -833,8 +836,11 @@ defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;
|
|||
defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>;
|
||||
defm VReg_224 : VRegClass<7, [v7i32, v7f32], (add VGPR_224)>;
|
||||
defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], (add VGPR_256)>;
|
||||
|
||||
let GlobalPriority = true in {
|
||||
defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>;
|
||||
defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>;
|
||||
}
|
||||
|
||||
multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> {
|
||||
let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1 in {
|
||||
|
@ -854,8 +860,11 @@ defm AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>;
|
|||
defm AReg_192 : ARegClass<6, [v6i32, v6f32, v3i64, v3f64], (add AGPR_192)>;
|
||||
defm AReg_224 : ARegClass<7, [v7i32, v7f32], (add AGPR_224)>;
|
||||
defm AReg_256 : ARegClass<8, [v8i32, v8f32, v4i64, v4f64], (add AGPR_256)>;
|
||||
|
||||
let GlobalPriority = true in {
|
||||
defm AReg_512 : ARegClass<16, [v16i32, v16f32, v8i64, v8f64], (add AGPR_512)>;
|
||||
defm AReg_1024 : ARegClass<32, [v32i32, v32f32, v16i64, v16f64], (add AGPR_1024)>;
|
||||
}
|
||||
|
||||
} // End GeneratePressureSet = 0
|
||||
|
||||
|
@ -910,8 +919,11 @@ defm AV_160 : AVRegClass<5, VReg_160.RegTypes, (add VGPR_160), (add AGPR_160)>;
|
|||
defm AV_192 : AVRegClass<6, VReg_192.RegTypes, (add VGPR_192), (add AGPR_192)>;
|
||||
defm AV_224 : AVRegClass<7, VReg_224.RegTypes, (add VGPR_224), (add AGPR_224)>;
|
||||
defm AV_256 : AVRegClass<8, VReg_256.RegTypes, (add VGPR_256), (add AGPR_256)>;
|
||||
|
||||
let GlobalPriority = true in {
|
||||
defm AV_512 : AVRegClass<16, VReg_512.RegTypes, (add VGPR_512), (add AGPR_512)>;
|
||||
defm AV_1024 : AVRegClass<32, VReg_1024.RegTypes, (add VGPR_1024), (add AGPR_1024)>;
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Register operands
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -339,118 +339,115 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(<4 x i128> addrspace(1)* %ptr,
|
|||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off
|
||||
; GFX10-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16
|
||||
; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off
|
||||
; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 1, v2
|
||||
; GFX10-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:32
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v19, 1, v2
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v2
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v19
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v15, v3, v5, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v16, v4, v6, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v3
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v12, v8, v10, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v13, v9, v11, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v14, v8, v10, s4
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s4
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v6, s4
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 2, v19
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v15, v9, v11, s4
|
||||
; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 2, v3
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v5, v15, v7, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v8, vcc_lo
|
||||
; GFX10-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:48
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s4
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v4, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v5, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v8, s4
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 3, v19
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v9, s4
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v10, s4
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v4, s4
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, v15, v5, s4
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 3, v3
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v16, v12, v6, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v17, v13, v7, vcc_lo
|
||||
; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v6, s4
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v7, s4
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 4, v19
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 4, v3
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v11, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v12, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v8, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v2
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s4
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v12, s4
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v19
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v14, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v8, s4
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v9, s4
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v3
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v13, s4
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v14, s4
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v19
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v10, s4
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v11, s4
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v3
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v16, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v15, s4
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v2
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v16, s4
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 7, v19
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, v17, s4
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, v18, s4
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v12, s4
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s4
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 7, v3
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v14, s4
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v15, s4
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: s_clause 0x3
|
||||
; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off
|
||||
; GFX11-NEXT: global_load_b128 v[7:10], v[0:1], off offset:16
|
||||
; GFX11-NEXT: global_load_b128 v[11:14], v[0:1], off offset:32
|
||||
; GFX11-NEXT: global_load_b128 v[15:18], v[0:1], off offset:48
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: global_load_b128 v[12:15], v[0:1], off
|
||||
; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16
|
||||
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v2
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: global_load_b128 v[8:11], v[0:1], off offset:32
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(3)
|
||||
; GFX11-NEXT: v_dual_cndmask_b32 v20, v3, v5 :: v_dual_cndmask_b32 v21, v4, v6
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX11-NEXT: v_cndmask_b32_e32 v1, v21, v8, vcc_lo
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_dual_cndmask_b32 v0, v20, v7 :: v_dual_add_nc_u32 v19, 1, v2
|
||||
; GFX11-NEXT: v_dual_cndmask_b32 v17, v13, v15 :: v_dual_cndmask_b32 v16, v12, v14
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2
|
||||
; GFX11-NEXT: v_add_nc_u32_e32 v3, 1, v2
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v3
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v18, v12, v14, s0
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v19, v13, v15, s0
|
||||
; GFX11-NEXT: global_load_b128 v[12:15], v[0:1], off offset:48
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX11-NEXT: v_dual_cndmask_b32 v1, v17, v5 :: v_dual_cndmask_b32 v0, v16, v4
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v3
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v19
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v9 :: v_dual_cndmask_b32 v1, v1, v10
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v4, v18, v4, s0
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v5, v19, v5, s0
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v3
|
||||
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_cndmask_b32 v1, v1, v7
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v5, s0
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v6, s0
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v19
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v7, s0
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 4, v3
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v11 :: v_dual_cndmask_b32 v1, v1, v12
|
||||
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v8 :: v_dual_cndmask_b32 v1, v1, v9
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v2
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v8, s0
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v19
|
||||
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v13 :: v_dual_cndmask_b32 v1, v1, v14
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v9, s0
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v3
|
||||
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v10 :: v_dual_cndmask_b32 v1, v1, v11
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v9, s0
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v10, s0
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 4, v19
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v11, s0
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 6, v3
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v15 :: v_dual_cndmask_b32 v1, v1, v16
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v2
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v11, s0
|
||||
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v12 :: v_dual_cndmask_b32 v1, v1, v13
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v12, s0
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v19
|
||||
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v17 :: v_dual_cndmask_b32 v1, v1, v18
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v13, s0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v14, s0
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 6, v19
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v15, s0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v16, s0
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 7, v19
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v2, v3, v17, s0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v3, v4, v18, s0
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v2
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v13, s0
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 7, v3
|
||||
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v14 :: v_dual_cndmask_b32 v1, v1, v15
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, v14, s0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v15, s0
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr
|
||||
%element = extractelement <4 x i128> %vector, i32 %idx
|
||||
|
|
|
@ -21,10 +21,10 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
|
|||
; GCN-NEXT: global_load_dwordx4 v[4:7], v64, s[0:1] offset:144
|
||||
; GCN-NEXT: global_load_dwordx4 v[8:11], v64, s[0:1] offset:160
|
||||
; GCN-NEXT: global_load_dwordx4 v[12:15], v64, s[0:1] offset:176
|
||||
; GCN-NEXT: global_load_dwordx4 v[16:19], v64, s[0:1] offset:192
|
||||
; GCN-NEXT: global_load_dwordx4 v[20:23], v64, s[0:1] offset:208
|
||||
; GCN-NEXT: global_load_dwordx4 v[24:27], v64, s[0:1] offset:224
|
||||
; GCN-NEXT: global_load_dwordx4 v[28:31], v64, s[0:1] offset:240
|
||||
; GCN-NEXT: global_load_dwordx4 v[28:31], v64, s[0:1] offset:192
|
||||
; GCN-NEXT: global_load_dwordx4 v[24:27], v64, s[0:1] offset:208
|
||||
; GCN-NEXT: global_load_dwordx4 v[20:23], v64, s[0:1] offset:224
|
||||
; GCN-NEXT: global_load_dwordx4 v[16:19], v64, s[0:1] offset:240
|
||||
; GCN-NEXT: s_waitcnt vmcnt(6)
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, 0x3e7
|
||||
; GCN-NEXT: global_store_dwordx4 v64, v[0:3], s[2:3] offset:128
|
||||
|
@ -34,11 +34,11 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
|
|||
; GCN-NEXT: s_waitcnt vmcnt(7)
|
||||
; GCN-NEXT: global_store_dwordx4 v64, v[12:15], s[2:3] offset:176
|
||||
; GCN-NEXT: s_waitcnt vmcnt(7)
|
||||
; GCN-NEXT: global_store_dwordx4 v64, v[16:19], s[2:3] offset:192
|
||||
; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[2:3] offset:192
|
||||
; GCN-NEXT: s_waitcnt vmcnt(7)
|
||||
; GCN-NEXT: global_store_dwordx4 v64, v[20:23], s[2:3] offset:208
|
||||
; GCN-NEXT: global_store_dwordx4 v64, v[24:27], s[2:3] offset:208
|
||||
; GCN-NEXT: s_waitcnt vmcnt(7)
|
||||
; GCN-NEXT: global_store_dwordx4 v64, v[24:27], s[2:3] offset:224
|
||||
; GCN-NEXT: global_store_dwordx4 v64, v[20:23], s[2:3] offset:224
|
||||
; GCN-NEXT: global_store_dwordx4 v64, v[32:35], s[2:3]
|
||||
; GCN-NEXT: global_store_dwordx4 v64, v[36:39], s[2:3] offset:16
|
||||
; GCN-NEXT: global_store_dwordx4 v64, v[40:43], s[2:3] offset:32
|
||||
|
@ -48,7 +48,7 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
|
|||
; GCN-NEXT: global_store_dwordx4 v64, v[56:59], s[2:3] offset:96
|
||||
; GCN-NEXT: global_store_dwordx4 v64, v[60:63], s[2:3] offset:112
|
||||
; GCN-NEXT: s_waitcnt vmcnt(15)
|
||||
; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[2:3] offset:240
|
||||
; GCN-NEXT: global_store_dwordx4 v64, v[16:19], s[2:3] offset:240
|
||||
; GCN-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: v_insert_v64i32_37:
|
||||
|
@ -67,24 +67,24 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
|
|||
; GFX10-NEXT: global_load_dwordx4 v[56:59], v64, s[0:1] offset:96
|
||||
; GFX10-NEXT: global_load_dwordx4 v[60:63], v64, s[0:1] offset:112
|
||||
; GFX10-NEXT: global_load_dwordx4 v[4:7], v64, s[0:1] offset:144
|
||||
; GFX10-NEXT: global_load_dwordx4 v[8:11], v64, s[0:1] offset:160
|
||||
; GFX10-NEXT: global_load_dwordx4 v[20:23], v64, s[0:1] offset:160
|
||||
; GFX10-NEXT: global_load_dwordx4 v[12:15], v64, s[0:1] offset:176
|
||||
; GFX10-NEXT: global_load_dwordx4 v[16:19], v64, s[0:1] offset:192
|
||||
; GFX10-NEXT: global_load_dwordx4 v[20:23], v64, s[0:1] offset:208
|
||||
; GFX10-NEXT: global_load_dwordx4 v[24:27], v64, s[0:1] offset:224
|
||||
; GFX10-NEXT: global_load_dwordx4 v[28:31], v64, s[0:1] offset:240
|
||||
; GFX10-NEXT: global_load_dwordx4 v[28:31], v64, s[0:1] offset:192
|
||||
; GFX10-NEXT: global_load_dwordx4 v[24:27], v64, s[0:1] offset:208
|
||||
; GFX10-NEXT: global_load_dwordx4 v[8:11], v64, s[0:1] offset:224
|
||||
; GFX10-NEXT: global_load_dwordx4 v[16:19], v64, s[0:1] offset:240
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(6)
|
||||
; GFX10-NEXT: v_mov_b32_e32 v5, 0x3e7
|
||||
; GFX10-NEXT: global_store_dwordx4 v64, v[0:3], s[2:3] offset:128
|
||||
; GFX10-NEXT: global_store_dwordx4 v64, v[4:7], s[2:3] offset:144
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(5)
|
||||
; GFX10-NEXT: global_store_dwordx4 v64, v[8:11], s[2:3] offset:160
|
||||
; GFX10-NEXT: global_store_dwordx4 v64, v[20:23], s[2:3] offset:160
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX10-NEXT: global_store_dwordx4 v64, v[12:15], s[2:3] offset:176
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(3)
|
||||
; GFX10-NEXT: global_store_dwordx4 v64, v[16:19], s[2:3] offset:192
|
||||
; GFX10-NEXT: global_store_dwordx4 v64, v[28:31], s[2:3] offset:192
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX10-NEXT: global_store_dwordx4 v64, v[20:23], s[2:3] offset:208
|
||||
; GFX10-NEXT: global_store_dwordx4 v64, v[24:27], s[2:3] offset:208
|
||||
; GFX10-NEXT: global_store_dwordx4 v64, v[32:35], s[2:3]
|
||||
; GFX10-NEXT: global_store_dwordx4 v64, v[36:39], s[2:3] offset:16
|
||||
; GFX10-NEXT: global_store_dwordx4 v64, v[40:43], s[2:3] offset:32
|
||||
|
@ -94,9 +94,9 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
|
|||
; GFX10-NEXT: global_store_dwordx4 v64, v[56:59], s[2:3] offset:96
|
||||
; GFX10-NEXT: global_store_dwordx4 v64, v[60:63], s[2:3] offset:112
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX10-NEXT: global_store_dwordx4 v64, v[24:27], s[2:3] offset:224
|
||||
; GFX10-NEXT: global_store_dwordx4 v64, v[8:11], s[2:3] offset:224
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_store_dwordx4 v64, v[28:31], s[2:3] offset:240
|
||||
; GFX10-NEXT: global_store_dwordx4 v64, v[16:19], s[2:3] offset:240
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: v_insert_v64i32_37:
|
||||
|
@ -117,24 +117,24 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
|
|||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: v_mov_b32_e32 v5, 0x3e7
|
||||
; GFX11-NEXT: s_clause 0x6
|
||||
; GFX11-NEXT: global_load_b128 v[0:3], v64, s[0:1] offset:128
|
||||
; GFX11-NEXT: global_load_b128 v[8:11], v64, s[0:1] offset:160
|
||||
; GFX11-NEXT: global_load_b128 v[12:15], v64, s[0:1] offset:176
|
||||
; GFX11-NEXT: global_load_b128 v[16:19], v64, s[0:1] offset:192
|
||||
; GFX11-NEXT: global_load_b128 v[20:23], v64, s[0:1] offset:208
|
||||
; GFX11-NEXT: global_load_b128 v[24:27], v64, s[0:1] offset:224
|
||||
; GFX11-NEXT: global_load_b128 v[28:31], v64, s[0:1] offset:240
|
||||
; GFX11-NEXT: global_load_b128 v[28:31], v64, s[0:1] offset:128
|
||||
; GFX11-NEXT: global_load_b128 v[24:27], v64, s[0:1] offset:160
|
||||
; GFX11-NEXT: global_load_b128 v[20:23], v64, s[0:1] offset:176
|
||||
; GFX11-NEXT: global_load_b128 v[0:3], v64, s[0:1] offset:192
|
||||
; GFX11-NEXT: global_load_b128 v[16:19], v64, s[0:1] offset:208
|
||||
; GFX11-NEXT: global_load_b128 v[8:11], v64, s[0:1] offset:224
|
||||
; GFX11-NEXT: global_load_b128 v[12:15], v64, s[0:1] offset:240
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(6)
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: global_store_b128 v64, v[0:3], s[2:3] offset:128
|
||||
; GFX11-NEXT: global_store_b128 v64, v[28:31], s[2:3] offset:128
|
||||
; GFX11-NEXT: global_store_b128 v64, v[4:7], s[2:3] offset:144
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(5)
|
||||
; GFX11-NEXT: global_store_b128 v64, v[8:11], s[2:3] offset:160
|
||||
; GFX11-NEXT: global_store_b128 v64, v[24:27], s[2:3] offset:160
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX11-NEXT: global_store_b128 v64, v[12:15], s[2:3] offset:176
|
||||
; GFX11-NEXT: global_store_b128 v64, v[20:23], s[2:3] offset:176
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(3)
|
||||
; GFX11-NEXT: s_clause 0x8
|
||||
; GFX11-NEXT: global_store_b128 v64, v[16:19], s[2:3] offset:192
|
||||
; GFX11-NEXT: global_store_b128 v64, v[0:3], s[2:3] offset:192
|
||||
; GFX11-NEXT: global_store_b128 v64, v[32:35], s[2:3]
|
||||
; GFX11-NEXT: global_store_b128 v64, v[36:39], s[2:3] offset:16
|
||||
; GFX11-NEXT: global_store_b128 v64, v[40:43], s[2:3] offset:32
|
||||
|
@ -144,11 +144,11 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
|
|||
; GFX11-NEXT: global_store_b128 v64, v[56:59], s[2:3] offset:96
|
||||
; GFX11-NEXT: global_store_b128 v64, v[60:63], s[2:3] offset:112
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX11-NEXT: global_store_b128 v64, v[20:23], s[2:3] offset:208
|
||||
; GFX11-NEXT: global_store_b128 v64, v[16:19], s[2:3] offset:208
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX11-NEXT: global_store_b128 v64, v[24:27], s[2:3] offset:224
|
||||
; GFX11-NEXT: global_store_b128 v64, v[8:11], s[2:3] offset:224
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: global_store_b128 v64, v[28:31], s[2:3] offset:240
|
||||
; GFX11-NEXT: global_store_b128 v64, v[12:15], s[2:3] offset:240
|
||||
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
; GFX11-NEXT: s_endpgm
|
||||
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
|
|
@ -819,22 +819,22 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
|
|||
; GPRIDX-NEXT: s_mov_b32 s9, 0x40080000
|
||||
; GPRIDX-NEXT: s_mov_b32 s8, s18
|
||||
; GPRIDX-NEXT: s_mov_b64 s[6:7], 2.0
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v3, s4
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v4, s5
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v5, s6
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v6, s7
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v7, s8
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v8, s9
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v9, s10
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v10, s11
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v11, s12
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v12, s13
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v13, s14
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v14, s15
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v15, s16
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v16, s17
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v17, s18
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v18, s19
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v4, s4
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v5, s5
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v6, s6
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v7, s7
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v8, s8
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v9, s9
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v10, s10
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v11, s11
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v12, s12
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v13, s13
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v14, s14
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v15, s15
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v16, s16
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v17, s17
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v18, s18
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v19, s19
|
||||
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
|
||||
; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v2
|
||||
; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 2, v2
|
||||
|
@ -843,29 +843,29 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
|
|||
; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 5, v2
|
||||
; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 6, v2
|
||||
; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[14:15], 7, v2
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[16:17]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v1, s[16:17]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[4:5]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[6:7]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v0, s[8:9]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v0, s[10:11]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v15, v0, s[12:13]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v17, v17, v0, s[14:15]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[4:5]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[6:7]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v1, s[8:9]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v14, v1, s[10:11]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v16, v16, v1, s[12:13]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v18, v18, v1, s[14:15]
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v4, v0, s[16:17]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v6, v0, vcc
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v5, v1, s[16:17]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v7, v1, vcc
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v8, v0, s[4:5]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v10, v0, s[6:7]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v12, v0, s[8:9]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v14, v0, s[10:11]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v16, v0, s[12:13]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v16, v18, v0, s[14:15]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v9, v1, s[4:5]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v11, v1, s[6:7]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v13, v1, s[8:9]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v15, v1, s[10:11]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v17, v1, s[12:13]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v17, v19, v1, s[14:15]
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[7:10], off
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[6:9], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[11:14], off
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[10:13], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[15:18], off
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[14:17], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
@ -1025,23 +1025,23 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do
|
|||
; GPRIDX-NEXT: s_mov_b32 s10, s12
|
||||
; GPRIDX-NEXT: s_mov_b32 s12, s14
|
||||
; GPRIDX-NEXT: s_mov_b32 s14, s16
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v16, s15
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v15, s14
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v14, s13
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v13, s12
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v12, s11
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v11, s10
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v10, s9
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v9, s8
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v8, s7
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v7, s6
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v6, s5
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v5, s4
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v17, s18
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v17, s15
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v16, s14
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v15, s13
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v14, s12
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v13, s11
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v12, s10
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v11, s9
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v10, s8
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v9, s7
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v8, s6
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v7, s5
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v6, s4
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v5, s3
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v4, s2
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v1, s18
|
||||
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
|
||||
; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
|
||||
; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
|
||||
|
@ -1050,30 +1050,30 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do
|
|||
; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0
|
||||
; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0
|
||||
; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v0, s19
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[12:13]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[12:13]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v17, s[0:1]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[2:3]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v17, s[4:5]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v17, s[6:7]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v17, s[8:9]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v15, v17, s[10:11]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[0:1]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v0, s[2:3]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v0, s[4:5]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v0, s[6:7]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v14, v0, s[8:9]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v16, v16, v0, s[10:11]
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[1:4], off
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[12:13]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v4, v1, vcc
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v6, v1, s[0:1]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v8, v1, s[2:3]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v10, v1, s[4:5]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v12, v1, s[6:7]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v14, v1, s[8:9]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v16, v1, s[10:11]
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v16, s19
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v1, v3, v16, s[12:13]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v5, v16, vcc
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v7, v16, s[0:1]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v9, v16, s[2:3]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v11, v16, s[4:5]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v13, v16, s[6:7]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v15, v16, s[8:9]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v17, v16, s[10:11]
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[5:8], off
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[9:12], off
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[13:16], off
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[12:15], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1447,22 +1447,22 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do
|
|||
; GPRIDX-NEXT: s_mov_b32 s10, s12
|
||||
; GPRIDX-NEXT: s_mov_b32 s12, s14
|
||||
; GPRIDX-NEXT: s_mov_b32 s14, s16
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v18, s15
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v17, s14
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v16, s13
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v15, s12
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v14, s11
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v13, s10
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v12, s9
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v11, s8
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v10, s7
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v9, s6
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v8, s5
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v7, s4
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v6, s3
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v5, s2
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v4, s1
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v3, s0
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v19, s15
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v18, s14
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v17, s13
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v16, s12
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v15, s11
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v14, s10
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v13, s9
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v12, s8
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v11, s7
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v10, s6
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v9, s5
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v8, s4
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v6, s2
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
|
||||
; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v2
|
||||
; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v2
|
||||
|
@ -1471,29 +1471,29 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do
|
|||
; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v2
|
||||
; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v2
|
||||
; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v2
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[12:13]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v1, s[12:13]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[0:1]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[2:3]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v0, s[4:5]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v0, s[6:7]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v15, v0, s[8:9]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v17, v17, v0, s[10:11]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[0:1]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[2:3]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v1, s[4:5]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v14, v1, s[6:7]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v16, v16, v1, s[8:9]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v18, v18, v1, s[10:11]
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v4, v0, s[12:13]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v6, v0, vcc
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v5, v1, s[12:13]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v7, v1, vcc
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v8, v0, s[0:1]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v10, v0, s[2:3]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v12, v0, s[4:5]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v14, v0, s[6:7]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v16, v0, s[8:9]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v16, v18, v0, s[10:11]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v9, v1, s[0:1]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v11, v1, s[2:3]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v13, v1, s[4:5]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v15, v1, s[6:7]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v17, v1, s[8:9]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v17, v19, v1, s[10:11]
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[7:10], off
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[6:9], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[11:14], off
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[10:13], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[15:18], off
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[14:17], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: s_endpgm
|
||||
;
|
||||
|
|
|
@ -878,34 +878,33 @@ entry:
|
|||
define amdgpu_kernel void @double15_inselt(<15 x double> addrspace(1)* %out, <15 x double> %vec, i32 %sel) {
|
||||
; GCN-LABEL: double15_inselt:
|
||||
; GCN: ; %bb.0: ; %entry
|
||||
; GCN-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0xa4
|
||||
; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0xa4
|
||||
; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x114
|
||||
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x104
|
||||
; GCN-NEXT: s_load_dwordx4 s[20:23], s[0:1], 0x104
|
||||
; GCN-NEXT: s_load_dwordx8 s[24:31], s[0:1], 0xe4
|
||||
; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GCN-NEXT: v_mov_b32_e32 v28, s2
|
||||
; GCN-NEXT: v_mov_b32_e32 v24, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GCN-NEXT: s_load_dword s4, s[0:1], 0x124
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s9
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s10
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s11
|
||||
; GCN-NEXT: v_mov_b32_e32 v28, s2
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s6
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_lshl_b32 s2, s4, 1
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s12
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, s13
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, s14
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, s15
|
||||
; GCN-NEXT: v_mov_b32_e32 v8, s16
|
||||
; GCN-NEXT: v_mov_b32_e32 v9, s17
|
||||
; GCN-NEXT: v_mov_b32_e32 v10, s18
|
||||
; GCN-NEXT: v_mov_b32_e32 v11, s19
|
||||
; GCN-NEXT: v_mov_b32_e32 v12, s20
|
||||
; GCN-NEXT: v_mov_b32_e32 v13, s21
|
||||
; GCN-NEXT: v_mov_b32_e32 v14, s22
|
||||
; GCN-NEXT: v_mov_b32_e32 v15, s23
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s7
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s8
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, s9
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, s10
|
||||
; GCN-NEXT: v_mov_b32_e32 v7, s11
|
||||
; GCN-NEXT: v_mov_b32_e32 v8, s12
|
||||
; GCN-NEXT: v_mov_b32_e32 v9, s13
|
||||
; GCN-NEXT: v_mov_b32_e32 v10, s14
|
||||
; GCN-NEXT: v_mov_b32_e32 v11, s15
|
||||
; GCN-NEXT: v_mov_b32_e32 v12, s16
|
||||
; GCN-NEXT: v_mov_b32_e32 v13, s17
|
||||
; GCN-NEXT: v_mov_b32_e32 v14, s18
|
||||
; GCN-NEXT: v_mov_b32_e32 v15, s19
|
||||
; GCN-NEXT: v_mov_b32_e32 v16, s24
|
||||
; GCN-NEXT: v_mov_b32_e32 v17, s25
|
||||
; GCN-NEXT: v_mov_b32_e32 v18, s26
|
||||
|
@ -914,9 +913,10 @@ define amdgpu_kernel void @double15_inselt(<15 x double> addrspace(1)* %out, <15
|
|||
; GCN-NEXT: v_mov_b32_e32 v21, s29
|
||||
; GCN-NEXT: v_mov_b32_e32 v22, s30
|
||||
; GCN-NEXT: v_mov_b32_e32 v23, s31
|
||||
; GCN-NEXT: v_mov_b32_e32 v25, s5
|
||||
; GCN-NEXT: v_mov_b32_e32 v26, s6
|
||||
; GCN-NEXT: v_mov_b32_e32 v27, s7
|
||||
; GCN-NEXT: v_mov_b32_e32 v24, s20
|
||||
; GCN-NEXT: v_mov_b32_e32 v25, s21
|
||||
; GCN-NEXT: v_mov_b32_e32 v26, s22
|
||||
; GCN-NEXT: v_mov_b32_e32 v27, s23
|
||||
; GCN-NEXT: v_mov_b32_e32 v29, s3
|
||||
; GCN-NEXT: s_mov_b32 m0, s2
|
||||
; GCN-NEXT: v_movreld_b32_e32 v0, 0
|
||||
|
|
|
@ -39,13 +39,13 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(<32 x float> addrspace(3)* no
|
|||
; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:8192
|
||||
; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:8288
|
||||
; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v1
|
||||
; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49264
|
||||
; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49248
|
||||
; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49232
|
||||
; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49216
|
||||
; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49200
|
||||
; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49184
|
||||
; GCN-NEXT: ds_read_b128 a[116:119], v4 offset:57456
|
||||
; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24688
|
||||
; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24672
|
||||
; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24656
|
||||
; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24640
|
||||
; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24624
|
||||
; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24608
|
||||
; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:24592
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112
|
||||
; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96
|
||||
|
@ -60,28 +60,28 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(<32 x float> addrspace(3)* no
|
|||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63]
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:24688
|
||||
; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:24672
|
||||
; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:24656
|
||||
; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:24640
|
||||
; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:24624
|
||||
; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:24608
|
||||
; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:24592
|
||||
; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:24576
|
||||
; GCN-NEXT: ds_read_b128 a[112:115], v4 offset:57440
|
||||
; GCN-NEXT: ds_read_b128 a[108:111], v4 offset:57424
|
||||
; GCN-NEXT: ds_read_b128 a[104:107], v4 offset:57408
|
||||
; GCN-NEXT: ds_read_b128 a[88:91], v4 offset:57344
|
||||
; GCN-NEXT: ds_read_b128 a[92:95], v4 offset:57360
|
||||
; GCN-NEXT: ds_read_b128 a[96:99], v4 offset:57376
|
||||
; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:24576
|
||||
; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49264
|
||||
; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49248
|
||||
; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49232
|
||||
; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49216
|
||||
; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49200
|
||||
; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49184
|
||||
; GCN-NEXT: ds_read_b128 a[28:31], v4 offset:57456
|
||||
; GCN-NEXT: ds_read_b128 a[24:27], v4 offset:57440
|
||||
; GCN-NEXT: ds_read_b128 a[20:23], v4 offset:57424
|
||||
; GCN-NEXT: ds_read_b128 a[16:19], v4 offset:57408
|
||||
; GCN-NEXT: ds_read_b128 a[0:3], v4 offset:57344
|
||||
; GCN-NEXT: ds_read_b128 a[4:7], v4 offset:57360
|
||||
; GCN-NEXT: ds_read_b128 a[8:11], v4 offset:57376
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:8288
|
||||
; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:8304
|
||||
; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:49168
|
||||
; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:49152
|
||||
; GCN-NEXT: ds_read_b128 a[100:103], v4 offset:57392
|
||||
; GCN-NEXT: ds_read_b128 a[12:15], v4 offset:57392
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[88:119], v2, v3, a[88:119]
|
||||
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
|
||||
; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:8256
|
||||
; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:8272
|
||||
; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:8224
|
||||
|
@ -91,15 +91,15 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(<32 x float> addrspace(3)* no
|
|||
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[56:87], v2, v3, a[56:87]
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 3
|
||||
; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:32864
|
||||
; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:32880
|
||||
; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:32832
|
||||
; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:32848
|
||||
; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:32800
|
||||
; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:32816
|
||||
; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:32768
|
||||
; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:32784
|
||||
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
|
||||
; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864
|
||||
; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880
|
||||
; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832
|
||||
; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848
|
||||
; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800
|
||||
; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816
|
||||
; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:32768
|
||||
; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:32784
|
||||
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[88:119], v2, v3, a[88:119]
|
||||
; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:24672
|
||||
; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:24688
|
||||
; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:24640
|
||||
|
@ -110,14 +110,14 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(<32 x float> addrspace(3)* no
|
|||
; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:24592
|
||||
; GCN-NEXT: s_nop 7
|
||||
; GCN-NEXT: s_nop 2
|
||||
; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:16480
|
||||
; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:16496
|
||||
; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:16448
|
||||
; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:16464
|
||||
; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:16416
|
||||
; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:16432
|
||||
; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:16384
|
||||
; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16400
|
||||
; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:16480
|
||||
; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:16496
|
||||
; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:16448
|
||||
; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:16464
|
||||
; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:16416
|
||||
; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:16432
|
||||
; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16384
|
||||
; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16400
|
||||
; GCN-NEXT: s_endpgm
|
||||
entry:
|
||||
call void @llvm.amdgcn.iglp.opt(i32 0)
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,7 +1,7 @@
|
|||
; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY,GREEDY908 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY,GREEDY90A %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY,GREEDY90A %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY,GREEDY90A-GISEL %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx90a -sgpr-regalloc=fast -vgpr-regalloc=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,FAST %s
|
||||
|
||||
; Check that Dst and SrcC of MFMA instructions reading more than 4 registers as SrcC
|
||||
|
@ -29,8 +29,15 @@ bb:
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32:
|
||||
; GREEDY: v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15]
|
||||
; GREEDY: v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15]
|
||||
; GREEDY908: v_mfma_f32_16x16x1{{.*}} a[18:33], v{{[0-9]+}}, v{{[0-9]+}}, a[18:33]
|
||||
; GREEDY908: v_mfma_f32_16x16x1{{.*}} a[2:17], v{{[0-9]+}}, v{{[0-9]+}}, a[18:33]
|
||||
|
||||
; GREEDY90A: v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[16:31]
|
||||
; GREEDY90A: v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[16:31]
|
||||
|
||||
; GREEDY90A-GISEL: v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15]
|
||||
; GREEDY90A-GISEL: v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15]
|
||||
|
||||
; FAST: v_mfma_f32_16x16x1{{.*}} a[32:47], v{{[0-9]+}}, v{{[0-9]+}}, a[32:47]
|
||||
; FAST: v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[32:47]
|
||||
; GCN: v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15]
|
||||
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue