[AMDGPU] Select no-return atomic intrinsics in tblgen

This is to avoid relying on the post-isel hook.

This change also enable the saddr pattern selection for atomic
intrinsics in GlobalISel.

Differential Revision: https://reviews.llvm.org/D123583
This commit is contained in:
Abinav Puthan Purayil 2022-04-08 14:56:18 +05:30
parent ed58a01f66
commit 45ca94334e
7 changed files with 131 additions and 70 deletions

View File

@ -540,6 +540,31 @@ defm atomic_store_#as : binary_atomic_op<atomic_store>;
// TODO: Add GISelPredicateCode for the ret and noret PatFrags once
// GlobalISelEmitter allows pattern matches where src and dst def count
// mismatch.
multiclass ret_noret_op {
let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }],
GISelPredicateCode = [{ return true; }] in {
def "_ret" : PatFrag<(ops node:$ptr, node:$data),
(!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>;
}
let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }],
GISelPredicateCode = [{ return false; }] in {
def "_noret" : PatFrag<(ops node:$ptr, node:$data),
(!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>;
}
}
defm int_amdgcn_flat_atomic_fadd : ret_noret_op;
defm int_amdgcn_flat_atomic_fadd_v2bf16 : ret_noret_op;
defm int_amdgcn_flat_atomic_fmin : ret_noret_op;
defm int_amdgcn_flat_atomic_fmax : ret_noret_op;
defm int_amdgcn_global_atomic_fadd : ret_noret_op;
defm int_amdgcn_global_atomic_fadd_v2bf16 : ret_noret_op;
defm int_amdgcn_global_atomic_fmin : ret_noret_op;
defm int_amdgcn_global_atomic_fmax : ret_noret_op;
defm int_amdgcn_ds_fadd_v2bf16 : ret_noret_op;
multiclass ret_noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> {
let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }],
GISelPredicateCode = [{ return false; }] in {

View File

@ -1188,9 +1188,9 @@ let SubtargetPredicate = isGFX90APlus in {
let SubtargetPredicate = isGFX90AOnly;
}
defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64, int_amdgcn_global_atomic_fadd>;
defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64, int_amdgcn_global_atomic_fmin>;
defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>;
defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64>;
defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64>;
defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64>;
} // End SubtargetPredicate = isGFX90APlus
def BUFFER_INV : MUBUF_Invalidate<"buffer_inv"> {
@ -1381,10 +1381,11 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">;
// buffer_atomic patterns
//===----------------------------------------------------------------------===//
multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst> {
multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst, bit isIntr = 0> {
foreach RtnMode = ["ret", "noret"] in {
defvar Op = !cast<SDPatternOperator>(OpPrefix # "_" # RtnMode # "_" # vt.Size);
defvar Op = !cast<SDPatternOperator>(OpPrefix # "_" # RtnMode
# !if(isIntr, "", "_" # vt.Size));
defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
def : GCNPat<
@ -1592,6 +1593,9 @@ defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_fadd, v2f16, "BUFFER_ATOMIC_P
}
let SubtargetPredicate = isGFX90APlus in {
defm : BufferAtomicPat<"int_amdgcn_global_atomic_fadd", f64, "BUFFER_ATOMIC_ADD_F64", 1>;
defm : BufferAtomicPat<"int_amdgcn_global_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64", 1>;
defm : BufferAtomicPat<"int_amdgcn_global_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64", 1>;
defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16">;

View File

@ -1025,9 +1025,13 @@ def : DSAtomicRetPat<DS_ADD_F64, f64, atomic_load_fadd_local_noret_64>;
let SubtargetPredicate = isGFX940Plus in {
def : DSAtomicRetPat<DS_PK_ADD_RTN_F16, v2f16, atomic_load_fadd_v2f16_local_32>;
def : GCNPat <
(v2i16 (int_amdgcn_ds_fadd_v2bf16 i32:$ptr, v2i16:$src)),
(v2i16 (int_amdgcn_ds_fadd_v2bf16_ret i32:$ptr, v2i16:$src)),
(DS_PK_ADD_RTN_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
>;
def : GCNPat <
(v2i16 (int_amdgcn_ds_fadd_v2bf16_noret i32:$ptr, v2i16:$src)),
(DS_PK_ADD_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
>;
}
def : Pat <

View File

@ -711,17 +711,17 @@ defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2",
} // End SubtargetPredicate = isGFX7GFX10
let SubtargetPredicate = isGFX90APlus in {
defm FLAT_ATOMIC_ADD_F64 : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64, int_amdgcn_flat_atomic_fadd>;
defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo<"flat_atomic_min_f64", VReg_64, f64, int_amdgcn_flat_atomic_fmin>;
defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo<"flat_atomic_max_f64", VReg_64, f64, int_amdgcn_flat_atomic_fmax>;
defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64, int_amdgcn_global_atomic_fadd>;
defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64, int_amdgcn_global_atomic_fmin>;
defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>;
defm FLAT_ATOMIC_ADD_F64 : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64>;
defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo<"flat_atomic_min_f64", VReg_64, f64>;
defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo<"flat_atomic_max_f64", VReg_64, f64>;
defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64>;
defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64>;
defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64>;
} // End SubtargetPredicate = isGFX90APlus
let SubtargetPredicate = isGFX940Plus in {
defm FLAT_ATOMIC_ADD_F32 : FLAT_Atomic_Pseudo<"flat_atomic_add_f32", VGPR_32, f32, int_amdgcn_flat_atomic_fadd>;
defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_f16", VGPR_32, v2f16, int_amdgcn_flat_atomic_fadd>;
defm FLAT_ATOMIC_ADD_F32 : FLAT_Atomic_Pseudo<"flat_atomic_add_f32", VGPR_32, f32>;
defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_f16", VGPR_32, v2f16>;
defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_bf16", VGPR_32, v2f16>;
defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Atomic_Pseudo<"global_atomic_pk_add_bf16", VGPR_32, v2f16>;
} // End SubtargetPredicate = isGFX940Plus
@ -897,15 +897,15 @@ let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in {
defm GLOBAL_ATOMIC_FCMPSWAP :
FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32, null_frag, v2f32, VReg_64>;
defm GLOBAL_ATOMIC_FMIN :
FLAT_Global_Atomic_Pseudo<"global_atomic_fmin", VGPR_32, f32, int_amdgcn_global_atomic_fmin>;
FLAT_Global_Atomic_Pseudo<"global_atomic_fmin", VGPR_32, f32>;
defm GLOBAL_ATOMIC_FMAX :
FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32, int_amdgcn_global_atomic_fmax>;
FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32>;
defm GLOBAL_ATOMIC_FCMPSWAP_X2 :
FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", VReg_64, f64, null_frag, v2f64, VReg_128>;
defm GLOBAL_ATOMIC_FMIN_X2 :
FLAT_Global_Atomic_Pseudo<"global_atomic_fmin_x2", VReg_64, f64, int_amdgcn_global_atomic_fmin>;
FLAT_Global_Atomic_Pseudo<"global_atomic_fmin_x2", VReg_64, f64>;
defm GLOBAL_ATOMIC_FMAX_X2 :
FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64, int_amdgcn_global_atomic_fmax>;
FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64>;
} // End SubtargetPredicate = isGFX10Plus, is_flat_global = 1
let is_flat_global = 1 in {
@ -920,10 +920,10 @@ let OtherPredicates = [HasAtomicFaddInsts] in {
let OtherPredicates = [isGFX90APlus] in {
defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_RTN <
"global_atomic_add_f32", VGPR_32, f32, int_amdgcn_global_atomic_fadd
"global_atomic_add_f32", VGPR_32, f32
>;
defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_RTN <
"global_atomic_pk_add_f16", VGPR_32, v2f16, int_amdgcn_global_atomic_fadd
"global_atomic_pk_add_f16", VGPR_32, v2f16
>;
} // End OtherPredicates = [isGFX90APlus]
} // End is_flat_global = 1
@ -1029,13 +1029,30 @@ multiclass FlatAtomicPat <string inst, string node, ValueType vt,
(!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
}
multiclass FlatSignedAtomicPat <string inst, string node, ValueType vt,
ValueType data_vt = vt, bit isIntr = 0> {
defvar rtnNode = !cast<PatFrags>(node # "_ret" # !if(isIntr, "", "_" # vt.Size));
defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt.Size));
def : GCNPat <(vt (rtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)),
(!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
def : GCNPat <(vt (noRtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)),
(!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
}
multiclass FlatSignedIntrPat <string inst, string node, ValueType vt,
ValueType data_vt = vt> {
defm : FlatSignedAtomicPat<inst, node, vt, data_vt, /* isIntr */ 1>;
}
class FlatSignedAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(node (GlobalOffset i64:$vaddr, i16:$offset), vt:$data),
(inst VReg_64:$vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
>;
class FlatSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
ValueType data_vt = vt> : GCNPat <
class FlatSignedAtomicPatRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
ValueType data_vt = vt> : GCNPat <
(vt (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)),
(inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)
>;
@ -1237,7 +1254,7 @@ multiclass GlobalFLATAtomicStorePats<FLAT_Pseudo inst, SDPatternOperator node, V
multiclass GlobalFLATAtomicPatsRtn<string nortn_inst_name, SDPatternOperator node,
ValueType vt, ValueType data_vt = vt> {
def : FlatSignedAtomicPat <!cast<FLAT_Pseudo>(nortn_inst_name#"_RTN"), node, vt, data_vt> {
def : FlatSignedAtomicPatRtn <!cast<FLAT_Pseudo>(nortn_inst_name#"_RTN"), node, vt, data_vt> {
let AddedComplexity = 10;
}
@ -1247,13 +1264,12 @@ multiclass GlobalFLATAtomicPatsRtn<string nortn_inst_name, SDPatternOperator nod
}
multiclass GlobalFLATAtomicPats<string inst, string node, ValueType vt,
ValueType data_vt = vt> {
defvar rtnNode = !cast<PatFrags>(node#"_ret_"#vt.Size);
defvar noRtnNode = !cast<PatFrags>(node#"_noret_"#vt.Size);
ValueType data_vt = vt, bit isIntr = 0> {
defvar rtnNode = !cast<PatFrags>(node # "_ret" # !if(isIntr, "", "_" # vt.Size));
defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt.Size));
let AddedComplexity = 10 in {
def : FlatSignedAtomicPat <!cast<FLAT_Pseudo>(inst), noRtnNode, vt, data_vt>;
def : FlatSignedAtomicPat <!cast<FLAT_Pseudo>(inst#"_RTN"), rtnNode, vt, data_vt>;
defm : FlatSignedAtomicPat <inst, node, vt, data_vt, isIntr>;
}
let AddedComplexity = 11 in {
@ -1262,6 +1278,11 @@ multiclass GlobalFLATAtomicPats<string inst, string node, ValueType vt,
}
}
multiclass GlobalFLATAtomicIntrPats<string inst, string node, ValueType vt,
ValueType data_vt = vt> {
defm : GlobalFLATAtomicPats<inst, node, vt, data_vt, /* isIntr */ 1>;
}
multiclass GlobalFLATNoRtnAtomicPats<FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> {
def : FlatSignedAtomicPatNoRtn <inst, node, vt> {
@ -1427,6 +1448,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f3
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN_X2", "atomic_load_fmin_global", f64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX_X2", "atomic_load_fmax_global", f64>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN_X2", "int_amdgcn_global_atomic_fmin", f64>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX_X2", "int_amdgcn_global_atomic_fmax", f64>;
}
let OtherPredicates = [HasAtomicFaddInsts] in {
@ -1440,19 +1465,26 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_v2f16
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>;
def : FlatSignedAtomicPat <FLAT_ATOMIC_ADD_F64_RTN, atomic_load_fadd_flat_ret_64, f64>;
def : FlatSignedAtomicPat <FLAT_ATOMIC_ADD_F64, atomic_load_fadd_flat_noret_64, f64>;
def : FlatSignedAtomicPat <FLAT_ATOMIC_MIN_F64_RTN, atomic_load_fmin_flat_ret_64, f64>;
def : FlatSignedAtomicPat <FLAT_ATOMIC_MIN_F64, atomic_load_fmin_flat_noret_64, f64>;
def : FlatSignedAtomicPat <FLAT_ATOMIC_MAX_F64_RTN, atomic_load_fmax_flat_ret_64, f64>;
def : FlatSignedAtomicPat <FLAT_ATOMIC_MAX_F64, atomic_load_fmax_flat_noret_64, f64>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", f32>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", f64>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", v2f16>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MIN_F64", "int_amdgcn_global_atomic_fmin", f64>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MAX_F64", "int_amdgcn_global_atomic_fmax", f64>;
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>;
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_flat", f64>;
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_flat", f64>;
defm : FlatSignedIntrPat <"FLAT_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", f64>;
defm : FlatSignedIntrPat <"FLAT_ATOMIC_MIN_F64", "int_amdgcn_flat_atomic_fmin", f64>;
defm : FlatSignedIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax", f64>;
}
let OtherPredicates = [isGFX940Plus] in {
def : FlatSignedAtomicPat <FLAT_ATOMIC_ADD_F32_RTN, atomic_load_fadd_flat_32, f32>;
def : FlatSignedAtomicPat <FLAT_ATOMIC_PK_ADD_F16_RTN, atomic_load_fadd_v2f16_flat_32, v2f16>;
def : FlatSignedAtomicPat <FLAT_ATOMIC_PK_ADD_BF16_RTN, int_amdgcn_flat_atomic_fadd_v2bf16, v2i16>;
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_PK_ADD_BF16", int_amdgcn_global_atomic_fadd_v2bf16, v2i16>;
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>;
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_v2f16_flat", v2f16>;
defm : FlatSignedIntrPat <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", f32>;
defm : FlatSignedIntrPat <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", v2f16>;
defm : FlatSignedIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>;
}
} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10

View File

@ -319,10 +319,10 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret(double addrspace(1)* %pt
; GFX90A-LABEL: global_atomic_fadd_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v2, v[0:1], s[0:1] glc
; GFX90A-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
@ -333,10 +333,10 @@ define amdgpu_kernel void @global_atomic_fmin_f64_noret(double addrspace(1)* %pt
; GFX90A-LABEL: global_atomic_fmin_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v2, v[0:1], s[0:1] glc
; GFX90A-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
@ -347,10 +347,10 @@ define amdgpu_kernel void @global_atomic_fmax_f64_noret(double addrspace(1)* %pt
; GFX90A-LABEL: global_atomic_fmax_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v2, v[0:1], s[0:1] glc
; GFX90A-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)

View File

@ -74,10 +74,10 @@ define amdgpu_kernel void @global_atomic_fadd_f32_off_ss(float addrspace(1)* %pt
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NEXT: v_mov_b32_e32 v1, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2048 glc
; GFX90A-NEXT: v_mov_b32_e32 v0, s2
; GFX90A-NEXT: global_atomic_add_f32 v0, v1, v0, s[0:1] offset:2048 glc
; GFX90A-NEXT: s_endpgm
%gep = getelementptr float, float addrspace(1)* %ptr, i64 512
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %gep, float %data)

View File

@ -23,13 +23,12 @@ define amdgpu_kernel void @global_atomic_fmin_f32_noret(float addrspace(1)* %ptr
; G_GFX10-LABEL: global_atomic_fmin_f32_noret:
; G_GFX10: ; %bb.0: ; %main_body
; G_GFX10-NEXT: s_clause 0x1
; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; G_GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; G_GFX10-NEXT: v_mov_b32_e32 v1, 0
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX10-NEXT: v_mov_b32_e32 v0, s2
; G_GFX10-NEXT: v_mov_b32_e32 v1, s3
; G_GFX10-NEXT: v_mov_b32_e32 v2, s4
; G_GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off glc
; G_GFX10-NEXT: v_mov_b32_e32 v0, s4
; G_GFX10-NEXT: global_atomic_fmin v0, v1, v0, s[2:3] glc
; G_GFX10-NEXT: s_endpgm
main_body:
%ret = call float @llvm.amdgcn.global.atomic.fmin.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
@ -51,13 +50,12 @@ define amdgpu_kernel void @global_atomic_fmax_f32_noret(float addrspace(1)* %ptr
; G_GFX10-LABEL: global_atomic_fmax_f32_noret:
; G_GFX10: ; %bb.0: ; %main_body
; G_GFX10-NEXT: s_clause 0x1
; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; G_GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; G_GFX10-NEXT: v_mov_b32_e32 v1, 0
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX10-NEXT: v_mov_b32_e32 v0, s2
; G_GFX10-NEXT: v_mov_b32_e32 v1, s3
; G_GFX10-NEXT: v_mov_b32_e32 v2, s4
; G_GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off glc
; G_GFX10-NEXT: v_mov_b32_e32 v0, s4
; G_GFX10-NEXT: global_atomic_fmax v0, v1, v0, s[2:3] glc
; G_GFX10-NEXT: s_endpgm
main_body:
%ret = call float @llvm.amdgcn.global.atomic.fmax.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
@ -120,12 +118,11 @@ define amdgpu_kernel void @global_atomic_fmin_f64_noret(double addrspace(1)* %pt
; G_GFX10-LABEL: global_atomic_fmin_f64_noret:
; G_GFX10: ; %bb.0: ; %main_body
; G_GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; G_GFX10-NEXT: v_mov_b32_e32 v2, 0
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX10-NEXT: v_mov_b32_e32 v0, s0
; G_GFX10-NEXT: v_mov_b32_e32 v2, s2
; G_GFX10-NEXT: v_mov_b32_e32 v1, s1
; G_GFX10-NEXT: v_mov_b32_e32 v3, s3
; G_GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc
; G_GFX10-NEXT: v_mov_b32_e32 v0, s2
; G_GFX10-NEXT: v_mov_b32_e32 v1, s3
; G_GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v2, v[0:1], s[0:1] glc
; G_GFX10-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
@ -146,12 +143,11 @@ define amdgpu_kernel void @global_atomic_fmax_f64_noret(double addrspace(1)* %pt
; G_GFX10-LABEL: global_atomic_fmax_f64_noret:
; G_GFX10: ; %bb.0: ; %main_body
; G_GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; G_GFX10-NEXT: v_mov_b32_e32 v2, 0
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX10-NEXT: v_mov_b32_e32 v0, s0
; G_GFX10-NEXT: v_mov_b32_e32 v2, s2
; G_GFX10-NEXT: v_mov_b32_e32 v1, s1
; G_GFX10-NEXT: v_mov_b32_e32 v3, s3
; G_GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc
; G_GFX10-NEXT: v_mov_b32_e32 v0, s2
; G_GFX10-NEXT: v_mov_b32_e32 v1, s3
; G_GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v2, v[0:1], s[0:1] glc
; G_GFX10-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)