forked from OSchip/llvm-project
[AMDGPU] Select no-return atomic intrinsics in tblgen
This is to avoid relying on the post-isel hook. This change also enable the saddr pattern selection for atomic intrinsics in GlobalISel. Differential Revision: https://reviews.llvm.org/D123583
This commit is contained in:
parent
ed58a01f66
commit
45ca94334e
|
@ -540,6 +540,31 @@ defm atomic_store_#as : binary_atomic_op<atomic_store>;
|
|||
// TODO: Add GISelPredicateCode for the ret and noret PatFrags once
|
||||
// GlobalISelEmitter allows pattern matches where src and dst def count
|
||||
// mismatch.
|
||||
|
||||
multiclass ret_noret_op {
|
||||
let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }],
|
||||
GISelPredicateCode = [{ return true; }] in {
|
||||
def "_ret" : PatFrag<(ops node:$ptr, node:$data),
|
||||
(!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>;
|
||||
}
|
||||
|
||||
let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }],
|
||||
GISelPredicateCode = [{ return false; }] in {
|
||||
def "_noret" : PatFrag<(ops node:$ptr, node:$data),
|
||||
(!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>;
|
||||
}
|
||||
}
|
||||
|
||||
defm int_amdgcn_flat_atomic_fadd : ret_noret_op;
|
||||
defm int_amdgcn_flat_atomic_fadd_v2bf16 : ret_noret_op;
|
||||
defm int_amdgcn_flat_atomic_fmin : ret_noret_op;
|
||||
defm int_amdgcn_flat_atomic_fmax : ret_noret_op;
|
||||
defm int_amdgcn_global_atomic_fadd : ret_noret_op;
|
||||
defm int_amdgcn_global_atomic_fadd_v2bf16 : ret_noret_op;
|
||||
defm int_amdgcn_global_atomic_fmin : ret_noret_op;
|
||||
defm int_amdgcn_global_atomic_fmax : ret_noret_op;
|
||||
defm int_amdgcn_ds_fadd_v2bf16 : ret_noret_op;
|
||||
|
||||
multiclass ret_noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> {
|
||||
let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }],
|
||||
GISelPredicateCode = [{ return false; }] in {
|
||||
|
|
|
@ -1188,9 +1188,9 @@ let SubtargetPredicate = isGFX90APlus in {
|
|||
let SubtargetPredicate = isGFX90AOnly;
|
||||
}
|
||||
|
||||
defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64, int_amdgcn_global_atomic_fadd>;
|
||||
defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64, int_amdgcn_global_atomic_fmin>;
|
||||
defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>;
|
||||
defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64>;
|
||||
defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64>;
|
||||
defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64>;
|
||||
} // End SubtargetPredicate = isGFX90APlus
|
||||
|
||||
def BUFFER_INV : MUBUF_Invalidate<"buffer_inv"> {
|
||||
|
@ -1381,10 +1381,11 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">;
|
|||
// buffer_atomic patterns
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst> {
|
||||
multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst, bit isIntr = 0> {
|
||||
foreach RtnMode = ["ret", "noret"] in {
|
||||
|
||||
defvar Op = !cast<SDPatternOperator>(OpPrefix # "_" # RtnMode # "_" # vt.Size);
|
||||
defvar Op = !cast<SDPatternOperator>(OpPrefix # "_" # RtnMode
|
||||
# !if(isIntr, "", "_" # vt.Size));
|
||||
defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
|
||||
|
||||
def : GCNPat<
|
||||
|
@ -1592,6 +1593,9 @@ defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_fadd, v2f16, "BUFFER_ATOMIC_P
|
|||
}
|
||||
|
||||
let SubtargetPredicate = isGFX90APlus in {
|
||||
defm : BufferAtomicPat<"int_amdgcn_global_atomic_fadd", f64, "BUFFER_ATOMIC_ADD_F64", 1>;
|
||||
defm : BufferAtomicPat<"int_amdgcn_global_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64", 1>;
|
||||
defm : BufferAtomicPat<"int_amdgcn_global_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64", 1>;
|
||||
defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32">;
|
||||
defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16">;
|
||||
|
||||
|
|
|
@ -1025,9 +1025,13 @@ def : DSAtomicRetPat<DS_ADD_F64, f64, atomic_load_fadd_local_noret_64>;
|
|||
let SubtargetPredicate = isGFX940Plus in {
|
||||
def : DSAtomicRetPat<DS_PK_ADD_RTN_F16, v2f16, atomic_load_fadd_v2f16_local_32>;
|
||||
def : GCNPat <
|
||||
(v2i16 (int_amdgcn_ds_fadd_v2bf16 i32:$ptr, v2i16:$src)),
|
||||
(v2i16 (int_amdgcn_ds_fadd_v2bf16_ret i32:$ptr, v2i16:$src)),
|
||||
(DS_PK_ADD_RTN_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
|
||||
>;
|
||||
def : GCNPat <
|
||||
(v2i16 (int_amdgcn_ds_fadd_v2bf16_noret i32:$ptr, v2i16:$src)),
|
||||
(DS_PK_ADD_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
|
||||
>;
|
||||
}
|
||||
|
||||
def : Pat <
|
||||
|
|
|
@ -711,17 +711,17 @@ defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2",
|
|||
} // End SubtargetPredicate = isGFX7GFX10
|
||||
|
||||
let SubtargetPredicate = isGFX90APlus in {
|
||||
defm FLAT_ATOMIC_ADD_F64 : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64, int_amdgcn_flat_atomic_fadd>;
|
||||
defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo<"flat_atomic_min_f64", VReg_64, f64, int_amdgcn_flat_atomic_fmin>;
|
||||
defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo<"flat_atomic_max_f64", VReg_64, f64, int_amdgcn_flat_atomic_fmax>;
|
||||
defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64, int_amdgcn_global_atomic_fadd>;
|
||||
defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64, int_amdgcn_global_atomic_fmin>;
|
||||
defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>;
|
||||
defm FLAT_ATOMIC_ADD_F64 : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64>;
|
||||
defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo<"flat_atomic_min_f64", VReg_64, f64>;
|
||||
defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo<"flat_atomic_max_f64", VReg_64, f64>;
|
||||
defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64>;
|
||||
defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64>;
|
||||
defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64>;
|
||||
} // End SubtargetPredicate = isGFX90APlus
|
||||
|
||||
let SubtargetPredicate = isGFX940Plus in {
|
||||
defm FLAT_ATOMIC_ADD_F32 : FLAT_Atomic_Pseudo<"flat_atomic_add_f32", VGPR_32, f32, int_amdgcn_flat_atomic_fadd>;
|
||||
defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_f16", VGPR_32, v2f16, int_amdgcn_flat_atomic_fadd>;
|
||||
defm FLAT_ATOMIC_ADD_F32 : FLAT_Atomic_Pseudo<"flat_atomic_add_f32", VGPR_32, f32>;
|
||||
defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_f16", VGPR_32, v2f16>;
|
||||
defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_bf16", VGPR_32, v2f16>;
|
||||
defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Atomic_Pseudo<"global_atomic_pk_add_bf16", VGPR_32, v2f16>;
|
||||
} // End SubtargetPredicate = isGFX940Plus
|
||||
|
@ -897,15 +897,15 @@ let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in {
|
|||
defm GLOBAL_ATOMIC_FCMPSWAP :
|
||||
FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32, null_frag, v2f32, VReg_64>;
|
||||
defm GLOBAL_ATOMIC_FMIN :
|
||||
FLAT_Global_Atomic_Pseudo<"global_atomic_fmin", VGPR_32, f32, int_amdgcn_global_atomic_fmin>;
|
||||
FLAT_Global_Atomic_Pseudo<"global_atomic_fmin", VGPR_32, f32>;
|
||||
defm GLOBAL_ATOMIC_FMAX :
|
||||
FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32, int_amdgcn_global_atomic_fmax>;
|
||||
FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32>;
|
||||
defm GLOBAL_ATOMIC_FCMPSWAP_X2 :
|
||||
FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", VReg_64, f64, null_frag, v2f64, VReg_128>;
|
||||
defm GLOBAL_ATOMIC_FMIN_X2 :
|
||||
FLAT_Global_Atomic_Pseudo<"global_atomic_fmin_x2", VReg_64, f64, int_amdgcn_global_atomic_fmin>;
|
||||
FLAT_Global_Atomic_Pseudo<"global_atomic_fmin_x2", VReg_64, f64>;
|
||||
defm GLOBAL_ATOMIC_FMAX_X2 :
|
||||
FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64, int_amdgcn_global_atomic_fmax>;
|
||||
FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64>;
|
||||
} // End SubtargetPredicate = isGFX10Plus, is_flat_global = 1
|
||||
|
||||
let is_flat_global = 1 in {
|
||||
|
@ -920,10 +920,10 @@ let OtherPredicates = [HasAtomicFaddInsts] in {
|
|||
|
||||
let OtherPredicates = [isGFX90APlus] in {
|
||||
defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_RTN <
|
||||
"global_atomic_add_f32", VGPR_32, f32, int_amdgcn_global_atomic_fadd
|
||||
"global_atomic_add_f32", VGPR_32, f32
|
||||
>;
|
||||
defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_RTN <
|
||||
"global_atomic_pk_add_f16", VGPR_32, v2f16, int_amdgcn_global_atomic_fadd
|
||||
"global_atomic_pk_add_f16", VGPR_32, v2f16
|
||||
>;
|
||||
} // End OtherPredicates = [isGFX90APlus]
|
||||
} // End is_flat_global = 1
|
||||
|
@ -1029,13 +1029,30 @@ multiclass FlatAtomicPat <string inst, string node, ValueType vt,
|
|||
(!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
|
||||
}
|
||||
|
||||
multiclass FlatSignedAtomicPat <string inst, string node, ValueType vt,
|
||||
ValueType data_vt = vt, bit isIntr = 0> {
|
||||
defvar rtnNode = !cast<PatFrags>(node # "_ret" # !if(isIntr, "", "_" # vt.Size));
|
||||
defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt.Size));
|
||||
|
||||
def : GCNPat <(vt (rtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)),
|
||||
(!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
|
||||
|
||||
def : GCNPat <(vt (noRtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)),
|
||||
(!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
|
||||
}
|
||||
|
||||
multiclass FlatSignedIntrPat <string inst, string node, ValueType vt,
|
||||
ValueType data_vt = vt> {
|
||||
defm : FlatSignedAtomicPat<inst, node, vt, data_vt, /* isIntr */ 1>;
|
||||
}
|
||||
|
||||
class FlatSignedAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
|
||||
(node (GlobalOffset i64:$vaddr, i16:$offset), vt:$data),
|
||||
(inst VReg_64:$vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
|
||||
>;
|
||||
|
||||
class FlatSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
|
||||
ValueType data_vt = vt> : GCNPat <
|
||||
class FlatSignedAtomicPatRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
|
||||
ValueType data_vt = vt> : GCNPat <
|
||||
(vt (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)),
|
||||
(inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)
|
||||
>;
|
||||
|
@ -1237,7 +1254,7 @@ multiclass GlobalFLATAtomicStorePats<FLAT_Pseudo inst, SDPatternOperator node, V
|
|||
|
||||
multiclass GlobalFLATAtomicPatsRtn<string nortn_inst_name, SDPatternOperator node,
|
||||
ValueType vt, ValueType data_vt = vt> {
|
||||
def : FlatSignedAtomicPat <!cast<FLAT_Pseudo>(nortn_inst_name#"_RTN"), node, vt, data_vt> {
|
||||
def : FlatSignedAtomicPatRtn <!cast<FLAT_Pseudo>(nortn_inst_name#"_RTN"), node, vt, data_vt> {
|
||||
let AddedComplexity = 10;
|
||||
}
|
||||
|
||||
|
@ -1247,13 +1264,12 @@ multiclass GlobalFLATAtomicPatsRtn<string nortn_inst_name, SDPatternOperator nod
|
|||
}
|
||||
|
||||
multiclass GlobalFLATAtomicPats<string inst, string node, ValueType vt,
|
||||
ValueType data_vt = vt> {
|
||||
defvar rtnNode = !cast<PatFrags>(node#"_ret_"#vt.Size);
|
||||
defvar noRtnNode = !cast<PatFrags>(node#"_noret_"#vt.Size);
|
||||
ValueType data_vt = vt, bit isIntr = 0> {
|
||||
defvar rtnNode = !cast<PatFrags>(node # "_ret" # !if(isIntr, "", "_" # vt.Size));
|
||||
defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt.Size));
|
||||
|
||||
let AddedComplexity = 10 in {
|
||||
def : FlatSignedAtomicPat <!cast<FLAT_Pseudo>(inst), noRtnNode, vt, data_vt>;
|
||||
def : FlatSignedAtomicPat <!cast<FLAT_Pseudo>(inst#"_RTN"), rtnNode, vt, data_vt>;
|
||||
defm : FlatSignedAtomicPat <inst, node, vt, data_vt, isIntr>;
|
||||
}
|
||||
|
||||
let AddedComplexity = 11 in {
|
||||
|
@ -1262,6 +1278,11 @@ multiclass GlobalFLATAtomicPats<string inst, string node, ValueType vt,
|
|||
}
|
||||
}
|
||||
|
||||
multiclass GlobalFLATAtomicIntrPats<string inst, string node, ValueType vt,
|
||||
ValueType data_vt = vt> {
|
||||
defm : GlobalFLATAtomicPats<inst, node, vt, data_vt, /* isIntr */ 1>;
|
||||
}
|
||||
|
||||
multiclass GlobalFLATNoRtnAtomicPats<FLAT_Pseudo inst, SDPatternOperator node,
|
||||
ValueType vt> {
|
||||
def : FlatSignedAtomicPatNoRtn <inst, node, vt> {
|
||||
|
@ -1427,6 +1448,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f3
|
|||
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
|
||||
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN_X2", "atomic_load_fmin_global", f64>;
|
||||
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX_X2", "atomic_load_fmax_global", f64>;
|
||||
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>;
|
||||
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>;
|
||||
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN_X2", "int_amdgcn_global_atomic_fmin", f64>;
|
||||
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX_X2", "int_amdgcn_global_atomic_fmax", f64>;
|
||||
}
|
||||
|
||||
let OtherPredicates = [HasAtomicFaddInsts] in {
|
||||
|
@ -1440,19 +1465,26 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_v2f16
|
|||
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>;
|
||||
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>;
|
||||
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>;
|
||||
def : FlatSignedAtomicPat <FLAT_ATOMIC_ADD_F64_RTN, atomic_load_fadd_flat_ret_64, f64>;
|
||||
def : FlatSignedAtomicPat <FLAT_ATOMIC_ADD_F64, atomic_load_fadd_flat_noret_64, f64>;
|
||||
def : FlatSignedAtomicPat <FLAT_ATOMIC_MIN_F64_RTN, atomic_load_fmin_flat_ret_64, f64>;
|
||||
def : FlatSignedAtomicPat <FLAT_ATOMIC_MIN_F64, atomic_load_fmin_flat_noret_64, f64>;
|
||||
def : FlatSignedAtomicPat <FLAT_ATOMIC_MAX_F64_RTN, atomic_load_fmax_flat_ret_64, f64>;
|
||||
def : FlatSignedAtomicPat <FLAT_ATOMIC_MAX_F64, atomic_load_fmax_flat_noret_64, f64>;
|
||||
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", f32>;
|
||||
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", f64>;
|
||||
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", v2f16>;
|
||||
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MIN_F64", "int_amdgcn_global_atomic_fmin", f64>;
|
||||
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MAX_F64", "int_amdgcn_global_atomic_fmax", f64>;
|
||||
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>;
|
||||
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_flat", f64>;
|
||||
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_flat", f64>;
|
||||
defm : FlatSignedIntrPat <"FLAT_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", f64>;
|
||||
defm : FlatSignedIntrPat <"FLAT_ATOMIC_MIN_F64", "int_amdgcn_flat_atomic_fmin", f64>;
|
||||
defm : FlatSignedIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax", f64>;
|
||||
}
|
||||
|
||||
let OtherPredicates = [isGFX940Plus] in {
|
||||
def : FlatSignedAtomicPat <FLAT_ATOMIC_ADD_F32_RTN, atomic_load_fadd_flat_32, f32>;
|
||||
def : FlatSignedAtomicPat <FLAT_ATOMIC_PK_ADD_F16_RTN, atomic_load_fadd_v2f16_flat_32, v2f16>;
|
||||
def : FlatSignedAtomicPat <FLAT_ATOMIC_PK_ADD_BF16_RTN, int_amdgcn_flat_atomic_fadd_v2bf16, v2i16>;
|
||||
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_PK_ADD_BF16", int_amdgcn_global_atomic_fadd_v2bf16, v2i16>;
|
||||
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>;
|
||||
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_v2f16_flat", v2f16>;
|
||||
defm : FlatSignedIntrPat <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", f32>;
|
||||
defm : FlatSignedIntrPat <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", v2f16>;
|
||||
defm : FlatSignedIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>;
|
||||
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>;
|
||||
}
|
||||
|
||||
} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
|
||||
|
|
|
@ -319,10 +319,10 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret(double addrspace(1)* %pt
|
|||
; GFX90A-LABEL: global_atomic_fadd_f64_noret:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
||||
; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
|
||||
; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v2, v[0:1], s[0:1] glc
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
main_body:
|
||||
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
|
||||
|
@ -333,10 +333,10 @@ define amdgpu_kernel void @global_atomic_fmin_f64_noret(double addrspace(1)* %pt
|
|||
; GFX90A-LABEL: global_atomic_fmin_f64_noret:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
||||
; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
|
||||
; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v2, v[0:1], s[0:1] glc
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
main_body:
|
||||
%ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
|
||||
|
@ -347,10 +347,10 @@ define amdgpu_kernel void @global_atomic_fmax_f64_noret(double addrspace(1)* %pt
|
|||
; GFX90A-LABEL: global_atomic_fmax_f64_noret:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
||||
; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
|
||||
; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v2, v[0:1], s[0:1] glc
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
main_body:
|
||||
%ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
|
||||
|
|
|
@ -74,10 +74,10 @@ define amdgpu_kernel void @global_atomic_fadd_f32_off_ss(float addrspace(1)* %pt
|
|||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_load_dword s2, s[4:5], 0x8
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2048 glc
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX90A-NEXT: global_atomic_add_f32 v0, v1, v0, s[0:1] offset:2048 glc
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
%gep = getelementptr float, float addrspace(1)* %ptr, i64 512
|
||||
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %gep, float %data)
|
||||
|
|
|
@ -23,13 +23,12 @@ define amdgpu_kernel void @global_atomic_fmin_f32_noret(float addrspace(1)* %ptr
|
|||
; G_GFX10-LABEL: global_atomic_fmin_f32_noret:
|
||||
; G_GFX10: ; %bb.0: ; %main_body
|
||||
; G_GFX10-NEXT: s_clause 0x1
|
||||
; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
; G_GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
|
||||
; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
; G_GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; G_GFX10-NEXT: v_mov_b32_e32 v0, s2
|
||||
; G_GFX10-NEXT: v_mov_b32_e32 v1, s3
|
||||
; G_GFX10-NEXT: v_mov_b32_e32 v2, s4
|
||||
; G_GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off glc
|
||||
; G_GFX10-NEXT: v_mov_b32_e32 v0, s4
|
||||
; G_GFX10-NEXT: global_atomic_fmin v0, v1, v0, s[2:3] glc
|
||||
; G_GFX10-NEXT: s_endpgm
|
||||
main_body:
|
||||
%ret = call float @llvm.amdgcn.global.atomic.fmin.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
|
||||
|
@ -51,13 +50,12 @@ define amdgpu_kernel void @global_atomic_fmax_f32_noret(float addrspace(1)* %ptr
|
|||
; G_GFX10-LABEL: global_atomic_fmax_f32_noret:
|
||||
; G_GFX10: ; %bb.0: ; %main_body
|
||||
; G_GFX10-NEXT: s_clause 0x1
|
||||
; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
; G_GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
|
||||
; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
; G_GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; G_GFX10-NEXT: v_mov_b32_e32 v0, s2
|
||||
; G_GFX10-NEXT: v_mov_b32_e32 v1, s3
|
||||
; G_GFX10-NEXT: v_mov_b32_e32 v2, s4
|
||||
; G_GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off glc
|
||||
; G_GFX10-NEXT: v_mov_b32_e32 v0, s4
|
||||
; G_GFX10-NEXT: global_atomic_fmax v0, v1, v0, s[2:3] glc
|
||||
; G_GFX10-NEXT: s_endpgm
|
||||
main_body:
|
||||
%ret = call float @llvm.amdgcn.global.atomic.fmax.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
|
||||
|
@ -120,12 +118,11 @@ define amdgpu_kernel void @global_atomic_fmin_f64_noret(double addrspace(1)* %pt
|
|||
; G_GFX10-LABEL: global_atomic_fmin_f64_noret:
|
||||
; G_GFX10: ; %bb.0: ; %main_body
|
||||
; G_GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; G_GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; G_GFX10-NEXT: v_mov_b32_e32 v0, s0
|
||||
; G_GFX10-NEXT: v_mov_b32_e32 v2, s2
|
||||
; G_GFX10-NEXT: v_mov_b32_e32 v1, s1
|
||||
; G_GFX10-NEXT: v_mov_b32_e32 v3, s3
|
||||
; G_GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc
|
||||
; G_GFX10-NEXT: v_mov_b32_e32 v0, s2
|
||||
; G_GFX10-NEXT: v_mov_b32_e32 v1, s3
|
||||
; G_GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v2, v[0:1], s[0:1] glc
|
||||
; G_GFX10-NEXT: s_endpgm
|
||||
main_body:
|
||||
%ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
|
||||
|
@ -146,12 +143,11 @@ define amdgpu_kernel void @global_atomic_fmax_f64_noret(double addrspace(1)* %pt
|
|||
; G_GFX10-LABEL: global_atomic_fmax_f64_noret:
|
||||
; G_GFX10: ; %bb.0: ; %main_body
|
||||
; G_GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; G_GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; G_GFX10-NEXT: v_mov_b32_e32 v0, s0
|
||||
; G_GFX10-NEXT: v_mov_b32_e32 v2, s2
|
||||
; G_GFX10-NEXT: v_mov_b32_e32 v1, s1
|
||||
; G_GFX10-NEXT: v_mov_b32_e32 v3, s3
|
||||
; G_GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc
|
||||
; G_GFX10-NEXT: v_mov_b32_e32 v0, s2
|
||||
; G_GFX10-NEXT: v_mov_b32_e32 v1, s3
|
||||
; G_GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v2, v[0:1], s[0:1] glc
|
||||
; G_GFX10-NEXT: s_endpgm
|
||||
main_body:
|
||||
%ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
|
||||
|
|
Loading…
Reference in New Issue