forked from OSchip/llvm-project
[AMDGPU] Add constrained shift pattern matches.
The motivation for this is due to clang's conformance to https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_C.html#operators-shift which makes clang emit (<shift> a, (and b, <width> - 1)) for `a <shift> b` in OpenCL where a is an int of bit width <width>. Differential revision: https://reviews.llvm.org/D110231
This commit is contained in:
parent
56b74613bf
commit
61e3b9fefe
|
@ -237,6 +237,36 @@ def select_oneuse : HasOneUseTernaryOp<select>;
|
||||||
def AMDGPUmul_u24_oneuse : HasOneUseBinOp<AMDGPUmul_u24>;
|
def AMDGPUmul_u24_oneuse : HasOneUseBinOp<AMDGPUmul_u24>;
|
||||||
def AMDGPUmul_i24_oneuse : HasOneUseBinOp<AMDGPUmul_i24>;
|
def AMDGPUmul_i24_oneuse : HasOneUseBinOp<AMDGPUmul_i24>;
|
||||||
|
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
// PatFrags for shifts
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
// Constrained shift PatFrags.
|
||||||
|
foreach width = [16, 32, 64] in {
|
||||||
|
defvar mask = !sub(width, 1);
|
||||||
|
|
||||||
|
def cshl_#width : PatFrags<(ops node:$src0, node:$src1),
|
||||||
|
[(shl node:$src0, node:$src1), (shl node:$src0, (and node:$src1, mask))]>;
|
||||||
|
defvar cshl = !cast<SDPatternOperator>("cshl_"#width);
|
||||||
|
def cshl_#width#_oneuse : HasOneUseBinOp<cshl>;
|
||||||
|
def clshl_rev_#width : PatFrag <(ops node:$src0, node:$src1),
|
||||||
|
(cshl $src1, $src0)>;
|
||||||
|
|
||||||
|
def csrl_#width : PatFrags<(ops node:$src0, node:$src1),
|
||||||
|
[(srl node:$src0, node:$src1), (srl node:$src0, (and node:$src1, mask))]>;
|
||||||
|
defvar csrl = !cast<SDPatternOperator>("csrl_"#width);
|
||||||
|
def csrl_#width#_oneuse : HasOneUseBinOp<csrl>;
|
||||||
|
def clshr_rev_#width : PatFrag <(ops node:$src0, node:$src1),
|
||||||
|
(csrl $src1, $src0)>;
|
||||||
|
|
||||||
|
def csra_#width : PatFrags<(ops node:$src0, node:$src1),
|
||||||
|
[(sra node:$src0, node:$src1), (sra node:$src0, (and node:$src1, mask))]>;
|
||||||
|
defvar csra = !cast<SDPatternOperator>("csra_"#width);
|
||||||
|
def csra_#width#_oneuse : HasOneUseBinOp<csra>;
|
||||||
|
def cashr_rev_#width : PatFrag <(ops node:$src0, node:$src1),
|
||||||
|
(csra $src1, $src0)>;
|
||||||
|
} // end foreach width
|
||||||
|
|
||||||
def srl_16 : PatFrag<
|
def srl_16 : PatFrag<
|
||||||
(ops node:$src0), (srl_oneuse node:$src0, (i32 16))
|
(ops node:$src0), (srl_oneuse node:$src0, (i32 16))
|
||||||
>;
|
>;
|
||||||
|
|
|
@ -598,22 +598,22 @@ let AddedComplexity = 1 in {
|
||||||
let Defs = [SCC] in {
|
let Defs = [SCC] in {
|
||||||
// TODO: b64 versions require VOP3 change since v_lshlrev_b64 is VOP3
|
// TODO: b64 versions require VOP3 change since v_lshlrev_b64 is VOP3
|
||||||
def S_LSHL_B32 : SOP2_32 <"s_lshl_b32",
|
def S_LSHL_B32 : SOP2_32 <"s_lshl_b32",
|
||||||
[(set SReg_32:$sdst, (UniformBinFrag<shl> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
|
[(set SReg_32:$sdst, (UniformBinFrag<cshl_32> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
|
||||||
>;
|
>;
|
||||||
def S_LSHL_B64 : SOP2_64_32 <"s_lshl_b64",
|
def S_LSHL_B64 : SOP2_64_32 <"s_lshl_b64",
|
||||||
[(set SReg_64:$sdst, (UniformBinFrag<shl> (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
|
[(set SReg_64:$sdst, (UniformBinFrag<cshl_64> (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
|
||||||
>;
|
>;
|
||||||
def S_LSHR_B32 : SOP2_32 <"s_lshr_b32",
|
def S_LSHR_B32 : SOP2_32 <"s_lshr_b32",
|
||||||
[(set SReg_32:$sdst, (UniformBinFrag<srl> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
|
[(set SReg_32:$sdst, (UniformBinFrag<csrl_32> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
|
||||||
>;
|
>;
|
||||||
def S_LSHR_B64 : SOP2_64_32 <"s_lshr_b64",
|
def S_LSHR_B64 : SOP2_64_32 <"s_lshr_b64",
|
||||||
[(set SReg_64:$sdst, (UniformBinFrag<srl> (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
|
[(set SReg_64:$sdst, (UniformBinFrag<csrl_64> (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
|
||||||
>;
|
>;
|
||||||
def S_ASHR_I32 : SOP2_32 <"s_ashr_i32",
|
def S_ASHR_I32 : SOP2_32 <"s_ashr_i32",
|
||||||
[(set SReg_32:$sdst, (UniformBinFrag<sra> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
|
[(set SReg_32:$sdst, (UniformBinFrag<csra_32> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
|
||||||
>;
|
>;
|
||||||
def S_ASHR_I64 : SOP2_64_32 <"s_ashr_i64",
|
def S_ASHR_I64 : SOP2_64_32 <"s_ashr_i64",
|
||||||
[(set SReg_64:$sdst, (UniformBinFrag<sra> (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
|
[(set SReg_64:$sdst, (UniformBinFrag<csra_64> (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
|
||||||
>;
|
>;
|
||||||
} // End Defs = [SCC]
|
} // End Defs = [SCC]
|
||||||
|
|
||||||
|
|
|
@ -501,9 +501,9 @@ defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smin>;
|
||||||
defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smax>;
|
defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smax>;
|
||||||
defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umin>;
|
defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umin>;
|
||||||
defm V_MAX_U32 : VOP2Inst <"v_max_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umax>;
|
defm V_MAX_U32 : VOP2Inst <"v_max_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umax>;
|
||||||
defm V_LSHRREV_B32 : VOP2Inst <"v_lshrrev_b32", VOP_I32_I32_I32, lshr_rev, "v_lshr_b32">;
|
defm V_LSHRREV_B32 : VOP2Inst <"v_lshrrev_b32", VOP_I32_I32_I32, clshr_rev_32, "v_lshr_b32">;
|
||||||
defm V_ASHRREV_I32 : VOP2Inst <"v_ashrrev_i32", VOP_I32_I32_I32, ashr_rev, "v_ashr_i32">;
|
defm V_ASHRREV_I32 : VOP2Inst <"v_ashrrev_i32", VOP_I32_I32_I32, cashr_rev_32, "v_ashr_i32">;
|
||||||
defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, lshl_rev, "v_lshl_b32">;
|
defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, clshl_rev_32, "v_lshl_b32">;
|
||||||
defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, and>;
|
defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, and>;
|
||||||
defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, or>;
|
defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, or>;
|
||||||
defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, xor>;
|
defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, xor>;
|
||||||
|
@ -578,9 +578,9 @@ defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfma
|
||||||
|
|
||||||
let isCommutable = 1 in {
|
let isCommutable = 1 in {
|
||||||
let SubtargetPredicate = isGFX6GFX7 in {
|
let SubtargetPredicate = isGFX6GFX7 in {
|
||||||
defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, srl>;
|
defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, csrl_32>;
|
||||||
defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, sra>;
|
defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, csra_32>;
|
||||||
defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, shl>;
|
defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, cshl_32>;
|
||||||
} // End SubtargetPredicate = isGFX6GFX7
|
} // End SubtargetPredicate = isGFX6GFX7
|
||||||
} // End isCommutable = 1
|
} // End isCommutable = 1
|
||||||
} // End isReMaterializable = 1
|
} // End isReMaterializable = 1
|
||||||
|
@ -605,9 +605,9 @@ class DivergentClampingBinOp<SDPatternOperator Op, VOP_Pseudo Inst> :
|
||||||
)
|
)
|
||||||
>;
|
>;
|
||||||
|
|
||||||
def : DivergentBinOp<srl, V_LSHRREV_B32_e64>;
|
def : DivergentBinOp<csrl_32, V_LSHRREV_B32_e64>;
|
||||||
def : DivergentBinOp<sra, V_ASHRREV_I32_e64>;
|
def : DivergentBinOp<csra_32, V_ASHRREV_I32_e64>;
|
||||||
def : DivergentBinOp<shl, V_LSHLREV_B32_e64>;
|
def : DivergentBinOp<cshl_32, V_LSHLREV_B32_e64>;
|
||||||
|
|
||||||
let SubtargetPredicate = HasAddNoCarryInsts in {
|
let SubtargetPredicate = HasAddNoCarryInsts in {
|
||||||
def : DivergentClampingBinOp<add, V_ADD_U32_e64>;
|
def : DivergentClampingBinOp<add, V_ADD_U32_e64>;
|
||||||
|
@ -648,9 +648,9 @@ def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">;
|
||||||
defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>;
|
defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>;
|
||||||
} // End FPDPRounding = 1
|
} // End FPDPRounding = 1
|
||||||
|
|
||||||
defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16, lshl_rev>;
|
defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16, clshl_rev_16>;
|
||||||
defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16, lshr_rev>;
|
defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16, clshr_rev_16>;
|
||||||
defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16, ashr_rev>;
|
defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16, cashr_rev_16>;
|
||||||
|
|
||||||
let isCommutable = 1 in {
|
let isCommutable = 1 in {
|
||||||
let FPDPRounding = 1 in {
|
let FPDPRounding = 1 in {
|
||||||
|
@ -852,9 +852,9 @@ defm : Arithmetic_i16_0Hi_Pats<smin, V_MIN_I16_e64>;
|
||||||
defm : Arithmetic_i16_0Hi_Pats<smax, V_MAX_I16_e64>;
|
defm : Arithmetic_i16_0Hi_Pats<smax, V_MAX_I16_e64>;
|
||||||
defm : Arithmetic_i16_0Hi_Pats<umin, V_MIN_U16_e64>;
|
defm : Arithmetic_i16_0Hi_Pats<umin, V_MIN_U16_e64>;
|
||||||
defm : Arithmetic_i16_0Hi_Pats<umax, V_MAX_U16_e64>;
|
defm : Arithmetic_i16_0Hi_Pats<umax, V_MAX_U16_e64>;
|
||||||
defm : Arithmetic_i16_0Hi_Pats<lshl_rev, V_LSHLREV_B16_e64>;
|
defm : Arithmetic_i16_0Hi_Pats<clshl_rev_16, V_LSHLREV_B16_e64>;
|
||||||
defm : Arithmetic_i16_0Hi_Pats<lshr_rev, V_LSHRREV_B16_e64>;
|
defm : Arithmetic_i16_0Hi_Pats<clshr_rev_16, V_LSHRREV_B16_e64>;
|
||||||
defm : Arithmetic_i16_0Hi_Pats<ashr_rev, V_ASHRREV_I16_e64>;
|
defm : Arithmetic_i16_0Hi_Pats<cashr_rev_16, V_ASHRREV_I16_e64>;
|
||||||
} // End Predicates = [Has16BitInsts, isGFX7GFX8GFX9]
|
} // End Predicates = [Has16BitInsts, isGFX7GFX8GFX9]
|
||||||
|
|
||||||
def : ZExt_i16_i1_Pat<zext>;
|
def : ZExt_i16_i1_Pat<zext>;
|
||||||
|
|
|
@ -400,15 +400,15 @@ defm V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I
|
||||||
|
|
||||||
let SchedRW = [Write64Bit] in {
|
let SchedRW = [Write64Bit] in {
|
||||||
let SubtargetPredicate = isGFX6GFX7 in {
|
let SubtargetPredicate = isGFX6GFX7 in {
|
||||||
defm V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>, shl>;
|
defm V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>, cshl_64>;
|
||||||
defm V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>, srl>;
|
defm V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>, csrl_64>;
|
||||||
defm V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, sra>;
|
defm V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, csra_64>;
|
||||||
} // End SubtargetPredicate = isGFX6GFX7
|
} // End SubtargetPredicate = isGFX6GFX7
|
||||||
|
|
||||||
let SubtargetPredicate = isGFX8Plus in {
|
let SubtargetPredicate = isGFX8Plus in {
|
||||||
defm V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, lshl_rev>;
|
defm V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, clshl_rev_64>;
|
||||||
defm V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>, lshr_rev>;
|
defm V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>, clshr_rev_64>;
|
||||||
defm V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, ashr_rev>;
|
defm V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, cashr_rev_64>;
|
||||||
} // End SubtargetPredicate = isGFX8Plus
|
} // End SubtargetPredicate = isGFX8Plus
|
||||||
} // End SchedRW = [Write64Bit]
|
} // End SchedRW = [Write64Bit]
|
||||||
} // End isReMaterializable = 1
|
} // End isReMaterializable = 1
|
||||||
|
@ -656,10 +656,10 @@ class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instructio
|
||||||
(inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2)
|
(inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2)
|
||||||
>;
|
>;
|
||||||
|
|
||||||
def : ThreeOp_i32_Pats<shl, add, V_LSHL_ADD_U32_e64>;
|
def : ThreeOp_i32_Pats<cshl_32, add, V_LSHL_ADD_U32_e64>;
|
||||||
def : ThreeOp_i32_Pats<add, shl, V_ADD_LSHL_U32_e64>;
|
def : ThreeOp_i32_Pats<add, cshl_32, V_ADD_LSHL_U32_e64>;
|
||||||
def : ThreeOp_i32_Pats<add, add, V_ADD3_U32_e64>;
|
def : ThreeOp_i32_Pats<add, add, V_ADD3_U32_e64>;
|
||||||
def : ThreeOp_i32_Pats<shl, or, V_LSHL_OR_B32_e64>;
|
def : ThreeOp_i32_Pats<cshl_32, or, V_LSHL_OR_B32_e64>;
|
||||||
def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>;
|
def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>;
|
||||||
def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>;
|
def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>;
|
||||||
def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>;
|
def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>;
|
||||||
|
|
|
@ -82,9 +82,9 @@ defm V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16
|
||||||
defm V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
|
defm V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
|
||||||
defm V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>;
|
defm V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>;
|
||||||
|
|
||||||
defm V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshl_rev>;
|
defm V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, clshl_rev_16>;
|
||||||
defm V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>;
|
defm V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, cashr_rev_16>;
|
||||||
defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>;
|
defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, clshr_rev_16>;
|
||||||
|
|
||||||
|
|
||||||
let SubtargetPredicate = HasVOP3PInsts in {
|
let SubtargetPredicate = HasVOP3PInsts in {
|
||||||
|
|
|
@ -759,10 +759,11 @@ class getNumNodeArgs<SDPatternOperator Op> {
|
||||||
int ret = TP.NumOperands;
|
int ret = TP.NumOperands;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class getDivergentFrag<SDPatternOperator Op> {
|
class getDivergentFrag<SDPatternOperator Op> {
|
||||||
|
assert !or(!isa<SDNode>(Op), !isa<PatFrags>(Op)), "Expected SDNode or PatFrags";
|
||||||
|
|
||||||
int NumSrcArgs = getNumNodeArgs<Op>.ret;
|
int NumSrcArgs = !if(!isa<SDNode>(Op), getNumNodeArgs<Op>.ret,
|
||||||
|
!size(!cast<PatFrags>(Op).Operands));
|
||||||
PatFrag ret = PatFrag <
|
PatFrag ret = PatFrag <
|
||||||
!if(!eq(NumSrcArgs, 1),
|
!if(!eq(NumSrcArgs, 1),
|
||||||
(ops node:$src0),
|
(ops node:$src0),
|
||||||
|
|
|
@ -2898,24 +2898,20 @@ define i16 @v_fshl_i16(i16 %lhs, i16 %rhs, i16 %amt) {
|
||||||
; GFX8-LABEL: v_fshl_i16:
|
; GFX8-LABEL: v_fshl_i16:
|
||||||
; GFX8: ; %bb.0:
|
; GFX8: ; %bb.0:
|
||||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; GFX8-NEXT: v_and_b32_e32 v3, 15, v2
|
; GFX8-NEXT: v_xor_b32_e32 v3, -1, v2
|
||||||
; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2
|
|
||||||
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
|
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v1
|
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v1
|
||||||
; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0
|
; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1
|
; GFX8-NEXT: v_lshrrev_b16_e32 v1, v3, v1
|
||||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
||||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||||
;
|
;
|
||||||
; GFX9-LABEL: v_fshl_i16:
|
; GFX9-LABEL: v_fshl_i16:
|
||||||
; GFX9: ; %bb.0:
|
; GFX9: ; %bb.0:
|
||||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; GFX9-NEXT: v_and_b32_e32 v3, 15, v2
|
; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2
|
||||||
; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2
|
|
||||||
; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
|
|
||||||
; GFX9-NEXT: v_lshrrev_b16_e32 v1, 1, v1
|
; GFX9-NEXT: v_lshrrev_b16_e32 v1, 1, v1
|
||||||
; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0
|
; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0
|
||||||
; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1
|
; GFX9-NEXT: v_lshrrev_b16_e32 v1, v3, v1
|
||||||
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
|
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
|
||||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||||
;
|
;
|
||||||
|
@ -2924,9 +2920,7 @@ define i16 @v_fshl_i16(i16 %lhs, i16 %rhs, i16 %amt) {
|
||||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||||
; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2
|
; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2
|
||||||
; GFX10-NEXT: v_and_b32_e32 v2, 15, v2
|
|
||||||
; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1
|
; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1
|
||||||
; GFX10-NEXT: v_and_b32_e32 v3, 15, v3
|
|
||||||
; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0
|
; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0
|
||||||
; GFX10-NEXT: v_lshrrev_b16 v1, v3, v1
|
; GFX10-NEXT: v_lshrrev_b16 v1, v3, v1
|
||||||
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
|
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
|
||||||
|
@ -3025,39 +3019,33 @@ define amdgpu_ps half @v_fshl_i16_ssv(i16 inreg %lhs, i16 inreg %rhs, i16 %amt)
|
||||||
;
|
;
|
||||||
; GFX8-LABEL: v_fshl_i16_ssv:
|
; GFX8-LABEL: v_fshl_i16_ssv:
|
||||||
; GFX8: ; %bb.0:
|
; GFX8: ; %bb.0:
|
||||||
; GFX8-NEXT: v_and_b32_e32 v1, 15, v0
|
; GFX8-NEXT: v_xor_b32_e32 v1, -1, v0
|
||||||
; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0
|
; GFX8-NEXT: v_lshlrev_b16_e64 v0, v0, s0
|
||||||
; GFX8-NEXT: v_lshlrev_b16_e64 v1, v1, s0
|
|
||||||
; GFX8-NEXT: s_bfe_u32 s0, s1, 0x100000
|
; GFX8-NEXT: s_bfe_u32 s0, s1, 0x100000
|
||||||
; GFX8-NEXT: s_bfe_u32 s1, 1, 0x100000
|
; GFX8-NEXT: s_bfe_u32 s1, 1, 0x100000
|
||||||
; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
|
|
||||||
; GFX8-NEXT: s_lshr_b32 s0, s0, s1
|
; GFX8-NEXT: s_lshr_b32 s0, s0, s1
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0
|
; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s0
|
||||||
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
|
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
||||||
; GFX8-NEXT: ; return to shader part epilog
|
; GFX8-NEXT: ; return to shader part epilog
|
||||||
;
|
;
|
||||||
; GFX9-LABEL: v_fshl_i16_ssv:
|
; GFX9-LABEL: v_fshl_i16_ssv:
|
||||||
; GFX9: ; %bb.0:
|
; GFX9: ; %bb.0:
|
||||||
; GFX9-NEXT: v_and_b32_e32 v1, 15, v0
|
; GFX9-NEXT: v_xor_b32_e32 v1, -1, v0
|
||||||
; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0
|
; GFX9-NEXT: v_lshlrev_b16_e64 v0, v0, s0
|
||||||
; GFX9-NEXT: v_lshlrev_b16_e64 v1, v1, s0
|
|
||||||
; GFX9-NEXT: s_bfe_u32 s0, s1, 0x100000
|
; GFX9-NEXT: s_bfe_u32 s0, s1, 0x100000
|
||||||
; GFX9-NEXT: s_bfe_u32 s1, 1, 0x100000
|
; GFX9-NEXT: s_bfe_u32 s1, 1, 0x100000
|
||||||
; GFX9-NEXT: v_and_b32_e32 v0, 15, v0
|
|
||||||
; GFX9-NEXT: s_lshr_b32 s0, s0, s1
|
; GFX9-NEXT: s_lshr_b32 s0, s0, s1
|
||||||
; GFX9-NEXT: v_lshrrev_b16_e64 v0, v0, s0
|
; GFX9-NEXT: v_lshrrev_b16_e64 v1, v1, s0
|
||||||
; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
|
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
|
||||||
; GFX9-NEXT: ; return to shader part epilog
|
; GFX9-NEXT: ; return to shader part epilog
|
||||||
;
|
;
|
||||||
; GFX10-LABEL: v_fshl_i16_ssv:
|
; GFX10-LABEL: v_fshl_i16_ssv:
|
||||||
; GFX10: ; %bb.0:
|
; GFX10: ; %bb.0:
|
||||||
; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0
|
; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0
|
||||||
; GFX10-NEXT: v_and_b32_e32 v0, 15, v0
|
|
||||||
; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000
|
; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000
|
||||||
; GFX10-NEXT: s_bfe_u32 s2, 1, 0x100000
|
; GFX10-NEXT: s_bfe_u32 s2, 1, 0x100000
|
||||||
; GFX10-NEXT: s_lshr_b32 s1, s1, s2
|
|
||||||
; GFX10-NEXT: v_and_b32_e32 v1, 15, v1
|
|
||||||
; GFX10-NEXT: v_lshlrev_b16 v0, v0, s0
|
; GFX10-NEXT: v_lshlrev_b16 v0, v0, s0
|
||||||
|
; GFX10-NEXT: s_lshr_b32 s1, s1, s2
|
||||||
; GFX10-NEXT: v_lshrrev_b16 v1, v1, s1
|
; GFX10-NEXT: v_lshrrev_b16 v1, v1, s1
|
||||||
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
|
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
|
||||||
; GFX10-NEXT: ; return to shader part epilog
|
; GFX10-NEXT: ; return to shader part epilog
|
||||||
|
@ -3310,21 +3298,17 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
|
||||||
; GFX8-LABEL: v_fshl_v2i16:
|
; GFX8-LABEL: v_fshl_v2i16:
|
||||||
; GFX8: ; %bb.0:
|
; GFX8: ; %bb.0:
|
||||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
|
; GFX8-NEXT: v_xor_b32_e32 v4, -1, v2
|
||||||
; GFX8-NEXT: v_and_b32_e32 v4, 15, v2
|
|
||||||
; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2
|
|
||||||
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
|
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v1
|
; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v1
|
||||||
; GFX8-NEXT: v_lshlrev_b16_e32 v4, v4, v0
|
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v5
|
; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v0
|
||||||
; GFX8-NEXT: v_or_b32_e32 v2, v4, v2
|
; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v5
|
||||||
; GFX8-NEXT: v_and_b32_e32 v4, 15, v3
|
; GFX8-NEXT: v_or_b32_e32 v2, v2, v4
|
||||||
; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3
|
; GFX8-NEXT: v_xor_b32_e32 v4, -1, v3
|
||||||
; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; GFX8-NEXT: v_mov_b32_e32 v4, 1
|
; GFX8-NEXT: v_mov_b32_e32 v3, 1
|
||||||
; GFX8-NEXT: v_and_b32_e32 v3, 15, v3
|
; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX8-NEXT: v_lshrrev_b16_e32 v1, v4, v1
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e32 v1, v3, v1
|
|
||||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
||||||
; GFX8-NEXT: v_mov_b32_e32 v1, 16
|
; GFX8-NEXT: v_mov_b32_e32 v1, 16
|
||||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||||
|
@ -3447,25 +3431,21 @@ define amdgpu_ps float @v_fshl_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
|
||||||
;
|
;
|
||||||
; GFX8-LABEL: v_fshl_v2i16_ssv:
|
; GFX8-LABEL: v_fshl_v2i16_ssv:
|
||||||
; GFX8: ; %bb.0:
|
; GFX8: ; %bb.0:
|
||||||
; GFX8-NEXT: v_and_b32_e32 v2, 15, v0
|
|
||||||
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
|
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
|
||||||
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
|
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
|
||||||
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
||||||
; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0
|
; GFX8-NEXT: v_xor_b32_e32 v2, -1, v0
|
||||||
; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s0
|
; GFX8-NEXT: v_lshlrev_b16_e64 v0, v0, s0
|
||||||
; GFX8-NEXT: s_bfe_u32 s0, s1, 0x100000
|
; GFX8-NEXT: s_bfe_u32 s0, s1, 0x100000
|
||||||
; GFX8-NEXT: s_bfe_u32 s1, 1, 0x100000
|
; GFX8-NEXT: s_bfe_u32 s1, 1, 0x100000
|
||||||
; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
|
|
||||||
; GFX8-NEXT: s_lshr_b32 s0, s0, s1
|
; GFX8-NEXT: s_lshr_b32 s0, s0, s1
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0
|
; GFX8-NEXT: v_lshrrev_b16_e64 v2, v2, s0
|
||||||
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
|
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
|
||||||
; GFX8-NEXT: v_and_b32_e32 v2, 15, v1
|
; GFX8-NEXT: v_xor_b32_e32 v2, -1, v1
|
||||||
; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1
|
|
||||||
; GFX8-NEXT: v_and_b32_e32 v1, 15, v1
|
|
||||||
; GFX8-NEXT: s_lshr_b32 s0, s3, s1
|
; GFX8-NEXT: s_lshr_b32 s0, s3, s1
|
||||||
; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s2
|
; GFX8-NEXT: v_lshlrev_b16_e64 v1, v1, s2
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s0
|
; GFX8-NEXT: v_lshrrev_b16_e64 v2, v2, s0
|
||||||
; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
|
; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
|
||||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||||
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||||
|
@ -3944,37 +3924,28 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
|
||||||
; GFX8-LABEL: v_fshl_v4i16:
|
; GFX8-LABEL: v_fshl_v4i16:
|
||||||
; GFX8: ; %bb.0:
|
; GFX8: ; %bb.0:
|
||||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v4
|
; GFX8-NEXT: v_xor_b32_e32 v8, -1, v4
|
||||||
; GFX8-NEXT: v_and_b32_e32 v8, 15, v4
|
|
||||||
; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4
|
|
||||||
; GFX8-NEXT: v_and_b32_e32 v4, 15, v4
|
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e32 v9, 1, v2
|
; GFX8-NEXT: v_lshrrev_b16_e32 v9, 1, v2
|
||||||
; GFX8-NEXT: v_lshlrev_b16_e32 v8, v8, v0
|
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v4
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v9
|
; GFX8-NEXT: v_lshlrev_b16_e32 v4, v4, v0
|
||||||
; GFX8-NEXT: v_or_b32_e32 v4, v8, v4
|
; GFX8-NEXT: v_lshrrev_b16_e32 v8, v8, v9
|
||||||
; GFX8-NEXT: v_and_b32_e32 v8, 15, v6
|
; GFX8-NEXT: v_or_b32_e32 v4, v4, v8
|
||||||
; GFX8-NEXT: v_xor_b32_e32 v6, -1, v6
|
; GFX8-NEXT: v_xor_b32_e32 v8, -1, v6
|
||||||
; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; GFX8-NEXT: v_mov_b32_e32 v8, 1
|
; GFX8-NEXT: v_mov_b32_e32 v6, 1
|
||||||
; GFX8-NEXT: v_and_b32_e32 v6, 15, v6
|
; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX8-NEXT: v_lshrrev_b16_e32 v2, v8, v2
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e32 v2, v6, v2
|
|
||||||
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v5
|
|
||||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
|
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
|
||||||
; GFX8-NEXT: v_and_b32_e32 v2, 15, v5
|
; GFX8-NEXT: v_xor_b32_e32 v2, -1, v5
|
||||||
; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5
|
; GFX8-NEXT: v_lshrrev_b16_e32 v8, 1, v3
|
||||||
; GFX8-NEXT: v_and_b32_e32 v5, 15, v5
|
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v5
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v3
|
; GFX8-NEXT: v_lshlrev_b16_e32 v5, v5, v1
|
||||||
; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v1
|
; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v8
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v6
|
; GFX8-NEXT: v_or_b32_e32 v2, v5, v2
|
||||||
; GFX8-NEXT: v_or_b32_e32 v2, v2, v5
|
; GFX8-NEXT: v_xor_b32_e32 v5, -1, v7
|
||||||
; GFX8-NEXT: v_and_b32_e32 v5, 15, v7
|
; GFX8-NEXT: v_lshrrev_b16_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; GFX8-NEXT: v_xor_b32_e32 v6, -1, v7
|
; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX8-NEXT: v_lshrrev_b16_e32 v3, v5, v3
|
||||||
; GFX8-NEXT: v_mov_b32_e32 v5, 1
|
|
||||||
; GFX8-NEXT: v_and_b32_e32 v6, 15, v6
|
|
||||||
; GFX8-NEXT: v_lshrrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e32 v3, v6, v3
|
|
||||||
; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
|
; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
|
||||||
; GFX8-NEXT: v_mov_b32_e32 v3, 16
|
; GFX8-NEXT: v_mov_b32_e32 v3, 16
|
||||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||||
|
|
|
@ -2751,24 +2751,20 @@ define i16 @v_fshr_i16(i16 %lhs, i16 %rhs, i16 %amt) {
|
||||||
; GFX8-LABEL: v_fshr_i16:
|
; GFX8-LABEL: v_fshr_i16:
|
||||||
; GFX8: ; %bb.0:
|
; GFX8: ; %bb.0:
|
||||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; GFX8-NEXT: v_and_b32_e32 v3, 15, v2
|
; GFX8-NEXT: v_xor_b32_e32 v3, -1, v2
|
||||||
; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2
|
|
||||||
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
|
|
||||||
; GFX8-NEXT: v_lshlrev_b16_e32 v0, 1, v0
|
; GFX8-NEXT: v_lshlrev_b16_e32 v0, 1, v0
|
||||||
; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0
|
; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e32 v1, v3, v1
|
; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1
|
||||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
||||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||||
;
|
;
|
||||||
; GFX9-LABEL: v_fshr_i16:
|
; GFX9-LABEL: v_fshr_i16:
|
||||||
; GFX9: ; %bb.0:
|
; GFX9: ; %bb.0:
|
||||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; GFX9-NEXT: v_and_b32_e32 v3, 15, v2
|
; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2
|
||||||
; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2
|
|
||||||
; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
|
|
||||||
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0
|
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0
|
||||||
; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0
|
; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0
|
||||||
; GFX9-NEXT: v_lshrrev_b16_e32 v1, v3, v1
|
; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1
|
||||||
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
|
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
|
||||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||||
;
|
;
|
||||||
|
@ -2778,8 +2774,6 @@ define i16 @v_fshr_i16(i16 %lhs, i16 %rhs, i16 %amt) {
|
||||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||||
; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2
|
; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2
|
||||||
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
|
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
|
||||||
; GFX10-NEXT: v_and_b32_e32 v2, 15, v2
|
|
||||||
; GFX10-NEXT: v_and_b32_e32 v3, 15, v3
|
|
||||||
; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1
|
; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1
|
||||||
; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0
|
; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0
|
||||||
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
|
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
|
||||||
|
@ -2879,36 +2873,30 @@ define amdgpu_ps half @v_fshr_i16_ssv(i16 inreg %lhs, i16 inreg %rhs, i16 %amt)
|
||||||
;
|
;
|
||||||
; GFX8-LABEL: v_fshr_i16_ssv:
|
; GFX8-LABEL: v_fshr_i16_ssv:
|
||||||
; GFX8: ; %bb.0:
|
; GFX8: ; %bb.0:
|
||||||
; GFX8-NEXT: v_and_b32_e32 v1, 15, v0
|
|
||||||
; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0
|
|
||||||
; GFX8-NEXT: s_bfe_u32 s2, 1, 0x100000
|
; GFX8-NEXT: s_bfe_u32 s2, 1, 0x100000
|
||||||
; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
|
; GFX8-NEXT: v_xor_b32_e32 v1, -1, v0
|
||||||
; GFX8-NEXT: s_lshl_b32 s0, s0, s2
|
; GFX8-NEXT: s_lshl_b32 s0, s0, s2
|
||||||
; GFX8-NEXT: v_lshlrev_b16_e64 v0, v0, s0
|
; GFX8-NEXT: v_lshlrev_b16_e64 v1, v1, s0
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s1
|
; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s1
|
||||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
|
||||||
; GFX8-NEXT: ; return to shader part epilog
|
; GFX8-NEXT: ; return to shader part epilog
|
||||||
;
|
;
|
||||||
; GFX9-LABEL: v_fshr_i16_ssv:
|
; GFX9-LABEL: v_fshr_i16_ssv:
|
||||||
; GFX9: ; %bb.0:
|
; GFX9: ; %bb.0:
|
||||||
; GFX9-NEXT: v_and_b32_e32 v1, 15, v0
|
|
||||||
; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0
|
|
||||||
; GFX9-NEXT: s_bfe_u32 s2, 1, 0x100000
|
; GFX9-NEXT: s_bfe_u32 s2, 1, 0x100000
|
||||||
; GFX9-NEXT: v_and_b32_e32 v0, 15, v0
|
; GFX9-NEXT: v_xor_b32_e32 v1, -1, v0
|
||||||
; GFX9-NEXT: s_lshl_b32 s0, s0, s2
|
; GFX9-NEXT: s_lshl_b32 s0, s0, s2
|
||||||
; GFX9-NEXT: v_lshlrev_b16_e64 v0, v0, s0
|
; GFX9-NEXT: v_lshlrev_b16_e64 v1, v1, s0
|
||||||
; GFX9-NEXT: v_lshrrev_b16_e64 v1, v1, s1
|
; GFX9-NEXT: v_lshrrev_b16_e64 v0, v0, s1
|
||||||
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
|
; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
|
||||||
; GFX9-NEXT: ; return to shader part epilog
|
; GFX9-NEXT: ; return to shader part epilog
|
||||||
;
|
;
|
||||||
; GFX10-LABEL: v_fshr_i16_ssv:
|
; GFX10-LABEL: v_fshr_i16_ssv:
|
||||||
; GFX10: ; %bb.0:
|
; GFX10: ; %bb.0:
|
||||||
; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0
|
; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0
|
||||||
; GFX10-NEXT: v_and_b32_e32 v0, 15, v0
|
|
||||||
; GFX10-NEXT: s_bfe_u32 s2, 1, 0x100000
|
; GFX10-NEXT: s_bfe_u32 s2, 1, 0x100000
|
||||||
; GFX10-NEXT: s_lshl_b32 s0, s0, s2
|
|
||||||
; GFX10-NEXT: v_and_b32_e32 v1, 15, v1
|
|
||||||
; GFX10-NEXT: v_lshrrev_b16 v0, v0, s1
|
; GFX10-NEXT: v_lshrrev_b16 v0, v0, s1
|
||||||
|
; GFX10-NEXT: s_lshl_b32 s0, s0, s2
|
||||||
; GFX10-NEXT: v_lshlrev_b16 v1, v1, s0
|
; GFX10-NEXT: v_lshlrev_b16 v1, v1, s0
|
||||||
; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
|
; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
|
||||||
; GFX10-NEXT: ; return to shader part epilog
|
; GFX10-NEXT: ; return to shader part epilog
|
||||||
|
@ -3214,24 +3202,20 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
|
||||||
; GFX8-NEXT: v_mov_b32_e32 v5, 15
|
; GFX8-NEXT: v_mov_b32_e32 v5, 15
|
||||||
; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; GFX8-NEXT: v_lshrrev_b16_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX8-NEXT: v_lshrrev_b16_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2
|
|
||||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v5
|
; GFX8-NEXT: v_or_b32_e32 v0, v0, v5
|
||||||
; GFX8-NEXT: v_lshlrev_b16_e32 v5, 1, v1
|
; GFX8-NEXT: v_lshlrev_b16_e32 v5, 1, v1
|
||||||
|
; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2
|
||||||
; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2
|
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2
|
||||||
; GFX8-NEXT: v_and_b32_e32 v6, 15, v2
|
; GFX8-NEXT: v_xor_b32_e32 v6, -1, v2
|
||||||
; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2
|
; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v3
|
||||||
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
|
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v5
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v5
|
; GFX8-NEXT: v_lshrrev_b16_e32 v3, v6, v3
|
||||||
; GFX8-NEXT: v_lshlrev_b16_e32 v3, v6, v3
|
; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v5
|
; GFX8-NEXT: v_xor_b32_e32 v3, -1, v4
|
||||||
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
|
|
||||||
; GFX8-NEXT: v_and_b32_e32 v3, 15, v4
|
|
||||||
; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4
|
|
||||||
; GFX8-NEXT: v_and_b32_e32 v4, 15, v4
|
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v1
|
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v1
|
||||||
; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0
|
; GFX8-NEXT: v_lshlrev_b16_e32 v0, v4, v0
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e32 v1, v4, v1
|
; GFX8-NEXT: v_lshrrev_b16_e32 v1, v3, v1
|
||||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
||||||
; GFX8-NEXT: v_mov_b32_e32 v1, 16
|
; GFX8-NEXT: v_mov_b32_e32 v1, 16
|
||||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||||
|
@ -3377,31 +3361,27 @@ define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
|
||||||
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
|
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
|
||||||
; GFX8-NEXT: s_lshl_b32 s0, s0, s4
|
; GFX8-NEXT: s_lshl_b32 s0, s0, s4
|
||||||
; GFX8-NEXT: s_lshr_b32 s5, s5, s6
|
; GFX8-NEXT: s_lshr_b32 s5, s5, s6
|
||||||
; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0
|
|
||||||
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
|
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
|
||||||
; GFX8-NEXT: s_or_b32 s0, s0, s5
|
; GFX8-NEXT: s_or_b32 s0, s0, s5
|
||||||
; GFX8-NEXT: s_lshl_b32 s1, s1, s4
|
; GFX8-NEXT: s_lshl_b32 s1, s1, s4
|
||||||
; GFX8-NEXT: v_and_b32_e32 v2, 15, v0
|
|
||||||
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
|
||||||
; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0
|
; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||||
; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s0
|
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
||||||
|
; GFX8-NEXT: v_xor_b32_e32 v2, -1, v0
|
||||||
|
; GFX8-NEXT: v_lshlrev_b16_e64 v0, v0, s0
|
||||||
; GFX8-NEXT: s_bfe_u32 s0, s1, 0x100000
|
; GFX8-NEXT: s_bfe_u32 s0, s1, 0x100000
|
||||||
; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
|
|
||||||
; GFX8-NEXT: s_lshr_b32 s0, s0, s4
|
|
||||||
; GFX8-NEXT: s_lshr_b32 s5, s3, s6
|
; GFX8-NEXT: s_lshr_b32 s5, s3, s6
|
||||||
; GFX8-NEXT: s_lshl_b32 s3, s3, s4
|
; GFX8-NEXT: s_lshl_b32 s3, s3, s4
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0
|
; GFX8-NEXT: s_lshr_b32 s0, s0, s4
|
||||||
; GFX8-NEXT: s_lshl_b32 s2, s2, s4
|
; GFX8-NEXT: s_lshl_b32 s2, s2, s4
|
||||||
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
|
; GFX8-NEXT: v_lshrrev_b16_e64 v2, v2, s0
|
||||||
; GFX8-NEXT: v_and_b32_e32 v2, 15, v1
|
|
||||||
; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1
|
|
||||||
; GFX8-NEXT: s_bfe_u32 s0, s3, 0x100000
|
; GFX8-NEXT: s_bfe_u32 s0, s3, 0x100000
|
||||||
; GFX8-NEXT: s_or_b32 s2, s2, s5
|
; GFX8-NEXT: s_or_b32 s2, s2, s5
|
||||||
; GFX8-NEXT: v_and_b32_e32 v1, 15, v1
|
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
|
||||||
|
; GFX8-NEXT: v_xor_b32_e32 v2, -1, v1
|
||||||
; GFX8-NEXT: s_lshr_b32 s0, s0, s4
|
; GFX8-NEXT: s_lshr_b32 s0, s0, s4
|
||||||
; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s2
|
; GFX8-NEXT: v_lshlrev_b16_e64 v1, v1, s2
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s0
|
; GFX8-NEXT: v_lshrrev_b16_e64 v2, v2, s0
|
||||||
; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
|
; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
|
||||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||||
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||||
|
@ -4025,24 +4005,20 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
|
||||||
; GFX8-NEXT: v_mov_b32_e32 v8, 15
|
; GFX8-NEXT: v_mov_b32_e32 v8, 15
|
||||||
; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; GFX8-NEXT: v_lshrrev_b16_sdwa v9, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX8-NEXT: v_lshrrev_b16_sdwa v9, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4
|
|
||||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v9
|
; GFX8-NEXT: v_or_b32_e32 v0, v0, v9
|
||||||
; GFX8-NEXT: v_lshlrev_b16_e32 v9, 1, v2
|
; GFX8-NEXT: v_lshlrev_b16_e32 v9, 1, v2
|
||||||
|
; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4
|
||||||
; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v4
|
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v4
|
||||||
; GFX8-NEXT: v_and_b32_e32 v10, 15, v4
|
; GFX8-NEXT: v_xor_b32_e32 v10, -1, v4
|
||||||
; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4
|
; GFX8-NEXT: v_lshlrev_b16_e32 v4, v4, v6
|
||||||
; GFX8-NEXT: v_and_b32_e32 v4, 15, v4
|
; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v9
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e32 v9, 1, v9
|
; GFX8-NEXT: v_lshrrev_b16_e32 v6, v10, v6
|
||||||
; GFX8-NEXT: v_lshlrev_b16_e32 v6, v10, v6
|
; GFX8-NEXT: v_or_b32_e32 v4, v4, v6
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v9
|
; GFX8-NEXT: v_xor_b32_e32 v6, -1, v7
|
||||||
; GFX8-NEXT: v_or_b32_e32 v4, v6, v4
|
|
||||||
; GFX8-NEXT: v_and_b32_e32 v6, 15, v7
|
|
||||||
; GFX8-NEXT: v_xor_b32_e32 v7, -1, v7
|
|
||||||
; GFX8-NEXT: v_and_b32_e32 v7, 15, v7
|
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2
|
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2
|
||||||
; GFX8-NEXT: v_lshlrev_b16_e32 v0, v6, v0
|
; GFX8-NEXT: v_lshlrev_b16_e32 v0, v7, v0
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e32 v2, v7, v2
|
; GFX8-NEXT: v_lshrrev_b16_e32 v2, v6, v2
|
||||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
|
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
|
||||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||||
|
@ -4053,24 +4029,20 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
|
||||||
; GFX8-NEXT: v_mov_b32_e32 v6, 1
|
; GFX8-NEXT: v_mov_b32_e32 v6, 1
|
||||||
; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; GFX8-NEXT: v_lshrrev_b16_sdwa v7, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX8-NEXT: v_lshrrev_b16_sdwa v7, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5
|
|
||||||
; GFX8-NEXT: v_or_b32_e32 v1, v1, v7
|
; GFX8-NEXT: v_or_b32_e32 v1, v1, v7
|
||||||
; GFX8-NEXT: v_lshlrev_b16_e32 v7, 1, v3
|
; GFX8-NEXT: v_lshlrev_b16_e32 v7, 1, v3
|
||||||
|
; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5
|
||||||
; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v5
|
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v5
|
||||||
; GFX8-NEXT: v_and_b32_e32 v8, 15, v5
|
; GFX8-NEXT: v_xor_b32_e32 v8, -1, v5
|
||||||
; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5
|
; GFX8-NEXT: v_lshlrev_b16_e32 v4, v5, v4
|
||||||
; GFX8-NEXT: v_and_b32_e32 v5, 15, v5
|
; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v7
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e32 v7, 1, v7
|
; GFX8-NEXT: v_lshrrev_b16_e32 v5, v8, v5
|
||||||
; GFX8-NEXT: v_lshlrev_b16_e32 v4, v8, v4
|
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v7
|
|
||||||
; GFX8-NEXT: v_or_b32_e32 v4, v4, v5
|
; GFX8-NEXT: v_or_b32_e32 v4, v4, v5
|
||||||
; GFX8-NEXT: v_and_b32_e32 v5, 15, v6
|
; GFX8-NEXT: v_xor_b32_e32 v5, -1, v6
|
||||||
; GFX8-NEXT: v_xor_b32_e32 v6, -1, v6
|
|
||||||
; GFX8-NEXT: v_and_b32_e32 v6, 15, v6
|
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v3
|
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v3
|
||||||
; GFX8-NEXT: v_lshlrev_b16_e32 v1, v5, v1
|
; GFX8-NEXT: v_lshlrev_b16_e32 v1, v6, v1
|
||||||
; GFX8-NEXT: v_lshrrev_b16_e32 v3, v6, v3
|
; GFX8-NEXT: v_lshrrev_b16_e32 v3, v5, v3
|
||||||
; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
|
; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
|
||||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||||
; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||||
|
|
|
@ -0,0 +1,197 @@
|
||||||
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||||
|
; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s
|
||||||
|
|
||||||
|
define i16 @csh_16(i16 %a, i16 %b) {
|
||||||
|
; CHECK-LABEL: csh_16:
|
||||||
|
; CHECK: ; %bb.0:
|
||||||
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
|
; CHECK-NEXT: v_lshlrev_b16_e32 v2, v1, v0
|
||||||
|
; CHECK-NEXT: v_lshrrev_b16_e32 v3, v1, v0
|
||||||
|
; CHECK-NEXT: v_ashrrev_i16_e32 v0, v1, v0
|
||||||
|
; CHECK-NEXT: v_add_u16_e32 v1, v2, v3
|
||||||
|
; CHECK-NEXT: v_add_u16_e32 v0, v1, v0
|
||||||
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||||
|
%and = and i16 %b, 15
|
||||||
|
%shl = shl i16 %a, %and
|
||||||
|
%lshr = lshr i16 %a, %and
|
||||||
|
%ashr = ashr i16 %a, %and
|
||||||
|
%ret.0 = add i16 %shl, %lshr
|
||||||
|
%ret = add i16 %ret.0, %ashr
|
||||||
|
ret i16 %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define i32 @csh_32(i32 %a, i32 %b) {
|
||||||
|
; CHECK-LABEL: csh_32:
|
||||||
|
; CHECK: ; %bb.0:
|
||||||
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
|
; CHECK-NEXT: v_lshlrev_b32_e32 v2, v1, v0
|
||||||
|
; CHECK-NEXT: v_lshrrev_b32_e32 v3, v1, v0
|
||||||
|
; CHECK-NEXT: v_ashrrev_i32_e32 v0, v1, v0
|
||||||
|
; CHECK-NEXT: v_add3_u32 v0, v2, v3, v0
|
||||||
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||||
|
%and = and i32 %b, 31
|
||||||
|
%shl = shl i32 %a, %and
|
||||||
|
%lshr = lshr i32 %a, %and
|
||||||
|
%ashr = ashr i32 %a, %and
|
||||||
|
%ret.0 = add i32 %shl, %lshr
|
||||||
|
%ret = add i32 %ret.0, %ashr
|
||||||
|
ret i32 %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps i32 @s_csh_32(i32 inreg %a, i32 inreg %b) {
|
||||||
|
; CHECK-LABEL: s_csh_32:
|
||||||
|
; CHECK: ; %bb.0:
|
||||||
|
; CHECK-NEXT: s_lshl_b32 s2, s0, s1
|
||||||
|
; CHECK-NEXT: s_lshr_b32 s3, s0, s1
|
||||||
|
; CHECK-NEXT: s_ashr_i32 s0, s0, s1
|
||||||
|
; CHECK-NEXT: s_add_i32 s1, s2, s3
|
||||||
|
; CHECK-NEXT: s_add_i32 s0, s1, s0
|
||||||
|
; CHECK-NEXT: ; return to shader part epilog
|
||||||
|
%and = and i32 %b, 31
|
||||||
|
%shl = shl i32 %a, %and
|
||||||
|
%lshr = lshr i32 %a, %and
|
||||||
|
%ashr = ashr i32 %a, %and
|
||||||
|
%ret.0 = add i32 %shl, %lshr
|
||||||
|
%ret = add i32 %ret.0, %ashr
|
||||||
|
ret i32 %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define <4 x i32> @csh_v4i32(<4 x i32> %a, <4 x i32> %b) {
|
||||||
|
; CHECK-LABEL: csh_v4i32:
|
||||||
|
; CHECK: ; %bb.0:
|
||||||
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
|
; CHECK-NEXT: v_lshlrev_b32_e32 v8, v7, v3
|
||||||
|
; CHECK-NEXT: v_lshlrev_b32_e32 v9, v6, v2
|
||||||
|
; CHECK-NEXT: v_lshlrev_b32_e32 v10, v5, v1
|
||||||
|
; CHECK-NEXT: v_lshlrev_b32_e32 v11, v4, v0
|
||||||
|
; CHECK-NEXT: v_lshrrev_b32_e32 v12, v7, v3
|
||||||
|
; CHECK-NEXT: v_lshrrev_b32_e32 v13, v6, v2
|
||||||
|
; CHECK-NEXT: v_lshrrev_b32_e32 v14, v5, v1
|
||||||
|
; CHECK-NEXT: v_lshrrev_b32_e32 v15, v4, v0
|
||||||
|
; CHECK-NEXT: v_ashrrev_i32_e32 v3, v7, v3
|
||||||
|
; CHECK-NEXT: v_ashrrev_i32_e32 v2, v6, v2
|
||||||
|
; CHECK-NEXT: v_ashrrev_i32_e32 v1, v5, v1
|
||||||
|
; CHECK-NEXT: v_ashrrev_i32_e32 v0, v4, v0
|
||||||
|
; CHECK-NEXT: v_add3_u32 v0, v11, v15, v0
|
||||||
|
; CHECK-NEXT: v_add3_u32 v1, v10, v14, v1
|
||||||
|
; CHECK-NEXT: v_add3_u32 v2, v9, v13, v2
|
||||||
|
; CHECK-NEXT: v_add3_u32 v3, v8, v12, v3
|
||||||
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||||
|
%and = and <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
|
||||||
|
%shl = shl <4 x i32> %a, %and
|
||||||
|
%lshr = lshr <4 x i32> %a, %and
|
||||||
|
%ashr = ashr <4 x i32> %a, %and
|
||||||
|
%ret.0 = add <4 x i32> %shl, %lshr
|
||||||
|
%ret = add <4 x i32> %ret.0, %ashr
|
||||||
|
ret <4 x i32> %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps <4 x i32> @s_csh_v4i32(<4 x i32> inreg %a, <4 x i32> inreg %b) {
|
||||||
|
; CHECK-LABEL: s_csh_v4i32:
|
||||||
|
; CHECK: ; %bb.0:
|
||||||
|
; CHECK-NEXT: s_lshl_b32 s8, s0, s4
|
||||||
|
; CHECK-NEXT: s_lshl_b32 s9, s1, s5
|
||||||
|
; CHECK-NEXT: s_lshl_b32 s10, s2, s6
|
||||||
|
; CHECK-NEXT: s_lshl_b32 s11, s3, s7
|
||||||
|
; CHECK-NEXT: s_lshr_b32 s12, s0, s4
|
||||||
|
; CHECK-NEXT: s_lshr_b32 s13, s1, s5
|
||||||
|
; CHECK-NEXT: s_lshr_b32 s14, s2, s6
|
||||||
|
; CHECK-NEXT: s_lshr_b32 s15, s3, s7
|
||||||
|
; CHECK-NEXT: s_ashr_i32 s3, s3, s7
|
||||||
|
; CHECK-NEXT: s_ashr_i32 s2, s2, s6
|
||||||
|
; CHECK-NEXT: s_ashr_i32 s1, s1, s5
|
||||||
|
; CHECK-NEXT: s_ashr_i32 s0, s0, s4
|
||||||
|
; CHECK-NEXT: s_add_i32 s4, s11, s15
|
||||||
|
; CHECK-NEXT: s_add_i32 s5, s10, s14
|
||||||
|
; CHECK-NEXT: s_add_i32 s6, s9, s13
|
||||||
|
; CHECK-NEXT: s_add_i32 s7, s8, s12
|
||||||
|
; CHECK-NEXT: s_add_i32 s0, s7, s0
|
||||||
|
; CHECK-NEXT: s_add_i32 s1, s6, s1
|
||||||
|
; CHECK-NEXT: s_add_i32 s2, s5, s2
|
||||||
|
; CHECK-NEXT: s_add_i32 s3, s4, s3
|
||||||
|
; CHECK-NEXT: ; return to shader part epilog
|
||||||
|
%and = and <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
|
||||||
|
%shl = shl <4 x i32> %a, %and
|
||||||
|
%lshr = lshr <4 x i32> %a, %and
|
||||||
|
%ashr = ashr <4 x i32> %a, %and
|
||||||
|
%ret.0 = add <4 x i32> %shl, %lshr
|
||||||
|
%ret = add <4 x i32> %ret.0, %ashr
|
||||||
|
ret <4 x i32> %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define i64 @csh_64(i64 %a, i64 %b) {
|
||||||
|
; CHECK-LABEL: csh_64:
|
||||||
|
; CHECK: ; %bb.0:
|
||||||
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
|
; CHECK-NEXT: v_lshlrev_b64 v[3:4], v2, v[0:1]
|
||||||
|
; CHECK-NEXT: v_lshrrev_b64 v[5:6], v2, v[0:1]
|
||||||
|
; CHECK-NEXT: v_ashrrev_i64 v[0:1], v2, v[0:1]
|
||||||
|
; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, v3, v5
|
||||||
|
; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v6, vcc
|
||||||
|
; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
|
||||||
|
; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
|
||||||
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||||
|
%and = and i64 %b, 63
|
||||||
|
%shl = shl i64 %a, %and
|
||||||
|
%lshr = lshr i64 %a, %and
|
||||||
|
%ashr = ashr i64 %a, %and
|
||||||
|
%ret.0 = add i64 %shl, %lshr
|
||||||
|
%ret = add i64 %ret.0, %ashr
|
||||||
|
ret i64 %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define amdgpu_ps i64 @s_csh_64(i64 inreg %a, i64 inreg %b) {
|
||||||
|
; CHECK-LABEL: s_csh_64:
|
||||||
|
; CHECK: ; %bb.0:
|
||||||
|
; CHECK-NEXT: s_lshl_b64 s[4:5], s[0:1], s2
|
||||||
|
; CHECK-NEXT: s_lshr_b64 s[6:7], s[0:1], s2
|
||||||
|
; CHECK-NEXT: s_ashr_i64 s[0:1], s[0:1], s2
|
||||||
|
; CHECK-NEXT: s_add_u32 s2, s4, s6
|
||||||
|
; CHECK-NEXT: s_addc_u32 s3, s5, s7
|
||||||
|
; CHECK-NEXT: s_add_u32 s0, s2, s0
|
||||||
|
; CHECK-NEXT: s_addc_u32 s1, s3, s1
|
||||||
|
; CHECK-NEXT: ; return to shader part epilog
|
||||||
|
%and = and i64 %b, 63
|
||||||
|
%shl = shl i64 %a, %and
|
||||||
|
%lshr = lshr i64 %a, %and
|
||||||
|
%ashr = ashr i64 %a, %and
|
||||||
|
%ret.0 = add i64 %shl, %lshr
|
||||||
|
%ret = add i64 %ret.0, %ashr
|
||||||
|
ret i64 %ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define i32 @cshl_or(i32 %a, i32 %b) {
|
||||||
|
; CHECK-LABEL: cshl_or:
|
||||||
|
; CHECK: ; %bb.0:
|
||||||
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
|
; CHECK-NEXT: v_lshl_or_b32 v0, v0, v1, v0
|
||||||
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||||
|
%and = and i32 %b, 31
|
||||||
|
%shl = shl i32 %a, %and
|
||||||
|
%or = or i32 %shl, %a
|
||||||
|
ret i32 %or
|
||||||
|
}
|
||||||
|
|
||||||
|
define i32 @cshl_add(i32 %a, i32 %b, i32 %c) {
|
||||||
|
; CHECK-LABEL: cshl_add:
|
||||||
|
; CHECK: ; %bb.0:
|
||||||
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
|
; CHECK-NEXT: v_lshl_add_u32 v0, v0, v1, v2
|
||||||
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||||
|
%and = and i32 %b, 31
|
||||||
|
%shl = shl i32 %a, %and
|
||||||
|
%add = add i32 %shl, %c
|
||||||
|
ret i32 %add
|
||||||
|
}
|
||||||
|
|
||||||
|
define i32 @add_cshl(i32 %a, i32 %b) {
|
||||||
|
; CHECK-LABEL: add_cshl:
|
||||||
|
; CHECK: ; %bb.0:
|
||||||
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
|
; CHECK-NEXT: v_add_lshl_u32 v0, v0, v1, v1
|
||||||
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||||
|
%add = add i32 %a, %b
|
||||||
|
%and = and i32 %b, 31
|
||||||
|
%shl = shl i32 %add, %and
|
||||||
|
ret i32 %shl
|
||||||
|
}
|
|
@ -639,10 +639,8 @@ define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
|
||||||
; VI-LABEL: v_fshr_i16:
|
; VI-LABEL: v_fshr_i16:
|
||||||
; VI: ; %bb.0:
|
; VI: ; %bb.0:
|
||||||
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; VI-NEXT: v_xor_b32_e32 v3, -1, v2
|
|
||||||
; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0
|
; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0
|
||||||
; VI-NEXT: v_and_b32_e32 v3, 15, v3
|
; VI-NEXT: v_xor_b32_e32 v3, -1, v2
|
||||||
; VI-NEXT: v_and_b32_e32 v2, 15, v2
|
|
||||||
; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0
|
; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0
|
||||||
; VI-NEXT: v_lshrrev_b16_e32 v1, v2, v1
|
; VI-NEXT: v_lshrrev_b16_e32 v1, v2, v1
|
||||||
; VI-NEXT: v_or_b32_e32 v0, v0, v1
|
; VI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||||
|
@ -651,10 +649,8 @@ define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
|
||||||
; GFX9-LABEL: v_fshr_i16:
|
; GFX9-LABEL: v_fshr_i16:
|
||||||
; GFX9: ; %bb.0:
|
; GFX9: ; %bb.0:
|
||||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2
|
|
||||||
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0
|
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0
|
||||||
; GFX9-NEXT: v_and_b32_e32 v3, 15, v3
|
; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2
|
||||||
; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
|
|
||||||
; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0
|
; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0
|
||||||
; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1
|
; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1
|
||||||
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
|
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
|
||||||
|
@ -669,10 +665,8 @@ define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
|
||||||
; GFX10: ; %bb.0:
|
; GFX10: ; %bb.0:
|
||||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||||
; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2
|
|
||||||
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
|
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
|
||||||
; GFX10-NEXT: v_and_b32_e32 v2, 15, v2
|
; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2
|
||||||
; GFX10-NEXT: v_and_b32_e32 v3, 15, v3
|
|
||||||
; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1
|
; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1
|
||||||
; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0
|
; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0
|
||||||
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
|
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
|
||||||
|
@ -701,18 +695,14 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2
|
||||||
; VI: ; %bb.0:
|
; VI: ; %bb.0:
|
||||||
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
|
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
|
||||||
; VI-NEXT: v_and_b32_e32 v4, 15, v3
|
|
||||||
; VI-NEXT: v_mov_b32_e32 v5, 1
|
; VI-NEXT: v_mov_b32_e32 v5, 1
|
||||||
; VI-NEXT: v_xor_b32_e32 v3, -1, v3
|
; VI-NEXT: v_lshrrev_b16_sdwa v4, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; VI-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; VI-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; VI-NEXT: v_and_b32_e32 v3, 15, v3
|
; VI-NEXT: v_xor_b32_e32 v3, -1, v3
|
||||||
; VI-NEXT: v_lshrrev_b16_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
||||||
; VI-NEXT: v_lshlrev_b16_e32 v3, v3, v5
|
; VI-NEXT: v_lshlrev_b16_e32 v3, v3, v5
|
||||||
; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||||
; VI-NEXT: v_xor_b32_e32 v4, -1, v2
|
|
||||||
; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0
|
; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0
|
||||||
; VI-NEXT: v_and_b32_e32 v4, 15, v4
|
; VI-NEXT: v_xor_b32_e32 v4, -1, v2
|
||||||
; VI-NEXT: v_and_b32_e32 v2, 15, v2
|
|
||||||
; VI-NEXT: v_lshlrev_b16_e32 v0, v4, v0
|
; VI-NEXT: v_lshlrev_b16_e32 v0, v4, v0
|
||||||
; VI-NEXT: v_lshrrev_b16_e32 v1, v2, v1
|
; VI-NEXT: v_lshrrev_b16_e32 v1, v2, v1
|
||||||
; VI-NEXT: v_or_b32_e32 v0, v0, v1
|
; VI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||||
|
@ -779,27 +769,21 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2
|
||||||
; VI: ; %bb.0:
|
; VI: ; %bb.0:
|
||||||
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4
|
; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4
|
||||||
; VI-NEXT: v_and_b32_e32 v7, 15, v6
|
|
||||||
; VI-NEXT: v_mov_b32_e32 v8, 1
|
; VI-NEXT: v_mov_b32_e32 v8, 1
|
||||||
; VI-NEXT: v_xor_b32_e32 v6, -1, v6
|
; VI-NEXT: v_lshrrev_b16_sdwa v7, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; VI-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; VI-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; VI-NEXT: v_and_b32_e32 v6, 15, v6
|
; VI-NEXT: v_xor_b32_e32 v6, -1, v6
|
||||||
; VI-NEXT: v_lshrrev_b16_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
||||||
; VI-NEXT: v_lshlrev_b16_e32 v6, v6, v8
|
; VI-NEXT: v_lshlrev_b16_e32 v6, v6, v8
|
||||||
; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||||
; VI-NEXT: v_xor_b32_e32 v7, -1, v5
|
|
||||||
; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1
|
; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1
|
||||||
; VI-NEXT: v_and_b32_e32 v7, 15, v7
|
; VI-NEXT: v_xor_b32_e32 v7, -1, v5
|
||||||
; VI-NEXT: v_and_b32_e32 v5, 15, v5
|
|
||||||
; VI-NEXT: v_lshlrev_b16_e32 v1, v7, v1
|
; VI-NEXT: v_lshlrev_b16_e32 v1, v7, v1
|
||||||
; VI-NEXT: v_lshrrev_b16_e32 v3, v5, v3
|
; VI-NEXT: v_lshrrev_b16_e32 v3, v5, v3
|
||||||
; VI-NEXT: v_or_b32_e32 v1, v1, v3
|
; VI-NEXT: v_or_b32_e32 v1, v1, v3
|
||||||
; VI-NEXT: v_xor_b32_e32 v3, -1, v4
|
|
||||||
; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0
|
; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0
|
||||||
; VI-NEXT: v_and_b32_e32 v3, 15, v3
|
; VI-NEXT: v_xor_b32_e32 v3, -1, v4
|
||||||
; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0
|
; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0
|
||||||
; VI-NEXT: v_and_b32_e32 v3, 15, v4
|
; VI-NEXT: v_lshrrev_b16_e32 v2, v4, v2
|
||||||
; VI-NEXT: v_lshrrev_b16_e32 v2, v3, v2
|
|
||||||
; VI-NEXT: v_or_b32_e32 v0, v0, v2
|
; VI-NEXT: v_or_b32_e32 v0, v0, v2
|
||||||
; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||||
|
@ -808,27 +792,21 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2
|
||||||
; GFX9: ; %bb.0:
|
; GFX9: ; %bb.0:
|
||||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v4
|
; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v4
|
||||||
; GFX9-NEXT: v_and_b32_e32 v7, 15, v6
|
|
||||||
; GFX9-NEXT: v_mov_b32_e32 v8, 1
|
; GFX9-NEXT: v_mov_b32_e32 v8, 1
|
||||||
; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6
|
; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; GFX9-NEXT: v_and_b32_e32 v6, 15, v6
|
; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6
|
||||||
; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
||||||
; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v8
|
; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v8
|
||||||
; GFX9-NEXT: v_or_b32_e32 v6, v6, v7
|
; GFX9-NEXT: v_or_b32_e32 v6, v6, v7
|
||||||
; GFX9-NEXT: v_xor_b32_e32 v7, -1, v5
|
|
||||||
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1
|
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1
|
||||||
; GFX9-NEXT: v_and_b32_e32 v7, 15, v7
|
; GFX9-NEXT: v_xor_b32_e32 v7, -1, v5
|
||||||
; GFX9-NEXT: v_and_b32_e32 v5, 15, v5
|
|
||||||
; GFX9-NEXT: v_lshlrev_b16_e32 v1, v7, v1
|
; GFX9-NEXT: v_lshlrev_b16_e32 v1, v7, v1
|
||||||
; GFX9-NEXT: v_lshrrev_b16_e32 v3, v5, v3
|
; GFX9-NEXT: v_lshrrev_b16_e32 v3, v5, v3
|
||||||
; GFX9-NEXT: v_or_b32_e32 v1, v1, v3
|
; GFX9-NEXT: v_or_b32_e32 v1, v1, v3
|
||||||
; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4
|
|
||||||
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0
|
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0
|
||||||
; GFX9-NEXT: v_and_b32_e32 v3, 15, v3
|
; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4
|
||||||
; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0
|
; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0
|
||||||
; GFX9-NEXT: v_and_b32_e32 v3, 15, v4
|
; GFX9-NEXT: v_lshrrev_b16_e32 v2, v4, v2
|
||||||
; GFX9-NEXT: v_lshrrev_b16_e32 v2, v3, v2
|
|
||||||
; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
|
; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
|
||||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||||
; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0
|
; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0
|
||||||
|
@ -844,31 +822,25 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2
|
||||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||||
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v4
|
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v4
|
||||||
; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4
|
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v0
|
||||||
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v0
|
|
||||||
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
|
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
|
||||||
; GFX10-NEXT: v_and_b32_e32 v4, 15, v4
|
; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4
|
||||||
; GFX10-NEXT: v_and_b32_e32 v9, 15, v6
|
; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v2
|
||||||
; GFX10-NEXT: v_xor_b32_e32 v6, -1, v6
|
; GFX10-NEXT: v_xor_b32_e32 v10, -1, v6
|
||||||
; GFX10-NEXT: v_and_b32_e32 v8, 15, v8
|
; GFX10-NEXT: v_lshlrev_b16 v7, 1, v7
|
||||||
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2
|
|
||||||
; GFX10-NEXT: v_lshlrev_b16 v10, 1, v10
|
|
||||||
; GFX10-NEXT: v_xor_b32_e32 v11, -1, v5
|
|
||||||
; GFX10-NEXT: v_and_b32_e32 v6, 15, v6
|
|
||||||
; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0
|
|
||||||
; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2
|
; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2
|
||||||
; GFX10-NEXT: v_lshrrev_b16 v4, v9, v7
|
; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0
|
||||||
|
; GFX10-NEXT: v_lshrrev_b16 v4, v6, v9
|
||||||
; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1
|
; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1
|
||||||
; GFX10-NEXT: v_lshlrev_b16 v6, v6, v10
|
; GFX10-NEXT: v_lshlrev_b16 v6, v10, v7
|
||||||
; GFX10-NEXT: v_and_b32_e32 v7, 15, v11
|
; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3
|
||||||
; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
|
; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
|
||||||
; GFX10-NEXT: v_and_b32_e32 v2, 15, v5
|
; GFX10-NEXT: v_xor_b32_e32 v2, -1, v5
|
||||||
; GFX10-NEXT: v_or_b32_e32 v4, v6, v4
|
; GFX10-NEXT: v_or_b32_e32 v4, v6, v4
|
||||||
; GFX10-NEXT: v_lshlrev_b16 v1, v7, v1
|
|
||||||
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||||
; GFX10-NEXT: v_lshrrev_b16 v2, v2, v3
|
; GFX10-NEXT: v_lshlrev_b16 v1, v2, v1
|
||||||
; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0
|
; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0
|
||||||
; GFX10-NEXT: v_or_b32_e32 v1, v1, v2
|
; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
|
||||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||||
%ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2)
|
%ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2)
|
||||||
ret <3 x i16> %ret
|
ret <3 x i16> %ret
|
||||||
|
@ -905,34 +877,26 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
|
||||||
; VI: ; %bb.0:
|
; VI: ; %bb.0:
|
||||||
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v5
|
; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v5
|
||||||
; VI-NEXT: v_and_b32_e32 v7, 15, v6
|
|
||||||
; VI-NEXT: v_mov_b32_e32 v8, 1
|
; VI-NEXT: v_mov_b32_e32 v8, 1
|
||||||
; VI-NEXT: v_xor_b32_e32 v6, -1, v6
|
; VI-NEXT: v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; VI-NEXT: v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; VI-NEXT: v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; VI-NEXT: v_and_b32_e32 v6, 15, v6
|
; VI-NEXT: v_xor_b32_e32 v6, -1, v6
|
||||||
; VI-NEXT: v_lshrrev_b16_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
||||||
; VI-NEXT: v_lshlrev_b16_e32 v6, v6, v9
|
; VI-NEXT: v_lshlrev_b16_e32 v6, v6, v9
|
||||||
; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||||
; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v4
|
; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v4
|
||||||
; VI-NEXT: v_and_b32_e32 v9, 15, v7
|
; VI-NEXT: v_lshrrev_b16_sdwa v9, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; VI-NEXT: v_xor_b32_e32 v7, -1, v7
|
|
||||||
; VI-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; VI-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; VI-NEXT: v_and_b32_e32 v7, 15, v7
|
; VI-NEXT: v_xor_b32_e32 v7, -1, v7
|
||||||
; VI-NEXT: v_lshlrev_b16_e32 v7, v7, v8
|
; VI-NEXT: v_lshlrev_b16_e32 v7, v7, v8
|
||||||
; VI-NEXT: v_xor_b32_e32 v8, -1, v5
|
|
||||||
; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1
|
; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1
|
||||||
; VI-NEXT: v_and_b32_e32 v8, 15, v8
|
; VI-NEXT: v_xor_b32_e32 v8, -1, v5
|
||||||
; VI-NEXT: v_and_b32_e32 v5, 15, v5
|
|
||||||
; VI-NEXT: v_lshlrev_b16_e32 v1, v8, v1
|
; VI-NEXT: v_lshlrev_b16_e32 v1, v8, v1
|
||||||
; VI-NEXT: v_lshrrev_b16_e32 v3, v5, v3
|
; VI-NEXT: v_lshrrev_b16_e32 v3, v5, v3
|
||||||
; VI-NEXT: v_or_b32_e32 v1, v1, v3
|
; VI-NEXT: v_or_b32_e32 v1, v1, v3
|
||||||
; VI-NEXT: v_xor_b32_e32 v3, -1, v4
|
|
||||||
; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0
|
; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0
|
||||||
; VI-NEXT: v_and_b32_e32 v3, 15, v3
|
; VI-NEXT: v_xor_b32_e32 v3, -1, v4
|
||||||
; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0
|
; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0
|
||||||
; VI-NEXT: v_and_b32_e32 v3, 15, v4
|
; VI-NEXT: v_lshrrev_b16_e32 v2, v4, v2
|
||||||
; VI-NEXT: v_lshrrev_b16_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
||||||
; VI-NEXT: v_lshrrev_b16_e32 v2, v3, v2
|
|
||||||
; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||||
; VI-NEXT: v_or_b32_e32 v0, v0, v2
|
; VI-NEXT: v_or_b32_e32 v0, v0, v2
|
||||||
; VI-NEXT: v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
; VI-NEXT: v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||||
|
@ -943,34 +907,26 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
|
||||||
; GFX9: ; %bb.0:
|
; GFX9: ; %bb.0:
|
||||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v5
|
; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v5
|
||||||
; GFX9-NEXT: v_and_b32_e32 v7, 15, v6
|
|
||||||
; GFX9-NEXT: v_mov_b32_e32 v8, 1
|
; GFX9-NEXT: v_mov_b32_e32 v8, 1
|
||||||
; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6
|
; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; GFX9-NEXT: v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX9-NEXT: v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; GFX9-NEXT: v_and_b32_e32 v6, 15, v6
|
; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6
|
||||||
; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
||||||
; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v9
|
; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v9
|
||||||
; GFX9-NEXT: v_or_b32_e32 v6, v6, v7
|
; GFX9-NEXT: v_or_b32_e32 v6, v6, v7
|
||||||
; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4
|
; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4
|
||||||
; GFX9-NEXT: v_and_b32_e32 v9, 15, v7
|
; GFX9-NEXT: v_lshrrev_b16_sdwa v9, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; GFX9-NEXT: v_xor_b32_e32 v7, -1, v7
|
|
||||||
; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||||
; GFX9-NEXT: v_and_b32_e32 v7, 15, v7
|
; GFX9-NEXT: v_xor_b32_e32 v7, -1, v7
|
||||||
; GFX9-NEXT: v_lshlrev_b16_e32 v7, v7, v8
|
; GFX9-NEXT: v_lshlrev_b16_e32 v7, v7, v8
|
||||||
; GFX9-NEXT: v_xor_b32_e32 v8, -1, v5
|
|
||||||
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1
|
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1
|
||||||
; GFX9-NEXT: v_and_b32_e32 v8, 15, v8
|
; GFX9-NEXT: v_xor_b32_e32 v8, -1, v5
|
||||||
; GFX9-NEXT: v_and_b32_e32 v5, 15, v5
|
|
||||||
; GFX9-NEXT: v_lshlrev_b16_e32 v1, v8, v1
|
; GFX9-NEXT: v_lshlrev_b16_e32 v1, v8, v1
|
||||||
; GFX9-NEXT: v_lshrrev_b16_e32 v3, v5, v3
|
; GFX9-NEXT: v_lshrrev_b16_e32 v3, v5, v3
|
||||||
; GFX9-NEXT: v_or_b32_e32 v1, v1, v3
|
; GFX9-NEXT: v_or_b32_e32 v1, v1, v3
|
||||||
; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4
|
|
||||||
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0
|
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0
|
||||||
; GFX9-NEXT: v_and_b32_e32 v3, 15, v3
|
; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4
|
||||||
; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0
|
; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0
|
||||||
; GFX9-NEXT: v_and_b32_e32 v3, 15, v4
|
; GFX9-NEXT: v_lshrrev_b16_e32 v2, v4, v2
|
||||||
; GFX9-NEXT: v_lshrrev_b16_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
||||||
; GFX9-NEXT: v_lshrrev_b16_e32 v2, v3, v2
|
|
||||||
; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
|
; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
|
||||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
|
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
|
||||||
; GFX9-NEXT: v_or_b32_e32 v7, v7, v9
|
; GFX9-NEXT: v_or_b32_e32 v7, v7, v9
|
||||||
|
@ -989,40 +945,32 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
|
||||||
; GFX10: ; %bb.0:
|
; GFX10: ; %bb.0:
|
||||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||||
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v5
|
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v1
|
||||||
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1
|
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5
|
||||||
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v3
|
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v3
|
||||||
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v4
|
; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v4
|
||||||
; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v0
|
|
||||||
; GFX10-NEXT: v_xor_b32_e32 v9, -1, v6
|
|
||||||
; GFX10-NEXT: v_and_b32_e32 v6, 15, v6
|
|
||||||
; GFX10-NEXT: v_lshlrev_b16 v8, 1, v8
|
|
||||||
; GFX10-NEXT: v_and_b32_e32 v13, 15, v10
|
|
||||||
; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1
|
; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1
|
||||||
; GFX10-NEXT: v_and_b32_e32 v9, 15, v9
|
; GFX10-NEXT: v_lshlrev_b16 v6, 1, v6
|
||||||
; GFX10-NEXT: v_lshrrev_b16 v6, v6, v7
|
; GFX10-NEXT: v_xor_b32_e32 v9, -1, v7
|
||||||
|
; GFX10-NEXT: v_lshrrev_b16 v7, v7, v8
|
||||||
|
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v0
|
||||||
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
|
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
|
||||||
; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v2
|
; GFX10-NEXT: v_xor_b32_e32 v12, -1, v5
|
||||||
; GFX10-NEXT: v_lshlrev_b16 v11, 1, v11
|
; GFX10-NEXT: v_lshlrev_b16 v6, v9, v6
|
||||||
; GFX10-NEXT: v_lshlrev_b16 v7, v9, v8
|
; GFX10-NEXT: v_xor_b32_e32 v9, -1, v4
|
||||||
; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4
|
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2
|
||||||
; GFX10-NEXT: v_xor_b32_e32 v9, -1, v10
|
; GFX10-NEXT: v_lshlrev_b16 v8, 1, v8
|
||||||
; GFX10-NEXT: v_xor_b32_e32 v10, -1, v5
|
; GFX10-NEXT: v_xor_b32_e32 v13, -1, v11
|
||||||
; GFX10-NEXT: v_and_b32_e32 v4, 15, v4
|
|
||||||
; GFX10-NEXT: v_and_b32_e32 v5, 15, v5
|
|
||||||
; GFX10-NEXT: v_and_b32_e32 v8, 15, v8
|
|
||||||
; GFX10-NEXT: v_and_b32_e32 v9, 15, v9
|
|
||||||
; GFX10-NEXT: v_and_b32_e32 v10, 15, v10
|
|
||||||
; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2
|
; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2
|
||||||
|
; GFX10-NEXT: v_lshlrev_b16 v0, v9, v0
|
||||||
|
; GFX10-NEXT: v_lshlrev_b16 v1, v12, v1
|
||||||
; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3
|
; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3
|
||||||
; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0
|
; GFX10-NEXT: v_lshrrev_b16 v4, v11, v10
|
||||||
; GFX10-NEXT: v_lshrrev_b16 v4, v13, v12
|
; GFX10-NEXT: v_lshlrev_b16 v5, v13, v8
|
||||||
; GFX10-NEXT: v_lshlrev_b16 v1, v10, v1
|
|
||||||
; GFX10-NEXT: v_lshlrev_b16 v5, v9, v11
|
|
||||||
; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
|
; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
|
||||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff
|
; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff
|
||||||
; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
|
; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
|
||||||
; GFX10-NEXT: v_or_b32_e32 v3, v7, v6
|
; GFX10-NEXT: v_or_b32_e32 v3, v6, v7
|
||||||
; GFX10-NEXT: v_or_b32_e32 v4, v5, v4
|
; GFX10-NEXT: v_or_b32_e32 v4, v5, v4
|
||||||
; GFX10-NEXT: v_and_b32_e32 v0, v2, v0
|
; GFX10-NEXT: v_and_b32_e32 v0, v2, v0
|
||||||
; GFX10-NEXT: v_and_b32_e32 v1, v2, v1
|
; GFX10-NEXT: v_and_b32_e32 v1, v2, v1
|
||||||
|
@ -1037,11 +985,9 @@ define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) {
|
||||||
; SI-LABEL: v_fshr_i64:
|
; SI-LABEL: v_fshr_i64:
|
||||||
; SI: ; %bb.0:
|
; SI: ; %bb.0:
|
||||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; SI-NEXT: v_and_b32_e32 v5, 63, v4
|
|
||||||
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
|
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
|
||||||
|
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v4
|
||||||
; SI-NEXT: v_not_b32_e32 v4, v4
|
; SI-NEXT: v_not_b32_e32 v4, v4
|
||||||
; SI-NEXT: v_and_b32_e32 v4, 63, v4
|
|
||||||
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v5
|
|
||||||
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v4
|
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v4
|
||||||
; SI-NEXT: v_or_b32_e32 v1, v1, v3
|
; SI-NEXT: v_or_b32_e32 v1, v1, v3
|
||||||
; SI-NEXT: v_or_b32_e32 v0, v0, v2
|
; SI-NEXT: v_or_b32_e32 v0, v0, v2
|
||||||
|
@ -1050,11 +996,9 @@ define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) {
|
||||||
; VI-LABEL: v_fshr_i64:
|
; VI-LABEL: v_fshr_i64:
|
||||||
; VI: ; %bb.0:
|
; VI: ; %bb.0:
|
||||||
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; VI-NEXT: v_and_b32_e32 v5, 63, v4
|
|
||||||
; VI-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
|
; VI-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
|
||||||
|
; VI-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
|
||||||
; VI-NEXT: v_not_b32_e32 v4, v4
|
; VI-NEXT: v_not_b32_e32 v4, v4
|
||||||
; VI-NEXT: v_and_b32_e32 v4, 63, v4
|
|
||||||
; VI-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3]
|
|
||||||
; VI-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
|
; VI-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
|
||||||
; VI-NEXT: v_or_b32_e32 v1, v1, v3
|
; VI-NEXT: v_or_b32_e32 v1, v1, v3
|
||||||
; VI-NEXT: v_or_b32_e32 v0, v0, v2
|
; VI-NEXT: v_or_b32_e32 v0, v0, v2
|
||||||
|
@ -1063,11 +1007,9 @@ define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) {
|
||||||
; GFX9-LABEL: v_fshr_i64:
|
; GFX9-LABEL: v_fshr_i64:
|
||||||
; GFX9: ; %bb.0:
|
; GFX9: ; %bb.0:
|
||||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; GFX9-NEXT: v_and_b32_e32 v5, 63, v4
|
|
||||||
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
|
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
|
||||||
|
; GFX9-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
|
||||||
; GFX9-NEXT: v_not_b32_e32 v4, v4
|
; GFX9-NEXT: v_not_b32_e32 v4, v4
|
||||||
; GFX9-NEXT: v_and_b32_e32 v4, 63, v4
|
|
||||||
; GFX9-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3]
|
|
||||||
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
|
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
|
||||||
; GFX9-NEXT: v_or_b32_e32 v1, v1, v3
|
; GFX9-NEXT: v_or_b32_e32 v1, v1, v3
|
||||||
; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
|
; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
|
||||||
|
@ -1082,10 +1024,8 @@ define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) {
|
||||||
; GFX10: ; %bb.0:
|
; GFX10: ; %bb.0:
|
||||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||||
; GFX10-NEXT: v_not_b32_e32 v5, v4
|
|
||||||
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
|
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
|
||||||
; GFX10-NEXT: v_and_b32_e32 v4, 63, v4
|
; GFX10-NEXT: v_not_b32_e32 v5, v4
|
||||||
; GFX10-NEXT: v_and_b32_e32 v5, 63, v5
|
|
||||||
; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
|
; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
|
||||||
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
|
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
|
||||||
; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
|
; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
|
||||||
|
@ -1099,18 +1039,14 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2
|
||||||
; SI-LABEL: v_fshr_v2i64:
|
; SI-LABEL: v_fshr_v2i64:
|
||||||
; SI: ; %bb.0:
|
; SI: ; %bb.0:
|
||||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; SI-NEXT: v_and_b32_e32 v9, 63, v8
|
|
||||||
; SI-NEXT: v_not_b32_e32 v8, v8
|
|
||||||
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
|
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
|
||||||
; SI-NEXT: v_and_b32_e32 v8, 63, v8
|
; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], v8
|
||||||
; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], v9
|
; SI-NEXT: v_not_b32_e32 v8, v8
|
||||||
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v8
|
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v8
|
||||||
; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
|
; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
|
||||||
; SI-NEXT: v_or_b32_e32 v1, v1, v5
|
; SI-NEXT: v_or_b32_e32 v1, v1, v5
|
||||||
; SI-NEXT: v_and_b32_e32 v5, 63, v10
|
; SI-NEXT: v_lshr_b64 v[5:6], v[6:7], v10
|
||||||
; SI-NEXT: v_lshr_b64 v[5:6], v[6:7], v5
|
|
||||||
; SI-NEXT: v_not_b32_e32 v7, v10
|
; SI-NEXT: v_not_b32_e32 v7, v10
|
||||||
; SI-NEXT: v_and_b32_e32 v7, 63, v7
|
|
||||||
; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v7
|
; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v7
|
||||||
; SI-NEXT: v_or_b32_e32 v0, v0, v4
|
; SI-NEXT: v_or_b32_e32 v0, v0, v4
|
||||||
; SI-NEXT: v_or_b32_e32 v3, v3, v6
|
; SI-NEXT: v_or_b32_e32 v3, v3, v6
|
||||||
|
@ -1120,18 +1056,14 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2
|
||||||
; VI-LABEL: v_fshr_v2i64:
|
; VI-LABEL: v_fshr_v2i64:
|
||||||
; VI: ; %bb.0:
|
; VI: ; %bb.0:
|
||||||
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; VI-NEXT: v_and_b32_e32 v9, 63, v8
|
|
||||||
; VI-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
|
; VI-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
|
||||||
|
; VI-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
|
||||||
; VI-NEXT: v_not_b32_e32 v8, v8
|
; VI-NEXT: v_not_b32_e32 v8, v8
|
||||||
; VI-NEXT: v_and_b32_e32 v8, 63, v8
|
|
||||||
; VI-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5]
|
|
||||||
; VI-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
|
; VI-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
|
||||||
; VI-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
|
; VI-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
|
||||||
; VI-NEXT: v_or_b32_e32 v1, v1, v5
|
; VI-NEXT: v_or_b32_e32 v1, v1, v5
|
||||||
; VI-NEXT: v_and_b32_e32 v5, 63, v10
|
; VI-NEXT: v_lshrrev_b64 v[5:6], v10, v[6:7]
|
||||||
; VI-NEXT: v_lshrrev_b64 v[5:6], v5, v[6:7]
|
|
||||||
; VI-NEXT: v_not_b32_e32 v7, v10
|
; VI-NEXT: v_not_b32_e32 v7, v10
|
||||||
; VI-NEXT: v_and_b32_e32 v7, 63, v7
|
|
||||||
; VI-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3]
|
; VI-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3]
|
||||||
; VI-NEXT: v_or_b32_e32 v0, v0, v4
|
; VI-NEXT: v_or_b32_e32 v0, v0, v4
|
||||||
; VI-NEXT: v_or_b32_e32 v3, v3, v6
|
; VI-NEXT: v_or_b32_e32 v3, v3, v6
|
||||||
|
@ -1141,18 +1073,14 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2
|
||||||
; GFX9-LABEL: v_fshr_v2i64:
|
; GFX9-LABEL: v_fshr_v2i64:
|
||||||
; GFX9: ; %bb.0:
|
; GFX9: ; %bb.0:
|
||||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; GFX9-NEXT: v_and_b32_e32 v9, 63, v8
|
|
||||||
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
|
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
|
||||||
|
; GFX9-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
|
||||||
; GFX9-NEXT: v_not_b32_e32 v8, v8
|
; GFX9-NEXT: v_not_b32_e32 v8, v8
|
||||||
; GFX9-NEXT: v_and_b32_e32 v8, 63, v8
|
|
||||||
; GFX9-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5]
|
|
||||||
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
|
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
|
||||||
; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
|
; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
|
||||||
; GFX9-NEXT: v_or_b32_e32 v1, v1, v5
|
; GFX9-NEXT: v_or_b32_e32 v1, v1, v5
|
||||||
; GFX9-NEXT: v_and_b32_e32 v5, 63, v10
|
; GFX9-NEXT: v_lshrrev_b64 v[5:6], v10, v[6:7]
|
||||||
; GFX9-NEXT: v_lshrrev_b64 v[5:6], v5, v[6:7]
|
|
||||||
; GFX9-NEXT: v_not_b32_e32 v7, v10
|
; GFX9-NEXT: v_not_b32_e32 v7, v10
|
||||||
; GFX9-NEXT: v_and_b32_e32 v7, 63, v7
|
|
||||||
; GFX9-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3]
|
; GFX9-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3]
|
||||||
; GFX9-NEXT: v_or_b32_e32 v0, v0, v4
|
; GFX9-NEXT: v_or_b32_e32 v0, v0, v4
|
||||||
; GFX9-NEXT: v_or_b32_e32 v3, v3, v6
|
; GFX9-NEXT: v_or_b32_e32 v3, v3, v6
|
||||||
|
@ -1168,17 +1096,13 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2
|
||||||
; GFX10: ; %bb.0:
|
; GFX10: ; %bb.0:
|
||||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||||
; GFX10-NEXT: v_not_b32_e32 v9, v8
|
|
||||||
; GFX10-NEXT: v_not_b32_e32 v11, v10
|
|
||||||
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
|
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
|
||||||
; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
|
; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
|
||||||
; GFX10-NEXT: v_and_b32_e32 v8, 63, v8
|
; GFX10-NEXT: v_not_b32_e32 v9, v8
|
||||||
; GFX10-NEXT: v_and_b32_e32 v9, 63, v9
|
; GFX10-NEXT: v_not_b32_e32 v11, v10
|
||||||
; GFX10-NEXT: v_and_b32_e32 v10, 63, v10
|
|
||||||
; GFX10-NEXT: v_and_b32_e32 v11, 63, v11
|
|
||||||
; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
|
; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
|
||||||
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
|
|
||||||
; GFX10-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7]
|
; GFX10-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7]
|
||||||
|
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
|
||||||
; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
|
; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
|
||||||
; GFX10-NEXT: v_or_b32_e32 v0, v0, v4
|
; GFX10-NEXT: v_or_b32_e32 v0, v0, v4
|
||||||
; GFX10-NEXT: v_or_b32_e32 v1, v1, v5
|
; GFX10-NEXT: v_or_b32_e32 v1, v1, v5
|
||||||
|
|
|
@ -245,8 +245,7 @@ define amdgpu_kernel void @trunc_shl_31_i32_i64_multi_use(i32 addrspace(1)* %out
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}trunc_shl_and31:
|
; GCN-LABEL: {{^}}trunc_shl_and31:
|
||||||
; GCN: s_and_b32 s[[AMT:[0-9]+]], s{{[0-9]+}}, 31
|
; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
|
||||||
; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, s[[AMT]], v{{[0-9]+}}
|
|
||||||
; GCN-NOT: v_lshl_b64
|
; GCN-NOT: v_lshl_b64
|
||||||
; GCN-NOT: v_lshlrev_b64
|
; GCN-NOT: v_lshlrev_b64
|
||||||
define amdgpu_kernel void @trunc_shl_and31(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) {
|
define amdgpu_kernel void @trunc_shl_and31(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) {
|
||||||
|
|
Loading…
Reference in New Issue