From 8693ddc74371dedc742c9f3d3e4eda1da72c13ea Mon Sep 17 00:00:00 2001 From: Jordan Rupprecht Date: Tue, 1 Sep 2020 13:28:42 -0700 Subject: [PATCH] Revert "[GlobalISel] Fold xor(cmp(pred, _, _), 1) -> cmp(inverse(pred), _, _)" (and dependent patch "Optimize away a Not feeding a brcond by using tbz instead of tbnz.") This reverts commit 8ad8f484b63ca507417b58c9016d2761f2b1a1a8. It causes crashes when running `ninja check-llvm-codegen-aarch64-globalisel`, e.g. http://lab.llvm.org:8011/builders/clang-with-thin-lto-ubuntu/builds/24132/steps/test-stage1-compiler/logs/stdio. Note that the crash does not seem to reproduce in debug builds. 5ded4442520d3dbb1aa72e6fe03cddef8828c618 depends on this, so revert that too. --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 4 - .../llvm/CodeGen/GlobalISel/MIPatternMatch.h | 6 - llvm/include/llvm/CodeGen/GlobalISel/Utils.h | 9 - .../include/llvm/Target/GlobalISel/Combine.td | 11 +- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 68 -------- llvm/lib/CodeGen/GlobalISel/Utils.cpp | 37 ---- .../GISel/AArch64InstructionSelector.cpp | 17 +- .../prelegalizercombiner-invert-cmp.mir | 163 ------------------ .../GlobalISel/select-brcond-of-not.mir | 76 -------- .../GlobalISel/divergent-control-flow.ll | 29 ++-- .../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll | 64 ++++--- .../CodeGen/AMDGPU/GlobalISel/srem.i64.ll | 64 ++++--- .../CodeGen/AMDGPU/GlobalISel/udiv.i64.ll | 70 ++++---- .../CodeGen/AMDGPU/GlobalISel/urem.i64.ll | 70 ++++---- 14 files changed, 178 insertions(+), 510 deletions(-) delete mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-invert-cmp.mir delete mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/select-brcond-of-not.mir diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index e1f0535affcd..76560b0727f9 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -355,10 +355,6 @@ public: /// \return true if \p MI is a G_SEXT_INREG that can be erased. bool matchRedundantSExtInReg(MachineInstr &MI); - /// Combine inverting a result of a compare into the opposite cond code. - bool matchNotCmp(MachineInstr &MI, Register &CmpReg); - bool applyNotCmp(MachineInstr &MI, Register &CmpReg); - /// Try to transform \p MI by using all of the above /// combine functions. Returns true if changed. bool tryCombine(MachineInstr &MI); diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h index c1c72d0ccbb9..4e216a284088 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h @@ -233,12 +233,6 @@ m_GAnd(const LHS &L, const RHS &R) { return BinaryOp_match(L, R); } -template -inline BinaryOp_match -m_GXor(const LHS &L, const RHS &R) { - return BinaryOp_match(L, R); -} - template inline BinaryOp_match m_GOr(const LHS &L, const RHS &R) { diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h index b7d2489eda23..23dc05c4e157 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h @@ -16,7 +16,6 @@ #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/Register.h" -#include "llvm/CodeGen/TargetLowering.h" #include "llvm/Support/Alignment.h" #include "llvm/Support/LowLevelTypeImpl.h" #include "llvm/Support/MachineValueType.h" @@ -228,10 +227,6 @@ LLT getGCDType(LLT OrigTy, LLT TargetTy); /// If \p MI is not a splat, returns None. Optional getSplatIndex(MachineInstr &MI); -/// Returns a scalar constant of a G_BUILD_VECTOR splat if it exists. -Optional getBuildVectorConstantSplat(const MachineInstr &MI, - const MachineRegisterInfo &MRI); - /// Return true if the specified instruction is a G_BUILD_VECTOR or /// G_BUILD_VECTOR_TRUNC where all of the elements are 0 or undef. bool isBuildVectorAllZeros(const MachineInstr &MI, @@ -242,9 +237,5 @@ bool isBuildVectorAllZeros(const MachineInstr &MI, bool isBuildVectorAllOnes(const MachineInstr &MI, const MachineRegisterInfo &MRI); -/// Returns true if given the TargetLowering's boolean contents information, -/// the value \p Val contains a true value. -bool isConstTrueVal(const TargetLowering &TLI, int64_t Val, bool IsVector, - bool IsFP); } // End namespace llvm. #endif diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 4b0fe43c1868..cbf57046d24f 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -373,14 +373,6 @@ def ext_ext_fold: GICombineRule < (apply [{ return Helper.applyCombineExtOfExt(*${root}, ${matchinfo}); }]) >; -def not_cmp_fold_matchinfo : GIDefMatchData<"Register">; -def not_cmp_fold : GICombineRule< - (defs root:$d, not_cmp_fold_matchinfo:$info), - (match (wip_match_opcode G_XOR): $d, - [{ return Helper.matchNotCmp(*${d}, ${info}); }]), - (apply [{ return Helper.applyNotCmp(*${d}, ${info}); }]) ->; - // FIXME: These should use the custom predicate feature once it lands. def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero, undef_to_negative_one, @@ -408,5 +400,4 @@ def all_combines : GICombineGroup<[trivial_combines, ptr_add_immed_chain, hoist_logic_op_with_same_opcode_hands, shl_ashr_to_sext_inreg, sext_inreg_of_load, width_reduction_combines, select_combines, - known_bits_simplifications, ext_ext_fold, - not_cmp_fold]>; + known_bits_simplifications, ext_ext_fold]>; diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 5dbd09670fea..dbdf8c98384d 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -2231,74 +2231,6 @@ bool CombinerHelper::matchRedundantSExtInReg(MachineInstr &MI) { return KB->computeNumSignBits(Src) >= (TypeSize - ExtBits + 1); } -static bool isConstValidTrue(const TargetLowering &TLI, unsigned ScalarSizeBits, - int64_t Cst, bool IsVector, bool IsFP) { - // For i1, Cst will always be -1 regardless of boolean contents. - return (ScalarSizeBits == 1 && Cst == -1) || - isConstTrueVal(TLI, Cst, IsVector, IsFP); -} - -bool CombinerHelper::matchNotCmp(MachineInstr &MI, Register &CmpReg) { - assert(MI.getOpcode() == TargetOpcode::G_XOR); - LLT Ty = MRI.getType(MI.getOperand(0).getReg()); - const auto &TLI = *Builder.getMF().getSubtarget().getTargetLowering(); - Register XorSrc; - Register CstReg; - int64_t Cst; - // We match xor(src, true) here. - if (!mi_match(MI.getOperand(0).getReg(), MRI, - m_GXor(m_Reg(XorSrc), m_Reg(CstReg)))) - return false; - - if (!MRI.hasOneNonDBGUse(XorSrc)) - return false; - - // Now try match src to either icmp or fcmp. - bool IsFP = false; - if (!mi_match(XorSrc, MRI, m_GICmp(m_Pred(), m_Reg(), m_Reg()))) { - // Try fcmp. - if (!mi_match(XorSrc, MRI, m_GFCmp(m_Pred(), m_Reg(), m_Reg()))) - return false; - IsFP = true; - } - - if (Ty.isVector()) { - MachineInstr *CstDef = MRI.getVRegDef(CstReg); - auto MaybeCst = getBuildVectorConstantSplat(*CstDef, MRI); - if (!MaybeCst) - return false; - if (!isConstValidTrue(TLI, Ty.getScalarSizeInBits(), *MaybeCst, true, IsFP)) - return false; - } else { - if (!mi_match(CstReg, MRI, m_ICst(Cst))) - return false; - if (!isConstValidTrue(TLI, Ty.getSizeInBits(), Cst, false, IsFP)) - return false; - } - - CmpReg = XorSrc; - return true; -} - -bool CombinerHelper::applyNotCmp(MachineInstr &MI, Register &CmpReg) { - MachineInstr *CmpDef = MRI.getVRegDef(CmpReg); - assert(CmpDef && "Should have been given an MI reg"); - assert(CmpDef->getOpcode() == TargetOpcode::G_ICMP || - CmpDef->getOpcode() == TargetOpcode::G_FCMP); - - Observer.changingInstr(*CmpDef); - MachineOperand &PredOp = CmpDef->getOperand(1); - CmpInst::Predicate NewP = CmpInst::getInversePredicate( - (CmpInst::Predicate)PredOp.getPredicate()); - PredOp.setPredicate(NewP); - Observer.changedInstr(*CmpDef); - - replaceRegWith(MRI, MI.getOperand(0).getReg(), - CmpDef->getOperand(0).getReg()); - MI.eraseFromParent(); - return true; -} - bool CombinerHelper::tryCombine(MachineInstr &MI) { if (tryCombineCopy(MI)) return true; diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index 303f2d8417b5..d598bf8b5a53 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -11,8 +11,6 @@ #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/ADT/APFloat.h" -#include "llvm/ADT/APInt.h" -#include "llvm/ADT/Optional.h" #include "llvm/ADT/Twine.h" #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" @@ -696,28 +694,6 @@ static bool isBuildVectorConstantSplat(const MachineInstr &MI, return true; } -Optional -llvm::getBuildVectorConstantSplat(const MachineInstr &MI, - const MachineRegisterInfo &MRI) { - if (!isBuildVectorOp(MI.getOpcode())) - return None; - - const unsigned NumOps = MI.getNumOperands(); - Optional Scalar; - for (unsigned I = 1; I != NumOps; ++I) { - Register Element = MI.getOperand(I).getReg(); - int64_t ElementValue; - if (!mi_match(Element, MRI, m_ICst(ElementValue))) - return None; - if (!Scalar) - Scalar = ElementValue; - else if (*Scalar != ElementValue) - return None; - } - - return Scalar; -} - bool llvm::isBuildVectorAllZeros(const MachineInstr &MI, const MachineRegisterInfo &MRI) { return isBuildVectorConstantSplat(MI, MRI, 0); @@ -727,16 +703,3 @@ bool llvm::isBuildVectorAllOnes(const MachineInstr &MI, const MachineRegisterInfo &MRI) { return isBuildVectorConstantSplat(MI, MRI, -1); } - -bool llvm::isConstTrueVal(const TargetLowering &TLI, int64_t Val, bool IsVector, - bool IsFP) { - switch (TLI.getBooleanContents(IsVector, IsFP)) { - case TargetLowering::UndefinedBooleanContent: - return Val & 0x1; - case TargetLowering::ZeroOrOneBooleanContent: - return Val == 1; - case TargetLowering::ZeroOrNegativeOneBooleanContent: - return Val == -1; - } - llvm_unreachable("Invalid boolean contents"); -} diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index a8d68180bb76..5e5f902e1107 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -41,7 +41,6 @@ #define DEBUG_TYPE "aarch64-isel" using namespace llvm; -using namespace MIPatternMatch; namespace { @@ -1884,7 +1883,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { return false; } - Register CondReg = I.getOperand(0).getReg(); + const Register CondReg = I.getOperand(0).getReg(); MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z @@ -1894,19 +1893,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { return true; if (ProduceNonFlagSettingCondBr) { - unsigned BOpc = AArch64::TBNZW; - // Try to fold a not, i.e. a xor, cond, 1. - Register XorSrc; - int64_t Cst; - if (mi_match(CondReg, MRI, - m_GTrunc(m_GXor(m_Reg(XorSrc), m_ICst(Cst)))) && - Cst == 1) { - CondReg = XorSrc; - BOpc = AArch64::TBZW; - if (MRI.getType(XorSrc).getSizeInBits() > 32) - BOpc = AArch64::TBZX; - } - auto MIB = BuildMI(MBB, I, I.getDebugLoc(), TII.get(BOpc)) + auto MIB = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::TBNZW)) .addUse(CondReg) .addImm(/*bit offset=*/0) .addMBB(DestMBB); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-invert-cmp.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-invert-cmp.mir deleted file mode 100644 index ec43b2c144ec..000000000000 --- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-invert-cmp.mir +++ /dev/null @@ -1,163 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple aarch64-apple-ios -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombinerhelper-only-enable-rule="not_cmp_fold" %s -o - -verify-machineinstrs | FileCheck %s - -# Check that we fold an compare result inverted into just inverting the condition code. ---- -name: icmp -tracksRegLiveness: true -body: | - bb.1: - liveins: $x0 - - ; CHECK-LABEL: name: icmp - ; CHECK: liveins: $x0 - ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 - ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[COPY]](s64), [[C]] - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s1) - ; CHECK: $w0 = COPY [[ANYEXT]](s32) - ; CHECK: RET_ReallyLR implicit $w0 - %0:_(s64) = COPY $x0 - %1:_(s64) = G_CONSTANT i64 1 - %2:_(s1) = G_CONSTANT i1 1 - %3:_(s1) = G_ICMP intpred(sgt), %0(s64), %1 - %4:_(s1) = G_XOR %3, %2 - %5:_(s32) = G_ANYEXT %4 - $w0 = COPY %5(s32) - RET_ReallyLR implicit $w0 -... ---- -name: fcmp -tracksRegLiveness: true -body: | - bb.1: - liveins: $x0 - - ; CHECK-LABEL: name: fcmp - ; CHECK: liveins: $x0 - ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 - ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; CHECK: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ule), [[COPY]](s64), [[C]] - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP]](s1) - ; CHECK: $w0 = COPY [[ANYEXT]](s32) - ; CHECK: RET_ReallyLR implicit $w0 - %0:_(s64) = COPY $x0 - %1:_(s64) = G_CONSTANT i64 1 - %2:_(s1) = G_CONSTANT i1 1 - %3:_(s1) = G_FCMP floatpred(ogt), %0(s64), %1 - %4:_(s1) = G_XOR %3, %2 - %5:_(s32) = G_ANYEXT %4 - $w0 = COPY %5(s32) - RET_ReallyLR implicit $w0 -... ---- -name: icmp_not_xor_with_1 -tracksRegLiveness: true -body: | - bb.1: - liveins: $x0 - - ; CHECK-LABEL: name: icmp_not_xor_with_1 - ; CHECK: liveins: $x0 - ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 - ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; CHECK: [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 false - ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY]](s64), [[C]] - ; CHECK: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[C1]] - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR]](s1) - ; CHECK: $w0 = COPY [[ANYEXT]](s32) - ; CHECK: RET_ReallyLR implicit $w0 - %0:_(s64) = COPY $x0 - %1:_(s64) = G_CONSTANT i64 1 - %2:_(s1) = G_CONSTANT i1 0 - %3:_(s1) = G_ICMP intpred(sgt), %0(s64), %1 - %4:_(s1) = G_XOR %3, %2 - %5:_(s32) = G_ANYEXT %4 - $w0 = COPY %5(s32) - RET_ReallyLR implicit $w0 -... ---- -name: icmp_not_xor_with_wrong_bool_contents -tracksRegLiveness: true -body: | - bb.1: - liveins: $x0 - - ; Even though bit 0 of the constant is 1, we require zero in the upper bits - ; for our aarch64's zero-or-one boolean contents. - ; CHECK-LABEL: name: icmp_not_xor_with_wrong_bool_contents - ; CHECK: liveins: $x0 - ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 - ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 - ; CHECK: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY]](s64), [[C]] - ; CHECK: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[ICMP]], [[C1]] - ; CHECK: $w0 = COPY [[XOR]](s32) - ; CHECK: RET_ReallyLR implicit $w0 - %0:_(s64) = COPY $x0 - %1:_(s64) = G_CONSTANT i64 1 - %2:_(s32) = G_CONSTANT i32 7 - %3:_(s32) = G_ICMP intpred(sgt), %0(s64), %1 - %4:_(s32) = G_XOR %3, %2 - $w0 = COPY %4(s32) - RET_ReallyLR implicit $w0 -... ---- -name: icmp_multiple_use -tracksRegLiveness: true -body: | - bb.1: - liveins: $x0 - - ; CHECK-LABEL: name: icmp_multiple_use - ; CHECK: liveins: $x0 - ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 - ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; CHECK: [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true - ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY]](s64), [[C]] - ; CHECK: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[C1]] - ; CHECK: %other_use:_(s1) = G_AND [[ICMP]], [[C1]] - ; CHECK: %other_use_ext:_(s32) = G_ANYEXT %other_use(s1) - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR]](s1) - ; CHECK: $w0 = COPY [[ANYEXT]](s32) - ; CHECK: $w1 = COPY %other_use_ext(s32) - ; CHECK: RET_ReallyLR implicit $w0 - %0:_(s64) = COPY $x0 - %1:_(s64) = G_CONSTANT i64 1 - %2:_(s1) = G_CONSTANT i1 1 - %3:_(s1) = G_ICMP intpred(sgt), %0(s64), %1 - %4:_(s1) = G_XOR %3, %2 - %other_use:_(s1) = G_AND %3, %2 - %other_use_ext:_(s32) = G_ANYEXT %other_use(s1) - %5:_(s32) = G_ANYEXT %4 - $w0 = COPY %5(s32) - $w1 = COPY %other_use_ext - RET_ReallyLR implicit $w0 -... ---- -name: icmp_vector -tracksRegLiveness: true -body: | - bb.1: - liveins: $q0 - - ; CHECK-LABEL: name: icmp_vector - ; CHECK: liveins: $q0 - ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 - ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 - ; CHECK: %splat_op2:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32) - ; CHECK: [[ICMP:%[0-9]+]]:_(<4 x s1>) = G_ICMP intpred(sle), [[COPY]](<4 x s32>), %splat_op2 - ; CHECK: [[ANYEXT:%[0-9]+]]:_(<4 x s32>) = G_ANYEXT [[ICMP]](<4 x s1>) - ; CHECK: $q0 = COPY [[ANYEXT]](<4 x s32>) - ; CHECK: RET_ReallyLR implicit $q0 - %0:_(<4 x s32>) = COPY $q0 - %1:_(s32) = G_CONSTANT i32 5 - %splat_op2:_(<4 x s32>) = G_BUILD_VECTOR %1, %1, %1, %1 - %2:_(s1) = G_CONSTANT i1 1 - %splat_true:_(<4 x s1>) = G_BUILD_VECTOR %2, %2, %2, %2 - %3:_(<4 x s1>) = G_ICMP intpred(sgt), %0(<4 x s32>), %splat_op2 - %4:_(<4 x s1>) = G_XOR %3, %splat_true - %5:_(<4 x s32>) = G_ANYEXT %4 - $q0 = COPY %5(<4 x s32>) - RET_ReallyLR implicit $q0 -... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-brcond-of-not.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-brcond-of-not.mir deleted file mode 100644 index 41fe50d9bb7d..000000000000 --- a/llvm/test/CodeGen/AArch64/GlobalISel/select-brcond-of-not.mir +++ /dev/null @@ -1,76 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s ---- -name: condbr_of_not -legalized: true -regBankSelected: true -liveins: - - { reg: '$x0' } -body: | - ; CHECK-LABEL: name: condbr_of_not - ; CHECK: bb.0: - ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 - ; CHECK: [[LDRBBui:%[0-9]+]]:gpr32 = LDRBBui [[COPY]], 0 :: (load 1) - ; CHECK: TBZW [[LDRBBui]], 0, %bb.2 - ; CHECK: bb.1: - ; CHECK: RET_ReallyLR - ; CHECK: bb.2: - ; CHECK: RET_ReallyLR - bb.1: - successors: %bb.2, %bb.3 - liveins: $x0 - - %0:gpr(p0) = COPY $x0 - %8:gpr(s8) = G_LOAD %0(p0) :: (load 1) - %4:gpr(s32) = G_ANYEXT %8(s8) - %5:gpr(s32) = G_CONSTANT i32 1 - %6:gpr(s32) = G_XOR %4, %5 - %3:gpr(s1) = G_TRUNC %6(s32) - G_BRCOND %3(s1), %bb.3 - - bb.2: - RET_ReallyLR - - bb.3: - RET_ReallyLR - -... ---- -name: condbr_of_not_64 -legalized: true -regBankSelected: true -liveins: - - { reg: '$x0' } -body: | - ; CHECK-LABEL: name: condbr_of_not_64 - ; CHECK: bb.0: - ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 - ; CHECK: [[LDRBBui:%[0-9]+]]:gpr32 = LDRBBui [[COPY]], 0 :: (load 1) - ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gpr64all = SUBREG_TO_REG 0, [[LDRBBui]], %subreg.sub_32 - ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY [[SUBREG_TO_REG]] - ; CHECK: TBZX [[COPY1]], 0, %bb.2 - ; CHECK: bb.1: - ; CHECK: RET_ReallyLR - ; CHECK: bb.2: - ; CHECK: RET_ReallyLR - bb.1: - successors: %bb.2, %bb.3 - liveins: $x0 - - %0:gpr(p0) = COPY $x0 - %8:gpr(s8) = G_LOAD %0(p0) :: (load 1) - %4:gpr(s64) = G_ANYEXT %8(s8) - %5:gpr(s64) = G_CONSTANT i64 1 - %6:gpr(s64) = G_XOR %4, %5 - %3:gpr(s1) = G_TRUNC %6(s64) - G_BRCOND %3(s1), %bb.3 - - bb.2: - RET_ReallyLR - - bb.3: - RET_ReallyLR - -... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll index 917697b63962..7564251c755d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -136,24 +136,27 @@ define void @constrained_if_register_class() { ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s4, 0 ; CHECK-NEXT: s_cselect_b32 s4, 1, 0 +; CHECK-NEXT: s_xor_b32 s4, s4, -1 ; CHECK-NEXT: s_and_b32 s4, s4, 1 ; CHECK-NEXT: s_cmp_lg_u32 s4, 0 -; CHECK-NEXT: s_cbranch_scc1 BB4_6 +; CHECK-NEXT: s_cbranch_scc0 BB4_6 ; CHECK-NEXT: ; %bb.1: ; %bb2 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, const.ptr@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, const.ptr@gotpcrel32@hi+4 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; CHECK-NEXT: s_mov_b32 s4, -1 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, const.ptr@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, const.ptr@gotpcrel32@hi+4 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 ; CHECK-NEXT: flat_load_dword v0, v[0:1] +; CHECK-NEXT: s_mov_b32 s4, -1 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cmp_ngt_f32_e32 vcc, 1.0, v0 -; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc +; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, 1.0, v0 +; CHECK-NEXT: s_xor_b64 s[8:9], vcc, s[6:7] +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] ; CHECK-NEXT: ; %bb.2: ; %bb7 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: ; %bb.3: ; %bb8 @@ -214,8 +217,10 @@ define amdgpu_kernel void @break_loop(i32 %arg) { ; CHECK-NEXT: ; %bb.2: ; %bb4 ; CHECK-NEXT: ; in Loop: Header=BB5_1 Depth=1 ; CHECK-NEXT: global_load_dword v2, v[0:1], off +; CHECK-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, 1 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ge_i32_e64 s[2:3], v0, v2 +; CHECK-NEXT: v_cmp_lt_i32_e32 vcc, v0, v2 +; CHECK-NEXT: s_xor_b64 s[2:3], vcc, s[2:3] ; CHECK-NEXT: BB5_3: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB5_1 Depth=1 ; CHECK-NEXT: s_and_b64 s[2:3], exec, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index d2e7328a384f..a85f1dd6c591 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -10,10 +10,12 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_or_b32_e32 v5, v1, v3 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; CHECK-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CHECK-NEXT: s_cbranch_execz BB0_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v3 @@ -202,7 +204,10 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_mov_b32 s1, -1 ; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[0:1] -; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[6:7], 0 +; CHECK-NEXT: v_cmp_eq_u64_e64 s[6:7], s[6:7], 0 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 1 +; CHECK-NEXT: s_xor_b64 vcc, s[6:7], s[8:9] +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_cbranch_vccz BB1_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: s_ashr_i32 s6, s3, 31 @@ -353,14 +358,11 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_xor_b32_e32 v0, s0, v0 ; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; CHECK-NEXT: s_mov_b32 s1, 0 -; CHECK-NEXT: s_branch BB1_3 -; CHECK-NEXT: BB1_2: -; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CHECK-NEXT: BB1_3: ; %Flow +; CHECK-NEXT: BB1_2: ; %Flow ; CHECK-NEXT: s_and_b32 s0, s1, 1 ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: s_cbranch_scc0 BB1_5 -; CHECK-NEXT: ; %bb.4: +; CHECK-NEXT: s_cbranch_scc0 BB1_4 +; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s4 ; CHECK-NEXT: s_sub_i32 s0, 0, s4 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -380,7 +382,7 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: BB1_5: +; CHECK-NEXT: BB1_4: ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 ; CHECK-NEXT: s_mov_b32 s1, s0 ; CHECK-NEXT: ; return to shader part epilog @@ -693,10 +695,12 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_mov_b32_e32 v8, v0 ; CGP-NEXT: v_or_b32_e32 v1, v9, v5 ; CGP-NEXT: v_mov_b32_e32 v0, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CGP-NEXT: s_cbranch_execz BB2_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v5 @@ -870,10 +874,12 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: v_or_b32_e32 v5, v3, v7 ; CGP-NEXT: v_mov_b32_e32 v4, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; CGP-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CGP-NEXT: s_cbranch_execz BB2_6 ; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v7 @@ -2507,10 +2513,12 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_lshl_b64 v[4:5], s[4:5], v2 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_or_b32_e32 v3, v1, v5 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; CHECK-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CHECK-NEXT: s_cbranch_execz BB7_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v5 @@ -2994,11 +3002,13 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_mov_b32_e32 v5, v0 ; CGP-NEXT: v_or_b32_e32 v1, v7, v11 ; CGP-NEXT: v_mov_b32_e32 v0, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: v_lshl_b64 v[8:9], s[4:5], v6 +; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CGP-NEXT: s_cbranch_execz BB8_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v11 @@ -3172,10 +3182,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: v_or_b32_e32 v5, v3, v9 ; CGP-NEXT: v_mov_b32_e32 v4, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; CGP-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CGP-NEXT: s_cbranch_execz BB8_6 ; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v9 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index cbb77b54aba5..7c4753c0939c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -10,10 +10,12 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_or_b32_e32 v5, v1, v3 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; CHECK-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CHECK-NEXT: s_cbranch_execz BB0_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v3 @@ -198,7 +200,10 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_mov_b32 s1, -1 ; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[0:1] -; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[6:7], 0 +; CHECK-NEXT: v_cmp_eq_u64_e64 s[6:7], s[6:7], 0 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 1 +; CHECK-NEXT: s_xor_b64 vcc, s[6:7], s[8:9] +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_cbranch_vccz BB1_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: s_ashr_i32 s0, s5, 31 @@ -347,14 +352,11 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_xor_b32_e32 v0, s6, v0 ; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0 ; CHECK-NEXT: s_mov_b32 s1, 0 -; CHECK-NEXT: s_branch BB1_3 -; CHECK-NEXT: BB1_2: -; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CHECK-NEXT: BB1_3: ; %Flow +; CHECK-NEXT: BB1_2: ; %Flow ; CHECK-NEXT: s_and_b32 s0, s1, 1 ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: s_cbranch_scc0 BB1_5 -; CHECK-NEXT: ; %bb.4: +; CHECK-NEXT: s_cbranch_scc0 BB1_4 +; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s4 ; CHECK-NEXT: s_sub_i32 s0, 0, s4 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -372,7 +374,7 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: BB1_5: +; CHECK-NEXT: BB1_4: ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 ; CHECK-NEXT: s_mov_b32 s1, s0 ; CHECK-NEXT: ; return to shader part epilog @@ -681,10 +683,12 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_mov_b32_e32 v8, v0 ; CGP-NEXT: v_or_b32_e32 v1, v9, v5 ; CGP-NEXT: v_mov_b32_e32 v0, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CGP-NEXT: s_cbranch_execz BB2_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v5 @@ -854,10 +858,12 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: v_or_b32_e32 v5, v3, v7 ; CGP-NEXT: v_mov_b32_e32 v4, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; CGP-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CGP-NEXT: s_cbranch_execz BB2_6 ; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v7 @@ -2471,10 +2477,12 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_lshl_b64 v[4:5], s[4:5], v2 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_or_b32_e32 v3, v1, v5 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; CHECK-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CHECK-NEXT: s_cbranch_execz BB7_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v5 @@ -2950,11 +2958,13 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_mov_b32_e32 v5, v0 ; CGP-NEXT: v_or_b32_e32 v1, v7, v11 ; CGP-NEXT: v_mov_b32_e32 v0, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: v_lshl_b64 v[8:9], s[4:5], v6 +; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CGP-NEXT: s_cbranch_execz BB8_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v11 @@ -3124,10 +3134,12 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: v_or_b32_e32 v5, v3, v9 ; CGP-NEXT: v_mov_b32_e32 v4, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; CGP-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CGP-NEXT: s_cbranch_execz BB8_6 ; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v9 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll index 559d116602e5..2faebe527d29 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -10,10 +10,12 @@ define i64 @v_udiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_or_b32_e32 v5, v1, v3 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; CHECK-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CHECK-NEXT: s_cbranch_execz BB0_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_cvt_f32_u32_e32 v4, v2 @@ -187,7 +189,10 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: s_mov_b32 s5, -1 ; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[4:5] -; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[6:7], 0 +; CHECK-NEXT: v_cmp_eq_u64_e64 s[6:7], s[6:7], 0 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 1 +; CHECK-NEXT: s_xor_b64 vcc, s[6:7], s[8:9] +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_cbranch_vccz BB1_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s2 @@ -319,14 +324,11 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CHECK-NEXT: s_mov_b32 s5, 0 -; CHECK-NEXT: s_branch BB1_3 -; CHECK-NEXT: BB1_2: -; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CHECK-NEXT: BB1_3: ; %Flow +; CHECK-NEXT: BB1_2: ; %Flow ; CHECK-NEXT: s_and_b32 s1, s5, 1 ; CHECK-NEXT: s_cmp_lg_u32 s1, 0 -; CHECK-NEXT: s_cbranch_scc0 BB1_5 -; CHECK-NEXT: ; %bb.4: +; CHECK-NEXT: s_cbranch_scc0 BB1_4 +; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s2 ; CHECK-NEXT: s_sub_i32 s1, 0, s2 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -346,7 +348,7 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: BB1_5: +; CHECK-NEXT: BB1_4: ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 ; CHECK-NEXT: s_mov_b32 s1, s0 ; CHECK-NEXT: ; return to shader part epilog @@ -629,10 +631,12 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_mov_b32_e32 v9, v1 ; CGP-NEXT: v_or_b32_e32 v1, v9, v5 ; CGP-NEXT: v_mov_b32_e32 v0, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CGP-NEXT: s_cbranch_execz BB2_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v4 @@ -791,10 +795,12 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: v_or_b32_e32 v5, v3, v7 ; CGP-NEXT: v_mov_b32_e32 v4, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; CGP-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CGP-NEXT: s_cbranch_execz BB2_6 ; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v6 @@ -2286,13 +2292,15 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_movk_i32 s4, 0x1000 ; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: v_lshl_b64 v[4:5], s[4:5], v2 -; CHECK-NEXT: v_or_b32_e32 v3, v1, v5 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; CHECK-NEXT: v_or_b32_e32 v7, v1, v5 +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; CHECK-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CHECK-NEXT: s_cbranch_execz BB7_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v4 @@ -2728,14 +2736,16 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_mov_b32_e32 v7, v1 ; CGP-NEXT: s_movk_i32 s4, 0x1000 ; CGP-NEXT: s_mov_b32 s5, 0 +; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v4 ; CGP-NEXT: v_lshl_b64 v[8:9], s[4:5], v6 ; CGP-NEXT: v_or_b32_e32 v1, v7, v11 -; CGP-NEXT: v_mov_b32_e32 v0, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CGP-NEXT: s_cbranch_execz BB8_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v10 @@ -2894,10 +2904,12 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: v_or_b32_e32 v5, v3, v9 ; CGP-NEXT: v_mov_b32_e32 v4, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; CGP-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CGP-NEXT: s_cbranch_execz BB8_6 ; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index 92f93185530f..5c7504088408 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -10,10 +10,12 @@ define i64 @v_urem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_or_b32_e32 v5, v1, v3 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; CHECK-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CHECK-NEXT: s_cbranch_execz BB0_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_cvt_f32_u32_e32 v4, v2 @@ -184,7 +186,10 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: s_mov_b32 s5, -1 ; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[4:5] -; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[6:7], 0 +; CHECK-NEXT: v_cmp_eq_u64_e64 s[6:7], s[6:7], 0 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 1 +; CHECK-NEXT: s_xor_b64 vcc, s[6:7], s[8:9] +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_cbranch_vccz BB1_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s2 @@ -315,14 +320,11 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; CHECK-NEXT: s_mov_b32 s5, 0 -; CHECK-NEXT: s_branch BB1_3 -; CHECK-NEXT: BB1_2: -; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CHECK-NEXT: BB1_3: ; %Flow +; CHECK-NEXT: BB1_2: ; %Flow ; CHECK-NEXT: s_and_b32 s1, s5, 1 ; CHECK-NEXT: s_cmp_lg_u32 s1, 0 -; CHECK-NEXT: s_cbranch_scc0 BB1_5 -; CHECK-NEXT: ; %bb.4: +; CHECK-NEXT: s_cbranch_scc0 BB1_4 +; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s2 ; CHECK-NEXT: s_sub_i32 s1, 0, s2 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -340,7 +342,7 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_subrev_i32_e32 v1, vcc, s2, v0 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: BB1_5: +; CHECK-NEXT: BB1_4: ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 ; CHECK-NEXT: s_mov_b32 s1, s0 ; CHECK-NEXT: ; return to shader part epilog @@ -621,10 +623,12 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_mov_b32_e32 v9, v1 ; CGP-NEXT: v_or_b32_e32 v1, v9, v5 ; CGP-NEXT: v_mov_b32_e32 v0, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CGP-NEXT: s_cbranch_execz BB2_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v4 @@ -780,10 +784,12 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: v_or_b32_e32 v5, v3, v7 ; CGP-NEXT: v_mov_b32_e32 v4, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; CGP-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CGP-NEXT: s_cbranch_execz BB2_6 ; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v6 @@ -2252,13 +2258,15 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_movk_i32 s4, 0x1000 ; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: v_lshl_b64 v[4:5], s[4:5], v2 -; CHECK-NEXT: v_or_b32_e32 v3, v1, v5 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; CHECK-NEXT: v_or_b32_e32 v7, v1, v5 +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; CHECK-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CHECK-NEXT: s_cbranch_execz BB7_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v4 @@ -2689,14 +2697,16 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_mov_b32_e32 v7, v1 ; CGP-NEXT: s_movk_i32 s4, 0x1000 ; CGP-NEXT: s_mov_b32 s5, 0 +; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v4 ; CGP-NEXT: v_lshl_b64 v[8:9], s[4:5], v6 ; CGP-NEXT: v_or_b32_e32 v1, v7, v11 -; CGP-NEXT: v_mov_b32_e32 v0, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CGP-NEXT: s_cbranch_execz BB8_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v10 @@ -2852,10 +2862,12 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: v_or_b32_e32 v5, v3, v9 ; CGP-NEXT: v_mov_b32_e32 v4, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; CGP-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CGP-NEXT: s_cbranch_execz BB8_6 ; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v8