[TargetLowering] Simplify expansion of S{ADD,SUB}O

ISD::SADDO uses the suggested sequence described in the section §2.4 of
the RISCV Spec v2.2. ISD::SSUBO uses the dual approach but checking for
(non-zero) positive.

Differential Revision: https://reviews.llvm.org/D47927

llvm-svn: 373187
This commit is contained in:
Roger Ferrer Ibanez 2019-09-30 07:58:50 +00:00
parent b3438f1cc0
commit 5a2a14db0b
16 changed files with 3475 additions and 6223 deletions

View File

@ -6907,24 +6907,19 @@ void TargetLowering::expandSADDSUBO(
SDValue Zero = DAG.getConstant(0, dl, LHS.getValueType());
// LHSSign -> LHS >= 0
// RHSSign -> RHS >= 0
// SumSign -> Result >= 0
//
// Add:
// Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
// Sub:
// Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
SDValue LHSSign = DAG.getSetCC(dl, OType, LHS, Zero, ISD::SETGE);
SDValue RHSSign = DAG.getSetCC(dl, OType, RHS, Zero, ISD::SETGE);
SDValue SignsMatch = DAG.getSetCC(dl, OType, LHSSign, RHSSign,
IsAdd ? ISD::SETEQ : ISD::SETNE);
// For an addition, the result should be less than one of the operands (LHS)
// if and only if the other operand (RHS) is negative, otherwise there will
// be overflow.
// For a subtraction, the result should be less than one of the operands
// (LHS) if and only if the other operand (RHS) is (non-zero) positive,
// otherwise there will be overflow.
SDValue ResultLowerThanLHS = DAG.getSetCC(dl, OType, Result, LHS, ISD::SETLT);
SDValue ConditionRHS =
DAG.getSetCC(dl, OType, RHS, Zero, IsAdd ? ISD::SETLT : ISD::SETGT);
SDValue SumSign = DAG.getSetCC(dl, OType, Result, Zero, ISD::SETGE);
SDValue SumSignNE = DAG.getSetCC(dl, OType, LHSSign, SumSign, ISD::SETNE);
SDValue Cmp = DAG.getNode(ISD::AND, dl, OType, SignsMatch, SumSignNE);
Overflow = DAG.getBoolExtOrTrunc(Cmp, dl, ResultType, ResultType);
Overflow = DAG.getBoolExtOrTrunc(
DAG.getNode(ISD::XOR, dl, OType, ConditionRHS, ResultLowerThanLHS), dl,
ResultType, ResultType);
}
bool TargetLowering::expandMULO(SDNode *Node, SDValue &Result,

View File

@ -54,17 +54,13 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; CHECK-LABEL: vec:
; CHECK: // %bb.0:
; CHECK-NEXT: add v2.4s, v0.4s, v1.4s
; CHECK-NEXT: cmge v1.4s, v1.4s, #0
; CHECK-NEXT: cmge v0.4s, v0.4s, #0
; CHECK-NEXT: cmge v5.4s, v2.4s, #0
; CHECK-NEXT: cmlt v4.4s, v2.4s, #0
; CHECK-NEXT: cmeq v1.4s, v0.4s, v1.4s
; CHECK-NEXT: cmeq v0.4s, v0.4s, v5.4s
; CHECK-NEXT: mvni v3.4s, #128, lsl #24
; CHECK-NEXT: cmlt v1.4s, v1.4s, #0
; CHECK-NEXT: cmgt v0.4s, v0.4s, v2.4s
; CHECK-NEXT: mvn v5.16b, v4.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b
; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b
; CHECK-NEXT: ret
%tmp = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y);

View File

@ -36,17 +36,13 @@ define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
; CHECK-LABEL: v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: add v2.16b, v0.16b, v1.16b
; CHECK-NEXT: cmge v1.16b, v1.16b, #0
; CHECK-NEXT: cmge v0.16b, v0.16b, #0
; CHECK-NEXT: cmge v5.16b, v2.16b, #0
; CHECK-NEXT: cmlt v4.16b, v2.16b, #0
; CHECK-NEXT: cmeq v1.16b, v0.16b, v1.16b
; CHECK-NEXT: cmeq v0.16b, v0.16b, v5.16b
; CHECK-NEXT: movi v3.16b, #127
; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
; CHECK-NEXT: cmgt v0.16b, v0.16b, v2.16b
; CHECK-NEXT: mvn v5.16b, v4.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b
; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b
; CHECK-NEXT: ret
%z = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %x, <16 x i8> %y)
@ -57,29 +53,21 @@ define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
; CHECK-LABEL: v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: add v4.16b, v0.16b, v2.16b
; CHECK-NEXT: cmlt v16.16b, v4.16b, #0
; CHECK-NEXT: cmlt v7.16b, v4.16b, #0
; CHECK-NEXT: movi v6.16b, #127
; CHECK-NEXT: mvn v16.16b, v7.16b
; CHECK-NEXT: bsl v6.16b, v7.16b, v16.16b
; CHECK-NEXT: add v7.16b, v1.16b, v3.16b
; CHECK-NEXT: mvn v17.16b, v16.16b
; CHECK-NEXT: bsl v6.16b, v16.16b, v17.16b
; CHECK-NEXT: cmlt v2.16b, v2.16b, #0
; CHECK-NEXT: cmgt v0.16b, v0.16b, v4.16b
; CHECK-NEXT: cmlt v16.16b, v7.16b, #0
; CHECK-NEXT: movi v5.16b, #127
; CHECK-NEXT: mvn v17.16b, v16.16b
; CHECK-NEXT: bsl v5.16b, v16.16b, v17.16b
; CHECK-NEXT: cmge v2.16b, v2.16b, #0
; CHECK-NEXT: cmge v0.16b, v0.16b, #0
; CHECK-NEXT: cmge v16.16b, v4.16b, #0
; CHECK-NEXT: cmge v3.16b, v3.16b, #0
; CHECK-NEXT: cmge v1.16b, v1.16b, #0
; CHECK-NEXT: cmeq v2.16b, v0.16b, v2.16b
; CHECK-NEXT: cmeq v0.16b, v0.16b, v16.16b
; CHECK-NEXT: cmge v16.16b, v7.16b, #0
; CHECK-NEXT: cmeq v3.16b, v1.16b, v3.16b
; CHECK-NEXT: cmeq v1.16b, v1.16b, v16.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: and v0.16b, v2.16b, v0.16b
; CHECK-NEXT: and v1.16b, v3.16b, v1.16b
; CHECK-NEXT: cmlt v3.16b, v3.16b, #0
; CHECK-NEXT: cmgt v1.16b, v1.16b, v7.16b
; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b
; CHECK-NEXT: mvn v2.16b, v16.16b
; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b
; CHECK-NEXT: bsl v5.16b, v16.16b, v2.16b
; CHECK-NEXT: bsl v0.16b, v6.16b, v4.16b
; CHECK-NEXT: bsl v1.16b, v5.16b, v7.16b
; CHECK-NEXT: ret
@ -102,42 +90,26 @@ define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; CHECK-NEXT: mvn v25.16b, v24.16b
; CHECK-NEXT: bsl v20.16b, v24.16b, v25.16b
; CHECK-NEXT: cmlt v24.16b, v21.16b, #0
; CHECK-NEXT: cmlt v4.16b, v4.16b, #0
; CHECK-NEXT: cmgt v0.16b, v0.16b, v16.16b
; CHECK-NEXT: movi v22.16b, #127
; CHECK-NEXT: add v23.16b, v3.16b, v7.16b
; CHECK-NEXT: mvn v25.16b, v24.16b
; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b
; CHECK-NEXT: cmlt v4.16b, v5.16b, #0
; CHECK-NEXT: cmgt v1.16b, v1.16b, v19.16b
; CHECK-NEXT: bsl v22.16b, v24.16b, v25.16b
; CHECK-NEXT: cmlt v24.16b, v23.16b, #0
; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b
; CHECK-NEXT: cmlt v4.16b, v6.16b, #0
; CHECK-NEXT: cmgt v2.16b, v2.16b, v21.16b
; CHECK-NEXT: movi v17.16b, #127
; CHECK-NEXT: mvn v25.16b, v24.16b
; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b
; CHECK-NEXT: cmlt v4.16b, v7.16b, #0
; CHECK-NEXT: cmgt v3.16b, v3.16b, v23.16b
; CHECK-NEXT: bsl v17.16b, v24.16b, v25.16b
; CHECK-NEXT: cmge v4.16b, v4.16b, #0
; CHECK-NEXT: cmge v0.16b, v0.16b, #0
; CHECK-NEXT: cmge v24.16b, v16.16b, #0
; CHECK-NEXT: cmge v5.16b, v5.16b, #0
; CHECK-NEXT: cmge v1.16b, v1.16b, #0
; CHECK-NEXT: cmeq v4.16b, v0.16b, v4.16b
; CHECK-NEXT: cmeq v0.16b, v0.16b, v24.16b
; CHECK-NEXT: cmge v24.16b, v19.16b, #0
; CHECK-NEXT: cmge v6.16b, v6.16b, #0
; CHECK-NEXT: cmge v2.16b, v2.16b, #0
; CHECK-NEXT: cmeq v5.16b, v1.16b, v5.16b
; CHECK-NEXT: cmeq v1.16b, v1.16b, v24.16b
; CHECK-NEXT: cmge v24.16b, v21.16b, #0
; CHECK-NEXT: cmge v7.16b, v7.16b, #0
; CHECK-NEXT: cmge v3.16b, v3.16b, #0
; CHECK-NEXT: cmeq v6.16b, v2.16b, v6.16b
; CHECK-NEXT: cmeq v2.16b, v2.16b, v24.16b
; CHECK-NEXT: cmge v24.16b, v23.16b, #0
; CHECK-NEXT: cmeq v7.16b, v3.16b, v7.16b
; CHECK-NEXT: cmeq v3.16b, v3.16b, v24.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v3.16b, v3.16b
; CHECK-NEXT: and v0.16b, v4.16b, v0.16b
; CHECK-NEXT: and v1.16b, v5.16b, v1.16b
; CHECK-NEXT: and v2.16b, v6.16b, v2.16b
; CHECK-NEXT: and v3.16b, v7.16b, v3.16b
; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b
; CHECK-NEXT: bsl v0.16b, v18.16b, v16.16b
; CHECK-NEXT: bsl v1.16b, v20.16b, v19.16b
; CHECK-NEXT: bsl v2.16b, v22.16b, v21.16b
@ -151,17 +123,13 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; CHECK-LABEL: v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: add v2.8h, v0.8h, v1.8h
; CHECK-NEXT: cmge v1.8h, v1.8h, #0
; CHECK-NEXT: cmge v0.8h, v0.8h, #0
; CHECK-NEXT: cmge v5.8h, v2.8h, #0
; CHECK-NEXT: cmlt v4.8h, v2.8h, #0
; CHECK-NEXT: cmeq v1.8h, v0.8h, v1.8h
; CHECK-NEXT: cmeq v0.8h, v0.8h, v5.8h
; CHECK-NEXT: mvni v3.8h, #128, lsl #8
; CHECK-NEXT: cmlt v1.8h, v1.8h, #0
; CHECK-NEXT: cmgt v0.8h, v0.8h, v2.8h
; CHECK-NEXT: mvn v5.16b, v4.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b
; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b
; CHECK-NEXT: ret
%z = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %x, <8 x i16> %y)
@ -172,29 +140,21 @@ define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
; CHECK-LABEL: v16i16:
; CHECK: // %bb.0:
; CHECK-NEXT: add v4.8h, v0.8h, v2.8h
; CHECK-NEXT: cmlt v16.8h, v4.8h, #0
; CHECK-NEXT: cmlt v7.8h, v4.8h, #0
; CHECK-NEXT: mvni v6.8h, #128, lsl #8
; CHECK-NEXT: mvn v16.16b, v7.16b
; CHECK-NEXT: bsl v6.16b, v7.16b, v16.16b
; CHECK-NEXT: add v7.8h, v1.8h, v3.8h
; CHECK-NEXT: mvn v17.16b, v16.16b
; CHECK-NEXT: bsl v6.16b, v16.16b, v17.16b
; CHECK-NEXT: cmlt v2.8h, v2.8h, #0
; CHECK-NEXT: cmgt v0.8h, v0.8h, v4.8h
; CHECK-NEXT: cmlt v16.8h, v7.8h, #0
; CHECK-NEXT: mvni v5.8h, #128, lsl #8
; CHECK-NEXT: mvn v17.16b, v16.16b
; CHECK-NEXT: bsl v5.16b, v16.16b, v17.16b
; CHECK-NEXT: cmge v2.8h, v2.8h, #0
; CHECK-NEXT: cmge v0.8h, v0.8h, #0
; CHECK-NEXT: cmge v16.8h, v4.8h, #0
; CHECK-NEXT: cmge v3.8h, v3.8h, #0
; CHECK-NEXT: cmge v1.8h, v1.8h, #0
; CHECK-NEXT: cmeq v2.8h, v0.8h, v2.8h
; CHECK-NEXT: cmeq v0.8h, v0.8h, v16.8h
; CHECK-NEXT: cmge v16.8h, v7.8h, #0
; CHECK-NEXT: cmeq v3.8h, v1.8h, v3.8h
; CHECK-NEXT: cmeq v1.8h, v1.8h, v16.8h
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: and v0.16b, v2.16b, v0.16b
; CHECK-NEXT: and v1.16b, v3.16b, v1.16b
; CHECK-NEXT: cmlt v3.8h, v3.8h, #0
; CHECK-NEXT: cmgt v1.8h, v1.8h, v7.8h
; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b
; CHECK-NEXT: mvn v2.16b, v16.16b
; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b
; CHECK-NEXT: bsl v5.16b, v16.16b, v2.16b
; CHECK-NEXT: bsl v0.16b, v6.16b, v4.16b
; CHECK-NEXT: bsl v1.16b, v5.16b, v7.16b
; CHECK-NEXT: ret
@ -217,42 +177,26 @@ define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
; CHECK-NEXT: mvn v25.16b, v24.16b
; CHECK-NEXT: bsl v20.16b, v24.16b, v25.16b
; CHECK-NEXT: cmlt v24.8h, v21.8h, #0
; CHECK-NEXT: cmlt v4.8h, v4.8h, #0
; CHECK-NEXT: cmgt v0.8h, v0.8h, v16.8h
; CHECK-NEXT: mvni v22.8h, #128, lsl #8
; CHECK-NEXT: add v23.8h, v3.8h, v7.8h
; CHECK-NEXT: mvn v25.16b, v24.16b
; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b
; CHECK-NEXT: cmlt v4.8h, v5.8h, #0
; CHECK-NEXT: cmgt v1.8h, v1.8h, v19.8h
; CHECK-NEXT: bsl v22.16b, v24.16b, v25.16b
; CHECK-NEXT: cmlt v24.8h, v23.8h, #0
; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b
; CHECK-NEXT: cmlt v4.8h, v6.8h, #0
; CHECK-NEXT: cmgt v2.8h, v2.8h, v21.8h
; CHECK-NEXT: mvni v17.8h, #128, lsl #8
; CHECK-NEXT: mvn v25.16b, v24.16b
; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b
; CHECK-NEXT: cmlt v4.8h, v7.8h, #0
; CHECK-NEXT: cmgt v3.8h, v3.8h, v23.8h
; CHECK-NEXT: bsl v17.16b, v24.16b, v25.16b
; CHECK-NEXT: cmge v4.8h, v4.8h, #0
; CHECK-NEXT: cmge v0.8h, v0.8h, #0
; CHECK-NEXT: cmge v24.8h, v16.8h, #0
; CHECK-NEXT: cmge v5.8h, v5.8h, #0
; CHECK-NEXT: cmge v1.8h, v1.8h, #0
; CHECK-NEXT: cmeq v4.8h, v0.8h, v4.8h
; CHECK-NEXT: cmeq v0.8h, v0.8h, v24.8h
; CHECK-NEXT: cmge v24.8h, v19.8h, #0
; CHECK-NEXT: cmge v6.8h, v6.8h, #0
; CHECK-NEXT: cmge v2.8h, v2.8h, #0
; CHECK-NEXT: cmeq v5.8h, v1.8h, v5.8h
; CHECK-NEXT: cmeq v1.8h, v1.8h, v24.8h
; CHECK-NEXT: cmge v24.8h, v21.8h, #0
; CHECK-NEXT: cmge v7.8h, v7.8h, #0
; CHECK-NEXT: cmge v3.8h, v3.8h, #0
; CHECK-NEXT: cmeq v6.8h, v2.8h, v6.8h
; CHECK-NEXT: cmeq v2.8h, v2.8h, v24.8h
; CHECK-NEXT: cmge v24.8h, v23.8h, #0
; CHECK-NEXT: cmeq v7.8h, v3.8h, v7.8h
; CHECK-NEXT: cmeq v3.8h, v3.8h, v24.8h
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v3.16b, v3.16b
; CHECK-NEXT: and v0.16b, v4.16b, v0.16b
; CHECK-NEXT: and v1.16b, v5.16b, v1.16b
; CHECK-NEXT: and v2.16b, v6.16b, v2.16b
; CHECK-NEXT: and v3.16b, v7.16b, v3.16b
; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b
; CHECK-NEXT: bsl v0.16b, v18.16b, v16.16b
; CHECK-NEXT: bsl v1.16b, v20.16b, v19.16b
; CHECK-NEXT: bsl v2.16b, v22.16b, v21.16b
@ -269,16 +213,12 @@ define void @v8i8(<8 x i8>* %px, <8 x i8>* %py, <8 x i8>* %pz) nounwind {
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: movi v2.8b, #127
; CHECK-NEXT: add v3.8b, v0.8b, v1.8b
; CHECK-NEXT: cmge v1.8b, v1.8b, #0
; CHECK-NEXT: cmge v0.8b, v0.8b, #0
; CHECK-NEXT: cmge v5.8b, v3.8b, #0
; CHECK-NEXT: cmlt v4.8b, v3.8b, #0
; CHECK-NEXT: cmeq v1.8b, v0.8b, v1.8b
; CHECK-NEXT: cmeq v0.8b, v0.8b, v5.8b
; CHECK-NEXT: cmlt v1.8b, v1.8b, #0
; CHECK-NEXT: cmgt v0.8b, v0.8b, v3.8b
; CHECK-NEXT: mvn v5.8b, v4.8b
; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b
; CHECK-NEXT: and v0.8b, v1.8b, v0.8b
; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b
; CHECK-NEXT: str d0, [x2]
; CHECK-NEXT: ret
@ -311,17 +251,13 @@ define void @v4i8(<4 x i8>* %px, <4 x i8>* %py, <4 x i8>* %pz) nounwind {
; CHECK-NEXT: shl v1.4h, v1.4h, #8
; CHECK-NEXT: shl v0.4h, v0.4h, #8
; CHECK-NEXT: add v3.4h, v0.4h, v1.4h
; CHECK-NEXT: cmge v1.4h, v1.4h, #0
; CHECK-NEXT: cmge v0.4h, v0.4h, #0
; CHECK-NEXT: cmge v5.4h, v3.4h, #0
; CHECK-NEXT: cmlt v4.4h, v3.4h, #0
; CHECK-NEXT: cmeq v1.4h, v0.4h, v1.4h
; CHECK-NEXT: cmeq v0.4h, v0.4h, v5.4h
; CHECK-NEXT: mvni v2.4h, #128, lsl #8
; CHECK-NEXT: cmlt v1.4h, v1.4h, #0
; CHECK-NEXT: cmgt v0.4h, v0.4h, v3.4h
; CHECK-NEXT: mvn v5.8b, v4.8b
; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b
; CHECK-NEXT: and v0.8b, v1.8b, v0.8b
; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b
; CHECK-NEXT: sshr v0.4h, v0.4h, #8
; CHECK-NEXT: xtn v0.8b, v0.8h
@ -348,17 +284,13 @@ define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind {
; CHECK-NEXT: shl v2.2s, v2.2s, #24
; CHECK-NEXT: shl v0.2s, v0.2s, #24
; CHECK-NEXT: add v3.2s, v0.2s, v2.2s
; CHECK-NEXT: cmge v2.2s, v2.2s, #0
; CHECK-NEXT: cmge v0.2s, v0.2s, #0
; CHECK-NEXT: cmge v5.2s, v3.2s, #0
; CHECK-NEXT: cmlt v4.2s, v3.2s, #0
; CHECK-NEXT: cmeq v2.2s, v0.2s, v2.2s
; CHECK-NEXT: cmeq v0.2s, v0.2s, v5.2s
; CHECK-NEXT: mvni v1.2s, #128, lsl #24
; CHECK-NEXT: cmlt v2.2s, v2.2s, #0
; CHECK-NEXT: cmgt v0.2s, v0.2s, v3.2s
; CHECK-NEXT: mvn v5.8b, v4.8b
; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: eor v0.8b, v2.8b, v0.8b
; CHECK-NEXT: bsl v1.8b, v4.8b, v5.8b
; CHECK-NEXT: and v0.8b, v2.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v1.8b, v3.8b
; CHECK-NEXT: ushr v0.2s, v0.2s, #24
; CHECK-NEXT: mov w8, v0.s[1]
@ -380,16 +312,12 @@ define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind {
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: mvni v2.4h, #128, lsl #8
; CHECK-NEXT: add v3.4h, v0.4h, v1.4h
; CHECK-NEXT: cmge v1.4h, v1.4h, #0
; CHECK-NEXT: cmge v0.4h, v0.4h, #0
; CHECK-NEXT: cmge v5.4h, v3.4h, #0
; CHECK-NEXT: cmlt v4.4h, v3.4h, #0
; CHECK-NEXT: cmeq v1.4h, v0.4h, v1.4h
; CHECK-NEXT: cmeq v0.4h, v0.4h, v5.4h
; CHECK-NEXT: cmlt v1.4h, v1.4h, #0
; CHECK-NEXT: cmgt v0.4h, v0.4h, v3.4h
; CHECK-NEXT: mvn v5.8b, v4.8b
; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b
; CHECK-NEXT: and v0.8b, v1.8b, v0.8b
; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b
; CHECK-NEXT: str d0, [x2]
; CHECK-NEXT: ret
@ -414,17 +342,13 @@ define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind {
; CHECK-NEXT: shl v2.2s, v2.2s, #16
; CHECK-NEXT: shl v0.2s, v0.2s, #16
; CHECK-NEXT: add v3.2s, v0.2s, v2.2s
; CHECK-NEXT: cmge v2.2s, v2.2s, #0
; CHECK-NEXT: cmge v0.2s, v0.2s, #0
; CHECK-NEXT: cmge v5.2s, v3.2s, #0
; CHECK-NEXT: cmlt v4.2s, v3.2s, #0
; CHECK-NEXT: cmeq v2.2s, v0.2s, v2.2s
; CHECK-NEXT: cmeq v0.2s, v0.2s, v5.2s
; CHECK-NEXT: mvni v1.2s, #128, lsl #24
; CHECK-NEXT: cmlt v2.2s, v2.2s, #0
; CHECK-NEXT: cmgt v0.2s, v0.2s, v3.2s
; CHECK-NEXT: mvn v5.8b, v4.8b
; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: eor v0.8b, v2.8b, v0.8b
; CHECK-NEXT: bsl v1.8b, v4.8b, v5.8b
; CHECK-NEXT: and v0.8b, v2.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v1.8b, v3.8b
; CHECK-NEXT: ushr v0.2s, v0.2s, #16
; CHECK-NEXT: mov w8, v0.s[1]
@ -443,17 +367,13 @@ define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind {
; CHECK-LABEL: v12i8:
; CHECK: // %bb.0:
; CHECK-NEXT: add v2.16b, v0.16b, v1.16b
; CHECK-NEXT: cmge v1.16b, v1.16b, #0
; CHECK-NEXT: cmge v0.16b, v0.16b, #0
; CHECK-NEXT: cmge v5.16b, v2.16b, #0
; CHECK-NEXT: cmlt v4.16b, v2.16b, #0
; CHECK-NEXT: cmeq v1.16b, v0.16b, v1.16b
; CHECK-NEXT: cmeq v0.16b, v0.16b, v5.16b
; CHECK-NEXT: movi v3.16b, #127
; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
; CHECK-NEXT: cmgt v0.16b, v0.16b, v2.16b
; CHECK-NEXT: mvn v5.16b, v4.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b
; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b
; CHECK-NEXT: ret
%z = call <12 x i8> @llvm.sadd.sat.v12i8(<12 x i8> %x, <12 x i8> %y)
@ -468,27 +388,19 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind
; CHECK-NEXT: mvni v5.8h, #128, lsl #8
; CHECK-NEXT: mvni v4.8h, #128, lsl #8
; CHECK-NEXT: add v6.8h, v1.8h, v2.8h
; CHECK-NEXT: cmlt v16.8h, v6.8h, #0
; CHECK-NEXT: cmlt v7.8h, v6.8h, #0
; CHECK-NEXT: mvn v16.16b, v7.16b
; CHECK-NEXT: bsl v5.16b, v7.16b, v16.16b
; CHECK-NEXT: add v7.8h, v0.8h, v3.8h
; CHECK-NEXT: mvn v17.16b, v16.16b
; CHECK-NEXT: bsl v5.16b, v16.16b, v17.16b
; CHECK-NEXT: cmlt v2.8h, v2.8h, #0
; CHECK-NEXT: cmgt v1.8h, v1.8h, v6.8h
; CHECK-NEXT: cmlt v16.8h, v7.8h, #0
; CHECK-NEXT: mvn v17.16b, v16.16b
; CHECK-NEXT: bsl v4.16b, v16.16b, v17.16b
; CHECK-NEXT: cmge v2.8h, v2.8h, #0
; CHECK-NEXT: cmge v1.8h, v1.8h, #0
; CHECK-NEXT: cmge v16.8h, v6.8h, #0
; CHECK-NEXT: cmge v3.8h, v3.8h, #0
; CHECK-NEXT: cmge v0.8h, v0.8h, #0
; CHECK-NEXT: cmeq v2.8h, v1.8h, v2.8h
; CHECK-NEXT: cmeq v1.8h, v1.8h, v16.8h
; CHECK-NEXT: cmge v16.8h, v7.8h, #0
; CHECK-NEXT: cmeq v3.8h, v0.8h, v3.8h
; CHECK-NEXT: cmeq v0.8h, v0.8h, v16.8h
; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: and v1.16b, v2.16b, v1.16b
; CHECK-NEXT: and v0.16b, v3.16b, v0.16b
; CHECK-NEXT: cmlt v3.8h, v3.8h, #0
; CHECK-NEXT: cmgt v0.8h, v0.8h, v7.8h
; CHECK-NEXT: eor v1.16b, v2.16b, v1.16b
; CHECK-NEXT: mvn v2.16b, v16.16b
; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b
; CHECK-NEXT: bsl v4.16b, v16.16b, v2.16b
; CHECK-NEXT: bsl v1.16b, v5.16b, v6.16b
; CHECK-NEXT: bsl v0.16b, v4.16b, v7.16b
; CHECK-NEXT: str q0, [x2]
@ -508,16 +420,12 @@ define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
; CHECK-NEXT: ldr b1, [x1]
; CHECK-NEXT: movi v2.8b, #127
; CHECK-NEXT: add v3.8b, v0.8b, v1.8b
; CHECK-NEXT: cmge v1.8b, v1.8b, #0
; CHECK-NEXT: cmge v0.8b, v0.8b, #0
; CHECK-NEXT: cmge v5.8b, v3.8b, #0
; CHECK-NEXT: cmlt v4.8b, v3.8b, #0
; CHECK-NEXT: cmeq v1.8b, v0.8b, v1.8b
; CHECK-NEXT: cmeq v0.8b, v0.8b, v5.8b
; CHECK-NEXT: cmlt v1.8b, v1.8b, #0
; CHECK-NEXT: cmgt v0.8b, v0.8b, v3.8b
; CHECK-NEXT: mvn v5.8b, v4.8b
; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b
; CHECK-NEXT: and v0.8b, v1.8b, v0.8b
; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b
; CHECK-NEXT: st1 { v0.b }[0], [x2]
; CHECK-NEXT: ret
@ -535,16 +443,12 @@ define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind {
; CHECK-NEXT: ldr h1, [x1]
; CHECK-NEXT: mvni v2.4h, #128, lsl #8
; CHECK-NEXT: add v3.4h, v0.4h, v1.4h
; CHECK-NEXT: cmge v1.4h, v1.4h, #0
; CHECK-NEXT: cmge v0.4h, v0.4h, #0
; CHECK-NEXT: cmge v5.4h, v3.4h, #0
; CHECK-NEXT: cmlt v4.4h, v3.4h, #0
; CHECK-NEXT: cmeq v1.4h, v0.4h, v1.4h
; CHECK-NEXT: cmeq v0.4h, v0.4h, v5.4h
; CHECK-NEXT: cmlt v1.4h, v1.4h, #0
; CHECK-NEXT: cmgt v0.4h, v0.4h, v3.4h
; CHECK-NEXT: mvn v5.8b, v4.8b
; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b
; CHECK-NEXT: and v0.8b, v1.8b, v0.8b
; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b
; CHECK-NEXT: str h0, [x2]
; CHECK-NEXT: ret
@ -561,17 +465,13 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
; CHECK-NEXT: shl v1.16b, v1.16b, #4
; CHECK-NEXT: shl v0.16b, v0.16b, #4
; CHECK-NEXT: add v3.16b, v0.16b, v1.16b
; CHECK-NEXT: cmge v1.16b, v1.16b, #0
; CHECK-NEXT: cmge v0.16b, v0.16b, #0
; CHECK-NEXT: cmge v5.16b, v3.16b, #0
; CHECK-NEXT: cmlt v4.16b, v3.16b, #0
; CHECK-NEXT: cmeq v1.16b, v0.16b, v1.16b
; CHECK-NEXT: cmeq v0.16b, v0.16b, v5.16b
; CHECK-NEXT: movi v2.16b, #127
; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
; CHECK-NEXT: cmgt v0.16b, v0.16b, v3.16b
; CHECK-NEXT: mvn v5.16b, v4.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v2.16b, v4.16b, v5.16b
; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b
; CHECK-NEXT: sshr v0.16b, v0.16b, #4
; CHECK-NEXT: ret
@ -585,17 +485,13 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind {
; CHECK-NEXT: shl v1.16b, v1.16b, #7
; CHECK-NEXT: shl v0.16b, v0.16b, #7
; CHECK-NEXT: add v3.16b, v0.16b, v1.16b
; CHECK-NEXT: cmge v1.16b, v1.16b, #0
; CHECK-NEXT: cmge v0.16b, v0.16b, #0
; CHECK-NEXT: cmge v5.16b, v3.16b, #0
; CHECK-NEXT: cmlt v4.16b, v3.16b, #0
; CHECK-NEXT: cmeq v1.16b, v0.16b, v1.16b
; CHECK-NEXT: cmeq v0.16b, v0.16b, v5.16b
; CHECK-NEXT: movi v2.16b, #127
; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
; CHECK-NEXT: cmgt v0.16b, v0.16b, v3.16b
; CHECK-NEXT: mvn v5.16b, v4.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v2.16b, v4.16b, v5.16b
; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b
; CHECK-NEXT: sshr v0.16b, v0.16b, #7
; CHECK-NEXT: ret
@ -607,17 +503,13 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; CHECK-LABEL: v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: add v2.2s, v0.2s, v1.2s
; CHECK-NEXT: cmge v1.2s, v1.2s, #0
; CHECK-NEXT: cmge v0.2s, v0.2s, #0
; CHECK-NEXT: cmge v5.2s, v2.2s, #0
; CHECK-NEXT: cmlt v4.2s, v2.2s, #0
; CHECK-NEXT: cmeq v1.2s, v0.2s, v1.2s
; CHECK-NEXT: cmeq v0.2s, v0.2s, v5.2s
; CHECK-NEXT: mvni v3.2s, #128, lsl #24
; CHECK-NEXT: cmlt v1.2s, v1.2s, #0
; CHECK-NEXT: cmgt v0.2s, v0.2s, v2.2s
; CHECK-NEXT: mvn v5.8b, v4.8b
; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: bsl v3.8b, v4.8b, v5.8b
; CHECK-NEXT: and v0.8b, v1.8b, v0.8b
; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v3.8b, v2.8b
; CHECK-NEXT: ret
%z = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
@ -628,17 +520,13 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; CHECK-LABEL: v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: add v2.4s, v0.4s, v1.4s
; CHECK-NEXT: cmge v1.4s, v1.4s, #0
; CHECK-NEXT: cmge v0.4s, v0.4s, #0
; CHECK-NEXT: cmge v5.4s, v2.4s, #0
; CHECK-NEXT: cmlt v4.4s, v2.4s, #0
; CHECK-NEXT: cmeq v1.4s, v0.4s, v1.4s
; CHECK-NEXT: cmeq v0.4s, v0.4s, v5.4s
; CHECK-NEXT: mvni v3.4s, #128, lsl #24
; CHECK-NEXT: cmlt v1.4s, v1.4s, #0
; CHECK-NEXT: cmgt v0.4s, v0.4s, v2.4s
; CHECK-NEXT: mvn v5.16b, v4.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b
; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b
; CHECK-NEXT: ret
%z = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
@ -649,29 +537,21 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
; CHECK-LABEL: v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: add v4.4s, v0.4s, v2.4s
; CHECK-NEXT: cmlt v16.4s, v4.4s, #0
; CHECK-NEXT: cmlt v7.4s, v4.4s, #0
; CHECK-NEXT: mvni v6.4s, #128, lsl #24
; CHECK-NEXT: mvn v16.16b, v7.16b
; CHECK-NEXT: bsl v6.16b, v7.16b, v16.16b
; CHECK-NEXT: add v7.4s, v1.4s, v3.4s
; CHECK-NEXT: mvn v17.16b, v16.16b
; CHECK-NEXT: bsl v6.16b, v16.16b, v17.16b
; CHECK-NEXT: cmlt v2.4s, v2.4s, #0
; CHECK-NEXT: cmgt v0.4s, v0.4s, v4.4s
; CHECK-NEXT: cmlt v16.4s, v7.4s, #0
; CHECK-NEXT: mvni v5.4s, #128, lsl #24
; CHECK-NEXT: mvn v17.16b, v16.16b
; CHECK-NEXT: bsl v5.16b, v16.16b, v17.16b
; CHECK-NEXT: cmge v2.4s, v2.4s, #0
; CHECK-NEXT: cmge v0.4s, v0.4s, #0
; CHECK-NEXT: cmge v16.4s, v4.4s, #0
; CHECK-NEXT: cmge v3.4s, v3.4s, #0
; CHECK-NEXT: cmge v1.4s, v1.4s, #0
; CHECK-NEXT: cmeq v2.4s, v0.4s, v2.4s
; CHECK-NEXT: cmeq v0.4s, v0.4s, v16.4s
; CHECK-NEXT: cmge v16.4s, v7.4s, #0
; CHECK-NEXT: cmeq v3.4s, v1.4s, v3.4s
; CHECK-NEXT: cmeq v1.4s, v1.4s, v16.4s
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: and v0.16b, v2.16b, v0.16b
; CHECK-NEXT: and v1.16b, v3.16b, v1.16b
; CHECK-NEXT: cmlt v3.4s, v3.4s, #0
; CHECK-NEXT: cmgt v1.4s, v1.4s, v7.4s
; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b
; CHECK-NEXT: mvn v2.16b, v16.16b
; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b
; CHECK-NEXT: bsl v5.16b, v16.16b, v2.16b
; CHECK-NEXT: bsl v0.16b, v6.16b, v4.16b
; CHECK-NEXT: bsl v1.16b, v5.16b, v7.16b
; CHECK-NEXT: ret
@ -694,42 +574,26 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
; CHECK-NEXT: mvn v25.16b, v24.16b
; CHECK-NEXT: bsl v20.16b, v24.16b, v25.16b
; CHECK-NEXT: cmlt v24.4s, v21.4s, #0
; CHECK-NEXT: cmlt v4.4s, v4.4s, #0
; CHECK-NEXT: cmgt v0.4s, v0.4s, v16.4s
; CHECK-NEXT: mvni v22.4s, #128, lsl #24
; CHECK-NEXT: add v23.4s, v3.4s, v7.4s
; CHECK-NEXT: mvn v25.16b, v24.16b
; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b
; CHECK-NEXT: cmlt v4.4s, v5.4s, #0
; CHECK-NEXT: cmgt v1.4s, v1.4s, v19.4s
; CHECK-NEXT: bsl v22.16b, v24.16b, v25.16b
; CHECK-NEXT: cmlt v24.4s, v23.4s, #0
; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b
; CHECK-NEXT: cmlt v4.4s, v6.4s, #0
; CHECK-NEXT: cmgt v2.4s, v2.4s, v21.4s
; CHECK-NEXT: mvni v17.4s, #128, lsl #24
; CHECK-NEXT: mvn v25.16b, v24.16b
; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b
; CHECK-NEXT: cmlt v4.4s, v7.4s, #0
; CHECK-NEXT: cmgt v3.4s, v3.4s, v23.4s
; CHECK-NEXT: bsl v17.16b, v24.16b, v25.16b
; CHECK-NEXT: cmge v4.4s, v4.4s, #0
; CHECK-NEXT: cmge v0.4s, v0.4s, #0
; CHECK-NEXT: cmge v24.4s, v16.4s, #0
; CHECK-NEXT: cmge v5.4s, v5.4s, #0
; CHECK-NEXT: cmge v1.4s, v1.4s, #0
; CHECK-NEXT: cmeq v4.4s, v0.4s, v4.4s
; CHECK-NEXT: cmeq v0.4s, v0.4s, v24.4s
; CHECK-NEXT: cmge v24.4s, v19.4s, #0
; CHECK-NEXT: cmge v6.4s, v6.4s, #0
; CHECK-NEXT: cmge v2.4s, v2.4s, #0
; CHECK-NEXT: cmeq v5.4s, v1.4s, v5.4s
; CHECK-NEXT: cmeq v1.4s, v1.4s, v24.4s
; CHECK-NEXT: cmge v24.4s, v21.4s, #0
; CHECK-NEXT: cmge v7.4s, v7.4s, #0
; CHECK-NEXT: cmge v3.4s, v3.4s, #0
; CHECK-NEXT: cmeq v6.4s, v2.4s, v6.4s
; CHECK-NEXT: cmeq v2.4s, v2.4s, v24.4s
; CHECK-NEXT: cmge v24.4s, v23.4s, #0
; CHECK-NEXT: cmeq v7.4s, v3.4s, v7.4s
; CHECK-NEXT: cmeq v3.4s, v3.4s, v24.4s
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v3.16b, v3.16b
; CHECK-NEXT: and v0.16b, v4.16b, v0.16b
; CHECK-NEXT: and v1.16b, v5.16b, v1.16b
; CHECK-NEXT: and v2.16b, v6.16b, v2.16b
; CHECK-NEXT: and v3.16b, v7.16b, v3.16b
; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b
; CHECK-NEXT: bsl v0.16b, v18.16b, v16.16b
; CHECK-NEXT: bsl v1.16b, v20.16b, v19.16b
; CHECK-NEXT: bsl v2.16b, v22.16b, v21.16b
@ -743,18 +607,14 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; CHECK-LABEL: v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: add v2.2d, v0.2d, v1.2d
; CHECK-NEXT: cmge v1.2d, v1.2d, #0
; CHECK-NEXT: cmge v0.2d, v0.2d, #0
; CHECK-NEXT: cmge v5.2d, v2.2d, #0
; CHECK-NEXT: mov x8, #9223372036854775807
; CHECK-NEXT: cmlt v3.2d, v2.2d, #0
; CHECK-NEXT: cmeq v1.2d, v0.2d, v1.2d
; CHECK-NEXT: cmeq v0.2d, v0.2d, v5.2d
; CHECK-NEXT: cmlt v1.2d, v1.2d, #0
; CHECK-NEXT: dup v4.2d, x8
; CHECK-NEXT: cmgt v0.2d, v0.2d, v2.2d
; CHECK-NEXT: mvn v5.16b, v3.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v4.16b, v3.16b, v5.16b
; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v4.16b, v2.16b
; CHECK-NEXT: ret
%z = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %x, <2 x i64> %y)
@ -766,31 +626,23 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: add v4.2d, v0.2d, v2.2d
; CHECK-NEXT: mov x8, #9223372036854775807
; CHECK-NEXT: cmlt v6.2d, v4.2d, #0
; CHECK-NEXT: dup v7.2d, x8
; CHECK-NEXT: cmlt v5.2d, v4.2d, #0
; CHECK-NEXT: dup v6.2d, x8
; CHECK-NEXT: mvn v7.16b, v5.16b
; CHECK-NEXT: mov v16.16b, v6.16b
; CHECK-NEXT: bsl v16.16b, v5.16b, v7.16b
; CHECK-NEXT: add v5.2d, v1.2d, v3.2d
; CHECK-NEXT: mvn v16.16b, v6.16b
; CHECK-NEXT: mov v17.16b, v7.16b
; CHECK-NEXT: bsl v17.16b, v6.16b, v16.16b
; CHECK-NEXT: cmlt v6.2d, v5.2d, #0
; CHECK-NEXT: mvn v16.16b, v6.16b
; CHECK-NEXT: bsl v7.16b, v6.16b, v16.16b
; CHECK-NEXT: cmge v2.2d, v2.2d, #0
; CHECK-NEXT: cmge v0.2d, v0.2d, #0
; CHECK-NEXT: cmge v6.2d, v4.2d, #0
; CHECK-NEXT: cmge v3.2d, v3.2d, #0
; CHECK-NEXT: cmge v1.2d, v1.2d, #0
; CHECK-NEXT: cmeq v2.2d, v0.2d, v2.2d
; CHECK-NEXT: cmeq v0.2d, v0.2d, v6.2d
; CHECK-NEXT: cmge v6.2d, v5.2d, #0
; CHECK-NEXT: cmeq v3.2d, v1.2d, v3.2d
; CHECK-NEXT: cmeq v1.2d, v1.2d, v6.2d
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: and v0.16b, v2.16b, v0.16b
; CHECK-NEXT: and v1.16b, v3.16b, v1.16b
; CHECK-NEXT: bsl v0.16b, v17.16b, v4.16b
; CHECK-NEXT: bsl v1.16b, v7.16b, v5.16b
; CHECK-NEXT: cmlt v2.2d, v2.2d, #0
; CHECK-NEXT: cmgt v0.2d, v0.2d, v4.2d
; CHECK-NEXT: cmlt v7.2d, v5.2d, #0
; CHECK-NEXT: cmlt v3.2d, v3.2d, #0
; CHECK-NEXT: cmgt v1.2d, v1.2d, v5.2d
; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b
; CHECK-NEXT: mvn v2.16b, v7.16b
; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b
; CHECK-NEXT: bsl v6.16b, v7.16b, v2.16b
; CHECK-NEXT: bsl v0.16b, v16.16b, v4.16b
; CHECK-NEXT: bsl v1.16b, v6.16b, v5.16b
; CHECK-NEXT: ret
%z = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
ret <4 x i64> %z
@ -812,42 +664,26 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; CHECK-NEXT: bsl v25.16b, v20.16b, v24.16b
; CHECK-NEXT: mvn v20.16b, v22.16b
; CHECK-NEXT: mov v24.16b, v21.16b
; CHECK-NEXT: cmlt v4.2d, v4.2d, #0
; CHECK-NEXT: cmgt v0.2d, v0.2d, v16.2d
; CHECK-NEXT: add v19.2d, v3.2d, v7.2d
; CHECK-NEXT: bsl v24.16b, v22.16b, v20.16b
; CHECK-NEXT: mvn v20.16b, v23.16b
; CHECK-NEXT: mov v22.16b, v21.16b
; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b
; CHECK-NEXT: cmlt v4.2d, v5.2d, #0
; CHECK-NEXT: cmgt v1.2d, v1.2d, v17.2d
; CHECK-NEXT: bsl v22.16b, v23.16b, v20.16b
; CHECK-NEXT: cmlt v20.2d, v19.2d, #0
; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b
; CHECK-NEXT: cmlt v4.2d, v6.2d, #0
; CHECK-NEXT: cmgt v2.2d, v2.2d, v18.2d
; CHECK-NEXT: mvn v23.16b, v20.16b
; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b
; CHECK-NEXT: cmlt v4.2d, v7.2d, #0
; CHECK-NEXT: cmgt v3.2d, v3.2d, v19.2d
; CHECK-NEXT: bsl v21.16b, v20.16b, v23.16b
; CHECK-NEXT: cmge v4.2d, v4.2d, #0
; CHECK-NEXT: cmge v0.2d, v0.2d, #0
; CHECK-NEXT: cmge v20.2d, v16.2d, #0
; CHECK-NEXT: cmge v5.2d, v5.2d, #0
; CHECK-NEXT: cmge v1.2d, v1.2d, #0
; CHECK-NEXT: cmeq v4.2d, v0.2d, v4.2d
; CHECK-NEXT: cmeq v0.2d, v0.2d, v20.2d
; CHECK-NEXT: cmge v20.2d, v17.2d, #0
; CHECK-NEXT: cmge v6.2d, v6.2d, #0
; CHECK-NEXT: cmge v2.2d, v2.2d, #0
; CHECK-NEXT: cmeq v5.2d, v1.2d, v5.2d
; CHECK-NEXT: cmeq v1.2d, v1.2d, v20.2d
; CHECK-NEXT: cmge v20.2d, v18.2d, #0
; CHECK-NEXT: cmge v7.2d, v7.2d, #0
; CHECK-NEXT: cmge v3.2d, v3.2d, #0
; CHECK-NEXT: cmeq v6.2d, v2.2d, v6.2d
; CHECK-NEXT: cmeq v2.2d, v2.2d, v20.2d
; CHECK-NEXT: cmge v20.2d, v19.2d, #0
; CHECK-NEXT: cmeq v7.2d, v3.2d, v7.2d
; CHECK-NEXT: cmeq v3.2d, v3.2d, v20.2d
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v3.16b, v3.16b
; CHECK-NEXT: and v0.16b, v4.16b, v0.16b
; CHECK-NEXT: and v1.16b, v5.16b, v1.16b
; CHECK-NEXT: and v2.16b, v6.16b, v2.16b
; CHECK-NEXT: and v3.16b, v7.16b, v3.16b
; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b
; CHECK-NEXT: bsl v0.16b, v25.16b, v16.16b
; CHECK-NEXT: bsl v1.16b, v24.16b, v17.16b
; CHECK-NEXT: bsl v2.16b, v22.16b, v18.16b

View File

@ -54,18 +54,13 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; CHECK-LABEL: vec:
; CHECK: // %bb.0:
; CHECK-NEXT: sub v2.4s, v0.4s, v1.4s
; CHECK-NEXT: cmge v1.4s, v1.4s, #0
; CHECK-NEXT: cmge v0.4s, v0.4s, #0
; CHECK-NEXT: cmge v5.4s, v2.4s, #0
; CHECK-NEXT: cmlt v4.4s, v2.4s, #0
; CHECK-NEXT: cmeq v1.4s, v0.4s, v1.4s
; CHECK-NEXT: cmeq v0.4s, v0.4s, v5.4s
; CHECK-NEXT: mvni v3.4s, #128, lsl #24
; CHECK-NEXT: cmgt v1.4s, v1.4s, #0
; CHECK-NEXT: cmgt v0.4s, v0.4s, v2.4s
; CHECK-NEXT: mvn v5.16b, v4.16b
; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b
; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b
; CHECK-NEXT: ret
%tmp = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y);

View File

@ -37,18 +37,13 @@ define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
; CHECK-LABEL: v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: sub v2.16b, v0.16b, v1.16b
; CHECK-NEXT: cmge v1.16b, v1.16b, #0
; CHECK-NEXT: cmge v0.16b, v0.16b, #0
; CHECK-NEXT: cmge v5.16b, v2.16b, #0
; CHECK-NEXT: cmlt v4.16b, v2.16b, #0
; CHECK-NEXT: cmeq v1.16b, v0.16b, v1.16b
; CHECK-NEXT: cmeq v0.16b, v0.16b, v5.16b
; CHECK-NEXT: movi v3.16b, #127
; CHECK-NEXT: cmgt v1.16b, v1.16b, #0
; CHECK-NEXT: cmgt v0.16b, v0.16b, v2.16b
; CHECK-NEXT: mvn v5.16b, v4.16b
; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b
; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b
; CHECK-NEXT: ret
%z = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %x, <16 x i8> %y)
@ -59,31 +54,21 @@ define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
; CHECK-LABEL: v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: sub v4.16b, v0.16b, v2.16b
; CHECK-NEXT: cmlt v16.16b, v4.16b, #0
; CHECK-NEXT: cmlt v7.16b, v4.16b, #0
; CHECK-NEXT: movi v6.16b, #127
; CHECK-NEXT: mvn v16.16b, v7.16b
; CHECK-NEXT: bsl v6.16b, v7.16b, v16.16b
; CHECK-NEXT: sub v7.16b, v1.16b, v3.16b
; CHECK-NEXT: mvn v17.16b, v16.16b
; CHECK-NEXT: bsl v6.16b, v16.16b, v17.16b
; CHECK-NEXT: cmgt v2.16b, v2.16b, #0
; CHECK-NEXT: cmgt v0.16b, v0.16b, v4.16b
; CHECK-NEXT: cmlt v16.16b, v7.16b, #0
; CHECK-NEXT: movi v5.16b, #127
; CHECK-NEXT: mvn v17.16b, v16.16b
; CHECK-NEXT: bsl v5.16b, v16.16b, v17.16b
; CHECK-NEXT: cmge v2.16b, v2.16b, #0
; CHECK-NEXT: cmge v0.16b, v0.16b, #0
; CHECK-NEXT: cmge v16.16b, v4.16b, #0
; CHECK-NEXT: cmge v3.16b, v3.16b, #0
; CHECK-NEXT: cmge v1.16b, v1.16b, #0
; CHECK-NEXT: cmeq v2.16b, v0.16b, v2.16b
; CHECK-NEXT: cmeq v0.16b, v0.16b, v16.16b
; CHECK-NEXT: cmge v16.16b, v7.16b, #0
; CHECK-NEXT: cmeq v3.16b, v1.16b, v3.16b
; CHECK-NEXT: cmeq v1.16b, v1.16b, v16.16b
; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v3.16b, v3.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: and v0.16b, v2.16b, v0.16b
; CHECK-NEXT: and v1.16b, v3.16b, v1.16b
; CHECK-NEXT: cmgt v3.16b, v3.16b, #0
; CHECK-NEXT: cmgt v1.16b, v1.16b, v7.16b
; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b
; CHECK-NEXT: mvn v2.16b, v16.16b
; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b
; CHECK-NEXT: bsl v5.16b, v16.16b, v2.16b
; CHECK-NEXT: bsl v0.16b, v6.16b, v4.16b
; CHECK-NEXT: bsl v1.16b, v5.16b, v7.16b
; CHECK-NEXT: ret
@ -106,46 +91,26 @@ define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; CHECK-NEXT: mvn v25.16b, v24.16b
; CHECK-NEXT: bsl v20.16b, v24.16b, v25.16b
; CHECK-NEXT: cmlt v24.16b, v21.16b, #0
; CHECK-NEXT: cmgt v4.16b, v4.16b, #0
; CHECK-NEXT: cmgt v0.16b, v0.16b, v16.16b
; CHECK-NEXT: movi v22.16b, #127
; CHECK-NEXT: sub v23.16b, v3.16b, v7.16b
; CHECK-NEXT: mvn v25.16b, v24.16b
; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b
; CHECK-NEXT: cmgt v4.16b, v5.16b, #0
; CHECK-NEXT: cmgt v1.16b, v1.16b, v19.16b
; CHECK-NEXT: bsl v22.16b, v24.16b, v25.16b
; CHECK-NEXT: cmlt v24.16b, v23.16b, #0
; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b
; CHECK-NEXT: cmgt v4.16b, v6.16b, #0
; CHECK-NEXT: cmgt v2.16b, v2.16b, v21.16b
; CHECK-NEXT: movi v17.16b, #127
; CHECK-NEXT: mvn v25.16b, v24.16b
; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b
; CHECK-NEXT: cmgt v4.16b, v7.16b, #0
; CHECK-NEXT: cmgt v3.16b, v3.16b, v23.16b
; CHECK-NEXT: bsl v17.16b, v24.16b, v25.16b
; CHECK-NEXT: cmge v4.16b, v4.16b, #0
; CHECK-NEXT: cmge v0.16b, v0.16b, #0
; CHECK-NEXT: cmge v24.16b, v16.16b, #0
; CHECK-NEXT: cmge v5.16b, v5.16b, #0
; CHECK-NEXT: cmge v1.16b, v1.16b, #0
; CHECK-NEXT: cmeq v4.16b, v0.16b, v4.16b
; CHECK-NEXT: cmeq v0.16b, v0.16b, v24.16b
; CHECK-NEXT: cmge v24.16b, v19.16b, #0
; CHECK-NEXT: cmge v6.16b, v6.16b, #0
; CHECK-NEXT: cmge v2.16b, v2.16b, #0
; CHECK-NEXT: cmeq v5.16b, v1.16b, v5.16b
; CHECK-NEXT: cmeq v1.16b, v1.16b, v24.16b
; CHECK-NEXT: cmge v24.16b, v21.16b, #0
; CHECK-NEXT: mvn v4.16b, v4.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: cmge v7.16b, v7.16b, #0
; CHECK-NEXT: cmge v3.16b, v3.16b, #0
; CHECK-NEXT: cmeq v6.16b, v2.16b, v6.16b
; CHECK-NEXT: cmeq v2.16b, v2.16b, v24.16b
; CHECK-NEXT: cmge v24.16b, v23.16b, #0
; CHECK-NEXT: and v0.16b, v4.16b, v0.16b
; CHECK-NEXT: mvn v4.16b, v5.16b
; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: cmeq v7.16b, v3.16b, v7.16b
; CHECK-NEXT: cmeq v3.16b, v3.16b, v24.16b
; CHECK-NEXT: and v1.16b, v4.16b, v1.16b
; CHECK-NEXT: mvn v4.16b, v6.16b
; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: and v2.16b, v4.16b, v2.16b
; CHECK-NEXT: mvn v4.16b, v7.16b
; CHECK-NEXT: mvn v3.16b, v3.16b
; CHECK-NEXT: and v3.16b, v4.16b, v3.16b
; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b
; CHECK-NEXT: bsl v0.16b, v18.16b, v16.16b
; CHECK-NEXT: bsl v1.16b, v20.16b, v19.16b
; CHECK-NEXT: bsl v2.16b, v22.16b, v21.16b
@ -159,18 +124,13 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; CHECK-LABEL: v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: sub v2.8h, v0.8h, v1.8h
; CHECK-NEXT: cmge v1.8h, v1.8h, #0
; CHECK-NEXT: cmge v0.8h, v0.8h, #0
; CHECK-NEXT: cmge v5.8h, v2.8h, #0
; CHECK-NEXT: cmlt v4.8h, v2.8h, #0
; CHECK-NEXT: cmeq v1.8h, v0.8h, v1.8h
; CHECK-NEXT: cmeq v0.8h, v0.8h, v5.8h
; CHECK-NEXT: mvni v3.8h, #128, lsl #8
; CHECK-NEXT: cmgt v1.8h, v1.8h, #0
; CHECK-NEXT: cmgt v0.8h, v0.8h, v2.8h
; CHECK-NEXT: mvn v5.16b, v4.16b
; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b
; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b
; CHECK-NEXT: ret
%z = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %x, <8 x i16> %y)
@ -181,31 +141,21 @@ define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
; CHECK-LABEL: v16i16:
; CHECK: // %bb.0:
; CHECK-NEXT: sub v4.8h, v0.8h, v2.8h
; CHECK-NEXT: cmlt v16.8h, v4.8h, #0
; CHECK-NEXT: cmlt v7.8h, v4.8h, #0
; CHECK-NEXT: mvni v6.8h, #128, lsl #8
; CHECK-NEXT: mvn v16.16b, v7.16b
; CHECK-NEXT: bsl v6.16b, v7.16b, v16.16b
; CHECK-NEXT: sub v7.8h, v1.8h, v3.8h
; CHECK-NEXT: mvn v17.16b, v16.16b
; CHECK-NEXT: bsl v6.16b, v16.16b, v17.16b
; CHECK-NEXT: cmgt v2.8h, v2.8h, #0
; CHECK-NEXT: cmgt v0.8h, v0.8h, v4.8h
; CHECK-NEXT: cmlt v16.8h, v7.8h, #0
; CHECK-NEXT: mvni v5.8h, #128, lsl #8
; CHECK-NEXT: mvn v17.16b, v16.16b
; CHECK-NEXT: bsl v5.16b, v16.16b, v17.16b
; CHECK-NEXT: cmge v2.8h, v2.8h, #0
; CHECK-NEXT: cmge v0.8h, v0.8h, #0
; CHECK-NEXT: cmge v16.8h, v4.8h, #0
; CHECK-NEXT: cmge v3.8h, v3.8h, #0
; CHECK-NEXT: cmge v1.8h, v1.8h, #0
; CHECK-NEXT: cmeq v2.8h, v0.8h, v2.8h
; CHECK-NEXT: cmeq v0.8h, v0.8h, v16.8h
; CHECK-NEXT: cmge v16.8h, v7.8h, #0
; CHECK-NEXT: cmeq v3.8h, v1.8h, v3.8h
; CHECK-NEXT: cmeq v1.8h, v1.8h, v16.8h
; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v3.16b, v3.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: and v0.16b, v2.16b, v0.16b
; CHECK-NEXT: and v1.16b, v3.16b, v1.16b
; CHECK-NEXT: cmgt v3.8h, v3.8h, #0
; CHECK-NEXT: cmgt v1.8h, v1.8h, v7.8h
; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b
; CHECK-NEXT: mvn v2.16b, v16.16b
; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b
; CHECK-NEXT: bsl v5.16b, v16.16b, v2.16b
; CHECK-NEXT: bsl v0.16b, v6.16b, v4.16b
; CHECK-NEXT: bsl v1.16b, v5.16b, v7.16b
; CHECK-NEXT: ret
@ -228,46 +178,26 @@ define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
; CHECK-NEXT: mvn v25.16b, v24.16b
; CHECK-NEXT: bsl v20.16b, v24.16b, v25.16b
; CHECK-NEXT: cmlt v24.8h, v21.8h, #0
; CHECK-NEXT: cmgt v4.8h, v4.8h, #0
; CHECK-NEXT: cmgt v0.8h, v0.8h, v16.8h
; CHECK-NEXT: mvni v22.8h, #128, lsl #8
; CHECK-NEXT: sub v23.8h, v3.8h, v7.8h
; CHECK-NEXT: mvn v25.16b, v24.16b
; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b
; CHECK-NEXT: cmgt v4.8h, v5.8h, #0
; CHECK-NEXT: cmgt v1.8h, v1.8h, v19.8h
; CHECK-NEXT: bsl v22.16b, v24.16b, v25.16b
; CHECK-NEXT: cmlt v24.8h, v23.8h, #0
; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b
; CHECK-NEXT: cmgt v4.8h, v6.8h, #0
; CHECK-NEXT: cmgt v2.8h, v2.8h, v21.8h
; CHECK-NEXT: mvni v17.8h, #128, lsl #8
; CHECK-NEXT: mvn v25.16b, v24.16b
; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b
; CHECK-NEXT: cmgt v4.8h, v7.8h, #0
; CHECK-NEXT: cmgt v3.8h, v3.8h, v23.8h
; CHECK-NEXT: bsl v17.16b, v24.16b, v25.16b
; CHECK-NEXT: cmge v4.8h, v4.8h, #0
; CHECK-NEXT: cmge v0.8h, v0.8h, #0
; CHECK-NEXT: cmge v24.8h, v16.8h, #0
; CHECK-NEXT: cmge v5.8h, v5.8h, #0
; CHECK-NEXT: cmge v1.8h, v1.8h, #0
; CHECK-NEXT: cmeq v4.8h, v0.8h, v4.8h
; CHECK-NEXT: cmeq v0.8h, v0.8h, v24.8h
; CHECK-NEXT: cmge v24.8h, v19.8h, #0
; CHECK-NEXT: cmge v6.8h, v6.8h, #0
; CHECK-NEXT: cmge v2.8h, v2.8h, #0
; CHECK-NEXT: cmeq v5.8h, v1.8h, v5.8h
; CHECK-NEXT: cmeq v1.8h, v1.8h, v24.8h
; CHECK-NEXT: cmge v24.8h, v21.8h, #0
; CHECK-NEXT: mvn v4.16b, v4.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: cmge v7.8h, v7.8h, #0
; CHECK-NEXT: cmge v3.8h, v3.8h, #0
; CHECK-NEXT: cmeq v6.8h, v2.8h, v6.8h
; CHECK-NEXT: cmeq v2.8h, v2.8h, v24.8h
; CHECK-NEXT: cmge v24.8h, v23.8h, #0
; CHECK-NEXT: and v0.16b, v4.16b, v0.16b
; CHECK-NEXT: mvn v4.16b, v5.16b
; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: cmeq v7.8h, v3.8h, v7.8h
; CHECK-NEXT: cmeq v3.8h, v3.8h, v24.8h
; CHECK-NEXT: and v1.16b, v4.16b, v1.16b
; CHECK-NEXT: mvn v4.16b, v6.16b
; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: and v2.16b, v4.16b, v2.16b
; CHECK-NEXT: mvn v4.16b, v7.16b
; CHECK-NEXT: mvn v3.16b, v3.16b
; CHECK-NEXT: and v3.16b, v4.16b, v3.16b
; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b
; CHECK-NEXT: bsl v0.16b, v18.16b, v16.16b
; CHECK-NEXT: bsl v1.16b, v20.16b, v19.16b
; CHECK-NEXT: bsl v2.16b, v22.16b, v21.16b
@ -284,17 +214,12 @@ define void @v8i8(<8 x i8>* %px, <8 x i8>* %py, <8 x i8>* %pz) nounwind {
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: movi v2.8b, #127
; CHECK-NEXT: sub v3.8b, v0.8b, v1.8b
; CHECK-NEXT: cmge v1.8b, v1.8b, #0
; CHECK-NEXT: cmge v0.8b, v0.8b, #0
; CHECK-NEXT: cmge v5.8b, v3.8b, #0
; CHECK-NEXT: cmlt v4.8b, v3.8b, #0
; CHECK-NEXT: cmeq v1.8b, v0.8b, v1.8b
; CHECK-NEXT: cmeq v0.8b, v0.8b, v5.8b
; CHECK-NEXT: cmgt v1.8b, v1.8b, #0
; CHECK-NEXT: cmgt v0.8b, v0.8b, v3.8b
; CHECK-NEXT: mvn v5.8b, v4.8b
; CHECK-NEXT: mvn v1.8b, v1.8b
; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b
; CHECK-NEXT: and v0.8b, v1.8b, v0.8b
; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b
; CHECK-NEXT: str d0, [x2]
; CHECK-NEXT: ret
@ -327,18 +252,13 @@ define void @v4i8(<4 x i8>* %px, <4 x i8>* %py, <4 x i8>* %pz) nounwind {
; CHECK-NEXT: shl v1.4h, v1.4h, #8
; CHECK-NEXT: shl v0.4h, v0.4h, #8
; CHECK-NEXT: sub v3.4h, v0.4h, v1.4h
; CHECK-NEXT: cmge v1.4h, v1.4h, #0
; CHECK-NEXT: cmge v0.4h, v0.4h, #0
; CHECK-NEXT: cmge v5.4h, v3.4h, #0
; CHECK-NEXT: cmlt v4.4h, v3.4h, #0
; CHECK-NEXT: cmeq v1.4h, v0.4h, v1.4h
; CHECK-NEXT: cmeq v0.4h, v0.4h, v5.4h
; CHECK-NEXT: mvni v2.4h, #128, lsl #8
; CHECK-NEXT: cmgt v1.4h, v1.4h, #0
; CHECK-NEXT: cmgt v0.4h, v0.4h, v3.4h
; CHECK-NEXT: mvn v5.8b, v4.8b
; CHECK-NEXT: mvn v1.8b, v1.8b
; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b
; CHECK-NEXT: and v0.8b, v1.8b, v0.8b
; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b
; CHECK-NEXT: sshr v0.4h, v0.4h, #8
; CHECK-NEXT: xtn v0.8b, v0.8h
@ -365,18 +285,13 @@ define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind {
; CHECK-NEXT: shl v2.2s, v2.2s, #24
; CHECK-NEXT: shl v0.2s, v0.2s, #24
; CHECK-NEXT: sub v3.2s, v0.2s, v2.2s
; CHECK-NEXT: cmge v2.2s, v2.2s, #0
; CHECK-NEXT: cmge v0.2s, v0.2s, #0
; CHECK-NEXT: cmge v5.2s, v3.2s, #0
; CHECK-NEXT: cmlt v4.2s, v3.2s, #0
; CHECK-NEXT: cmeq v2.2s, v0.2s, v2.2s
; CHECK-NEXT: cmeq v0.2s, v0.2s, v5.2s
; CHECK-NEXT: mvni v1.2s, #128, lsl #24
; CHECK-NEXT: cmgt v2.2s, v2.2s, #0
; CHECK-NEXT: cmgt v0.2s, v0.2s, v3.2s
; CHECK-NEXT: mvn v5.8b, v4.8b
; CHECK-NEXT: mvn v2.8b, v2.8b
; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: eor v0.8b, v2.8b, v0.8b
; CHECK-NEXT: bsl v1.8b, v4.8b, v5.8b
; CHECK-NEXT: and v0.8b, v2.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v1.8b, v3.8b
; CHECK-NEXT: ushr v0.2s, v0.2s, #24
; CHECK-NEXT: mov w8, v0.s[1]
@ -398,17 +313,12 @@ define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind {
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: mvni v2.4h, #128, lsl #8
; CHECK-NEXT: sub v3.4h, v0.4h, v1.4h
; CHECK-NEXT: cmge v1.4h, v1.4h, #0
; CHECK-NEXT: cmge v0.4h, v0.4h, #0
; CHECK-NEXT: cmge v5.4h, v3.4h, #0
; CHECK-NEXT: cmlt v4.4h, v3.4h, #0
; CHECK-NEXT: cmeq v1.4h, v0.4h, v1.4h
; CHECK-NEXT: cmeq v0.4h, v0.4h, v5.4h
; CHECK-NEXT: cmgt v1.4h, v1.4h, #0
; CHECK-NEXT: cmgt v0.4h, v0.4h, v3.4h
; CHECK-NEXT: mvn v5.8b, v4.8b
; CHECK-NEXT: mvn v1.8b, v1.8b
; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b
; CHECK-NEXT: and v0.8b, v1.8b, v0.8b
; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b
; CHECK-NEXT: str d0, [x2]
; CHECK-NEXT: ret
@ -433,18 +343,13 @@ define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind {
; CHECK-NEXT: shl v2.2s, v2.2s, #16
; CHECK-NEXT: shl v0.2s, v0.2s, #16
; CHECK-NEXT: sub v3.2s, v0.2s, v2.2s
; CHECK-NEXT: cmge v2.2s, v2.2s, #0
; CHECK-NEXT: cmge v0.2s, v0.2s, #0
; CHECK-NEXT: cmge v5.2s, v3.2s, #0
; CHECK-NEXT: cmlt v4.2s, v3.2s, #0
; CHECK-NEXT: cmeq v2.2s, v0.2s, v2.2s
; CHECK-NEXT: cmeq v0.2s, v0.2s, v5.2s
; CHECK-NEXT: mvni v1.2s, #128, lsl #24
; CHECK-NEXT: cmgt v2.2s, v2.2s, #0
; CHECK-NEXT: cmgt v0.2s, v0.2s, v3.2s
; CHECK-NEXT: mvn v5.8b, v4.8b
; CHECK-NEXT: mvn v2.8b, v2.8b
; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: eor v0.8b, v2.8b, v0.8b
; CHECK-NEXT: bsl v1.8b, v4.8b, v5.8b
; CHECK-NEXT: and v0.8b, v2.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v1.8b, v3.8b
; CHECK-NEXT: ushr v0.2s, v0.2s, #16
; CHECK-NEXT: mov w8, v0.s[1]
@ -463,18 +368,13 @@ define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind {
; CHECK-LABEL: v12i8:
; CHECK: // %bb.0:
; CHECK-NEXT: sub v2.16b, v0.16b, v1.16b
; CHECK-NEXT: cmge v1.16b, v1.16b, #0
; CHECK-NEXT: cmge v0.16b, v0.16b, #0
; CHECK-NEXT: cmge v5.16b, v2.16b, #0
; CHECK-NEXT: cmlt v4.16b, v2.16b, #0
; CHECK-NEXT: cmeq v1.16b, v0.16b, v1.16b
; CHECK-NEXT: cmeq v0.16b, v0.16b, v5.16b
; CHECK-NEXT: movi v3.16b, #127
; CHECK-NEXT: cmgt v1.16b, v1.16b, #0
; CHECK-NEXT: cmgt v0.16b, v0.16b, v2.16b
; CHECK-NEXT: mvn v5.16b, v4.16b
; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b
; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b
; CHECK-NEXT: ret
%z = call <12 x i8> @llvm.ssub.sat.v12i8(<12 x i8> %x, <12 x i8> %y)
@ -489,29 +389,19 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind
; CHECK-NEXT: mvni v5.8h, #128, lsl #8
; CHECK-NEXT: mvni v4.8h, #128, lsl #8
; CHECK-NEXT: sub v6.8h, v1.8h, v2.8h
; CHECK-NEXT: cmlt v16.8h, v6.8h, #0
; CHECK-NEXT: cmlt v7.8h, v6.8h, #0
; CHECK-NEXT: mvn v16.16b, v7.16b
; CHECK-NEXT: bsl v5.16b, v7.16b, v16.16b
; CHECK-NEXT: sub v7.8h, v0.8h, v3.8h
; CHECK-NEXT: mvn v17.16b, v16.16b
; CHECK-NEXT: bsl v5.16b, v16.16b, v17.16b
; CHECK-NEXT: cmgt v2.8h, v2.8h, #0
; CHECK-NEXT: cmgt v1.8h, v1.8h, v6.8h
; CHECK-NEXT: cmlt v16.8h, v7.8h, #0
; CHECK-NEXT: mvn v17.16b, v16.16b
; CHECK-NEXT: bsl v4.16b, v16.16b, v17.16b
; CHECK-NEXT: cmge v2.8h, v2.8h, #0
; CHECK-NEXT: cmge v1.8h, v1.8h, #0
; CHECK-NEXT: cmge v16.8h, v6.8h, #0
; CHECK-NEXT: cmge v3.8h, v3.8h, #0
; CHECK-NEXT: cmge v0.8h, v0.8h, #0
; CHECK-NEXT: cmeq v2.8h, v1.8h, v2.8h
; CHECK-NEXT: cmeq v1.8h, v1.8h, v16.8h
; CHECK-NEXT: cmge v16.8h, v7.8h, #0
; CHECK-NEXT: cmeq v3.8h, v0.8h, v3.8h
; CHECK-NEXT: cmeq v0.8h, v0.8h, v16.8h
; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v3.16b, v3.16b
; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: and v1.16b, v2.16b, v1.16b
; CHECK-NEXT: and v0.16b, v3.16b, v0.16b
; CHECK-NEXT: cmgt v3.8h, v3.8h, #0
; CHECK-NEXT: cmgt v0.8h, v0.8h, v7.8h
; CHECK-NEXT: eor v1.16b, v2.16b, v1.16b
; CHECK-NEXT: mvn v2.16b, v16.16b
; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b
; CHECK-NEXT: bsl v4.16b, v16.16b, v2.16b
; CHECK-NEXT: bsl v1.16b, v5.16b, v6.16b
; CHECK-NEXT: bsl v0.16b, v4.16b, v7.16b
; CHECK-NEXT: str q0, [x2]
@ -531,17 +421,12 @@ define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
; CHECK-NEXT: ldr b1, [x1]
; CHECK-NEXT: movi v2.8b, #127
; CHECK-NEXT: sub v3.8b, v0.8b, v1.8b
; CHECK-NEXT: cmge v1.8b, v1.8b, #0
; CHECK-NEXT: cmge v0.8b, v0.8b, #0
; CHECK-NEXT: cmge v5.8b, v3.8b, #0
; CHECK-NEXT: cmlt v4.8b, v3.8b, #0
; CHECK-NEXT: cmeq v1.8b, v0.8b, v1.8b
; CHECK-NEXT: cmeq v0.8b, v0.8b, v5.8b
; CHECK-NEXT: cmgt v1.8b, v1.8b, #0
; CHECK-NEXT: cmgt v0.8b, v0.8b, v3.8b
; CHECK-NEXT: mvn v5.8b, v4.8b
; CHECK-NEXT: mvn v1.8b, v1.8b
; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b
; CHECK-NEXT: and v0.8b, v1.8b, v0.8b
; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b
; CHECK-NEXT: st1 { v0.b }[0], [x2]
; CHECK-NEXT: ret
@ -559,17 +444,12 @@ define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind {
; CHECK-NEXT: ldr h1, [x1]
; CHECK-NEXT: mvni v2.4h, #128, lsl #8
; CHECK-NEXT: sub v3.4h, v0.4h, v1.4h
; CHECK-NEXT: cmge v1.4h, v1.4h, #0
; CHECK-NEXT: cmge v0.4h, v0.4h, #0
; CHECK-NEXT: cmge v5.4h, v3.4h, #0
; CHECK-NEXT: cmlt v4.4h, v3.4h, #0
; CHECK-NEXT: cmeq v1.4h, v0.4h, v1.4h
; CHECK-NEXT: cmeq v0.4h, v0.4h, v5.4h
; CHECK-NEXT: cmgt v1.4h, v1.4h, #0
; CHECK-NEXT: cmgt v0.4h, v0.4h, v3.4h
; CHECK-NEXT: mvn v5.8b, v4.8b
; CHECK-NEXT: mvn v1.8b, v1.8b
; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: bsl v2.8b, v4.8b, v5.8b
; CHECK-NEXT: and v0.8b, v1.8b, v0.8b
; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b
; CHECK-NEXT: str h0, [x2]
; CHECK-NEXT: ret
@ -586,18 +466,13 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
; CHECK-NEXT: shl v1.16b, v1.16b, #4
; CHECK-NEXT: shl v0.16b, v0.16b, #4
; CHECK-NEXT: sub v3.16b, v0.16b, v1.16b
; CHECK-NEXT: cmge v1.16b, v1.16b, #0
; CHECK-NEXT: cmge v0.16b, v0.16b, #0
; CHECK-NEXT: cmge v5.16b, v3.16b, #0
; CHECK-NEXT: cmlt v4.16b, v3.16b, #0
; CHECK-NEXT: cmeq v1.16b, v0.16b, v1.16b
; CHECK-NEXT: cmeq v0.16b, v0.16b, v5.16b
; CHECK-NEXT: movi v2.16b, #127
; CHECK-NEXT: cmgt v1.16b, v1.16b, #0
; CHECK-NEXT: cmgt v0.16b, v0.16b, v3.16b
; CHECK-NEXT: mvn v5.16b, v4.16b
; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v2.16b, v4.16b, v5.16b
; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b
; CHECK-NEXT: sshr v0.16b, v0.16b, #4
; CHECK-NEXT: ret
@ -611,18 +486,13 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind {
; CHECK-NEXT: shl v1.16b, v1.16b, #7
; CHECK-NEXT: shl v0.16b, v0.16b, #7
; CHECK-NEXT: sub v3.16b, v0.16b, v1.16b
; CHECK-NEXT: cmge v1.16b, v1.16b, #0
; CHECK-NEXT: cmge v0.16b, v0.16b, #0
; CHECK-NEXT: cmge v5.16b, v3.16b, #0
; CHECK-NEXT: cmlt v4.16b, v3.16b, #0
; CHECK-NEXT: cmeq v1.16b, v0.16b, v1.16b
; CHECK-NEXT: cmeq v0.16b, v0.16b, v5.16b
; CHECK-NEXT: movi v2.16b, #127
; CHECK-NEXT: cmgt v1.16b, v1.16b, #0
; CHECK-NEXT: cmgt v0.16b, v0.16b, v3.16b
; CHECK-NEXT: mvn v5.16b, v4.16b
; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v2.16b, v4.16b, v5.16b
; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b
; CHECK-NEXT: sshr v0.16b, v0.16b, #7
; CHECK-NEXT: ret
@ -634,18 +504,13 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; CHECK-LABEL: v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: sub v2.2s, v0.2s, v1.2s
; CHECK-NEXT: cmge v1.2s, v1.2s, #0
; CHECK-NEXT: cmge v0.2s, v0.2s, #0
; CHECK-NEXT: cmge v5.2s, v2.2s, #0
; CHECK-NEXT: cmlt v4.2s, v2.2s, #0
; CHECK-NEXT: cmeq v1.2s, v0.2s, v1.2s
; CHECK-NEXT: cmeq v0.2s, v0.2s, v5.2s
; CHECK-NEXT: mvni v3.2s, #128, lsl #24
; CHECK-NEXT: cmgt v1.2s, v1.2s, #0
; CHECK-NEXT: cmgt v0.2s, v0.2s, v2.2s
; CHECK-NEXT: mvn v5.8b, v4.8b
; CHECK-NEXT: mvn v1.8b, v1.8b
; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: bsl v3.8b, v4.8b, v5.8b
; CHECK-NEXT: and v0.8b, v1.8b, v0.8b
; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
; CHECK-NEXT: bsl v0.8b, v3.8b, v2.8b
; CHECK-NEXT: ret
%z = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
@ -656,18 +521,13 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; CHECK-LABEL: v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: sub v2.4s, v0.4s, v1.4s
; CHECK-NEXT: cmge v1.4s, v1.4s, #0
; CHECK-NEXT: cmge v0.4s, v0.4s, #0
; CHECK-NEXT: cmge v5.4s, v2.4s, #0
; CHECK-NEXT: cmlt v4.4s, v2.4s, #0
; CHECK-NEXT: cmeq v1.4s, v0.4s, v1.4s
; CHECK-NEXT: cmeq v0.4s, v0.4s, v5.4s
; CHECK-NEXT: mvni v3.4s, #128, lsl #24
; CHECK-NEXT: cmgt v1.4s, v1.4s, #0
; CHECK-NEXT: cmgt v0.4s, v0.4s, v2.4s
; CHECK-NEXT: mvn v5.16b, v4.16b
; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v3.16b, v4.16b, v5.16b
; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b
; CHECK-NEXT: ret
%z = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
@ -678,31 +538,21 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
; CHECK-LABEL: v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: sub v4.4s, v0.4s, v2.4s
; CHECK-NEXT: cmlt v16.4s, v4.4s, #0
; CHECK-NEXT: cmlt v7.4s, v4.4s, #0
; CHECK-NEXT: mvni v6.4s, #128, lsl #24
; CHECK-NEXT: mvn v16.16b, v7.16b
; CHECK-NEXT: bsl v6.16b, v7.16b, v16.16b
; CHECK-NEXT: sub v7.4s, v1.4s, v3.4s
; CHECK-NEXT: mvn v17.16b, v16.16b
; CHECK-NEXT: bsl v6.16b, v16.16b, v17.16b
; CHECK-NEXT: cmgt v2.4s, v2.4s, #0
; CHECK-NEXT: cmgt v0.4s, v0.4s, v4.4s
; CHECK-NEXT: cmlt v16.4s, v7.4s, #0
; CHECK-NEXT: mvni v5.4s, #128, lsl #24
; CHECK-NEXT: mvn v17.16b, v16.16b
; CHECK-NEXT: bsl v5.16b, v16.16b, v17.16b
; CHECK-NEXT: cmge v2.4s, v2.4s, #0
; CHECK-NEXT: cmge v0.4s, v0.4s, #0
; CHECK-NEXT: cmge v16.4s, v4.4s, #0
; CHECK-NEXT: cmge v3.4s, v3.4s, #0
; CHECK-NEXT: cmge v1.4s, v1.4s, #0
; CHECK-NEXT: cmeq v2.4s, v0.4s, v2.4s
; CHECK-NEXT: cmeq v0.4s, v0.4s, v16.4s
; CHECK-NEXT: cmge v16.4s, v7.4s, #0
; CHECK-NEXT: cmeq v3.4s, v1.4s, v3.4s
; CHECK-NEXT: cmeq v1.4s, v1.4s, v16.4s
; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v3.16b, v3.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: and v0.16b, v2.16b, v0.16b
; CHECK-NEXT: and v1.16b, v3.16b, v1.16b
; CHECK-NEXT: cmgt v3.4s, v3.4s, #0
; CHECK-NEXT: cmgt v1.4s, v1.4s, v7.4s
; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b
; CHECK-NEXT: mvn v2.16b, v16.16b
; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b
; CHECK-NEXT: bsl v5.16b, v16.16b, v2.16b
; CHECK-NEXT: bsl v0.16b, v6.16b, v4.16b
; CHECK-NEXT: bsl v1.16b, v5.16b, v7.16b
; CHECK-NEXT: ret
@ -725,46 +575,26 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
; CHECK-NEXT: mvn v25.16b, v24.16b
; CHECK-NEXT: bsl v20.16b, v24.16b, v25.16b
; CHECK-NEXT: cmlt v24.4s, v21.4s, #0
; CHECK-NEXT: cmgt v4.4s, v4.4s, #0
; CHECK-NEXT: cmgt v0.4s, v0.4s, v16.4s
; CHECK-NEXT: mvni v22.4s, #128, lsl #24
; CHECK-NEXT: sub v23.4s, v3.4s, v7.4s
; CHECK-NEXT: mvn v25.16b, v24.16b
; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b
; CHECK-NEXT: cmgt v4.4s, v5.4s, #0
; CHECK-NEXT: cmgt v1.4s, v1.4s, v19.4s
; CHECK-NEXT: bsl v22.16b, v24.16b, v25.16b
; CHECK-NEXT: cmlt v24.4s, v23.4s, #0
; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b
; CHECK-NEXT: cmgt v4.4s, v6.4s, #0
; CHECK-NEXT: cmgt v2.4s, v2.4s, v21.4s
; CHECK-NEXT: mvni v17.4s, #128, lsl #24
; CHECK-NEXT: mvn v25.16b, v24.16b
; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b
; CHECK-NEXT: cmgt v4.4s, v7.4s, #0
; CHECK-NEXT: cmgt v3.4s, v3.4s, v23.4s
; CHECK-NEXT: bsl v17.16b, v24.16b, v25.16b
; CHECK-NEXT: cmge v4.4s, v4.4s, #0
; CHECK-NEXT: cmge v0.4s, v0.4s, #0
; CHECK-NEXT: cmge v24.4s, v16.4s, #0
; CHECK-NEXT: cmge v5.4s, v5.4s, #0
; CHECK-NEXT: cmge v1.4s, v1.4s, #0
; CHECK-NEXT: cmeq v4.4s, v0.4s, v4.4s
; CHECK-NEXT: cmeq v0.4s, v0.4s, v24.4s
; CHECK-NEXT: cmge v24.4s, v19.4s, #0
; CHECK-NEXT: cmge v6.4s, v6.4s, #0
; CHECK-NEXT: cmge v2.4s, v2.4s, #0
; CHECK-NEXT: cmeq v5.4s, v1.4s, v5.4s
; CHECK-NEXT: cmeq v1.4s, v1.4s, v24.4s
; CHECK-NEXT: cmge v24.4s, v21.4s, #0
; CHECK-NEXT: mvn v4.16b, v4.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: cmge v7.4s, v7.4s, #0
; CHECK-NEXT: cmge v3.4s, v3.4s, #0
; CHECK-NEXT: cmeq v6.4s, v2.4s, v6.4s
; CHECK-NEXT: cmeq v2.4s, v2.4s, v24.4s
; CHECK-NEXT: cmge v24.4s, v23.4s, #0
; CHECK-NEXT: and v0.16b, v4.16b, v0.16b
; CHECK-NEXT: mvn v4.16b, v5.16b
; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: cmeq v7.4s, v3.4s, v7.4s
; CHECK-NEXT: cmeq v3.4s, v3.4s, v24.4s
; CHECK-NEXT: and v1.16b, v4.16b, v1.16b
; CHECK-NEXT: mvn v4.16b, v6.16b
; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: and v2.16b, v4.16b, v2.16b
; CHECK-NEXT: mvn v4.16b, v7.16b
; CHECK-NEXT: mvn v3.16b, v3.16b
; CHECK-NEXT: and v3.16b, v4.16b, v3.16b
; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b
; CHECK-NEXT: bsl v0.16b, v18.16b, v16.16b
; CHECK-NEXT: bsl v1.16b, v20.16b, v19.16b
; CHECK-NEXT: bsl v2.16b, v22.16b, v21.16b
@ -778,19 +608,14 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; CHECK-LABEL: v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: sub v2.2d, v0.2d, v1.2d
; CHECK-NEXT: cmge v1.2d, v1.2d, #0
; CHECK-NEXT: cmge v0.2d, v0.2d, #0
; CHECK-NEXT: cmge v5.2d, v2.2d, #0
; CHECK-NEXT: mov x8, #9223372036854775807
; CHECK-NEXT: cmlt v3.2d, v2.2d, #0
; CHECK-NEXT: cmeq v1.2d, v0.2d, v1.2d
; CHECK-NEXT: cmeq v0.2d, v0.2d, v5.2d
; CHECK-NEXT: cmgt v1.2d, v1.2d, #0
; CHECK-NEXT: dup v4.2d, x8
; CHECK-NEXT: cmgt v0.2d, v0.2d, v2.2d
; CHECK-NEXT: mvn v5.16b, v3.16b
; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: bsl v4.16b, v3.16b, v5.16b
; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
; CHECK-NEXT: bsl v0.16b, v4.16b, v2.16b
; CHECK-NEXT: ret
%z = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %x, <2 x i64> %y)
@ -802,33 +627,23 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: sub v4.2d, v0.2d, v2.2d
; CHECK-NEXT: mov x8, #9223372036854775807
; CHECK-NEXT: cmlt v6.2d, v4.2d, #0
; CHECK-NEXT: dup v7.2d, x8
; CHECK-NEXT: cmlt v5.2d, v4.2d, #0
; CHECK-NEXT: dup v6.2d, x8
; CHECK-NEXT: mvn v7.16b, v5.16b
; CHECK-NEXT: mov v16.16b, v6.16b
; CHECK-NEXT: bsl v16.16b, v5.16b, v7.16b
; CHECK-NEXT: sub v5.2d, v1.2d, v3.2d
; CHECK-NEXT: mvn v16.16b, v6.16b
; CHECK-NEXT: mov v17.16b, v7.16b
; CHECK-NEXT: bsl v17.16b, v6.16b, v16.16b
; CHECK-NEXT: cmlt v6.2d, v5.2d, #0
; CHECK-NEXT: mvn v16.16b, v6.16b
; CHECK-NEXT: bsl v7.16b, v6.16b, v16.16b
; CHECK-NEXT: cmge v2.2d, v2.2d, #0
; CHECK-NEXT: cmge v0.2d, v0.2d, #0
; CHECK-NEXT: cmge v6.2d, v4.2d, #0
; CHECK-NEXT: cmge v3.2d, v3.2d, #0
; CHECK-NEXT: cmge v1.2d, v1.2d, #0
; CHECK-NEXT: cmeq v2.2d, v0.2d, v2.2d
; CHECK-NEXT: cmeq v0.2d, v0.2d, v6.2d
; CHECK-NEXT: cmge v6.2d, v5.2d, #0
; CHECK-NEXT: cmeq v3.2d, v1.2d, v3.2d
; CHECK-NEXT: cmeq v1.2d, v1.2d, v6.2d
; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v3.16b, v3.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: and v0.16b, v2.16b, v0.16b
; CHECK-NEXT: and v1.16b, v3.16b, v1.16b
; CHECK-NEXT: bsl v0.16b, v17.16b, v4.16b
; CHECK-NEXT: bsl v1.16b, v7.16b, v5.16b
; CHECK-NEXT: cmgt v2.2d, v2.2d, #0
; CHECK-NEXT: cmgt v0.2d, v0.2d, v4.2d
; CHECK-NEXT: cmlt v7.2d, v5.2d, #0
; CHECK-NEXT: cmgt v3.2d, v3.2d, #0
; CHECK-NEXT: cmgt v1.2d, v1.2d, v5.2d
; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b
; CHECK-NEXT: mvn v2.16b, v7.16b
; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b
; CHECK-NEXT: bsl v6.16b, v7.16b, v2.16b
; CHECK-NEXT: bsl v0.16b, v16.16b, v4.16b
; CHECK-NEXT: bsl v1.16b, v6.16b, v5.16b
; CHECK-NEXT: ret
%z = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
ret <4 x i64> %z
@ -850,46 +665,26 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; CHECK-NEXT: bsl v25.16b, v20.16b, v24.16b
; CHECK-NEXT: mvn v20.16b, v22.16b
; CHECK-NEXT: mov v24.16b, v21.16b
; CHECK-NEXT: cmgt v4.2d, v4.2d, #0
; CHECK-NEXT: cmgt v0.2d, v0.2d, v16.2d
; CHECK-NEXT: sub v19.2d, v3.2d, v7.2d
; CHECK-NEXT: bsl v24.16b, v22.16b, v20.16b
; CHECK-NEXT: mvn v20.16b, v23.16b
; CHECK-NEXT: mov v22.16b, v21.16b
; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b
; CHECK-NEXT: cmgt v4.2d, v5.2d, #0
; CHECK-NEXT: cmgt v1.2d, v1.2d, v17.2d
; CHECK-NEXT: bsl v22.16b, v23.16b, v20.16b
; CHECK-NEXT: cmlt v20.2d, v19.2d, #0
; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b
; CHECK-NEXT: cmgt v4.2d, v6.2d, #0
; CHECK-NEXT: cmgt v2.2d, v2.2d, v18.2d
; CHECK-NEXT: mvn v23.16b, v20.16b
; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b
; CHECK-NEXT: cmgt v4.2d, v7.2d, #0
; CHECK-NEXT: cmgt v3.2d, v3.2d, v19.2d
; CHECK-NEXT: bsl v21.16b, v20.16b, v23.16b
; CHECK-NEXT: cmge v4.2d, v4.2d, #0
; CHECK-NEXT: cmge v0.2d, v0.2d, #0
; CHECK-NEXT: cmge v20.2d, v16.2d, #0
; CHECK-NEXT: cmge v5.2d, v5.2d, #0
; CHECK-NEXT: cmge v1.2d, v1.2d, #0
; CHECK-NEXT: cmeq v4.2d, v0.2d, v4.2d
; CHECK-NEXT: cmeq v0.2d, v0.2d, v20.2d
; CHECK-NEXT: cmge v20.2d, v17.2d, #0
; CHECK-NEXT: cmge v6.2d, v6.2d, #0
; CHECK-NEXT: cmge v2.2d, v2.2d, #0
; CHECK-NEXT: cmeq v5.2d, v1.2d, v5.2d
; CHECK-NEXT: cmeq v1.2d, v1.2d, v20.2d
; CHECK-NEXT: cmge v20.2d, v18.2d, #0
; CHECK-NEXT: mvn v4.16b, v4.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: cmge v7.2d, v7.2d, #0
; CHECK-NEXT: cmge v3.2d, v3.2d, #0
; CHECK-NEXT: cmeq v6.2d, v2.2d, v6.2d
; CHECK-NEXT: cmeq v2.2d, v2.2d, v20.2d
; CHECK-NEXT: cmge v20.2d, v19.2d, #0
; CHECK-NEXT: and v0.16b, v4.16b, v0.16b
; CHECK-NEXT: mvn v4.16b, v5.16b
; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: cmeq v7.2d, v3.2d, v7.2d
; CHECK-NEXT: cmeq v3.2d, v3.2d, v20.2d
; CHECK-NEXT: and v1.16b, v4.16b, v1.16b
; CHECK-NEXT: mvn v4.16b, v6.16b
; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: and v2.16b, v4.16b, v2.16b
; CHECK-NEXT: mvn v4.16b, v7.16b
; CHECK-NEXT: mvn v3.16b, v3.16b
; CHECK-NEXT: and v3.16b, v4.16b, v3.16b
; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b
; CHECK-NEXT: bsl v0.16b, v25.16b, v16.16b
; CHECK-NEXT: bsl v1.16b, v24.16b, v17.16b
; CHECK-NEXT: bsl v2.16b, v22.16b, v18.16b

View File

@ -13,29 +13,25 @@ declare { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32>, <2 x i
define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
; SI-LABEL: saddo_i64_zext:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s8
; SI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[0:1], -1
; SI-NEXT: s_mov_b32 s5, s9
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
; SI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1
; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
; SI-NEXT: s_add_u32 s2, s10, s0
; SI-NEXT: s_addc_u32 s3, s11, s1
; SI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], -1
; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, v0
; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: s_add_u32 s10, s6, s8
; SI-NEXT: s_addc_u32 s11, s7, s9
; SI-NEXT: v_mov_b32_e32 v1, s7
; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
; SI-NEXT: v_cmp_lt_i64_e64 s[6:7], s[8:9], 0
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_xor_b64 s[4:5], s[6:7], vcc
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; SI-NEXT: v_mov_b32_e32 v1, s11
; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v0
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: saddo_i64_zext:
@ -43,22 +39,18 @@ define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s6
; VI-NEXT: s_add_u32 s8, s6, s0
; VI-NEXT: s_addc_u32 s9, s7, s1
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[1:2]
; VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0
; VI-NEXT: v_mov_b32_e32 v3, s9
; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[0:1], -1
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
; VI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], -1
; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[2:3]
; VI-NEXT: s_add_u32 s2, s6, s0
; VI-NEXT: s_addc_u32 s3, s7, s1
; VI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], -1
; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], v3, v2
; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@ -68,22 +60,18 @@ define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: s_add_u32 s8, s6, s0
; GFX9-NEXT: s_addc_u32 s9, s7, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s7
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[1:2]
; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v3, s9
; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[0:1], -1
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], -1
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[2:3]
; GFX9-NEXT: s_add_u32 s2, s6, s0
; GFX9-NEXT: s_addc_u32 s3, s7, s1
; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], -1
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], v3, v2
; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT: s_endpgm
@ -99,32 +87,27 @@ define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b
define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
; SI-LABEL: s_saddo_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s8
; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s1, -1
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s0, -1
; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
; SI-NEXT: s_add_i32 s2, s0, s1
; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s2, -1
; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, v0
; SI-NEXT: s_mov_b32 s5, s9
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; SI-NEXT: s_mov_b32 s8, s10
; SI-NEXT: s_mov_b32 s9, s11
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_mov_b32 s10, s6
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: v_cmp_lt_i32_e64 s[10:11], s9, 0
; SI-NEXT: s_add_i32 s9, s8, s9
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: v_cmp_lt_i32_e32 vcc, s9, v0
; SI-NEXT: v_mov_b32_e32 v0, s9
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_xor_b64 s[0:1], s[10:11], vcc
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s2
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: buffer_store_byte v0, off, s[8:11], 0
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_saddo_i32:
@ -133,18 +116,13 @@ define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s1, -1
; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[2:3]
; VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s0, -1
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3]
; VI-NEXT: s_add_i32 s2, s0, s1
; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], s2, -1
; VI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], v5, v4
; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s1, 0
; VI-NEXT: s_add_i32 s1, s0, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_cmp_lt_i32_e32 vcc, s1, v4
; VI-NEXT: v_mov_b32_e32 v4, s1
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
; VI-NEXT: flat_store_dword v[0:1], v4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
@ -158,18 +136,13 @@ define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_cmp_gt_i32_e64 s[2:3], s1, -1
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[2:3]
; GFX9-NEXT: v_cmp_gt_i32_e64 s[2:3], s0, -1
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3]
; GFX9-NEXT: s_add_i32 s2, s0, s1
; GFX9-NEXT: v_cmp_gt_i32_e64 s[0:1], s2, -1
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], v5, v4
; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], s1, 0
; GFX9-NEXT: s_add_i32 s1, s0, s1
; GFX9-NEXT: v_mov_b32_e32 v4, s0
; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, s1, v4
; GFX9-NEXT: v_mov_b32_e32 v4, s1
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v4, s2
; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
; GFX9-NEXT: global_store_dword v[0:1], v4, off
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s7
@ -204,19 +177,12 @@ define amdgpu_kernel void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
; SI-NEXT: s_mov_b32 s6, s14
; SI-NEXT: s_mov_b32 s7, s15
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], -1, v0
; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], v3, v1
; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, v1, v0
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v0
; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0
; SI-NEXT: s_endpgm
;
@ -235,17 +201,11 @@ define amdgpu_kernel void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v6
; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], -1, v4
; VI-NEXT: v_cmp_eq_u32_e32 vcc, v7, v5
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1]
; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], v7, v5
; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: flat_store_dword v[2:3], v4
; VI-NEXT: v_add_u32_e32 v5, vcc, v4, v6
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v4
; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], v5, v6
; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: flat_store_dword v[2:3], v5
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
@ -265,17 +225,11 @@ define amdgpu_kernel void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4
; GFX9-NEXT: v_add_u32_e32 v4, v6, v4
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, -1, v6
; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX9-NEXT: v_cmp_lt_i32_e64 s[0:1], -1, v4
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v7, v5
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1]
; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], v7, v5
; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: global_store_dword v[2:3], v4, off
; GFX9-NEXT: v_add_u32_e32 v5, v6, v4
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v4
; GFX9-NEXT: v_cmp_lt_i32_e64 s[0:1], v5, v6
; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: global_store_dword v[2:3], v5, off
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; GFX9-NEXT: global_store_byte v[0:1], v2, off
; GFX9-NEXT: s_endpgm
@ -292,31 +246,27 @@ define amdgpu_kernel void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
define amdgpu_kernel void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
; SI-LABEL: s_saddo_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s15, 0xf000
; SI-NEXT: s_mov_b32 s14, -1
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], -1
; SI-NEXT: s_add_u32 s2, s8, s10
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[8:9], -1
; SI-NEXT: s_addc_u32 s3, s9, s11
; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; SI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], -1
; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, v0
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: s_mov_b32 s12, s6
; SI-NEXT: s_mov_b32 s13, s7
; SI-NEXT: s_mov_b32 s6, s14
; SI-NEXT: s_mov_b32 s7, s15
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_add_u32 s12, s4, s6
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: s_addc_u32 s13, s5, s7
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
; SI-NEXT: v_mov_b32_e32 v0, s12
; SI-NEXT: v_cmp_lt_i64_e64 s[4:5], s[6:7], 0
; SI-NEXT: s_mov_b32 s8, s2
; SI-NEXT: s_mov_b32 s9, s3
; SI-NEXT: s_mov_b32 s2, s10
; SI-NEXT: s_mov_b32 s3, s11
; SI-NEXT: v_mov_b32_e32 v1, s13
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_xor_b64 s[0:1], s[4:5], vcc
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0
; SI-NEXT: buffer_store_byte v0, off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_saddo_i64:
@ -324,22 +274,18 @@ define amdgpu_kernel void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: s_add_u32 s0, s4, s6
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], -1
; VI-NEXT: s_addc_u32 s1, s5, s7
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
; VI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1
; VI-NEXT: s_add_u32 s2, s4, s6
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_addc_u32 s3, s5, s7
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1]
; VI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], -1
; VI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], v5, v4
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5]
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; VI-NEXT: flat_store_byte v[0:1], v2
@ -350,22 +296,18 @@ define amdgpu_kernel void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: s_add_u32 s0, s4, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], -1
; GFX9-NEXT: s_addc_u32 s1, s5, s7
; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1
; GFX9-NEXT: s_add_u32 s2, s4, s6
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: s_addc_u32 s3, s5, s7
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1]
; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], -1
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], v5, v4
; GFX9-NEXT: v_mov_b32_e32 v5, s3
; GFX9-NEXT: v_mov_b32_e32 v4, s2
; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
; GFX9-NEXT: v_mov_b32_e32 v5, s1
; GFX9-NEXT: v_mov_b32_e32 v4, s0
; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[4:5], off
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; GFX9-NEXT: global_store_byte v[0:1], v2, off
@ -398,19 +340,12 @@ define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
; SI-NEXT: s_mov_b32 s6, s14
; SI-NEXT: s_mov_b32 s7, s15
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[2:3]
; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; SI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1]
; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
; SI-NEXT: v_cmp_lt_i64_e64 s[0:1], -1, v[0:1]
; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], v5, v2
; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2
; SI-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc
; SI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
; SI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1]
; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0
; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0
; SI-NEXT: s_endpgm
@ -430,18 +365,12 @@ define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5]
; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; VI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[6:7]
; VI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, v6, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc
; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], -1, v[4:5]
; VI-NEXT: v_cmp_eq_u32_e32 vcc, v9, v8
; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1]
; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], v9, v6
; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5]
; VI-NEXT: v_add_u32_e32 v8, vcc, v6, v4
; VI-NEXT: v_addc_u32_e32 v9, vcc, v7, v5, vcc
; VI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5]
; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[6:7]
; VI-NEXT: flat_store_dwordx2 v[2:3], v[8:9]
; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
@ -461,18 +390,12 @@ define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v5, vcc
; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], -1, v[4:5]
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v9, v8
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1]
; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], v9, v6
; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[4:5], off
; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v6, v4
; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v7, v5, vcc
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5]
; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[6:7]
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[8:9], off
; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; GFX9-NEXT: global_store_byte v[0:1], v2, off
; GFX9-NEXT: s_endpgm
@ -489,48 +412,35 @@ define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind {
; SI-LABEL: v_saddo_v2i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s15, 0xf000
; SI-NEXT: s_mov_b32 s14, -1
; SI-NEXT: s_mov_b32 s2, s14
; SI-NEXT: s_mov_b32 s3, s15
; SI-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s19, 0xf000
; SI-NEXT: s_mov_b32 s18, -1
; SI-NEXT: s_mov_b32 s2, s18
; SI-NEXT: s_mov_b32 s3, s19
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s0, s10
; SI-NEXT: s_mov_b32 s1, s11
; SI-NEXT: s_mov_b32 s10, s14
; SI-NEXT: s_mov_b32 s11, s15
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
; SI-NEXT: s_mov_b32 s0, s14
; SI-NEXT: s_mov_b32 s1, s15
; SI-NEXT: s_mov_b32 s14, s18
; SI-NEXT: s_mov_b32 s15, s19
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
; SI-NEXT: s_mov_b32 s12, s6
; SI-NEXT: s_mov_b32 s13, s7
; SI-NEXT: s_mov_b32 s6, s14
; SI-NEXT: s_mov_b32 s7, s15
; SI-NEXT: s_mov_b32 s16, s10
; SI-NEXT: s_mov_b32 s17, s11
; SI-NEXT: s_mov_b32 s10, s18
; SI-NEXT: s_mov_b32 s11, s19
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v3
; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], -1, v0
; SI-NEXT: v_cmp_lt_i32_e64 s[2:3], -1, v1
; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v6, v2
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: v_cmp_ne_u32_e64 s[4:5], v6, v2
; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5]
; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; SI-NEXT: v_cmp_ne_u32_e64 s[2:3], v5, v3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v5, vcc, v1, v3
; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2
; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3
; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], v5, v1
; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
; SI-NEXT: v_cmp_lt_i32_e64 s[2:3], v4, v0
; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; SI-NEXT: s_and_b64 s[0:1], vcc, s[2:3]
; SI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0
; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_saddo_v2i32:
@ -543,33 +453,21 @@ define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32>
; VI-NEXT: v_mov_b32_e32 v7, s5
; VI-NEXT: flat_load_dwordx2 v[6:7], v[6:7]
; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4
; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v6
; VI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, v6, v4
; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v5
; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v7
; VI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; VI-NEXT: v_add_u32_e32 v5, vcc, v7, v5
; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], -1, v4
; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], -1, v5
; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1]
; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], v10, v6
; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3]
; VI-NEXT: v_cmp_ne_u32_e64 s[4:5], v10, v6
; VI-NEXT: v_cmp_eq_u32_e32 vcc, v9, v8
; VI-NEXT: v_cmp_ne_u32_e64 s[2:3], v9, v7
; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5]
; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5]
; VI-NEXT: v_add_u32_e32 v9, vcc, v7, v5
; VI-NEXT: v_add_u32_e32 v8, vcc, v6, v4
; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v5
; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v7
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v4
; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v6
; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
; VI-NEXT: flat_store_dwordx2 v[2:3], v[8:9]
; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
; VI-NEXT: s_and_b64 s[0:1], vcc, s[2:3]
; VI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3]
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@ -584,33 +482,21 @@ define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32>
; GFX9-NEXT: v_mov_b32_e32 v7, s5
; GFX9-NEXT: global_load_dwordx2 v[6:7], v[6:7], off
; GFX9-NEXT: global_load_dwordx2 v[4:5], v[4:5], off
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, -1, v6
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, -1, v5
; GFX9-NEXT: v_add_u32_e32 v4, v6, v4
; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, -1, v7
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX9-NEXT: v_cmp_lt_i32_e64 s[0:1], -1, v4
; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], -1, v5
; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], v10, v6
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3]
; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], v10, v6
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v9, v8
; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], v9, v7
; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5]
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[4:5], off
; GFX9-NEXT: v_add_u32_e32 v9, v7, v5
; GFX9-NEXT: v_add_u32_e32 v8, v6, v4
; GFX9-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v5
; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v7
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v4
; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v6
; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[8:9], off
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[2:3]
; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[2:3]
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT: s_endpgm

View File

@ -95,76 +95,48 @@ define <2 x i1> @usubo(<2 x i64> *%ptr, <2 x i64> *%ptr2) {
define <2 x i1> @saddo(<2 x i64> *%ptr, <2 x i64> *%ptr2) {
; CHECK-LABEL: saddo:
; CHECK: @ %bb.0:
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: vld1.64 {d16, d17}, [r1]
; CHECK-NEXT: movs r5, #0
; CHECK-NEXT: movs r6, #0
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
; CHECK-NEXT: vld1.64 {d20, d21}, [r0]
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vmov.32 r1, d16[1]
; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
; CHECK-NEXT: vmov.32 r2, d17[1]
; CHECK-NEXT: vadd.i64 q8, q9, q8
; CHECK-NEXT: vmov.32 r12, d18[1]
; CHECK-NEXT: vmov.32 r4, d19[1]
; CHECK-NEXT: vmov.32 lr, d16[1]
; CHECK-NEXT: vmov.32 r7, d17[1]
; CHECK-NEXT: cmp.w r1, #-1
; CHECK-NEXT: vld1.64 {d18, d19}, [r1]
; CHECK-NEXT: vadd.i64 q8, q10, q9
; CHECK-NEXT: vmov.32 r2, d20[0]
; CHECK-NEXT: vmov.32 r1, d20[1]
; CHECK-NEXT: vmov.32 r12, d16[0]
; CHECK-NEXT: vmov.32 r8, d16[1]
; CHECK-NEXT: vmov.32 lr, d17[0]
; CHECK-NEXT: vmov.32 r4, d21[0]
; CHECK-NEXT: vmov.32 r5, d17[1]
; CHECK-NEXT: vmov.32 r6, d18[1]
; CHECK-NEXT: vmov.32 r7, d21[1]
; CHECK-NEXT: subs.w r2, r12, r2
; CHECK-NEXT: vmov.32 r2, d19[1]
; CHECK-NEXT: sbcs.w r1, r8, r1
; CHECK-NEXT: mov.w r1, #0
; CHECK-NEXT: it gt
; CHECK-NEXT: movgt r1, #1
; CHECK-NEXT: cmp r1, #0
; CHECK-NEXT: it ne
; CHECK-NEXT: movne.w r1, #-1
; CHECK-NEXT: cmp.w r2, #-1
; CHECK-NEXT: mov.w r2, #0
; CHECK-NEXT: it gt
; CHECK-NEXT: movgt r2, #1
; CHECK-NEXT: cmp.w r12, #-1
; CHECK-NEXT: it gt
; CHECK-NEXT: movgt r5, #1
; CHECK-NEXT: cmp r5, #0
; CHECK-NEXT: it ne
; CHECK-NEXT: movne.w r5, #-1
; CHECK-NEXT: cmp.w r4, #-1
; CHECK-NEXT: mov.w r4, #0
; CHECK-NEXT: it gt
; CHECK-NEXT: movgt r4, #1
; CHECK-NEXT: cmp.w lr, #-1
; CHECK-NEXT: it gt
; CHECK-NEXT: movgt r6, #1
; CHECK-NEXT: cmp r6, #0
; CHECK-NEXT: it ne
; CHECK-NEXT: movne.w r6, #-1
; CHECK-NEXT: cmp.w r7, #-1
; CHECK-NEXT: it gt
; CHECK-NEXT: movgt r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: movlt r1, #1
; CHECK-NEXT: subs.w r4, lr, r4
; CHECK-NEXT: sbcs.w r7, r5, r7
; CHECK-NEXT: it lt
; CHECK-NEXT: movlt r3, #1
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: it ne
; CHECK-NEXT: movne.w r3, #-1
; CHECK-NEXT: cmp r4, #0
; CHECK-NEXT: vdup.32 d19, r3
; CHECK-NEXT: asrs r7, r6, #31
; CHECK-NEXT: vdup.32 d21, r3
; CHECK-NEXT: cmp r1, #0
; CHECK-NEXT: it ne
; CHECK-NEXT: movne.w r4, #-1
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: it ne
; CHECK-NEXT: movne.w r2, #-1
; CHECK-NEXT: vdup.32 d23, r2
; CHECK-NEXT: vdup.32 d21, r4
; CHECK-NEXT: vdup.32 d18, r6
; CHECK-NEXT: vdup.32 d22, r1
; CHECK-NEXT: vdup.32 d20, r5
; CHECK-NEXT: vceq.i32 q9, q10, q9
; CHECK-NEXT: movne.w r1, #-1
; CHECK-NEXT: vdup.32 d20, r1
; CHECK-NEXT: vst1.64 {d16, d17}, [r0]
; CHECK-NEXT: vceq.i32 q10, q10, q11
; CHECK-NEXT: vrev64.32 q11, q9
; CHECK-NEXT: vrev64.32 q12, q10
; CHECK-NEXT: vand q9, q9, q11
; CHECK-NEXT: vand q10, q10, q12
; CHECK-NEXT: vbic q9, q10, q9
; CHECK-NEXT: asrs r2, r2, #31
; CHECK-NEXT: vdup.32 d19, r2
; CHECK-NEXT: vdup.32 d18, r7
; CHECK-NEXT: veor q9, q9, q10
; CHECK-NEXT: vmovn.i64 d18, q9
; CHECK-NEXT: vmov r2, r1, d18
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
%x = load <2 x i64>, <2 x i64>* %ptr, align 8
%y = load <2 x i64>, <2 x i64>* %ptr2, align 8
%s = call {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64> %x, <2 x i64> %y)
@ -177,77 +149,64 @@ define <2 x i1> @saddo(<2 x i64> *%ptr, <2 x i64> *%ptr2) {
define <2 x i1> @ssubo(<2 x i64> *%ptr, <2 x i64> *%ptr2) {
; CHECK-LABEL: ssubo:
; CHECK: @ %bb.0:
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
; CHECK-NEXT: vld1.64 {d18, d19}, [r1]
; CHECK-NEXT: movs r5, #0
; CHECK-NEXT: movs r6, #0
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: vld1.64 {d20, d21}, [r0]
; CHECK-NEXT: vsub.i64 q8, q10, q9
; CHECK-NEXT: vmov.32 r1, d20[0]
; CHECK-NEXT: vmov.32 r12, d20[1]
; CHECK-NEXT: vmov.32 lr, d21[1]
; CHECK-NEXT: vmov.32 r1, d16[1]
; CHECK-NEXT: vmov.32 r2, d17[1]
; CHECK-NEXT: vmov.32 r4, d18[1]
; CHECK-NEXT: vmov.32 r7, d19[1]
; CHECK-NEXT: cmp.w r1, #-1
; CHECK-NEXT: vmov.32 r3, d16[0]
; CHECK-NEXT: vmov.32 lr, d16[1]
; CHECK-NEXT: vmov.32 r4, d21[0]
; CHECK-NEXT: vmov.32 r5, d17[0]
; CHECK-NEXT: vmov.32 r6, d21[1]
; CHECK-NEXT: vmov.32 r7, d17[1]
; CHECK-NEXT: vmov.32 r8, d18[1]
; CHECK-NEXT: subs r1, r3, r1
; CHECK-NEXT: vmov.32 r3, d18[0]
; CHECK-NEXT: sbcs.w r1, lr, r12
; CHECK-NEXT: vmov.32 r12, d19[0]
; CHECK-NEXT: mov.w r1, #0
; CHECK-NEXT: it gt
; CHECK-NEXT: movgt r1, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: movlt r1, #1
; CHECK-NEXT: subs r5, r5, r4
; CHECK-NEXT: vmov.32 r5, d19[1]
; CHECK-NEXT: sbcs r7, r6
; CHECK-NEXT: mov.w r7, #0
; CHECK-NEXT: it lt
; CHECK-NEXT: movlt r7, #1
; CHECK-NEXT: cmp r7, #0
; CHECK-NEXT: it ne
; CHECK-NEXT: movne.w r7, #-1
; CHECK-NEXT: vdup.32 d21, r7
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: sbcs.w r3, r2, r8
; CHECK-NEXT: mov.w r3, #0
; CHECK-NEXT: it lt
; CHECK-NEXT: movlt r3, #1
; CHECK-NEXT: rsbs.w r6, r12, #0
; CHECK-NEXT: sbcs.w r6, r2, r5
; CHECK-NEXT: it lt
; CHECK-NEXT: movlt r2, #1
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: it ne
; CHECK-NEXT: movne.w r2, #-1
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: vdup.32 d19, r2
; CHECK-NEXT: it ne
; CHECK-NEXT: movne.w r3, #-1
; CHECK-NEXT: cmp r1, #0
; CHECK-NEXT: it ne
; CHECK-NEXT: movne.w r1, #-1
; CHECK-NEXT: cmp.w r2, #-1
; CHECK-NEXT: mov.w r2, #0
; CHECK-NEXT: it gt
; CHECK-NEXT: movgt r2, #1
; CHECK-NEXT: cmp.w r12, #-1
; CHECK-NEXT: it gt
; CHECK-NEXT: movgt r5, #1
; CHECK-NEXT: cmp r5, #0
; CHECK-NEXT: it ne
; CHECK-NEXT: movne.w r5, #-1
; CHECK-NEXT: cmp.w lr, #-1
; CHECK-NEXT: it gt
; CHECK-NEXT: movgt r6, #1
; CHECK-NEXT: cmp.w r4, #-1
; CHECK-NEXT: mov.w r4, #0
; CHECK-NEXT: it gt
; CHECK-NEXT: movgt r4, #1
; CHECK-NEXT: cmp r4, #0
; CHECK-NEXT: it ne
; CHECK-NEXT: movne.w r4, #-1
; CHECK-NEXT: cmp.w r7, #-1
; CHECK-NEXT: it gt
; CHECK-NEXT: movgt r3, #1
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: it ne
; CHECK-NEXT: movne.w r3, #-1
; CHECK-NEXT: vdup.32 d19, r3
; CHECK-NEXT: cmp r6, #0
; CHECK-NEXT: it ne
; CHECK-NEXT: movne.w r6, #-1
; CHECK-NEXT: vdup.32 d21, r6
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: vdup.32 d18, r4
; CHECK-NEXT: it ne
; CHECK-NEXT: movne.w r2, #-1
; CHECK-NEXT: vdup.32 d23, r2
; CHECK-NEXT: vdup.32 d20, r5
; CHECK-NEXT: vdup.32 d22, r1
; CHECK-NEXT: vceq.i32 q9, q10, q9
; CHECK-NEXT: vdup.32 d18, r3
; CHECK-NEXT: vdup.32 d20, r1
; CHECK-NEXT: veor q9, q9, q10
; CHECK-NEXT: vst1.64 {d16, d17}, [r0]
; CHECK-NEXT: vceq.i32 q10, q10, q11
; CHECK-NEXT: vrev64.32 q11, q9
; CHECK-NEXT: vrev64.32 q12, q10
; CHECK-NEXT: vand q9, q9, q11
; CHECK-NEXT: vand q10, q10, q12
; CHECK-NEXT: vmvn q9, q9
; CHECK-NEXT: vbic q9, q9, q10
; CHECK-NEXT: vmovn.i64 d18, q9
; CHECK-NEXT: vmov r2, r1, d18
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
%x = load <2 x i64>, <2 x i64>* %ptr, align 8
%y = load <2 x i64>, <2 x i64>* %ptr2, align 8
%s = call {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64> %x, <2 x i64> %y)

View File

@ -10,17 +10,11 @@ declare {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
define i1 @sadd(i32 %a, i32 %b, i32* %c) nounwind {
; RV32I-LABEL: sadd:
; RV32I: # %bb.0: # %entry
; RV32I-NEXT: addi a3, zero, -1
; RV32I-NEXT: slt a4, a3, a1
; RV32I-NEXT: slt a5, a3, a0
; RV32I-NEXT: xor a4, a5, a4
; RV32I-NEXT: seqz a4, a4
; RV32I-NEXT: add a1, a0, a1
; RV32I-NEXT: slt a0, a3, a1
; RV32I-NEXT: xor a0, a5, a0
; RV32I-NEXT: snez a0, a0
; RV32I-NEXT: and a0, a4, a0
; RV32I-NEXT: sw a1, 0(a2)
; RV32I-NEXT: add a3, a0, a1
; RV32I-NEXT: slt a0, a3, a0
; RV32I-NEXT: slti a1, a1, 0
; RV32I-NEXT: xor a0, a1, a0
; RV32I-NEXT: sw a3, 0(a2)
; RV32I-NEXT: ret
entry:
%x = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a, i32 %b)
@ -33,16 +27,10 @@ entry:
define i1 @ssub(i32 %a, i32 %b, i32* %c) nounwind {
; RV32I-LABEL: ssub:
; RV32I: # %bb.0: # %entry
; RV32I-NEXT: addi a3, zero, -1
; RV32I-NEXT: slt a4, a3, a1
; RV32I-NEXT: slt a5, a3, a0
; RV32I-NEXT: xor a4, a5, a4
; RV32I-NEXT: snez a4, a4
; RV32I-NEXT: sgtz a3, a1
; RV32I-NEXT: sub a1, a0, a1
; RV32I-NEXT: slt a0, a3, a1
; RV32I-NEXT: xor a0, a5, a0
; RV32I-NEXT: snez a0, a0
; RV32I-NEXT: and a0, a4, a0
; RV32I-NEXT: slt a0, a1, a0
; RV32I-NEXT: xor a0, a3, a0
; RV32I-NEXT: sw a1, 0(a2)
; RV32I-NEXT: ret
entry:

View File

@ -34,30 +34,21 @@ define <4 x i32> @combine_vec_smul_two(<4 x i32> %a0, <4 x i32> %a1) {
; SSE-LABEL: combine_vec_smul_two:
; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: pxor %xmm0, %xmm0
; SSE-NEXT: pxor %xmm3, %xmm3
; SSE-NEXT: paddd %xmm0, %xmm2
; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: pcmpgtd %xmm2, %xmm3
; SSE-NEXT: pcmpeqd %xmm4, %xmm4
; SSE-NEXT: pxor %xmm4, %xmm3
; SSE-NEXT: paddd %xmm2, %xmm2
; SSE-NEXT: pcmpgtd %xmm2, %xmm0
; SSE-NEXT: pxor %xmm4, %xmm0
; SSE-NEXT: pcmpeqd %xmm3, %xmm0
; SSE-NEXT: blendvps %xmm0, %xmm2, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: pxor %xmm0, %xmm3
; SSE-NEXT: movdqa %xmm3, %xmm0
; SSE-NEXT: blendvps %xmm0, %xmm1, %xmm2
; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_smul_two:
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm3
; AVX-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
; AVX-NEXT: vpxor %xmm4, %xmm3, %xmm3
; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm2
; AVX-NEXT: vpxor %xmm4, %xmm2, %xmm2
; AVX-NEXT: vpcmpeqd %xmm2, %xmm3, %xmm2
; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm2
; AVX-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm3
; AVX-NEXT: vpxor %xmm3, %xmm0, %xmm0
; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; AVX-NEXT: retq
%1 = call {<4 x i32>, <4 x i1>} @llvm.smul.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> <i32 2, i32 2, i32 2, i32 2>)
%2 = extractvalue {<4 x i32>, <4 x i1>} %1, 0

View File

@ -98,15 +98,10 @@ define <4 x i32> @smul_v4i32_1(<4 x i32> %a, <4 x i32> %b) nounwind {
define <4 x i32> @smul_v4i32_2(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX-LABEL: smul_v4i32_2:
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm3
; AVX-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
; AVX-NEXT: vpxor %xmm4, %xmm3, %xmm3
; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm2
; AVX-NEXT: vpxor %xmm4, %xmm2, %xmm2
; AVX-NEXT: vpcmpeqd %xmm2, %xmm3, %xmm2
; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm2
; AVX-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm3
; AVX-NEXT: vpxor %xmm3, %xmm0, %xmm0
; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; AVX-NEXT: retq
%x = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> %a, <4 x i32> <i32 2, i32 2, i32 2, i32 2>)
%y = extractvalue { <4 x i32>, <4 x i1> } %x, 0

View File

@ -183,28 +183,20 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
;
; X64-LABEL: vec:
; X64: # %bb.0:
; X64-NEXT: pxor %xmm2, %xmm2
; X64-NEXT: pxor %xmm3, %xmm3
; X64-NEXT: pxor %xmm4, %xmm4
; X64-NEXT: pcmpgtd %xmm1, %xmm4
; X64-NEXT: pcmpeqd %xmm2, %xmm2
; X64-NEXT: pxor %xmm2, %xmm4
; X64-NEXT: pxor %xmm5, %xmm5
; X64-NEXT: pcmpgtd %xmm0, %xmm5
; X64-NEXT: pxor %xmm2, %xmm5
; X64-NEXT: pcmpeqd %xmm5, %xmm4
; X64-NEXT: paddd %xmm1, %xmm0
; X64-NEXT: pcmpgtd %xmm0, %xmm3
; X64-NEXT: pxor %xmm3, %xmm2
; X64-NEXT: pcmpeqd %xmm5, %xmm2
; X64-NEXT: pandn %xmm4, %xmm2
; X64-NEXT: movdqa %xmm3, %xmm1
; X64-NEXT: pandn {{.*}}(%rip), %xmm1
; X64-NEXT: psrld $1, %xmm3
; X64-NEXT: por %xmm1, %xmm3
; X64-NEXT: pand %xmm2, %xmm3
; X64-NEXT: pandn %xmm0, %xmm2
; X64-NEXT: pcmpgtd %xmm1, %xmm3
; X64-NEXT: paddd %xmm0, %xmm1
; X64-NEXT: pcmpgtd %xmm1, %xmm0
; X64-NEXT: pxor %xmm3, %xmm0
; X64-NEXT: pcmpgtd %xmm1, %xmm2
; X64-NEXT: movdqa %xmm2, %xmm3
; X64-NEXT: pandn {{.*}}(%rip), %xmm3
; X64-NEXT: psrld $1, %xmm2
; X64-NEXT: por %xmm3, %xmm2
; X64-NEXT: movdqa %xmm2, %xmm0
; X64-NEXT: pand %xmm0, %xmm2
; X64-NEXT: pandn %xmm1, %xmm0
; X64-NEXT: por %xmm2, %xmm0
; X64-NEXT: retq
%tmp = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y);
ret <4 x i32> %tmp;

File diff suppressed because it is too large Load Diff

View File

@ -183,30 +183,20 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
;
; X64-LABEL: vec:
; X64: # %bb.0:
; X64-NEXT: movdqa %xmm0, %xmm2
; X64-NEXT: pxor %xmm3, %xmm3
; X64-NEXT: pxor %xmm0, %xmm0
; X64-NEXT: pcmpgtd %xmm1, %xmm0
; X64-NEXT: pcmpeqd %xmm4, %xmm4
; X64-NEXT: pxor %xmm4, %xmm0
; X64-NEXT: pxor %xmm5, %xmm5
; X64-NEXT: pcmpgtd %xmm2, %xmm5
; X64-NEXT: pxor %xmm4, %xmm5
; X64-NEXT: pcmpeqd %xmm5, %xmm0
; X64-NEXT: psubd %xmm1, %xmm2
; X64-NEXT: pcmpgtd %xmm2, %xmm3
; X64-NEXT: movdqa %xmm3, %xmm1
; X64-NEXT: pxor %xmm4, %xmm1
; X64-NEXT: pcmpeqd %xmm5, %xmm1
; X64-NEXT: pxor %xmm4, %xmm1
; X64-NEXT: pandn %xmm1, %xmm0
; X64-NEXT: movdqa %xmm3, %xmm1
; X64-NEXT: pxor %xmm2, %xmm2
; X64-NEXT: movdqa %xmm0, %xmm3
; X64-NEXT: psubd %xmm1, %xmm3
; X64-NEXT: pcmpgtd %xmm2, %xmm1
; X64-NEXT: pcmpgtd %xmm3, %xmm0
; X64-NEXT: pxor %xmm1, %xmm0
; X64-NEXT: pcmpgtd %xmm3, %xmm2
; X64-NEXT: movdqa %xmm2, %xmm1
; X64-NEXT: pandn {{.*}}(%rip), %xmm1
; X64-NEXT: psrld $1, %xmm3
; X64-NEXT: por %xmm1, %xmm3
; X64-NEXT: pand %xmm0, %xmm3
; X64-NEXT: pandn %xmm2, %xmm0
; X64-NEXT: por %xmm3, %xmm0
; X64-NEXT: psrld $1, %xmm2
; X64-NEXT: por %xmm1, %xmm2
; X64-NEXT: pand %xmm0, %xmm2
; X64-NEXT: pandn %xmm3, %xmm0
; X64-NEXT: por %xmm2, %xmm0
; X64-NEXT: retq
%tmp = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
ret <4 x i32> %tmp

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff