[FastISel][X86] Extend support for {s|u}{add|sub|mul}.with.overflow intrinsics.

llvm-svn: 210610
2014-06-10 23:52:44 +00:00 · 2014-06-10 23:52:44 +00:00 · 2dace6e54b
parent d7e1fe40e1
commit 2dace6e54b
3 changed files with 430 additions and 34 deletions
--- a/llvm/include/llvm/CodeGen/FastISel.h
+++ b/llvm/include/llvm/CodeGen/FastISel.h
@ -373,6 +373,9 @@ protected:
  /// - \c Add has a constant operand.
  bool canFoldAddIntoGEP(const User *GEP, const Value *Add);

+  /// Test whether the given value has exactly one use.
+  bool hasTrivialKill(const Value *V) const;
+
 private:
  bool SelectBinaryOp(const User *I, unsigned ISDOpcode);

@ -408,9 +411,6 @@ private:
  /// beginning of the block. It helps to avoid spilling cached variables across
  /// heavy instructions like calls.
  void flushLocalValueMap();
-
-  /// Test whether the given value has exactly one use.
-  bool hasTrivialKill(const Value *V) const;
 };

 }
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@ -1637,6 +1637,18 @@ bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
  return true;
 }

+static bool isCommutativeIntrinsic(IntrinsicInst const &I) {
+  switch (I.getIntrinsicID()) {
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow:
+    return true;
+  default:
+    return false;
+  }
+}
+
 bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
  // FIXME: Handle more intrinsics.
  switch (I.getIntrinsicID()) {
@ -1718,47 +1730,94 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
    return true;
  }
  case Intrinsic::sadd_with_overflow:
-  case Intrinsic::uadd_with_overflow: {
-    // FIXME: Should fold immediates.
-
-    // Replace "add with overflow" intrinsics with an "add" instruction followed
-    // by a seto/setc instruction.
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow: {
+    // This implements the basic lowering of the xalu with overflow intrinsics
+    // into add/sub/mul folowed by either seto or setb.
    const Function *Callee = I.getCalledFunction();
-    Type *RetTy =
-      cast<StructType>(Callee->getReturnType())->getTypeAtIndex(unsigned(0));
+    auto *Ty = cast<StructType>(Callee->getReturnType());
+    Type *RetTy = Ty->getTypeAtIndex(0U);
+    Type *CondTy = Ty->getTypeAtIndex(1);

    MVT VT;
    if (!isTypeLegal(RetTy, VT))
      return false;

-    const Value *Op1 = I.getArgOperand(0);
-    const Value *Op2 = I.getArgOperand(1);
-    unsigned Reg1 = getRegForValue(Op1);
-    unsigned Reg2 = getRegForValue(Op2);
-
-    if (Reg1 == 0 || Reg2 == 0)
-      // FIXME: Handle values *not* in registers.
+    if (VT < MVT::i8 || VT > MVT::i64)
      return false;

-    unsigned OpC = 0;
-    if (VT == MVT::i32)
-      OpC = X86::ADD32rr;
-    else if (VT == MVT::i64)
-      OpC = X86::ADD64rr;
-    else
+    const Value *LHS = I.getArgOperand(0);
+    const Value *RHS = I.getArgOperand(1);
+
+    // Canonicalize immediates to the RHS.
+    if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
+        isCommutativeIntrinsic(I))
+      std::swap(LHS, RHS);
+
+    unsigned BaseOpc, CondOpc;
+    switch (I.getIntrinsicID()) {
+    default: llvm_unreachable("Unexpected intrinsic!");
+    case Intrinsic::sadd_with_overflow:
+      BaseOpc = ISD::ADD; CondOpc = X86::SETOr; break;
+    case Intrinsic::uadd_with_overflow:
+      BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break;
+    case Intrinsic::ssub_with_overflow:
+      BaseOpc = ISD::SUB; CondOpc = X86::SETOr; break;
+    case Intrinsic::usub_with_overflow:
+      BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break;
+    case Intrinsic::smul_with_overflow:
+      BaseOpc = ISD::MUL; CondOpc = X86::SETOr; break;
+    case Intrinsic::umul_with_overflow:
+      BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break;
+    }
+
+    unsigned LHSReg = getRegForValue(LHS);
+    if (LHSReg == 0)
+      return false;
+    bool LHSIsKill = hasTrivialKill(LHS);
+
+    unsigned ResultReg = 0;
+    // Check if we have an immediate version.
+    if (auto const *C = dyn_cast<ConstantInt>(RHS)) {
+      ResultReg = FastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill,
+                              C->getZExtValue());
+    }
+
+    unsigned RHSReg;
+    bool RHSIsKill;
+    if (!ResultReg) {
+      RHSReg = getRegForValue(RHS);
+      if (RHSReg == 0)
+        return false;
+      RHSIsKill = hasTrivialKill(RHS);
+      ResultReg = FastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg,
+                              RHSIsKill);
+    }
+
+    // FastISel doesn't have a pattern for X86::MUL*r. Emit it manually.
+    if (BaseOpc == X86ISD::UMUL && !ResultReg) {
+      static const unsigned MULOpc[] =
+      { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r };
+      static const unsigned Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX };
+      // First copy the first operand into RAX, which is an implicit input to
+      // the X86::MUL*r instruction.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8])
+        .addReg(LHSReg, getKillRegState(LHSIsKill));
+      ResultReg = FastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8],
+                                 TLI.getRegClassFor(VT), RHSReg, RHSIsKill);
+    }
+
+    if (!ResultReg)
      return false;

-    // The call to CreateRegs builds two sequential registers, to store the
-    // both the returned values.
-    unsigned ResultReg = FuncInfo.CreateRegs(I.getType());
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpC), ResultReg)
-      .addReg(Reg1).addReg(Reg2);
-
-    unsigned Opc = X86::SETBr;
-    if (I.getIntrinsicID() == Intrinsic::sadd_with_overflow)
-      Opc = X86::SETOr;
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
-            ResultReg + 1);
+    unsigned ResultReg2 = FuncInfo.CreateRegs(CondTy);
+    assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers.");
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc),
+            ResultReg2);

    UpdateValueMap(&I, ResultReg, 2);
    return true;
--- a/llvm/test/CodeGen/X86/xaluo.ll
+++ b/llvm/test/CodeGen/X86/xaluo.ll
@ -0,0 +1,337 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s -check-prefix=DAG
+; RUN: llc -mtriple=x86_64-unknown-unknown -fast-isel -fast-isel-abort < %s | FileCheck %s --check-prefix=FAST
+
+;
+; Get the actual value of the overflow bit.
+;
+; SADDO reg, reg
+define zeroext i1 @saddo.i8(i8 signext %v1, i8 signext %v2, i8* %res) {
+entry:
+; DAG-LABEL:    saddo.i8
+; DAG:          addb %sil, %dil
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   saddo.i8
+; FAST:         addb %sil, %dil
+; FAST-NEXT:    seto %al
+  %t = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %v1, i8 %v2)
+  %val = extractvalue {i8, i1} %t, 0
+  %obit = extractvalue {i8, i1} %t, 1
+  store i8 %val, i8* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo.i16(i16 %v1, i16 %v2, i16* %res) {
+entry:
+; DAG-LABEL:    saddo.i16
+; DAG:          addw %si, %di
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   saddo.i16
+; FAST:         addw %si, %di
+; FAST-NEXT:    seto %al
+  %t = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %v1, i16 %v2)
+  %val = extractvalue {i16, i1} %t, 0
+  %obit = extractvalue {i16, i1} %t, 1
+  store i16 %val, i16* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; DAG-LABEL:    saddo.i32
+; DAG:          addl %esi, %edi
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   saddo.i32
+; FAST:         addl %esi, %edi
+; FAST-NEXT:    seto %al
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; DAG-LABEL:    saddo.i64
+; DAG:          addq %rsi, %rdi
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   saddo.i64
+; FAST:         addq %rsi, %rdi
+; FAST-NEXT:    seto %al
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+; SADDO reg, imm | imm, reg
+; FIXME: INC isn't supported in FastISel yet
+define zeroext i1 @saddo.i64imm1(i64 %v1, i64* %res) {
+entry:
+; DAG-LABEL:    saddo.i64imm1
+; DAG:          incq %rdi
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   saddo.i64imm1
+; FAST:         addq $1, %rdi
+; FAST-NEXT:    seto %al
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 1)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+; FIXME: DAG doesn't optimize immediates on the LHS.
+define zeroext i1 @saddo.i64imm2(i64 %v1, i64* %res) {
+entry:
+; DAG-LABEL:    saddo.i64imm2
+; DAG:          mov
+; DAG-NEXT:     addq
+; DAG-NEXT:     seto
+; FAST-LABEL:   saddo.i64imm2
+; FAST:         addq $1, %rdi
+; FAST-NEXT:    seto %al
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 1, i64 %v1)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+; Check boundary conditions for large immediates.
+define zeroext i1 @saddo.i64imm3(i64 %v1, i64* %res) {
+entry:
+; DAG-LABEL:    saddo.i64imm3
+; DAG:          addq $-2147483648, %rdi
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   saddo.i64imm3
+; FAST:         addq $-2147483648, %rdi
+; FAST-NEXT:    seto %al
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -2147483648)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo.i64imm4(i64 %v1, i64* %res) {
+entry:
+; DAG-LABEL:    saddo.i64imm4
+; DAG:          movabsq $-21474836489, %[[REG:[a-z]+]]
+; DAG-NEXT:     addq %rdi, %[[REG]]
+; DAG-NEXT:     seto
+; FAST-LABEL:   saddo.i64imm4
+; FAST:         movabsq $-21474836489, %[[REG:[a-z]+]]
+; FAST-NEXT:    addq %rdi, %[[REG]]
+; FAST-NEXT:    seto
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -21474836489)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo.i64imm5(i64 %v1, i64* %res) {
+entry:
+; DAG-LABEL:    saddo.i64imm5
+; DAG:          addq $2147483647, %rdi
+; DAG-NEXT:     seto
+; FAST-LABEL:   saddo.i64imm5
+; FAST:         addq $2147483647, %rdi
+; FAST-NEXT:    seto
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 2147483647)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+; TODO: FastISel shouldn't use movabsq.
+define zeroext i1 @saddo.i64imm6(i64 %v1, i64* %res) {
+entry:
+; DAG-LABEL:    saddo.i64imm6
+; DAG:          movl $2147483648, %ecx
+; DAG:          addq %rdi, %rcx
+; DAG-NEXT:     seto
+; FAST-LABEL:   saddo.i64imm6
+; FAST:         movabsq $2147483648, %[[REG:[a-z]+]]
+; FAST:         addq %rdi, %[[REG]]
+; FAST-NEXT:     seto
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 2147483648)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+; UADDO
+define zeroext i1 @uaddo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; DAG-LABEL:    uaddo.i32
+; DAG:          addl %esi, %edi
+; DAG-NEXT:     setb %al
+; FAST-LABEL:   uaddo.i32
+; FAST:         addl %esi, %edi
+; FAST-NEXT:    setb %al
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; DAG-LABEL:    uaddo.i64
+; DAG:          addq %rsi, %rdi
+; DAG-NEXT:     setb %al
+; FAST-LABEL:   uaddo.i64
+; FAST:         addq %rsi, %rdi
+; FAST-NEXT:    setb %al
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+; SSUBO
+define zeroext i1 @ssubo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; DAG-LABEL:    ssubo.i32
+; DAG:          subl %esi, %edi
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   ssubo.i32
+; FAST:         subl %esi, %edi
+; FAST-NEXT:    seto %al
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; DAG-LABEL:    ssubo.i64
+; DAG:          subq %rsi, %rdi
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   ssubo.i64
+; FAST:         subq %rsi, %rdi
+; FAST-NEXT:    seto %al
+  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+; USUBO
+define zeroext i1 @usubo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; DAG-LABEL:    usubo.i32
+; DAG:          subl %esi, %edi
+; DAG-NEXT:     setb %al
+; FAST-LABEL:   usubo.i32
+; FAST:         subl %esi, %edi
+; FAST-NEXT:    setb %al
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; DAG-LABEL:    usubo.i64
+; DAG:          subq %rsi, %rdi
+; DAG-NEXT:     setb %al
+; FAST-LABEL:   usubo.i64
+; FAST:         subq %rsi, %rdi
+; FAST-NEXT:    setb %al
+  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+; SMULO
+define zeroext i1 @smulo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; DAG-LABEL:    smulo.i32
+; DAG:          imull %esi, %edi
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   smulo.i32
+; FAST:         imull %esi, %edi
+; FAST-NEXT:    seto %al
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; DAG-LABEL:    smulo.i64
+; DAG:          imulq %rsi, %rdi
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   smulo.i64
+; FAST:         imulq %rsi, %rdi
+; FAST-NEXT:    seto %al
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+; UMULO
+define zeroext i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; DAG-LABEL:    umulo.i32
+; DAG:          mull %esi
+; DAG-NEXT:     seto
+; FAST-LABEL:   umulo.i32
+; FAST:         mull %esi
+; FAST-NEXT:    seto
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; DAG-LABEL:    umulo.i64
+; DAG:          mulq %rsi
+; DAG-NEXT:     seto
+; FAST-LABEL:   umulo.i64
+; FAST:         mulq %rsi
+; FAST-NEXT:    seto
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+declare {i8, i1} @llvm.sadd.with.overflow.i8(i8, i8) nounwind readnone
+declare {i16, i1} @llvm.sadd.with.overflow.i16(i16, i16) nounwind readnone
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
+