[FastISel][X86] Extend support for {s|u}{add|sub|mul}.with.overflow intrinsics.

llvm-svn: 210610
This commit is contained in:
Juergen Ributzka 2014-06-10 23:52:44 +00:00
parent d7e1fe40e1
commit 2dace6e54b
3 changed files with 430 additions and 34 deletions

View File

@ -373,6 +373,9 @@ protected:
/// - \c Add has a constant operand.
bool canFoldAddIntoGEP(const User *GEP, const Value *Add);
/// Test whether the given value has exactly one use.
bool hasTrivialKill(const Value *V) const;
private:
bool SelectBinaryOp(const User *I, unsigned ISDOpcode);
@ -408,9 +411,6 @@ private:
/// beginning of the block. It helps to avoid spilling cached variables across
/// heavy instructions like calls.
void flushLocalValueMap();
/// Test whether the given value has exactly one use.
bool hasTrivialKill(const Value *V) const;
};
}

View File

@ -1637,6 +1637,18 @@ bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
return true;
}
static bool isCommutativeIntrinsic(IntrinsicInst const &I) {
switch (I.getIntrinsicID()) {
case Intrinsic::sadd_with_overflow:
case Intrinsic::uadd_with_overflow:
case Intrinsic::smul_with_overflow:
case Intrinsic::umul_with_overflow:
return true;
default:
return false;
}
}
bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
// FIXME: Handle more intrinsics.
switch (I.getIntrinsicID()) {
@ -1718,47 +1730,94 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
return true;
}
case Intrinsic::sadd_with_overflow:
case Intrinsic::uadd_with_overflow: {
// FIXME: Should fold immediates.
// Replace "add with overflow" intrinsics with an "add" instruction followed
// by a seto/setc instruction.
case Intrinsic::uadd_with_overflow:
case Intrinsic::ssub_with_overflow:
case Intrinsic::usub_with_overflow:
case Intrinsic::smul_with_overflow:
case Intrinsic::umul_with_overflow: {
// This implements the basic lowering of the xalu with overflow intrinsics
// into add/sub/mul folowed by either seto or setb.
const Function *Callee = I.getCalledFunction();
Type *RetTy =
cast<StructType>(Callee->getReturnType())->getTypeAtIndex(unsigned(0));
auto *Ty = cast<StructType>(Callee->getReturnType());
Type *RetTy = Ty->getTypeAtIndex(0U);
Type *CondTy = Ty->getTypeAtIndex(1);
MVT VT;
if (!isTypeLegal(RetTy, VT))
return false;
const Value *Op1 = I.getArgOperand(0);
const Value *Op2 = I.getArgOperand(1);
unsigned Reg1 = getRegForValue(Op1);
unsigned Reg2 = getRegForValue(Op2);
if (Reg1 == 0 || Reg2 == 0)
// FIXME: Handle values *not* in registers.
if (VT < MVT::i8 || VT > MVT::i64)
return false;
unsigned OpC = 0;
if (VT == MVT::i32)
OpC = X86::ADD32rr;
else if (VT == MVT::i64)
OpC = X86::ADD64rr;
else
const Value *LHS = I.getArgOperand(0);
const Value *RHS = I.getArgOperand(1);
// Canonicalize immediates to the RHS.
if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
isCommutativeIntrinsic(I))
std::swap(LHS, RHS);
unsigned BaseOpc, CondOpc;
switch (I.getIntrinsicID()) {
default: llvm_unreachable("Unexpected intrinsic!");
case Intrinsic::sadd_with_overflow:
BaseOpc = ISD::ADD; CondOpc = X86::SETOr; break;
case Intrinsic::uadd_with_overflow:
BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break;
case Intrinsic::ssub_with_overflow:
BaseOpc = ISD::SUB; CondOpc = X86::SETOr; break;
case Intrinsic::usub_with_overflow:
BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break;
case Intrinsic::smul_with_overflow:
BaseOpc = ISD::MUL; CondOpc = X86::SETOr; break;
case Intrinsic::umul_with_overflow:
BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break;
}
unsigned LHSReg = getRegForValue(LHS);
if (LHSReg == 0)
return false;
bool LHSIsKill = hasTrivialKill(LHS);
unsigned ResultReg = 0;
// Check if we have an immediate version.
if (auto const *C = dyn_cast<ConstantInt>(RHS)) {
ResultReg = FastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill,
C->getZExtValue());
}
unsigned RHSReg;
bool RHSIsKill;
if (!ResultReg) {
RHSReg = getRegForValue(RHS);
if (RHSReg == 0)
return false;
RHSIsKill = hasTrivialKill(RHS);
ResultReg = FastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg,
RHSIsKill);
}
// FastISel doesn't have a pattern for X86::MUL*r. Emit it manually.
if (BaseOpc == X86ISD::UMUL && !ResultReg) {
static const unsigned MULOpc[] =
{ X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r };
static const unsigned Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX };
// First copy the first operand into RAX, which is an implicit input to
// the X86::MUL*r instruction.
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8])
.addReg(LHSReg, getKillRegState(LHSIsKill));
ResultReg = FastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8],
TLI.getRegClassFor(VT), RHSReg, RHSIsKill);
}
if (!ResultReg)
return false;
// The call to CreateRegs builds two sequential registers, to store the
// both the returned values.
unsigned ResultReg = FuncInfo.CreateRegs(I.getType());
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpC), ResultReg)
.addReg(Reg1).addReg(Reg2);
unsigned Opc = X86::SETBr;
if (I.getIntrinsicID() == Intrinsic::sadd_with_overflow)
Opc = X86::SETOr;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
ResultReg + 1);
unsigned ResultReg2 = FuncInfo.CreateRegs(CondTy);
assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers.");
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc),
ResultReg2);
UpdateValueMap(&I, ResultReg, 2);
return true;

View File

@ -0,0 +1,337 @@
; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s -check-prefix=DAG
; RUN: llc -mtriple=x86_64-unknown-unknown -fast-isel -fast-isel-abort < %s | FileCheck %s --check-prefix=FAST
;
; Get the actual value of the overflow bit.
;
; SADDO reg, reg
define zeroext i1 @saddo.i8(i8 signext %v1, i8 signext %v2, i8* %res) {
entry:
; DAG-LABEL: saddo.i8
; DAG: addb %sil, %dil
; DAG-NEXT: seto %al
; FAST-LABEL: saddo.i8
; FAST: addb %sil, %dil
; FAST-NEXT: seto %al
%t = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %v1, i8 %v2)
%val = extractvalue {i8, i1} %t, 0
%obit = extractvalue {i8, i1} %t, 1
store i8 %val, i8* %res
ret i1 %obit
}
define zeroext i1 @saddo.i16(i16 %v1, i16 %v2, i16* %res) {
entry:
; DAG-LABEL: saddo.i16
; DAG: addw %si, %di
; DAG-NEXT: seto %al
; FAST-LABEL: saddo.i16
; FAST: addw %si, %di
; FAST-NEXT: seto %al
%t = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %v1, i16 %v2)
%val = extractvalue {i16, i1} %t, 0
%obit = extractvalue {i16, i1} %t, 1
store i16 %val, i16* %res
ret i1 %obit
}
define zeroext i1 @saddo.i32(i32 %v1, i32 %v2, i32* %res) {
entry:
; DAG-LABEL: saddo.i32
; DAG: addl %esi, %edi
; DAG-NEXT: seto %al
; FAST-LABEL: saddo.i32
; FAST: addl %esi, %edi
; FAST-NEXT: seto %al
%t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
%val = extractvalue {i32, i1} %t, 0
%obit = extractvalue {i32, i1} %t, 1
store i32 %val, i32* %res
ret i1 %obit
}
define zeroext i1 @saddo.i64(i64 %v1, i64 %v2, i64* %res) {
entry:
; DAG-LABEL: saddo.i64
; DAG: addq %rsi, %rdi
; DAG-NEXT: seto %al
; FAST-LABEL: saddo.i64
; FAST: addq %rsi, %rdi
; FAST-NEXT: seto %al
%t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
store i64 %val, i64* %res
ret i1 %obit
}
; SADDO reg, imm | imm, reg
; FIXME: INC isn't supported in FastISel yet
define zeroext i1 @saddo.i64imm1(i64 %v1, i64* %res) {
entry:
; DAG-LABEL: saddo.i64imm1
; DAG: incq %rdi
; DAG-NEXT: seto %al
; FAST-LABEL: saddo.i64imm1
; FAST: addq $1, %rdi
; FAST-NEXT: seto %al
%t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 1)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
store i64 %val, i64* %res
ret i1 %obit
}
; FIXME: DAG doesn't optimize immediates on the LHS.
define zeroext i1 @saddo.i64imm2(i64 %v1, i64* %res) {
entry:
; DAG-LABEL: saddo.i64imm2
; DAG: mov
; DAG-NEXT: addq
; DAG-NEXT: seto
; FAST-LABEL: saddo.i64imm2
; FAST: addq $1, %rdi
; FAST-NEXT: seto %al
%t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 1, i64 %v1)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
store i64 %val, i64* %res
ret i1 %obit
}
; Check boundary conditions for large immediates.
define zeroext i1 @saddo.i64imm3(i64 %v1, i64* %res) {
entry:
; DAG-LABEL: saddo.i64imm3
; DAG: addq $-2147483648, %rdi
; DAG-NEXT: seto %al
; FAST-LABEL: saddo.i64imm3
; FAST: addq $-2147483648, %rdi
; FAST-NEXT: seto %al
%t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -2147483648)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
store i64 %val, i64* %res
ret i1 %obit
}
define zeroext i1 @saddo.i64imm4(i64 %v1, i64* %res) {
entry:
; DAG-LABEL: saddo.i64imm4
; DAG: movabsq $-21474836489, %[[REG:[a-z]+]]
; DAG-NEXT: addq %rdi, %[[REG]]
; DAG-NEXT: seto
; FAST-LABEL: saddo.i64imm4
; FAST: movabsq $-21474836489, %[[REG:[a-z]+]]
; FAST-NEXT: addq %rdi, %[[REG]]
; FAST-NEXT: seto
%t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -21474836489)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
store i64 %val, i64* %res
ret i1 %obit
}
define zeroext i1 @saddo.i64imm5(i64 %v1, i64* %res) {
entry:
; DAG-LABEL: saddo.i64imm5
; DAG: addq $2147483647, %rdi
; DAG-NEXT: seto
; FAST-LABEL: saddo.i64imm5
; FAST: addq $2147483647, %rdi
; FAST-NEXT: seto
%t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 2147483647)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
store i64 %val, i64* %res
ret i1 %obit
}
; TODO: FastISel shouldn't use movabsq.
define zeroext i1 @saddo.i64imm6(i64 %v1, i64* %res) {
entry:
; DAG-LABEL: saddo.i64imm6
; DAG: movl $2147483648, %ecx
; DAG: addq %rdi, %rcx
; DAG-NEXT: seto
; FAST-LABEL: saddo.i64imm6
; FAST: movabsq $2147483648, %[[REG:[a-z]+]]
; FAST: addq %rdi, %[[REG]]
; FAST-NEXT: seto
%t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 2147483648)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
store i64 %val, i64* %res
ret i1 %obit
}
; UADDO
define zeroext i1 @uaddo.i32(i32 %v1, i32 %v2, i32* %res) {
entry:
; DAG-LABEL: uaddo.i32
; DAG: addl %esi, %edi
; DAG-NEXT: setb %al
; FAST-LABEL: uaddo.i32
; FAST: addl %esi, %edi
; FAST-NEXT: setb %al
%t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
%val = extractvalue {i32, i1} %t, 0
%obit = extractvalue {i32, i1} %t, 1
store i32 %val, i32* %res
ret i1 %obit
}
define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, i64* %res) {
entry:
; DAG-LABEL: uaddo.i64
; DAG: addq %rsi, %rdi
; DAG-NEXT: setb %al
; FAST-LABEL: uaddo.i64
; FAST: addq %rsi, %rdi
; FAST-NEXT: setb %al
%t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
store i64 %val, i64* %res
ret i1 %obit
}
; SSUBO
define zeroext i1 @ssubo.i32(i32 %v1, i32 %v2, i32* %res) {
entry:
; DAG-LABEL: ssubo.i32
; DAG: subl %esi, %edi
; DAG-NEXT: seto %al
; FAST-LABEL: ssubo.i32
; FAST: subl %esi, %edi
; FAST-NEXT: seto %al
%t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
%val = extractvalue {i32, i1} %t, 0
%obit = extractvalue {i32, i1} %t, 1
store i32 %val, i32* %res
ret i1 %obit
}
define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, i64* %res) {
entry:
; DAG-LABEL: ssubo.i64
; DAG: subq %rsi, %rdi
; DAG-NEXT: seto %al
; FAST-LABEL: ssubo.i64
; FAST: subq %rsi, %rdi
; FAST-NEXT: seto %al
%t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
store i64 %val, i64* %res
ret i1 %obit
}
; USUBO
define zeroext i1 @usubo.i32(i32 %v1, i32 %v2, i32* %res) {
entry:
; DAG-LABEL: usubo.i32
; DAG: subl %esi, %edi
; DAG-NEXT: setb %al
; FAST-LABEL: usubo.i32
; FAST: subl %esi, %edi
; FAST-NEXT: setb %al
%t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
%val = extractvalue {i32, i1} %t, 0
%obit = extractvalue {i32, i1} %t, 1
store i32 %val, i32* %res
ret i1 %obit
}
define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, i64* %res) {
entry:
; DAG-LABEL: usubo.i64
; DAG: subq %rsi, %rdi
; DAG-NEXT: setb %al
; FAST-LABEL: usubo.i64
; FAST: subq %rsi, %rdi
; FAST-NEXT: setb %al
%t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
store i64 %val, i64* %res
ret i1 %obit
}
; SMULO
define zeroext i1 @smulo.i32(i32 %v1, i32 %v2, i32* %res) {
entry:
; DAG-LABEL: smulo.i32
; DAG: imull %esi, %edi
; DAG-NEXT: seto %al
; FAST-LABEL: smulo.i32
; FAST: imull %esi, %edi
; FAST-NEXT: seto %al
%t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
%val = extractvalue {i32, i1} %t, 0
%obit = extractvalue {i32, i1} %t, 1
store i32 %val, i32* %res
ret i1 %obit
}
define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
entry:
; DAG-LABEL: smulo.i64
; DAG: imulq %rsi, %rdi
; DAG-NEXT: seto %al
; FAST-LABEL: smulo.i64
; FAST: imulq %rsi, %rdi
; FAST-NEXT: seto %al
%t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
store i64 %val, i64* %res
ret i1 %obit
}
; UMULO
define zeroext i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) {
entry:
; DAG-LABEL: umulo.i32
; DAG: mull %esi
; DAG-NEXT: seto
; FAST-LABEL: umulo.i32
; FAST: mull %esi
; FAST-NEXT: seto
%t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
%val = extractvalue {i32, i1} %t, 0
%obit = extractvalue {i32, i1} %t, 1
store i32 %val, i32* %res
ret i1 %obit
}
define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, i64* %res) {
entry:
; DAG-LABEL: umulo.i64
; DAG: mulq %rsi
; DAG-NEXT: seto
; FAST-LABEL: umulo.i64
; FAST: mulq %rsi
; FAST-NEXT: seto
%t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
%val = extractvalue {i64, i1} %t, 0
%obit = extractvalue {i64, i1} %t, 1
store i64 %val, i64* %res
ret i1 %obit
}
declare {i8, i1} @llvm.sadd.with.overflow.i8(i8, i8) nounwind readnone
declare {i16, i1} @llvm.sadd.with.overflow.i16(i16, i16) nounwind readnone
declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
declare {i64, i1} @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone
declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone
declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone
declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone