Teach constant folding to perform conversions from constant floating

point values to their integer representation through the SSE intrinsic
calls. This is the last part of a README.txt entry for which I have real
world examples.

llvm-svn: 123206
This commit is contained in:
Chandler Carruth 2011-01-11 01:07:24 +00:00
parent fdf4969149
commit b1e7f557b7
3 changed files with 89 additions and 55 deletions

View File

@ -1047,6 +1047,14 @@ llvm::canConstantFoldCallTo(const Function *F) {
case Intrinsic::smul_with_overflow:
case Intrinsic::convert_from_fp16:
case Intrinsic::convert_to_fp16:
case Intrinsic::x86_sse_cvtss2si:
case Intrinsic::x86_sse_cvtss2si64:
case Intrinsic::x86_sse_cvttss2si:
case Intrinsic::x86_sse_cvttss2si64:
case Intrinsic::x86_sse2_cvtsd2si:
case Intrinsic::x86_sse2_cvtsd2si64:
case Intrinsic::x86_sse2_cvttsd2si:
case Intrinsic::x86_sse2_cvttsd2si64:
return true;
default:
return false;
@ -1116,6 +1124,36 @@ static Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double),
return 0; // dummy return to suppress warning
}
/// ConstantFoldConvertToInt - Attempt to an SSE floating point to integer
/// conversion of a constant floating point. If roundTowardZero is false, the
/// default IEEE rounding is used (toward nearest, ties to even). This matches
/// the behavior of the non-truncating SSE instructions in the default rounding
/// mode. The desired integer type Ty is used to select how many bits are
/// available for the result. Returns null if the conversion cannot be
/// performed, otherwise returns the Constant value resulting from the
/// conversion.
static Constant *ConstantFoldConvertToInt(ConstantFP *Op, bool roundTowardZero,
const Type *Ty) {
assert(Op && "Called with NULL operand");
APFloat Val(Op->getValueAPF());
// All of these conversion intrinsics form an integer of at most 64bits.
unsigned ResultWidth = cast<IntegerType>(Ty)->getBitWidth();
assert(ResultWidth <= 64 &&
"Can only constant fold conversions to 64 and 32 bit ints");
uint64_t UIntVal;
bool isExact = false;
APFloat::roundingMode mode = roundTowardZero? APFloat::rmTowardZero
: APFloat::rmNearestTiesToEven;
APFloat::opStatus status = Val.convertToInteger(&UIntVal, ResultWidth,
/*isSigned=*/true, mode,
&isExact);
if (status != APFloat::opOK && status != APFloat::opInexact)
return 0;
return ConstantInt::get(Ty, UIntVal, /*isSigned=*/true);
}
/// ConstantFoldCall - Attempt to constant fold a call to the specified function
/// with the specified arguments, returning null if unsuccessful.
Constant *
@ -1246,6 +1284,24 @@ llvm::ConstantFoldCall(Function *F,
}
}
if (ConstantVector *Op = dyn_cast<ConstantVector>(Operands[0])) {
switch (F->getIntrinsicID()) {
default: break;
case Intrinsic::x86_sse_cvtss2si:
case Intrinsic::x86_sse_cvtss2si64:
case Intrinsic::x86_sse2_cvtsd2si:
case Intrinsic::x86_sse2_cvtsd2si64:
if (ConstantFP *FPOp = dyn_cast<ConstantFP>(Op->getOperand(0)))
return ConstantFoldConvertToInt(FPOp, /*roundTowardZero=*/false, Ty);
case Intrinsic::x86_sse_cvttss2si:
case Intrinsic::x86_sse_cvttss2si64:
case Intrinsic::x86_sse2_cvttsd2si:
case Intrinsic::x86_sse2_cvttsd2si64:
if (ConstantFP *FPOp = dyn_cast<ConstantFP>(Op->getOperand(0)))
return ConstantFoldConvertToInt(FPOp, /*roundTowardZero=*/true, Ty);
}
}
if (isa<UndefValue>(Operands[0])) {
if (F->getIntrinsicID() == Intrinsic::bswap)
return Operands[0];

View File

@ -2259,58 +2259,3 @@ Since we know that x+2.0 doesn't care about the sign of any zeros in X, we can
transform the fmul to 0.0, and then the fadd to 2.0.
//===---------------------------------------------------------------------===//
clang -O3 currently compiles this code:
#include <emmintrin.h>
int f(double x) { return _mm_cvtsd_si32(_mm_set_sd(x)); }
int g(double x) { return _mm_cvttsd_si32(_mm_set_sd(x)); }
into
define i32 @_Z1fd(double %x) nounwind readnone {
entry:
%vecinit.i = insertelement <2 x double> undef, double %x, i32 0
%vecinit1.i = insertelement <2 x double> %vecinit.i, double 0.000000e+00,i32 1
%0 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %vecinit1.i) nounwind
ret i32 %0
}
define i32 @_Z1gd(double %x) nounwind readnone {
entry:
%conv.i = fptosi double %x to i32
ret i32 %conv.i
}
This difference carries over to the assmebly produced, resulting in:
_Z1fd: # @_Z1fd
# BB#0: # %entry
pushq %rbp
movq %rsp, %rbp
xorps %xmm1, %xmm1
movsd %xmm0, %xmm1
cvtsd2sil %xmm1, %eax
popq %rbp
ret
_Z1gd: # @_Z1gd
# BB#0: # %entry
pushq %rbp
movq %rsp, %rbp
cvttsd2si %xmm0, %eax
popq %rbp
ret
The problem is that we can't see through the intrinsic call used for cvtsd2si,
and fold away the unnecessary manipulation of the function parameter. When
these functions are inlined, it forms a barrier preventing many further
optimizations. LLVM IR doesn't have a good way to model the logic of
'cvtsd2si', its only FP -> int conversion path forces truncation. We should add
a rounding flag onto fptosi so that it can represent this type of rounding
naturally in the IR rather than using intrinsics. We might need to use a
'system_rounding_mode' flag to encode that the semantics of the rounding mode
can be changed by the program, but ideally we could just say that isn't
supported, and hard code the rounding.
//===---------------------------------------------------------------------===//

View File

@ -21,3 +21,36 @@ define double @T() {
%c = fadd double %b, %D
ret double %c
}
define i1 @test_sse_cvt() nounwind readnone {
; CHECK: @test_sse_cvt
; CHECK-NOT: call
; CHECK: ret i1 true
entry:
%i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
%i1 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
%i2 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
%i3 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
%i4 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> <double 1.75, double undef>) nounwind
%i5 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> <double 1.75, double undef>) nounwind
%i6 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> <double 1.75, double undef>) nounwind
%i7 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> <double 1.75, double undef>) nounwind
%sum11 = add i32 %i0, %i1
%sum12 = add i32 %i4, %i5
%sum1 = add i32 %sum11, %sum12
%sum21 = add i64 %i2, %i3
%sum22 = add i64 %i6, %i7
%sum2 = add i64 %sum21, %sum22
%sum1.sext = sext i32 %sum1 to i64
%b = icmp eq i64 %sum1.sext, %sum2
ret i1 %b
}
declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone