From b1e7f557b7a18e2fe31623077aa88429a60f5746 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Tue, 11 Jan 2011 01:07:24 +0000 Subject: [PATCH] Teach constant folding to perform conversions from constant floating point values to their integer representation through the SSE intrinsic calls. This is the last part of a README.txt entry for which I have real world examples. llvm-svn: 123206 --- llvm/lib/Analysis/ConstantFolding.cpp | 56 +++++++++++++++++++++++++ llvm/lib/Target/README.txt | 55 ------------------------ llvm/test/Transforms/ConstProp/calls.ll | 33 +++++++++++++++ 3 files changed, 89 insertions(+), 55 deletions(-) diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 1b38c027dada..300821026a7e 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -1047,6 +1047,14 @@ llvm::canConstantFoldCallTo(const Function *F) { case Intrinsic::smul_with_overflow: case Intrinsic::convert_from_fp16: case Intrinsic::convert_to_fp16: + case Intrinsic::x86_sse_cvtss2si: + case Intrinsic::x86_sse_cvtss2si64: + case Intrinsic::x86_sse_cvttss2si: + case Intrinsic::x86_sse_cvttss2si64: + case Intrinsic::x86_sse2_cvtsd2si: + case Intrinsic::x86_sse2_cvtsd2si64: + case Intrinsic::x86_sse2_cvttsd2si: + case Intrinsic::x86_sse2_cvttsd2si64: return true; default: return false; @@ -1116,6 +1124,36 @@ static Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double), return 0; // dummy return to suppress warning } +/// ConstantFoldConvertToInt - Attempt to an SSE floating point to integer +/// conversion of a constant floating point. If roundTowardZero is false, the +/// default IEEE rounding is used (toward nearest, ties to even). This matches +/// the behavior of the non-truncating SSE instructions in the default rounding +/// mode. The desired integer type Ty is used to select how many bits are +/// available for the result. Returns null if the conversion cannot be +/// performed, otherwise returns the Constant value resulting from the +/// conversion. +static Constant *ConstantFoldConvertToInt(ConstantFP *Op, bool roundTowardZero, + const Type *Ty) { + assert(Op && "Called with NULL operand"); + APFloat Val(Op->getValueAPF()); + + // All of these conversion intrinsics form an integer of at most 64bits. + unsigned ResultWidth = cast(Ty)->getBitWidth(); + assert(ResultWidth <= 64 && + "Can only constant fold conversions to 64 and 32 bit ints"); + + uint64_t UIntVal; + bool isExact = false; + APFloat::roundingMode mode = roundTowardZero? APFloat::rmTowardZero + : APFloat::rmNearestTiesToEven; + APFloat::opStatus status = Val.convertToInteger(&UIntVal, ResultWidth, + /*isSigned=*/true, mode, + &isExact); + if (status != APFloat::opOK && status != APFloat::opInexact) + return 0; + return ConstantInt::get(Ty, UIntVal, /*isSigned=*/true); +} + /// ConstantFoldCall - Attempt to constant fold a call to the specified function /// with the specified arguments, returning null if unsuccessful. Constant * @@ -1246,6 +1284,24 @@ llvm::ConstantFoldCall(Function *F, } } + if (ConstantVector *Op = dyn_cast(Operands[0])) { + switch (F->getIntrinsicID()) { + default: break; + case Intrinsic::x86_sse_cvtss2si: + case Intrinsic::x86_sse_cvtss2si64: + case Intrinsic::x86_sse2_cvtsd2si: + case Intrinsic::x86_sse2_cvtsd2si64: + if (ConstantFP *FPOp = dyn_cast(Op->getOperand(0))) + return ConstantFoldConvertToInt(FPOp, /*roundTowardZero=*/false, Ty); + case Intrinsic::x86_sse_cvttss2si: + case Intrinsic::x86_sse_cvttss2si64: + case Intrinsic::x86_sse2_cvttsd2si: + case Intrinsic::x86_sse2_cvttsd2si64: + if (ConstantFP *FPOp = dyn_cast(Op->getOperand(0))) + return ConstantFoldConvertToInt(FPOp, /*roundTowardZero=*/true, Ty); + } + } + if (isa(Operands[0])) { if (F->getIntrinsicID() == Intrinsic::bswap) return Operands[0]; diff --git a/llvm/lib/Target/README.txt b/llvm/lib/Target/README.txt index 194a19219cb1..c3a9330ba6ed 100644 --- a/llvm/lib/Target/README.txt +++ b/llvm/lib/Target/README.txt @@ -2259,58 +2259,3 @@ Since we know that x+2.0 doesn't care about the sign of any zeros in X, we can transform the fmul to 0.0, and then the fadd to 2.0. //===---------------------------------------------------------------------===// - -clang -O3 currently compiles this code: - -#include -int f(double x) { return _mm_cvtsd_si32(_mm_set_sd(x)); } -int g(double x) { return _mm_cvttsd_si32(_mm_set_sd(x)); } - -into - -define i32 @_Z1fd(double %x) nounwind readnone { -entry: - %vecinit.i = insertelement <2 x double> undef, double %x, i32 0 - %vecinit1.i = insertelement <2 x double> %vecinit.i, double 0.000000e+00,i32 1 - %0 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %vecinit1.i) nounwind - ret i32 %0 -} - -define i32 @_Z1gd(double %x) nounwind readnone { -entry: - %conv.i = fptosi double %x to i32 - ret i32 %conv.i -} - -This difference carries over to the assmebly produced, resulting in: - -_Z1fd: # @_Z1fd -# BB#0: # %entry - pushq %rbp - movq %rsp, %rbp - xorps %xmm1, %xmm1 - movsd %xmm0, %xmm1 - cvtsd2sil %xmm1, %eax - popq %rbp - ret - -_Z1gd: # @_Z1gd -# BB#0: # %entry - pushq %rbp - movq %rsp, %rbp - cvttsd2si %xmm0, %eax - popq %rbp - ret - -The problem is that we can't see through the intrinsic call used for cvtsd2si, -and fold away the unnecessary manipulation of the function parameter. When -these functions are inlined, it forms a barrier preventing many further -optimizations. LLVM IR doesn't have a good way to model the logic of -'cvtsd2si', its only FP -> int conversion path forces truncation. We should add -a rounding flag onto fptosi so that it can represent this type of rounding -naturally in the IR rather than using intrinsics. We might need to use a -'system_rounding_mode' flag to encode that the semantics of the rounding mode -can be changed by the program, but ideally we could just say that isn't -supported, and hard code the rounding. - -//===---------------------------------------------------------------------===// diff --git a/llvm/test/Transforms/ConstProp/calls.ll b/llvm/test/Transforms/ConstProp/calls.ll index a12fc82d646a..82d73245ad15 100644 --- a/llvm/test/Transforms/ConstProp/calls.ll +++ b/llvm/test/Transforms/ConstProp/calls.ll @@ -21,3 +21,36 @@ define double @T() { %c = fadd double %b, %D ret double %c } + +define i1 @test_sse_cvt() nounwind readnone { +; CHECK: @test_sse_cvt +; CHECK-NOT: call +; CHECK: ret i1 true +entry: + %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> ) nounwind + %i1 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> ) nounwind + %i2 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> ) nounwind + %i3 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> ) nounwind + %i4 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> ) nounwind + %i5 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> ) nounwind + %i6 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> ) nounwind + %i7 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> ) nounwind + %sum11 = add i32 %i0, %i1 + %sum12 = add i32 %i4, %i5 + %sum1 = add i32 %sum11, %sum12 + %sum21 = add i64 %i2, %i3 + %sum22 = add i64 %i6, %i7 + %sum2 = add i64 %sum21, %sum22 + %sum1.sext = sext i32 %sum1 to i64 + %b = icmp eq i64 %sum1.sext, %sum2 + ret i1 %b +} + +declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone +declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone +declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone +declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone +declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone +declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone +declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone +declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone