forked from OSchip/llvm-project
[InstCombine] CVTPH2PS Vector Demanded Elements + Constant Folding
Improved InstCombine support for CVTPH2PS (F16C half 2 float conversion): <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) - only uses the bottom 4 i16 elements for the conversion. Added constant folding support. Differential Revision: http://reviews.llvm.org/D12731 llvm-svn: 247504
This commit is contained in:
parent
bc9c327f3f
commit
20c607b110
|
@ -796,6 +796,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
|
|||
return new StoreInst(II->getArgOperand(0), Ptr);
|
||||
}
|
||||
break;
|
||||
|
||||
case Intrinsic::x86_sse_storeu_ps:
|
||||
case Intrinsic::x86_sse2_storeu_pd:
|
||||
case Intrinsic::x86_sse2_storeu_dq:
|
||||
|
@ -809,6 +810,52 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
|
|||
}
|
||||
break;
|
||||
|
||||
case Intrinsic::x86_vcvtph2ps_128:
|
||||
case Intrinsic::x86_vcvtph2ps_256: {
|
||||
auto Arg = II->getArgOperand(0);
|
||||
auto ArgType = cast<VectorType>(Arg->getType());
|
||||
auto RetType = cast<VectorType>(II->getType());
|
||||
unsigned ArgWidth = ArgType->getNumElements();
|
||||
unsigned RetWidth = RetType->getNumElements();
|
||||
assert(RetWidth <= ArgWidth && "Unexpected input/return vector widths");
|
||||
assert(ArgType->isIntOrIntVectorTy() &&
|
||||
ArgType->getScalarSizeInBits() == 16 &&
|
||||
"CVTPH2PS input type should be 16-bit integer vector");
|
||||
assert(RetType->getScalarType()->isFloatTy() &&
|
||||
"CVTPH2PS output type should be 32-bit float vector");
|
||||
|
||||
// Constant folding: Convert to generic half to single conversion.
|
||||
if (auto CIZero = dyn_cast<ConstantAggregateZero>(Arg))
|
||||
return ReplaceInstUsesWith(*II, ConstantAggregateZero::get(RetType));
|
||||
|
||||
if (auto CIHalf = dyn_cast<ConstantDataVector>(Arg)) {
|
||||
auto VectorHalfAsShorts = Arg;
|
||||
if (RetWidth < ArgWidth) {
|
||||
SmallVector<int, 8> SubVecMask;
|
||||
for (unsigned i = 0; i != RetWidth; ++i)
|
||||
SubVecMask.push_back((int)i);
|
||||
VectorHalfAsShorts = Builder->CreateShuffleVector(
|
||||
Arg, UndefValue::get(ArgType), SubVecMask);
|
||||
}
|
||||
|
||||
auto VectorHalfType =
|
||||
VectorType::get(Type::getHalfTy(II->getContext()), RetWidth);
|
||||
auto VectorHalfs =
|
||||
Builder->CreateBitCast(VectorHalfAsShorts, VectorHalfType);
|
||||
auto VectorFloats = Builder->CreateFPExt(VectorHalfs, RetType);
|
||||
return ReplaceInstUsesWith(*II, VectorFloats);
|
||||
}
|
||||
|
||||
// We only use the lowest lanes of the argument.
|
||||
APInt DemandedElts = APInt::getLowBitsSet(ArgWidth, RetWidth);
|
||||
APInt UndefElts(ArgWidth, 0);
|
||||
if (Value *V = SimplifyDemandedVectorElts(Arg, DemandedElts, UndefElts)) {
|
||||
II->setArgOperand(0, V);
|
||||
return II;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case Intrinsic::x86_sse_cvtss2si:
|
||||
case Intrinsic::x86_sse_cvtss2si64:
|
||||
case Intrinsic::x86_sse_cvttss2si:
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
; RUN: opt < %s -instcombine -S | FileCheck %s
|
||||
|
||||
declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>)
|
||||
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>)
|
||||
|
||||
;
|
||||
; Vector Demanded Bits
|
||||
;
|
||||
|
||||
; Only bottom 4 elements required.
|
||||
define <4 x float> @demand_vcvtph2ps_128(<8 x i16> %A) {
|
||||
; CHECK-LABEL: @demand_vcvtph2ps_128
|
||||
; CHECK-NEXT: %1 = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %A)
|
||||
; CHECK-NEXT: ret <4 x float> %1
|
||||
%1 = shufflevector <8 x i16> %A, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
|
||||
%2 = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %1)
|
||||
ret <4 x float> %2
|
||||
}
|
||||
|
||||
; All 8 elements required.
|
||||
define <8 x float> @demand_vcvtph2ps_256(<8 x i16> %A) {
|
||||
; CHECK-LABEL: @demand_vcvtph2ps_256
|
||||
; CHECK-NEXT: %1 = shufflevector <8 x i16> %A, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
|
||||
; CHECK-NEXT: %2 = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %1)
|
||||
; CHECK-NEXT: ret <8 x float> %2
|
||||
%1 = shufflevector <8 x i16> %A, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
|
||||
%2 = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %1)
|
||||
ret <8 x float> %2
|
||||
}
|
||||
|
||||
;
|
||||
; Constant Folding
|
||||
;
|
||||
|
||||
define <4 x float> @fold_vcvtph2ps_128() {
|
||||
; CHECK-LABEL: @fold_vcvtph2ps_128
|
||||
; CHECK-NEXT: ret <4 x float> <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float -0.000000e+00>
|
||||
%1 = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> <i16 0, i16 14336, i16 15360, i16 32768, i16 16384, i16 31743, i16 48128, i16 49152>)
|
||||
ret <4 x float> %1
|
||||
}
|
||||
|
||||
define <8 x float> @fold_vcvtph2ps_256() {
|
||||
; CHECK-LABEL: @fold_vcvtph2ps_256
|
||||
; CHECK-NEXT: ret <8 x float> <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float -0.000000e+00, float 2.000000e+00, float 6.550400e+04, float -1.000000e+00, float -2.000000e+00>
|
||||
%1 = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> <i16 0, i16 14336, i16 15360, i16 32768, i16 16384, i16 31743, i16 48128, i16 49152>)
|
||||
ret <8 x float> %1
|
||||
}
|
||||
|
||||
define <4 x float> @fold_vcvtph2ps_128_zero() {
|
||||
; CHECK-LABEL: @fold_vcvtph2ps_128_zero
|
||||
; CHECK-NEXT: ret <4 x float> zeroinitializer
|
||||
%1 = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
|
||||
ret <4 x float> %1
|
||||
}
|
||||
|
||||
define <8 x float> @fold_vcvtph2ps_256_zero() {
|
||||
; CHECK-LABEL: @fold_vcvtph2ps_256_zero
|
||||
; CHECK-NEXT: ret <8 x float> zeroinitializer
|
||||
%1 = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
|
||||
ret <8 x float> %1
|
||||
}
|
Loading…
Reference in New Issue