Optimize vector select from all 0s or all 1s

As packed comparisons in AVX/SSE produce all 0s or all 1s in each SIMD lane,
vector select could be simplified to AND/OR or removed if one or both values
being selected is all 0s or all 1s.

llvm-svn: 179267
This commit is contained in:
Michael Liao 2013-04-11 05:15:54 +00:00
parent 95d9440348
commit 55658d4222
2 changed files with 117 additions and 0 deletions

View File

@ -15787,6 +15787,51 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
if (unsigned Op = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget))
return DAG.getNode(Op, DL, N->getValueType(0), LHS, RHS);
// Simplify vector selection if the selector will be produced by CMPP*/PCMP*.
if (!DCI.isBeforeLegalize() && N->getOpcode() == ISD::VSELECT &&
Cond.getOpcode() == ISD::SETCC) {
assert(Cond.getValueType().isVector() &&
"vector select expects a vector selector!");
EVT IntVT = Cond.getValueType();
bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
if (!TValIsAllOnes && !FValIsAllZeros) {
// Try invert the condition if true value is not all 1s and false value
// is not all 0s.
bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
if (TValIsAllZeros || FValIsAllOnes) {
SDValue CC = Cond.getOperand(2);
ISD::CondCode NewCC =
ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
Cond.getOperand(0).getValueType().isInteger());
Cond = DAG.getSetCC(DL, IntVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
std::swap(LHS, RHS);
TValIsAllOnes = FValIsAllOnes;
FValIsAllZeros = TValIsAllZeros;
}
}
if (TValIsAllOnes || FValIsAllZeros) {
SDValue Ret;
if (TValIsAllOnes && FValIsAllZeros)
Ret = Cond;
else if (TValIsAllOnes)
Ret = DAG.getNode(ISD::OR, DL, IntVT, Cond,
DAG.getNode(ISD::BITCAST, DL, IntVT, RHS));
else if (FValIsAllZeros)
Ret = DAG.getNode(ISD::AND, DL, IntVT, Cond,
DAG.getNode(ISD::BITCAST, DL, IntVT, LHS));
return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
}
}
// If we know that this node is legal then we know that it is going to be
// matched by one of the SSE/AVX BLEND instructions. These instructions only
// depend on the highest bit in each word. Try to use SimplifyDemandedBits

View File

@ -0,0 +1,72 @@
; RUN: opt < %s -O3 | \
; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
define <4 x i32> @test1(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
%f = fcmp ult <4 x float> %a, %b
%r = select <4 x i1> %f, <4 x i32> %c, <4 x i32> zeroinitializer
ret <4 x i32> %r
; CHECK: test1
; CHECK: cmpnle
; CHECK-NEXT: andps
; CHECK: ret
}
define <4 x i32> @test2(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
%f = fcmp ult <4 x float> %a, %b
%r = select <4 x i1> %f, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c
ret <4 x i32> %r
; CHECK: test2
; CHECK: cmpnle
; CHECK-NEXT: orps
; CHECK: ret
}
define <4 x i32> @test3(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
%f = fcmp ult <4 x float> %a, %b
%r = select <4 x i1> %f, <4 x i32> zeroinitializer, <4 x i32> %c
ret <4 x i32> %r
; CHECK: test3
; CHECK: cmple
; CHECK-NEXT: andps
; CHECK: ret
}
define <4 x i32> @test4(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
%f = fcmp ult <4 x float> %a, %b
%r = select <4 x i1> %f, <4 x i32> %c, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
ret <4 x i32> %r
; CHECK: test4
; CHECK: cmple
; CHECK-NEXT: orps
; CHECK: ret
}
define <4 x i32> @test5(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
%f = fcmp ult <4 x float> %a, %b
%r = select <4 x i1> %f, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> zeroinitializer
ret <4 x i32> %r
; CHECK: test5
; CHECK: cmpnle
; CHECK-NEXT: ret
}
define <4 x i32> @test6(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
%f = fcmp ult <4 x float> %a, %b
%r = select <4 x i1> %f, <4 x i32> zeroinitializer, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
ret <4 x i32> %r
; CHECK: test6
; CHECK: cmple
; CHECK-NEXT: ret
}
define <4 x i32> @test7(<4 x float> %a, <4 x float> %b, <4 x i32>* %p) {
%f = fcmp ult <4 x float> %a, %b
%s = sext <4 x i1> %f to <4 x i32>
%l = load <4 x i32>* %p
%r = and <4 x i32> %l, %s
ret <4 x i32> %r
; CHECK: test7
; CHECK: cmpnle
; CHECK-NEXT: andps
; CHECK: ret
}