From f09a357765de1f6c868c83a54d3070928996db9e Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Mon, 27 Jan 2014 18:45:30 +0000 Subject: [PATCH] [DAGCombiner] Teach how to fold sext/aext/zext of constant build vectors. This patch teaches the DAGCombiner how to fold a sext/aext/zext dag node when the operand in input is a build vector of constants (or UNDEFs). The inability to fold a sext/zext of a constant build_vector was the root cause of some pcg bugs affecting vselect expansion on x86-64 with AVX support. Before this change, the DAGCombiner only knew how to fold a sext/zext/aext of a ConstantSDNode. llvm-svn: 200234 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 73 ++++- llvm/test/CodeGen/Mips/msa/compare_float.ll | 16 +- llvm/test/CodeGen/X86/avx-blend.ll | 13 +- .../test/CodeGen/X86/fold-vector-sext-zext.ll | 291 ++++++++++++++++++ 4 files changed, 372 insertions(+), 21 deletions(-) create mode 100644 llvm/test/CodeGen/X86/fold-vector-sext-zext.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 97c19d0e9b18..9a64b71d947a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -4577,6 +4577,62 @@ SDValue DAGCombiner::visitSETCC(SDNode *N) { SDLoc(N)); } +// tryToFoldExtendOfConstant - Try to fold a sext/zext/aext +// dag node into a ConstantSDNode or a build_vector of constants. +// This function is called by the DAGCombiner when visiting sext/zext/aext +// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND). +// Vector extends are not folded if operations are legal; this is to +// avoid introducing illegal build_vector dag nodes. +static SDNode *tryToFoldExtendOfConstant(SDNode *N, SelectionDAG &DAG, + bool LegalOperations) { + unsigned Opcode = N->getOpcode(); + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + + assert((Opcode == ISD::SIGN_EXTEND || Opcode == ISD::ZERO_EXTEND || + Opcode == ISD::ANY_EXTEND) && "Expected EXTEND dag node in input!"); + + // fold (sext c1) -> c1 + // fold (zext c1) -> c1 + // fold (aext c1) -> c1 + if (isa(N0)) + return DAG.getNode(Opcode, SDLoc(N), VT, N0).getNode(); + + // fold (sext (build_vector AllConstants) -> (build_vector AllConstants) + // fold (zext (build_vector AllConstants) -> (build_vector AllConstants) + // fold (aext (build_vector AllConstants) -> (build_vector AllConstants) + if (!(VT.isVector() && !LegalOperations && + ISD::isBuildVectorOfConstantSDNodes(N0.getNode()))) + return 0; + + // We can fold this node into a build_vector. + unsigned VTBits = VT.getScalarType().getSizeInBits(); + unsigned EVTBits = N0->getValueType(0).getScalarType().getSizeInBits(); + unsigned ShAmt = VTBits - EVTBits; + SmallVector Elts; + unsigned NumElts = N0->getNumOperands(); + SDLoc DL(N); + + for (unsigned i=0; i != NumElts; ++i) { + SDValue Op = N0->getOperand(i); + if (Op->getOpcode() == ISD::UNDEF) { + Elts.push_back(DAG.getUNDEF(VT.getScalarType())); + continue; + } + + ConstantSDNode *CurrentND = cast(Op); + const APInt &C = APInt(VTBits, CurrentND->getAPIntValue().getZExtValue()); + if (Opcode == ISD::SIGN_EXTEND) + Elts.push_back(DAG.getConstant(C.shl(ShAmt).ashr(ShAmt).getZExtValue(), + VT.getScalarType())); + else + Elts.push_back(DAG.getConstant(C.shl(ShAmt).lshr(ShAmt).getZExtValue(), + VT.getScalarType())); + } + + return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, &Elts[0], NumElts).getNode(); +} + // ExtendUsesToFormExtLoad - Trying to extend uses of a load to enable this: // "fold ({s|z|a}ext (load x)) -> ({s|z|a}ext (truncate ({s|z|a}extload x)))" // transformation. Returns true if extension are possible and the above @@ -4667,9 +4723,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); - // fold (sext c1) -> c1 - if (isa(N0)) - return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N0); + if (SDNode *Res = tryToFoldExtendOfConstant(N, DAG, LegalOperations)) + return SDValue(Res, 0); // fold (sext (sext x)) -> (sext x) // fold (sext (aext x)) -> (sext x) @@ -4917,9 +4972,9 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); - // fold (zext c1) -> c1 - if (isa(N0)) - return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0); + if (SDNode *Res = tryToFoldExtendOfConstant(N, DAG, LegalOperations)) + return SDValue(Res, 0); + // fold (zext (zext x)) -> (zext x) // fold (zext (aext x)) -> (zext x) if (N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) @@ -5186,9 +5241,9 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); - // fold (aext c1) -> c1 - if (isa(N0)) - return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, N0); + if (SDNode *Res = tryToFoldExtendOfConstant(N, DAG, LegalOperations)) + return SDValue(Res, 0); + // fold (aext (aext x)) -> (aext x) // fold (aext (zext x)) -> (zext x) // fold (aext (sext x)) -> (sext x) diff --git a/llvm/test/CodeGen/Mips/msa/compare_float.ll b/llvm/test/CodeGen/Mips/msa/compare_float.ll index 2fc61f89c7fa..f5e8d9d9d6c6 100644 --- a/llvm/test/CodeGen/Mips/msa/compare_float.ll +++ b/llvm/test/CodeGen/Mips/msa/compare_float.ll @@ -32,12 +32,9 @@ define void @false_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) noun store <2 x i64> %4, <2 x i64>* %c ret void - ; FIXME: This code is correct, but poor. Ideally it would be similar to - ; the code in @false_v4f32 + ; (setcc $a, $b, SETFALSE) is always folded ; CHECK-DAG: ldi.b [[R1:\$w[0-9]+]], 0 - ; CHECK-DAG: slli.d [[R3:\$w[0-9]+]], [[R1]], 63 - ; CHECK-DAG: srai.d [[R4:\$w[0-9]+]], [[R3]], 63 - ; CHECK-DAG: st.d [[R4]], 0($4) + ; CHECK-DAG: st.w [[R1]], 0($4) ; CHECK: .size false_v2f64 } @@ -509,12 +506,9 @@ define void @true_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounw store <2 x i64> %4, <2 x i64>* %c ret void - ; FIXME: This code is correct, but poor. Ideally it would be similar to - ; the code in @true_v4f32 - ; CHECK-DAG: ldi.d [[R1:\$w[0-9]+]], 1 - ; CHECK-DAG: slli.d [[R3:\$w[0-9]+]], [[R1]], 63 - ; CHECK-DAG: srai.d [[R4:\$w[0-9]+]], [[R3]], 63 - ; CHECK-DAG: st.d [[R4]], 0($4) + ; (setcc $a, $b, SETTRUE) is always folded. + ; CHECK-DAG: ldi.b [[R1:\$w[0-9]+]], -1 + ; CHECK-DAG: st.w [[R1]], 0($4) ; CHECK: .size true_v2f64 } diff --git a/llvm/test/CodeGen/X86/avx-blend.ll b/llvm/test/CodeGen/X86/avx-blend.ll index e9bfce663f6a..5fcd5ff5f4c3 100644 --- a/llvm/test/CodeGen/X86/avx-blend.ll +++ b/llvm/test/CodeGen/X86/avx-blend.ll @@ -51,6 +51,7 @@ define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) { ;CHECK-LABEL: vsel_float8: +;CHECK-NOT: vinsertf128 ;CHECK: vblendvps ;CHECK: ret define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) { @@ -59,8 +60,9 @@ define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) { } ;CHECK-LABEL: vsel_i328: +;CHECK-NOT: vinsertf128 ;CHECK: vblendvps -;CHECK: ret +;CHECK-NEXT: ret define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) { %vsel = select <8 x i1> , <8 x i32> %v1, <8 x i32> %v2 ret <8 x i32> %vsel @@ -82,6 +84,15 @@ define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) { ret <8 x i64> %vsel } +;CHECK-LABEL: vsel_double4: +;CHECK-NOT: vinsertf128 +;CHECK: vblendvpd +;CHECK-NEXT: ret +define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) { + %vsel = select <4 x i1> , <4 x double> %v1, <4 x double> %v2 + ret <4 x double> %vsel +} + ;; TEST blend + compares ; CHECK: testa define <2 x double> @testa(<2 x double> %x, <2 x double> %y) { diff --git a/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll b/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll new file mode 100644 index 000000000000..aeaab4479085 --- /dev/null +++ b/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll @@ -0,0 +1,291 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -mattr=+avx | FileCheck %s + +; Verify that the backend correctly folds a sign/zero extend of a vector where +; elements are all constant values or UNDEFs. +; The backend should be able to optimize all the test functions below into +; simple loads from constant pool of the result. That is because the resulting +; vector should be known at static time. + + +define <4 x i16> @test1() { + %1 = insertelement <4 x i8> undef, i8 0, i32 0 + %2 = insertelement <4 x i8> %1, i8 -1, i32 1 + %3 = insertelement <4 x i8> %2, i8 2, i32 2 + %4 = insertelement <4 x i8> %3, i8 -3, i32 3 + %5 = sext <4 x i8> %4 to <4 x i16> + ret <4 x i16> %5 +} +; CHECK-LABEL: test1 +; CHECK: vmovaps +; CHECK-NEXT: ret + +define <4 x i16> @test2() { + %1 = insertelement <4 x i8> undef, i8 undef, i32 0 + %2 = insertelement <4 x i8> %1, i8 -1, i32 1 + %3 = insertelement <4 x i8> %2, i8 undef, i32 2 + %4 = insertelement <4 x i8> %3, i8 -3, i32 3 + %5 = sext <4 x i8> %4 to <4 x i16> + ret <4 x i16> %5 +} +; CHECK-LABEL: test2 +; CHECK: vmovaps +; CHECK-NEXT: ret + +define <4 x i32> @test3() { + %1 = insertelement <4 x i8> undef, i8 0, i32 0 + %2 = insertelement <4 x i8> %1, i8 -1, i32 1 + %3 = insertelement <4 x i8> %2, i8 2, i32 2 + %4 = insertelement <4 x i8> %3, i8 -3, i32 3 + %5 = sext <4 x i8> %4 to <4 x i32> + ret <4 x i32> %5 +} +; CHECK-LABEL: test3 +; CHECK: vmovaps +; CHECK-NEXT: ret + +define <4 x i32> @test4() { + %1 = insertelement <4 x i8> undef, i8 undef, i32 0 + %2 = insertelement <4 x i8> %1, i8 -1, i32 1 + %3 = insertelement <4 x i8> %2, i8 undef, i32 2 + %4 = insertelement <4 x i8> %3, i8 -3, i32 3 + %5 = sext <4 x i8> %4 to <4 x i32> + ret <4 x i32> %5 +} +; CHECK-LABEL: test4 +; CHECK: vmovaps +; CHECK-NEXT: ret + + +define <4 x i64> @test5() { + %1 = insertelement <4 x i8> undef, i8 0, i32 0 + %2 = insertelement <4 x i8> %1, i8 -1, i32 1 + %3 = insertelement <4 x i8> %2, i8 2, i32 2 + %4 = insertelement <4 x i8> %3, i8 -3, i32 3 + %5 = sext <4 x i8> %4 to <4 x i64> + ret <4 x i64> %5 +} +; CHECK-LABEL: test5 +; CHECK-NOT: vinsertf128 +; CHECK: vmovaps +; CHECK-NEXT: ret + +define <4 x i64> @test6() { + %1 = insertelement <4 x i8> undef, i8 undef, i32 0 + %2 = insertelement <4 x i8> %1, i8 -1, i32 1 + %3 = insertelement <4 x i8> %2, i8 undef, i32 2 + %4 = insertelement <4 x i8> %3, i8 -3, i32 3 + %5 = sext <4 x i8> %4 to <4 x i64> + ret <4 x i64> %5 +} +; CHECK-LABEL: test6 +; CHECK-NOT: vinsertf128 +; CHECK: vmovaps +; CHECK-NEXT: ret + +define <8 x i16> @test7() { + %1 = insertelement <8 x i8> undef, i8 0, i32 0 + %2 = insertelement <8 x i8> %1, i8 -1, i32 1 + %3 = insertelement <8 x i8> %2, i8 2, i32 2 + %4 = insertelement <8 x i8> %3, i8 -3, i32 3 + %5 = insertelement <8 x i8> %4, i8 4, i32 4 + %6 = insertelement <8 x i8> %5, i8 -5, i32 5 + %7 = insertelement <8 x i8> %6, i8 6, i32 6 + %8 = insertelement <8 x i8> %7, i8 -7, i32 7 + %9 = sext <8 x i8> %4 to <8 x i16> + ret <8 x i16> %9 +} +; CHECK-LABEL: test7 +; CHECK: vmovaps +; CHECK-NEXT: ret + +define <8 x i32> @test8() { + %1 = insertelement <8 x i8> undef, i8 0, i32 0 + %2 = insertelement <8 x i8> %1, i8 -1, i32 1 + %3 = insertelement <8 x i8> %2, i8 2, i32 2 + %4 = insertelement <8 x i8> %3, i8 -3, i32 3 + %5 = insertelement <8 x i8> %4, i8 4, i32 4 + %6 = insertelement <8 x i8> %5, i8 -5, i32 5 + %7 = insertelement <8 x i8> %6, i8 6, i32 6 + %8 = insertelement <8 x i8> %7, i8 -7, i32 7 + %9 = sext <8 x i8> %4 to <8 x i32> + ret <8 x i32> %9 +} +; CHECK-LABEL: test8 +; CHECK-NOT: vinsertf128 +; CHECK: vmovaps +; CHECK-NEXT: ret + +define <8 x i16> @test9() { + %1 = insertelement <8 x i8> undef, i8 undef, i32 0 + %2 = insertelement <8 x i8> %1, i8 -1, i32 1 + %3 = insertelement <8 x i8> %2, i8 undef, i32 2 + %4 = insertelement <8 x i8> %3, i8 -3, i32 3 + %5 = insertelement <8 x i8> %4, i8 undef, i32 4 + %6 = insertelement <8 x i8> %5, i8 -5, i32 5 + %7 = insertelement <8 x i8> %6, i8 undef, i32 6 + %8 = insertelement <8 x i8> %7, i8 -7, i32 7 + %9 = sext <8 x i8> %4 to <8 x i16> + ret <8 x i16> %9 +} +; CHECK-LABEL: test9 +; CHECK: vmovaps +; CHECK-NEXT: ret + +define <8 x i32> @test10() { + %1 = insertelement <8 x i8> undef, i8 0, i32 0 + %2 = insertelement <8 x i8> %1, i8 undef, i32 1 + %3 = insertelement <8 x i8> %2, i8 2, i32 2 + %4 = insertelement <8 x i8> %3, i8 undef, i32 3 + %5 = insertelement <8 x i8> %4, i8 4, i32 4 + %6 = insertelement <8 x i8> %5, i8 undef, i32 5 + %7 = insertelement <8 x i8> %6, i8 6, i32 6 + %8 = insertelement <8 x i8> %7, i8 undef, i32 7 + %9 = sext <8 x i8> %4 to <8 x i32> + ret <8 x i32> %9 +} +; CHECK-LABEL: test10 +; CHECK-NOT: vinsertf128 +; CHECK: vmovaps +; CHECK-NEXT: ret + + +define <4 x i16> @test11() { + %1 = insertelement <4 x i8> undef, i8 0, i32 0 + %2 = insertelement <4 x i8> %1, i8 -1, i32 1 + %3 = insertelement <4 x i8> %2, i8 2, i32 2 + %4 = insertelement <4 x i8> %3, i8 -3, i32 3 + %5 = zext <4 x i8> %4 to <4 x i16> + ret <4 x i16> %5 +} +; CHECK-LABEL: test11 +; CHECK: vmovaps +; CHECK-NEXT: ret + +define <4 x i32> @test12() { + %1 = insertelement <4 x i8> undef, i8 0, i32 0 + %2 = insertelement <4 x i8> %1, i8 -1, i32 1 + %3 = insertelement <4 x i8> %2, i8 2, i32 2 + %4 = insertelement <4 x i8> %3, i8 -3, i32 3 + %5 = zext <4 x i8> %4 to <4 x i32> + ret <4 x i32> %5 +} +; CHECK-LABEL: test12 +; CHECK: vmovaps +; CHECK-NEXT: ret + +define <4 x i64> @test13() { + %1 = insertelement <4 x i8> undef, i8 0, i32 0 + %2 = insertelement <4 x i8> %1, i8 -1, i32 1 + %3 = insertelement <4 x i8> %2, i8 2, i32 2 + %4 = insertelement <4 x i8> %3, i8 -3, i32 3 + %5 = zext <4 x i8> %4 to <4 x i64> + ret <4 x i64> %5 +} +; CHECK-LABEL: test13 +; CHECK-NOT: vinsertf128 +; CHECK: vmovaps +; CHECK-NEXT: ret + +define <4 x i16> @test14() { + %1 = insertelement <4 x i8> undef, i8 undef, i32 0 + %2 = insertelement <4 x i8> %1, i8 -1, i32 1 + %3 = insertelement <4 x i8> %2, i8 undef, i32 2 + %4 = insertelement <4 x i8> %3, i8 -3, i32 3 + %5 = zext <4 x i8> %4 to <4 x i16> + ret <4 x i16> %5 +} +; CHECK-LABEL: test14 +; CHECK: vmovaps +; CHECK-NEXT: ret + +define <4 x i32> @test15() { + %1 = insertelement <4 x i8> undef, i8 0, i32 0 + %2 = insertelement <4 x i8> %1, i8 undef, i32 1 + %3 = insertelement <4 x i8> %2, i8 2, i32 2 + %4 = insertelement <4 x i8> %3, i8 undef, i32 3 + %5 = zext <4 x i8> %4 to <4 x i32> + ret <4 x i32> %5 +} +; CHECK-LABEL: test15 +; CHECK: vmovaps +; CHECK-NEXT: ret + +define <4 x i64> @test16() { + %1 = insertelement <4 x i8> undef, i8 undef, i32 0 + %2 = insertelement <4 x i8> %1, i8 -1, i32 1 + %3 = insertelement <4 x i8> %2, i8 2, i32 2 + %4 = insertelement <4 x i8> %3, i8 undef, i32 3 + %5 = zext <4 x i8> %4 to <4 x i64> + ret <4 x i64> %5 +} +; CHECK-LABEL: test16 +; CHECK-NOT: vinsertf128 +; CHECK: vmovaps +; CHECK-NEXT: ret + +define <8 x i16> @test17() { + %1 = insertelement <8 x i8> undef, i8 0, i32 0 + %2 = insertelement <8 x i8> %1, i8 -1, i32 1 + %3 = insertelement <8 x i8> %2, i8 2, i32 2 + %4 = insertelement <8 x i8> %3, i8 -3, i32 3 + %5 = insertelement <8 x i8> %4, i8 4, i32 4 + %6 = insertelement <8 x i8> %5, i8 -5, i32 5 + %7 = insertelement <8 x i8> %6, i8 6, i32 6 + %8 = insertelement <8 x i8> %7, i8 -7, i32 7 + %9 = zext <8 x i8> %8 to <8 x i16> + ret <8 x i16> %9 +} +; CHECK-LABEL: test17 +; CHECK: vmovaps +; CHECK-NEXT: ret + +define <8 x i32> @test18() { + %1 = insertelement <8 x i8> undef, i8 0, i32 0 + %2 = insertelement <8 x i8> %1, i8 -1, i32 1 + %3 = insertelement <8 x i8> %2, i8 2, i32 2 + %4 = insertelement <8 x i8> %3, i8 -3, i32 3 + %5 = insertelement <8 x i8> %4, i8 4, i32 4 + %6 = insertelement <8 x i8> %5, i8 -5, i32 5 + %7 = insertelement <8 x i8> %6, i8 6, i32 6 + %8 = insertelement <8 x i8> %7, i8 -7, i32 7 + %9 = zext <8 x i8> %8 to <8 x i32> + ret <8 x i32> %9 +} +; CHECK-LABEL: test18 +; CHECK-NOT: vinsertf128 +; CHECK: vmovaps +; CHECK-NEXT: ret + +define <8 x i16> @test19() { + %1 = insertelement <8 x i8> undef, i8 undef, i32 0 + %2 = insertelement <8 x i8> %1, i8 -1, i32 1 + %3 = insertelement <8 x i8> %2, i8 undef, i32 2 + %4 = insertelement <8 x i8> %3, i8 -3, i32 3 + %5 = insertelement <8 x i8> %4, i8 undef, i32 4 + %6 = insertelement <8 x i8> %5, i8 -5, i32 5 + %7 = insertelement <8 x i8> %6, i8 undef, i32 6 + %8 = insertelement <8 x i8> %7, i8 -7, i32 7 + %9 = zext <8 x i8> %8 to <8 x i16> + ret <8 x i16> %9 +} +; CHECK-LABEL: test19 +; CHECK: vmovaps +; CHECK-NEXT: ret + +define <8 x i32> @test20() { + %1 = insertelement <8 x i8> undef, i8 0, i32 0 + %2 = insertelement <8 x i8> %1, i8 undef, i32 1 + %3 = insertelement <8 x i8> %2, i8 2, i32 2 + %4 = insertelement <8 x i8> %3, i8 -3, i32 3 + %5 = insertelement <8 x i8> %4, i8 4, i32 4 + %6 = insertelement <8 x i8> %5, i8 undef, i32 5 + %7 = insertelement <8 x i8> %6, i8 6, i32 6 + %8 = insertelement <8 x i8> %7, i8 undef, i32 7 + %9 = zext <8 x i8> %8 to <8 x i32> + ret <8 x i32> %9 +} +; CHECK-LABEL: test20 +; CHECK-NOT: vinsertf128 +; CHECK: vmovaps +; CHECK-NEXT: ret +