From 43f5e0848ece8c39752516019588124aa3f895a1 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 29 Sep 2015 08:19:11 +0000 Subject: [PATCH] [InstCombine] Improve Vector Demanded Bits Through Bitcasts Currently SimplifyDemandedVectorElts can only peek through bitcasts if the vectors have the same number of elements. This patch fixes and enables some existing (disabled) code to support bitcasting to vectors with more/fewer elements. It currently only accepts cases when vectors alias cleanly (i.e. number of elements are an exact multiple of the other vector). This was added to improve the demanded vector elements support for SSE vector shifts which require the __m128i (<2 x i64>) argument type to be bitcast to the vector type for the builtin shift. I've added extra tests for various additional bitcasts. Differential Revision: http://reviews.llvm.org/D12935 llvm-svn: 248784 --- .../InstCombineSimplifyDemanded.cpp | 69 +++++++++---------- .../InstCombine/x86-vector-shifts.ll | 44 ++++++++++++ 2 files changed, 78 insertions(+), 35 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index c68a03157847..a1fbda7f7e6d 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1082,6 +1082,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, if (!VTy) break; unsigned InVWidth = VTy->getNumElements(); APInt InputDemandedElts(InVWidth, 0); + UndefElts2 = APInt(InVWidth, 0); unsigned Ratio; if (VWidth == InVWidth) { @@ -1089,29 +1090,25 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, // elements as are demanded of us. Ratio = 1; InputDemandedElts = DemandedElts; - } else if (VWidth > InVWidth) { - // Untested so far. - break; - - // If there are more elements in the result than there are in the source, - // then an input element is live if any of the corresponding output - // elements are live. - Ratio = VWidth/InVWidth; - for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) { + } else if ((VWidth % InVWidth) == 0) { + // If the number of elements in the output is a multiple of the number of + // elements in the input then an input element is live if any of the + // corresponding output elements are live. + Ratio = VWidth / InVWidth; + for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) if (DemandedElts[OutIdx]) - InputDemandedElts.setBit(OutIdx/Ratio); - } - } else { - // Untested so far. - break; - - // If there are more elements in the source than there are in the result, - // then an input element is live if the corresponding output element is - // live. - Ratio = InVWidth/VWidth; + InputDemandedElts.setBit(OutIdx / Ratio); + } else if ((InVWidth % VWidth) == 0) { + // If the number of elements in the input is a multiple of the number of + // elements in the output then an input element is live if the + // corresponding output element is live. + Ratio = InVWidth / VWidth; for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx) - if (DemandedElts[InIdx/Ratio]) + if (DemandedElts[InIdx / Ratio]) InputDemandedElts.setBit(InIdx); + } else { + // Unsupported so far. + break; } // div/rem demand all inputs, because they don't want divide by zero. @@ -1122,24 +1119,26 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, MadeChange = true; } - UndefElts = UndefElts2; - if (VWidth > InVWidth) { - llvm_unreachable("Unimp"); - // If there are more elements in the result than there are in the source, - // then an output element is undef if the corresponding input element is - // undef. + if (VWidth == InVWidth) { + UndefElts = UndefElts2; + } else if ((VWidth % InVWidth) == 0) { + // If the number of elements in the output is a multiple of the number of + // elements in the input then an output element is undef if the + // corresponding input element is undef. for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) - if (UndefElts2[OutIdx/Ratio]) + if (UndefElts2[OutIdx / Ratio]) UndefElts.setBit(OutIdx); - } else if (VWidth < InVWidth) { + } else if ((InVWidth % VWidth) == 0) { + // If the number of elements in the input is a multiple of the number of + // elements in the output then an output element is undef if all of the + // corresponding input elements are undef. + for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) { + APInt SubUndef = UndefElts2.lshr(OutIdx * Ratio).zextOrTrunc(Ratio); + if (SubUndef.countPopulation() == Ratio) + UndefElts.setBit(OutIdx); + } + } else { llvm_unreachable("Unimp"); - // If there are more elements in the source than there are in the result, - // then a result element is undef if all of the corresponding input - // elements are undef. - UndefElts = ~0ULL >> (64-VWidth); // Start out all undef. - for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx) - if (!UndefElts2[InIdx]) // Not undef? - UndefElts.clearBit(InIdx/Ratio); // Clear undef bit. } break; } diff --git a/llvm/test/Transforms/InstCombine/x86-vector-shifts.ll b/llvm/test/Transforms/InstCombine/x86-vector-shifts.ll index d07d90de049c..59e445a40bef 100644 --- a/llvm/test/Transforms/InstCombine/x86-vector-shifts.ll +++ b/llvm/test/Transforms/InstCombine/x86-vector-shifts.ll @@ -838,6 +838,17 @@ define <8 x i16> @sse2_psra_w_var(<8 x i16> %v, <8 x i16> %a) { ret <8 x i16> %2 } +define <8 x i16> @sse2_psra_w_var_bc(<8 x i16> %v, <2 x i64> %a) { +; CHECK-LABEL: @sse2_psra_w_var_bc +; CHECK-NEXT: %1 = bitcast <2 x i64> %a to <8 x i16> +; CHECK-NEXT: %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %1) +; CHECK-NEXT: ret <8 x i16> %2 + %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> + %2 = bitcast <2 x i64> %1 to <8 x i16> + %3 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %2) + ret <8 x i16> %3 +} + define <4 x i32> @sse2_psra_d_var(<4 x i32> %v, <4 x i32> %a) { ; CHECK-LABEL: @sse2_psra_d_var ; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %a) @@ -847,6 +858,17 @@ define <4 x i32> @sse2_psra_d_var(<4 x i32> %v, <4 x i32> %a) { ret <4 x i32> %2 } +define <4 x i32> @sse2_psra_d_var_bc(<4 x i32> %v, <8 x i16> %a) { +; CHECK-LABEL: @sse2_psra_d_var_bc +; CHECK-NEXT: %1 = bitcast <8 x i16> %a to <4 x i32> +; CHECK-NEXT: %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %1) +; CHECK-NEXT: ret <4 x i32> %2 + %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + %2 = bitcast <8 x i16> %1 to <4 x i32> + %3 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %2) + ret <4 x i32> %3 +} + define <16 x i16> @avx2_psra_w_var(<16 x i16> %v, <8 x i16> %a) { ; CHECK-LABEL: @avx2_psra_w_var ; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %a) @@ -901,6 +923,17 @@ define <16 x i16> @avx2_psrl_w_var(<16 x i16> %v, <8 x i16> %a) { ret <16 x i16> %2 } +define <16 x i16> @avx2_psrl_w_var_bc(<16 x i16> %v, <16 x i8> %a) { +; CHECK-LABEL: @avx2_psrl_w_var_bc +; CHECK-NEXT: %1 = bitcast <16 x i8> %a to <8 x i16> +; CHECK-NEXT: %2 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %1) +; CHECK-NEXT: ret <16 x i16> %2 + %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> + %2 = bitcast <16 x i8> %1 to <8 x i16> + %3 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %2) + ret <16 x i16> %3 +} + define <8 x i32> @avx2_psrl_d_var(<8 x i32> %v, <4 x i32> %a) { ; CHECK-LABEL: @avx2_psrl_d_var ; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %a) @@ -910,6 +943,17 @@ define <8 x i32> @avx2_psrl_d_var(<8 x i32> %v, <4 x i32> %a) { ret <8 x i32> %2 } +define <8 x i32> @avx2_psrl_d_var_bc(<8 x i32> %v, <2 x i64> %a) { +; CHECK-LABEL: @avx2_psrl_d_var_bc +; CHECK-NEXT: %1 = bitcast <2 x i64> %a to <4 x i32> +; CHECK-NEXT: %2 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %1) +; CHECK-NEXT: ret <8 x i32> %2 + %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> + %2 = bitcast <2 x i64> %1 to <4 x i32> + %3 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %2) + ret <8 x i32> %3 +} + define <4 x i64> @avx2_psrl_q_var(<4 x i64> %v, <2 x i64> %a) { ; CHECK-LABEL: @avx2_psrl_q_var ; CHECK-NEXT: %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %a)