From 432c199e84730e4bb0aec964aa1d13f0226455ac Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Tue, 3 May 2022 16:04:44 -0400 Subject: [PATCH] [InstCombine] move shuffle after min/max with same-shuffled operands This is an intrinsic version of the existing fold for binops. As a first step, I only allowed min/max, but the code is set up to make adding more intrinsics easy (with more or less than 2 arguments). This (and possible follow-ups) are discussed in issue #46238. --- .../InstCombine/InstCombineCalls.cpp | 41 +++++++++++++++++++ .../InstCombine/minmax-intrinsics.ll | 26 +++++++----- 2 files changed, 57 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 957b954c2afb..f35800e9f83a 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1080,6 +1080,43 @@ static Instruction *factorizeMinMaxTree(IntrinsicInst *II) { return CallInst::Create(MinMax, { MinMaxOp, ThirdOp }); } +/// If all arguments of the intrinsic are unary shuffles with the same mask, +/// try to shuffle after the intrinsic. +static Instruction * +foldShuffledIntrinsicOperands(IntrinsicInst *II, + InstCombiner::BuilderTy &Builder) { + // TODO: This should be extended to handle other intrinsics like fshl, ctpop, + // etc. Use llvm::isTriviallyVectorizable() and related to determine + // which intrinsics are safe to shuffle? + if (!match(II, m_MaxOrMin(m_Value(), m_Value()))) + return nullptr; + + Value *X; + ArrayRef Mask; + if (!match(II->getArgOperand(0), + m_Shuffle(m_Value(X), m_Undef(), m_Mask(Mask)))) + return nullptr; + + // At least 1 operand must have 1 use because we are creating 2 instructions. + if (none_of(II->args(), [](Value *V) { return V->hasOneUse(); })) + return nullptr; + + // See if all arguments are shuffled with the same mask. + SmallVector NewArgs(II->arg_size()); + NewArgs[0] = X; + for (unsigned i = 1, e = II->arg_size(); i != e; ++i) { + if (!match(II->getArgOperand(i), + m_Shuffle(m_Value(X), m_Undef(), m_SpecificMask(Mask)))) + return nullptr; + NewArgs[i] = X; + } + + // intrinsic (shuf X, M), (shuf Y, M), ... --> shuf (intrinsic X, Y, ...), M + Value *NewIntrinsic = + Builder.CreateIntrinsic(II->getIntrinsicID(), X->getType(), NewArgs); + return new ShuffleVectorInst(NewIntrinsic, Mask); +} + /// CallInst simplification. This mostly only handles folding of intrinsic /// instructions. For normal calls, it allows visitCallBase to do the heavy /// lifting. @@ -2622,6 +2659,10 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { break; } } + + if (Instruction *Shuf = foldShuffledIntrinsicOperands(II, Builder)) + return Shuf; + // Some intrinsics (like experimental_gc_statepoint) can be used in invoke // context, so it is handled in visitCallBase and we should trigger it. return visitCallBase(*II); diff --git a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll index c41dae0e64e5..7fb4f80c5cad 100644 --- a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll @@ -2364,9 +2364,8 @@ define i8 @umax_umax_reassoc_constantexpr_sink(i8 %x, i8 %y) { define <3 x i8> @smax_unary_shuffle_ops(<3 x i8> %x, <3 x i8> %y) { ; CHECK-LABEL: @smax_unary_shuffle_ops( -; CHECK-NEXT: [[SX:%.*]] = shufflevector <3 x i8> [[X:%.*]], <3 x i8> poison, <3 x i32> -; CHECK-NEXT: [[SY:%.*]] = shufflevector <3 x i8> [[Y:%.*]], <3 x i8> poison, <3 x i32> -; CHECK-NEXT: [[R:%.*]] = call <3 x i8> @llvm.smax.v3i8(<3 x i8> [[SX]], <3 x i8> [[SY]]) +; CHECK-NEXT: [[TMP1:%.*]] = call <3 x i8> @llvm.smax.v3i8(<3 x i8> [[X:%.*]], <3 x i8> [[Y:%.*]]) +; CHECK-NEXT: [[R:%.*]] = shufflevector <3 x i8> [[TMP1]], <3 x i8> poison, <3 x i32> ; CHECK-NEXT: ret <3 x i8> [[R]] ; %sx = shufflevector <3 x i8> %x, <3 x i8> poison, <3 x i32> @@ -2379,8 +2378,8 @@ define <3 x i8> @smin_unary_shuffle_ops_use_poison_mask_elt(<3 x i8> %x, <3 x i8 ; CHECK-LABEL: @smin_unary_shuffle_ops_use_poison_mask_elt( ; CHECK-NEXT: [[SX:%.*]] = shufflevector <3 x i8> [[X:%.*]], <3 x i8> poison, <3 x i32> ; CHECK-NEXT: call void @use_vec(<3 x i8> [[SX]]) -; CHECK-NEXT: [[SY:%.*]] = shufflevector <3 x i8> [[Y:%.*]], <3 x i8> poison, <3 x i32> -; CHECK-NEXT: [[R:%.*]] = call <3 x i8> @llvm.smin.v3i8(<3 x i8> [[SX]], <3 x i8> [[SY]]) +; CHECK-NEXT: [[TMP1:%.*]] = call <3 x i8> @llvm.smin.v3i8(<3 x i8> [[X]], <3 x i8> [[Y:%.*]]) +; CHECK-NEXT: [[R:%.*]] = shufflevector <3 x i8> [[TMP1]], <3 x i8> poison, <3 x i32> ; CHECK-NEXT: ret <3 x i8> [[R]] ; %sx = shufflevector <3 x i8> %x, <3 x i8> poison, <3 x i32> @@ -2392,10 +2391,10 @@ define <3 x i8> @smin_unary_shuffle_ops_use_poison_mask_elt(<3 x i8> %x, <3 x i8 define <3 x i8> @umax_unary_shuffle_ops_use_widening(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @umax_unary_shuffle_ops_use_widening( -; CHECK-NEXT: [[SX:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> poison, <3 x i32> ; CHECK-NEXT: [[SY:%.*]] = shufflevector <2 x i8> [[Y:%.*]], <2 x i8> poison, <3 x i32> ; CHECK-NEXT: call void @use_vec(<3 x i8> [[SY]]) -; CHECK-NEXT: [[R:%.*]] = call <3 x i8> @llvm.umax.v3i8(<3 x i8> [[SX]], <3 x i8> [[SY]]) +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i8> @llvm.umax.v2i8(<2 x i8> [[X:%.*]], <2 x i8> [[Y]]) +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> poison, <3 x i32> ; CHECK-NEXT: ret <3 x i8> [[R]] ; %sx = shufflevector <2 x i8> %x, <2 x i8> poison, <3 x i32> @@ -2407,9 +2406,8 @@ define <3 x i8> @umax_unary_shuffle_ops_use_widening(<2 x i8> %x, <2 x i8> %y) { define <3 x i8> @umin_unary_shuffle_ops_narrowing(<4 x i8> %x, <4 x i8> %y) { ; CHECK-LABEL: @umin_unary_shuffle_ops_narrowing( -; CHECK-NEXT: [[SX:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> poison, <3 x i32> -; CHECK-NEXT: [[SY:%.*]] = shufflevector <4 x i8> [[Y:%.*]], <4 x i8> poison, <3 x i32> -; CHECK-NEXT: [[R:%.*]] = call <3 x i8> @llvm.umin.v3i8(<3 x i8> [[SX]], <3 x i8> [[SY]]) +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i8> @llvm.umin.v4i8(<4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]]) +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <3 x i32> ; CHECK-NEXT: ret <3 x i8> [[R]] ; %sx = shufflevector <4 x i8> %x, <4 x i8> poison, <3 x i32> @@ -2418,6 +2416,8 @@ define <3 x i8> @umin_unary_shuffle_ops_narrowing(<4 x i8> %x, <4 x i8> %y) { ret <3 x i8> %r } +; negative test - must have 2 shuffles + define <3 x i8> @smax_unary_shuffle_ops_unshuffled_op(<3 x i8> %x, <3 x i8> %y) { ; CHECK-LABEL: @smax_unary_shuffle_ops_unshuffled_op( ; CHECK-NEXT: [[SX:%.*]] = shufflevector <3 x i8> [[X:%.*]], <3 x i8> poison, <3 x i32> @@ -2429,6 +2429,8 @@ define <3 x i8> @smax_unary_shuffle_ops_unshuffled_op(<3 x i8> %x, <3 x i8> %y) ret <3 x i8> %r } +; negative test - must have identical masks + define <3 x i8> @smax_unary_shuffle_ops_wrong_mask(<3 x i8> %x, <3 x i8> %y) { ; CHECK-LABEL: @smax_unary_shuffle_ops_wrong_mask( ; CHECK-NEXT: [[SX:%.*]] = shufflevector <3 x i8> [[X:%.*]], <3 x i8> poison, <3 x i32> @@ -2442,6 +2444,8 @@ define <3 x i8> @smax_unary_shuffle_ops_wrong_mask(<3 x i8> %x, <3 x i8> %y) { ret <3 x i8> %r } +; negative test - must be unary shuffles + define <3 x i8> @smax_unary_shuffle_ops_wrong_shuf(<3 x i8> %x, <3 x i8> %y, <3 x i8> %z) { ; CHECK-LABEL: @smax_unary_shuffle_ops_wrong_shuf( ; CHECK-NEXT: [[SX:%.*]] = shufflevector <3 x i8> [[X:%.*]], <3 x i8> [[Z:%.*]], <3 x i32> @@ -2455,6 +2459,8 @@ define <3 x i8> @smax_unary_shuffle_ops_wrong_shuf(<3 x i8> %x, <3 x i8> %y, <3 ret <3 x i8> %r } +; negative test - too many uses + define <3 x i8> @smin_unary_shuffle_ops_uses(<3 x i8> %x, <3 x i8> %y) { ; CHECK-LABEL: @smin_unary_shuffle_ops_uses( ; CHECK-NEXT: [[SX:%.*]] = shufflevector <3 x i8> [[X:%.*]], <3 x i8> poison, <3 x i32>