From bef6e67e95fb39a8c0558624e9eeed0edc2f50e4 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Sun, 19 Apr 2020 08:06:17 -0400 Subject: [PATCH] [VectorCombine] transform bitcasted shuffle to wider elements bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC' This is the widen shuffle elements enhancement to D76727. It builds on the analysis and simplifications in D77881 and rG6a7e958a423e. The phase ordering tests show that we can simplify inverse shuffles across a binop in both directions (widen/narrow or narrow/widen) now. There's another potential transform visible in some of the remaining TODOs - move a bitcasted operand of a shuffle after the shuffle. Differential Revision: https://reviews.llvm.org/D78371 --- .../Transforms/Vectorize/VectorCombine.cpp | 37 +++++++----- .../Transforms/PhaseOrdering/X86/shuffle.ll | 58 +++++++++---------- .../Transforms/VectorCombine/X86/shuffle.ll | 8 +-- 3 files changed, 52 insertions(+), 51 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index e1d3a0590d29..b473d946b7ea 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -259,9 +259,9 @@ static bool foldExtractExtract(Instruction &I, const TargetTransformInfo &TTI) { return true; } -/// If this is a bitcast to narrow elements from a shuffle of wider elements, -/// try to bitcast the source vector to the narrow type followed by shuffle. -/// This can enable further transforms by moving bitcasts or shuffles together. +/// If this is a bitcast of a shuffle, try to bitcast the source vector to the +/// destination type followed by shuffle. This can enable further transforms by +/// moving bitcasts or shuffles together. static bool foldBitcastShuf(Instruction &I, const TargetTransformInfo &TTI) { Value *V; ArrayRef Mask; @@ -269,32 +269,39 @@ static bool foldBitcastShuf(Instruction &I, const TargetTransformInfo &TTI) { m_Mask(Mask)))))) return false; + // Disallow non-vector casts and length-changing shuffles. + // TODO: We could allow any shuffle. auto *DestTy = dyn_cast(I.getType()); auto *SrcTy = cast(V->getType()); if (!DestTy || I.getOperand(0)->getType() != SrcTy) return false; - // TODO: Handle bitcast from narrow element type to wide element type. - unsigned DestNumElts = DestTy->getNumElements(); - unsigned SrcNumElts = SrcTy->getNumElements(); - if (SrcNumElts > DestNumElts) - return false; - // The new shuffle must not cost more than the old shuffle. The bitcast is // moved ahead of the shuffle, so assume that it has the same cost as before. if (TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DestTy) > TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy)) return false; - // Bitcast the source vector and expand the shuffle mask to the equivalent for - // narrow elements. + unsigned DestNumElts = DestTy->getNumElements(); + unsigned SrcNumElts = SrcTy->getNumElements(); + SmallVector NewMask; + if (SrcNumElts <= DestNumElts) { + // The bitcast is from wide to narrow/equal elements. The shuffle mask can + // always be expanded to the equivalent form choosing narrower elements. + assert(DestNumElts % SrcNumElts == 0 && "Unexpected shuffle mask"); + unsigned ScaleFactor = DestNumElts / SrcNumElts; + narrowShuffleMaskElts(ScaleFactor, Mask, NewMask); + } else { + // The bitcast is from narrow elements to wide elements. The shuffle mask + // must choose consecutive elements to allow casting first. + assert(SrcNumElts % DestNumElts == 0 && "Unexpected shuffle mask"); + unsigned ScaleFactor = SrcNumElts / DestNumElts; + if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask)) + return false; + } // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC' IRBuilder<> Builder(&I); Value *CastV = Builder.CreateBitCast(V, DestTy); - SmallVector NewMask; - assert(DestNumElts % SrcNumElts == 0 && "Unexpected shuffle mask"); - unsigned ScaleFactor = DestNumElts / SrcNumElts; - narrowShuffleMaskElts(ScaleFactor, Mask, NewMask); Value *Shuf = Builder.CreateShuffleVector(CastV, UndefValue::get(DestTy), NewMask); I.replaceAllUsesWith(Shuf); diff --git a/llvm/test/Transforms/PhaseOrdering/X86/shuffle.ll b/llvm/test/Transforms/PhaseOrdering/X86/shuffle.ll index a199c0d51cde..1b2ee9fa3ade 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/shuffle.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/shuffle.ll @@ -47,17 +47,13 @@ define <2 x i64> @shuffle_32_add_8_shuffle_32_masks_are_eq(<2 x i64> %v) { ret <2 x i64> %bc5 } -; TODO: Eliminate redundant shuffles +; Eliminate redundant shuffles define <2 x i64> @shuffle_8_add_32_shuffle_8_masks_are_eq(<2 x i64> %v) { ; CHECK-LABEL: @shuffle_8_add_32_shuffle_8_masks_are_eq( -; CHECK-NEXT: [[BC0:%.*]] = bitcast <2 x i64> [[V:%.*]] to <16 x i8> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[BC0]], <16 x i8> undef, <16 x i32> -; CHECK-NEXT: [[BC2:%.*]] = bitcast <16 x i8> [[SHUFFLE]] to <4 x i32> -; CHECK-NEXT: [[ADD_I:%.*]] = shl <4 x i32> [[BC2]], -; CHECK-NEXT: [[BC4:%.*]] = bitcast <4 x i32> [[ADD_I]] to <16 x i8> -; CHECK-NEXT: [[SHUFFLE4:%.*]] = shufflevector <16 x i8> [[BC4]], <16 x i8> undef, <16 x i32> -; CHECK-NEXT: [[BC5:%.*]] = bitcast <16 x i8> [[SHUFFLE4]] to <2 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[TMP1]], +; CHECK-NEXT: [[BC5:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64> ; CHECK-NEXT: ret <2 x i64> [[BC5]] ; %bc0 = bitcast <2 x i64> %v to <16 x i8> @@ -126,15 +122,14 @@ define <16 x i8> @shuffle_16_add_8_masks_are_eq(<8 x i16> %v1, <8 x i16> %v2) { ret <16 x i8> %add } -; TODO: Sink single shuffle. +; Sink single shuffle. define <4 x i32> @shuffle_16_add_32_masks_are_eq_and_can_be_converted_up(<8 x i16> %v1, <8 x i16> %v2) { ; CHECK-LABEL: @shuffle_16_add_32_masks_are_eq_and_can_be_converted_up( -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x i16> [[V1:%.*]], <8 x i16> undef, <8 x i32> -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <8 x i16> [[V2:%.*]], <8 x i16> undef, <8 x i32> -; CHECK-NEXT: [[BC1:%.*]] = bitcast <8 x i16> [[SHUFFLE1]] to <4 x i32> -; CHECK-NEXT: [[BC2:%.*]] = bitcast <8 x i16> [[SHUFFLE2]] to <4 x i32> -; CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[BC2]], [[BC1]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V1:%.*]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[V2:%.*]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[ADD:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[ADD]] ; %shuffle1 = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> @@ -145,15 +140,14 @@ define <4 x i32> @shuffle_16_add_32_masks_are_eq_and_can_be_converted_up(<8 x i1 ret <4 x i32> %add } -; TODO: Sink single shuffle. +; Sink single shuffle. define <4 x i32> @shuffle_8_add_32_masks_are_eq_and_can_be_converted_up(<16 x i8> %v1, <16 x i8> %v2) { ; CHECK-LABEL: @shuffle_8_add_32_masks_are_eq_and_can_be_converted_up( -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> undef, <16 x i32> -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <16 x i8> [[V2:%.*]], <16 x i8> undef, <16 x i32> -; CHECK-NEXT: [[BC1:%.*]] = bitcast <16 x i8> [[SHUFFLE1]] to <4 x i32> -; CHECK-NEXT: [[BC2:%.*]] = bitcast <16 x i8> [[SHUFFLE2]] to <4 x i32> -; CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[BC2]], [[BC1]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[V1:%.*]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[V2:%.*]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[ADD:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[ADD]] ; %shuffle1 = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> @@ -229,13 +223,13 @@ define <16 x i8> @shuffle_32_bitcast_8_shuffle_8_can_not_be_converted_up(<4 x i3 } ; shuffle<4 x i32>( bitcast<4 x i32>( shuffle<16 x i8>(v))) -; TODO: Narrow, squash shuffles, and widen type? +; TODO: squash shuffles? define <4 x i32> @shuffle_8_bitcast_32_shuffle_32_can_be_converted_up(<16 x i8> %v1) { ; CHECK-LABEL: @shuffle_8_bitcast_32_shuffle_32_can_be_converted_up( -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> undef, <16 x i32> -; CHECK-NEXT: [[BC1:%.*]] = bitcast <16 x i8> [[SHUFFLE1]] to <4 x i32> -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[BC1]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[V1:%.*]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[SHUFFLE2]] ; %shuffle1 = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> @@ -245,13 +239,13 @@ define <4 x i32> @shuffle_8_bitcast_32_shuffle_32_can_be_converted_up(<16 x i8> } ; shuffle<4 x i32>( bitcast<4 x i32>( shuffle<8 x i16>(v))) -; TODO: Narrow, squash shuffles, and widen type? +; TODO: squash shuffles? define <4 x i32> @shuffle_16_bitcast_32_shuffle_32_can_be_converted_up(<8 x i16> %v1) { ; CHECK-LABEL: @shuffle_16_bitcast_32_shuffle_32_can_be_converted_up( -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x i16> [[V1:%.*]], <8 x i16> undef, <8 x i32> -; CHECK-NEXT: [[BC1:%.*]] = bitcast <8 x i16> [[SHUFFLE1]] to <4 x i32> -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[BC1]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V1:%.*]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[SHUFFLE2]] ; %shuffle1 = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> @@ -293,13 +287,13 @@ define <4 x i32> @shuffle_16_bitcast_32_shuffle_32_can_not_be_converted_up(<8 x } ; shuffle<8 x i16>( bitcast<8 x i16>( shuffle<16 x i8>(v))) -; TODO: Narrow, squash shuffles, and widen type? +; TODO: squash shuffles and widen type? define <8 x i16> @shuffle_8_bitcast_16_shuffle_16_can__be_converted_up(<16 x i8> %v1) { ; CHECK-LABEL: @shuffle_8_bitcast_16_shuffle_16_can__be_converted_up( -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> undef, <16 x i32> -; CHECK-NEXT: [[BC1:%.*]] = bitcast <16 x i8> [[SHUFFLE1]] to <8 x i16> -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <8 x i16> [[BC1]], <8 x i16> undef, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[V1:%.*]] to <8 x i16> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> +; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> undef, <8 x i32> ; CHECK-NEXT: ret <8 x i16> [[SHUFFLE2]] ; %shuffle1 = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle.ll index 5a758d78d937..50b6f9e35dd4 100644 --- a/llvm/test/Transforms/VectorCombine/X86/shuffle.ll +++ b/llvm/test/Transforms/VectorCombine/X86/shuffle.ll @@ -59,13 +59,13 @@ define i128 @bitcast_shuf_narrow_element_wrong_type(<4 x i32> %v) { ret i128 %r } -; Negative test - but might want to try this +; Widen shuffle elements define <4 x i32> @bitcast_shuf_wide_element(<8 x i16> %v) { ; CHECK-LABEL: @bitcast_shuf_wide_element( -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> undef, <8 x i32> -; CHECK-NEXT: [[R:%.*]] = bitcast <8 x i16> [[SHUF]] to <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[R]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP2]] ; %shuf = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> %r = bitcast <8 x i16> %shuf to <4 x i32>