diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index d12624ffb824..a38936644bd3 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1302,7 +1302,7 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { for (ShuffleVectorInst *SV : Shuffles) { for (auto U : SV->users()) { ShuffleVectorInst *SSV = dyn_cast(U); - if (SSV && isa(SSV->getOperand(1))) + if (SSV && isa(SSV->getOperand(1)) && SSV->getType() == VT) Shuffles.push_back(SSV); } } diff --git a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll index f275cb02c23d..939e11e26866 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll @@ -976,4 +976,27 @@ define <16 x i32> @testoutofbounds(<16 x i32> %x, <16 x i32> %y) { ret <16 x i32> %add } +define <64 x i32> @testlargerextrashuffle2(i32 %call.i, <16 x i32> %0) { +; CHECK-LABEL: @testlargerextrashuffle2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0:%.*]], i32 [[CALL_I:%.*]], i32 15 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[CALL_I]], i32 15 +; CHECK-NEXT: [[TMP3:%.*]] = sub <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = add <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP3]], <16 x i32> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <64 x i32> +; CHECK-NEXT: ret <64 x i32> [[TMP6]] +; +entry: + %1 = insertelement <16 x i32> %0, i32 %call.i, i32 15 + %2 = insertelement <16 x i32> %0, i32 %call.i, i32 15 + %3 = sub <16 x i32> %1, %2 + %4 = add <16 x i32> %1, %2 + %5 = shufflevector <16 x i32> %3, <16 x i32> %4, <16 x i32> + %6 = shufflevector <16 x i32> %5, <16 x i32> poison, <64 x i32> + ret <64 x i32> %6 +} + + + declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)