Precommit transform tests that have poison as insertelement's placeholder

This commit copies existing tests at llvm/Transforms and replaces 'insertelement undef' in those files with 'insertelement poison'. (see https://reviews.llvm.org/D93586) Tests listed using this script: grep -R -E '^[^;]*insertelement <.*> undef,' . | cut -d":" -f1 | uniq | wc -l Tests updated: file_org=llvm/test/Transforms/$1 file=${file_org%.ll}-inseltpoison.ll cp $file_org $file sed -i -E 's/^([^;]*)insertelement <(.*)> undef/\1insertelement <\2> poison/g' $file head -1 $file | grep "Assertions have been autogenerated by utils/update_test_checks.py" -q if [ "$?" == 1 ]; then echo "$file : should be manually updated" # I manually updated the script exit 1 fi python3 ./llvm/utils/update_test_checks.py --opt-binary=./build-releaseassert/bin/opt $file
2020-12-24 11:41:27 +09:00 · 2020-12-24 11:41:27 +09:00 · db7a2f347f
parent 48ad8194a5
commit db7a2f347f
120 changed files with 44309 additions and 0 deletions
--- a/llvm/test/Transforms/Attributor/dereferenceable-2-inseltpoison.ll
+++ b/llvm/test/Transforms/Attributor/dereferenceable-2-inseltpoison.ll
@ -0,0 +1,847 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes
+; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=1 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM
+; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=1 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM
+; RUN: opt -attributor-cgscc -enable-new-pm=0 -attributor-manifest-internal  -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM
+; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal  -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM
+
+; Determine dereference-ability before unused loads get deleted:
+; https://bugs.llvm.org/show_bug.cgi?id=21780
+
+define <4 x double> @PR21780(double* %ptr) {
+; IS__TUNIT____: Function Attrs: argmemonly nofree nosync nounwind readonly willreturn
+; IS__TUNIT____-LABEL: define {{[^@]+}}@PR21780
+; IS__TUNIT____-SAME: (double* nocapture nofree nonnull readonly align 8 dereferenceable(32) [[PTR:%.*]]) [[ATTR0:#.*]] {
+; IS__TUNIT____-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[PTR]], i64 1
+; IS__TUNIT____-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[PTR]], i64 2
+; IS__TUNIT____-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[PTR]], i64 3
+; IS__TUNIT____-NEXT:    [[T0:%.*]] = load double, double* [[PTR]], align 8
+; IS__TUNIT____-NEXT:    [[T1:%.*]] = load double, double* [[ARRAYIDX1]], align 8
+; IS__TUNIT____-NEXT:    [[T2:%.*]] = load double, double* [[ARRAYIDX2]], align 8
+; IS__TUNIT____-NEXT:    [[T3:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; IS__TUNIT____-NEXT:    [[VECINIT0:%.*]] = insertelement <4 x double> poison, double [[T0]], i32 0
+; IS__TUNIT____-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x double> [[VECINIT0]], double [[T1]], i32 1
+; IS__TUNIT____-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x double> [[VECINIT1]], double [[T2]], i32 2
+; IS__TUNIT____-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x double> [[VECINIT2]], double [[T3]], i32 3
+; IS__TUNIT____-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x double> [[VECINIT3]], <4 x double> [[VECINIT3]], <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; IS__TUNIT____-NEXT:    ret <4 x double> [[SHUFFLE]]
+;
+; IS__CGSCC____: Function Attrs: argmemonly nofree norecurse nosync nounwind readonly willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@PR21780
+; IS__CGSCC____-SAME: (double* nocapture nofree nonnull readonly align 8 dereferenceable(32) [[PTR:%.*]]) [[ATTR0:#.*]] {
+; IS__CGSCC____-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[PTR]], i64 1
+; IS__CGSCC____-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[PTR]], i64 2
+; IS__CGSCC____-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[PTR]], i64 3
+; IS__CGSCC____-NEXT:    [[T0:%.*]] = load double, double* [[PTR]], align 8
+; IS__CGSCC____-NEXT:    [[T1:%.*]] = load double, double* [[ARRAYIDX1]], align 8
+; IS__CGSCC____-NEXT:    [[T2:%.*]] = load double, double* [[ARRAYIDX2]], align 8
+; IS__CGSCC____-NEXT:    [[T3:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; IS__CGSCC____-NEXT:    [[VECINIT0:%.*]] = insertelement <4 x double> poison, double [[T0]], i32 0
+; IS__CGSCC____-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x double> [[VECINIT0]], double [[T1]], i32 1
+; IS__CGSCC____-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x double> [[VECINIT1]], double [[T2]], i32 2
+; IS__CGSCC____-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x double> [[VECINIT2]], double [[T3]], i32 3
+; IS__CGSCC____-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x double> [[VECINIT3]], <4 x double> [[VECINIT3]], <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; IS__CGSCC____-NEXT:    ret <4 x double> [[SHUFFLE]]
+;
+
+  ; GEP of index 0 is simplified away.
+  %arrayidx1 = getelementptr inbounds double, double* %ptr, i64 1
+  %arrayidx2 = getelementptr inbounds double, double* %ptr, i64 2
+  %arrayidx3 = getelementptr inbounds double, double* %ptr, i64 3
+
+  %t0 = load double, double* %ptr, align 8
+  %t1 = load double, double* %arrayidx1, align 8
+  %t2 = load double, double* %arrayidx2, align 8
+  %t3 = load double, double* %arrayidx3, align 8
+
+  %vecinit0 = insertelement <4 x double> poison, double %t0, i32 0
+  %vecinit1 = insertelement <4 x double> %vecinit0, double %t1, i32 1
+  %vecinit2 = insertelement <4 x double> %vecinit1, double %t2, i32 2
+  %vecinit3 = insertelement <4 x double> %vecinit2, double %t3, i32 3
+  %shuffle = shufflevector <4 x double> %vecinit3, <4 x double> %vecinit3, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  ret <4 x double> %shuffle
+}
+
+
+define double @PR21780_only_access3_with_inbounds(double* %ptr) {
+; IS__TUNIT____: Function Attrs: argmemonly nofree nosync nounwind readonly willreturn
+; IS__TUNIT____-LABEL: define {{[^@]+}}@PR21780_only_access3_with_inbounds
+; IS__TUNIT____-SAME: (double* nocapture nofree nonnull readonly align 8 dereferenceable(32) [[PTR:%.*]]) [[ATTR0]] {
+; IS__TUNIT____-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[PTR]], i64 3
+; IS__TUNIT____-NEXT:    [[T3:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; IS__TUNIT____-NEXT:    ret double [[T3]]
+;
+; IS__CGSCC____: Function Attrs: argmemonly nofree norecurse nosync nounwind readonly willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@PR21780_only_access3_with_inbounds
+; IS__CGSCC____-SAME: (double* nocapture nofree nonnull readonly align 8 dereferenceable(32) [[PTR:%.*]]) [[ATTR0]] {
+; IS__CGSCC____-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[PTR]], i64 3
+; IS__CGSCC____-NEXT:    [[T3:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; IS__CGSCC____-NEXT:    ret double [[T3]]
+;
+
+  %arrayidx3 = getelementptr inbounds double, double* %ptr, i64 3
+  %t3 = load double, double* %arrayidx3, align 8
+  ret double %t3
+}
+
+define double @PR21780_only_access3_without_inbounds(double* %ptr) {
+; IS__TUNIT____: Function Attrs: argmemonly nofree nosync nounwind readonly willreturn
+; IS__TUNIT____-LABEL: define {{[^@]+}}@PR21780_only_access3_without_inbounds
+; IS__TUNIT____-SAME: (double* nocapture nofree readonly align 8 [[PTR:%.*]]) [[ATTR0]] {
+; IS__TUNIT____-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr double, double* [[PTR]], i64 3
+; IS__TUNIT____-NEXT:    [[T3:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; IS__TUNIT____-NEXT:    ret double [[T3]]
+;
+; IS__CGSCC____: Function Attrs: argmemonly nofree norecurse nosync nounwind readonly willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@PR21780_only_access3_without_inbounds
+; IS__CGSCC____-SAME: (double* nocapture nofree readonly align 8 [[PTR:%.*]]) [[ATTR0]] {
+; IS__CGSCC____-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr double, double* [[PTR]], i64 3
+; IS__CGSCC____-NEXT:    [[T3:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; IS__CGSCC____-NEXT:    ret double [[T3]]
+;
+  %arrayidx3 = getelementptr double, double* %ptr, i64 3
+  %t3 = load double, double* %arrayidx3, align 8
+  ret double %t3
+}
+
+define double @PR21780_without_inbounds(double* %ptr) {
+; IS__TUNIT____: Function Attrs: argmemonly nofree nosync nounwind readonly willreturn
+; IS__TUNIT____-LABEL: define {{[^@]+}}@PR21780_without_inbounds
+; IS__TUNIT____-SAME: (double* nocapture nofree nonnull readonly align 8 dereferenceable(32) [[PTR:%.*]]) [[ATTR0]] {
+; IS__TUNIT____-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr double, double* [[PTR]], i64 3
+; IS__TUNIT____-NEXT:    [[T3:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; IS__TUNIT____-NEXT:    ret double [[T3]]
+;
+; IS__CGSCC____: Function Attrs: argmemonly nofree norecurse nosync nounwind readonly willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@PR21780_without_inbounds
+; IS__CGSCC____-SAME: (double* nocapture nofree nonnull readonly align 8 dereferenceable(32) [[PTR:%.*]]) [[ATTR0]] {
+; IS__CGSCC____-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr double, double* [[PTR]], i64 3
+; IS__CGSCC____-NEXT:    [[T3:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; IS__CGSCC____-NEXT:    ret double [[T3]]
+;
+
+  %arrayidx1 = getelementptr double, double* %ptr, i64 1
+  %arrayidx2 = getelementptr double, double* %ptr, i64 2
+  %arrayidx3 = getelementptr double, double* %ptr, i64 3
+
+  %t0 = load double, double* %ptr, align 8
+  %t1 = load double, double* %arrayidx1, align 8
+  %t2 = load double, double* %arrayidx2, align 8
+  %t3 = load double, double* %arrayidx3, align 8
+
+  ret double %t3
+}
+
+; Unsimplified, but still valid. Also, throw in some bogus arguments.
+
+define void @gep0(i8* %unused, i8* %other, i8* %ptr) {
+; IS__TUNIT____: Function Attrs: argmemonly nofree nosync nounwind willreturn
+; IS__TUNIT____-LABEL: define {{[^@]+}}@gep0
+; IS__TUNIT____-SAME: (i8* nocapture nofree readnone [[UNUSED:%.*]], i8* nocapture nofree nonnull writeonly dereferenceable(1) [[OTHER:%.*]], i8* nocapture nofree nonnull readonly dereferenceable(3) [[PTR:%.*]]) [[ATTR1:#.*]] {
+; IS__TUNIT____-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, i8* [[PTR]], i64 2
+; IS__TUNIT____-NEXT:    [[T2:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
+; IS__TUNIT____-NEXT:    store i8 [[T2]], i8* [[OTHER]], align 1
+; IS__TUNIT____-NEXT:    ret void
+;
+; IS__CGSCC____: Function Attrs: argmemonly nofree norecurse nosync nounwind willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@gep0
+; IS__CGSCC____-SAME: (i8* nocapture nofree readnone [[UNUSED:%.*]], i8* nocapture nofree nonnull writeonly dereferenceable(1) [[OTHER:%.*]], i8* nocapture nofree nonnull readonly dereferenceable(3) [[PTR:%.*]]) [[ATTR1:#.*]] {
+; IS__CGSCC____-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, i8* [[PTR]], i64 2
+; IS__CGSCC____-NEXT:    [[T2:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
+; IS__CGSCC____-NEXT:    store i8 [[T2]], i8* [[OTHER]], align 1
+; IS__CGSCC____-NEXT:    ret void
+;
+  %arrayidx0 = getelementptr i8, i8* %ptr, i64 0
+  %arrayidx1 = getelementptr i8, i8* %ptr, i64 1
+  %arrayidx2 = getelementptr i8, i8* %ptr, i64 2
+  %t0 = load i8, i8* %arrayidx0
+  %t1 = load i8, i8* %arrayidx1
+  %t2 = load i8, i8* %arrayidx2
+  store i8 %t2, i8* %other
+  ret void
+}
+
+; Order of accesses does not change computation.
+; Multiple arguments may be dereferenceable.
+
+define void @ordering(i8* %ptr1, i32* %ptr2) {
+; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
+; IS__TUNIT____-LABEL: define {{[^@]+}}@ordering
+; IS__TUNIT____-SAME: (i8* nocapture nofree nonnull readnone dereferenceable(3) [[PTR1:%.*]], i32* nocapture nofree nonnull readnone align 4 dereferenceable(8) [[PTR2:%.*]]) [[ATTR2:#.*]] {
+; IS__TUNIT____-NEXT:    ret void
+;
+; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@ordering
+; IS__CGSCC____-SAME: (i8* nocapture nofree nonnull readnone dereferenceable(3) [[PTR1:%.*]], i32* nocapture nofree nonnull readnone align 4 dereferenceable(8) [[PTR2:%.*]]) [[ATTR2:#.*]] {
+; IS__CGSCC____-NEXT:    ret void
+;
+  %a20 = getelementptr i32, i32* %ptr2, i64 0
+  %a12 = getelementptr i8, i8* %ptr1, i64 2
+  %t12 = load i8, i8* %a12
+  %a11 = getelementptr i8, i8* %ptr1, i64 1
+  %t20 = load i32, i32* %a20
+  %a10 = getelementptr i8, i8* %ptr1, i64 0
+  %t10 = load i8, i8* %a10
+  %t11 = load i8, i8* %a11
+  %a21 = getelementptr i32, i32* %ptr2, i64 1
+  %t21 = load i32, i32* %a21
+  ret void
+}
+
+; Not in entry block.
+
+define void @not_entry_but_guaranteed_to_execute(i8* %ptr) {
+; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
+; IS__TUNIT____-LABEL: define {{[^@]+}}@not_entry_but_guaranteed_to_execute
+; IS__TUNIT____-SAME: (i8* nocapture nofree nonnull readnone dereferenceable(3) [[PTR:%.*]]) [[ATTR2]] {
+; IS__TUNIT____-NEXT:  entry:
+; IS__TUNIT____-NEXT:    br label [[EXIT:%.*]]
+; IS__TUNIT____:       exit:
+; IS__TUNIT____-NEXT:    ret void
+;
+; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@not_entry_but_guaranteed_to_execute
+; IS__CGSCC____-SAME: (i8* nocapture nofree nonnull readnone dereferenceable(3) [[PTR:%.*]]) [[ATTR2]] {
+; IS__CGSCC____-NEXT:  entry:
+; IS__CGSCC____-NEXT:    br label [[EXIT:%.*]]
+; IS__CGSCC____:       exit:
+; IS__CGSCC____-NEXT:    ret void
+;
+entry:
+  br label %exit
+exit:
+  %arrayidx0 = getelementptr i8, i8* %ptr, i64 0
+  %arrayidx1 = getelementptr i8, i8* %ptr, i64 1
+  %arrayidx2 = getelementptr i8, i8* %ptr, i64 2
+  %t0 = load i8, i8* %arrayidx0
+  %t1 = load i8, i8* %arrayidx1
+  %t2 = load i8, i8* %arrayidx2
+  ret void
+}
+
+; Not in entry block and not guaranteed to execute.
+
+define void @not_entry_not_guaranteed_to_execute(i8* %ptr, i1 %cond) {
+; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
+; IS__TUNIT____-LABEL: define {{[^@]+}}@not_entry_not_guaranteed_to_execute
+; IS__TUNIT____-SAME: (i8* nocapture nofree readnone [[PTR:%.*]], i1 [[COND:%.*]]) [[ATTR2]] {
+; IS__TUNIT____-NEXT:  entry:
+; IS__TUNIT____-NEXT:    br i1 [[COND]], label [[LOADS:%.*]], label [[EXIT:%.*]]
+; IS__TUNIT____:       loads:
+; IS__TUNIT____-NEXT:    ret void
+; IS__TUNIT____:       exit:
+; IS__TUNIT____-NEXT:    ret void
+;
+; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@not_entry_not_guaranteed_to_execute
+; IS__CGSCC____-SAME: (i8* nocapture nofree readnone [[PTR:%.*]], i1 [[COND:%.*]]) [[ATTR2]] {
+; IS__CGSCC____-NEXT:  entry:
+; IS__CGSCC____-NEXT:    br i1 [[COND]], label [[LOADS:%.*]], label [[EXIT:%.*]]
+; IS__CGSCC____:       loads:
+; IS__CGSCC____-NEXT:    ret void
+; IS__CGSCC____:       exit:
+; IS__CGSCC____-NEXT:    ret void
+;
+entry:
+  br i1 %cond, label %loads, label %exit
+loads:
+  %arrayidx0 = getelementptr i8, i8* %ptr, i64 0
+  %arrayidx1 = getelementptr i8, i8* %ptr, i64 1
+  %arrayidx2 = getelementptr i8, i8* %ptr, i64 2
+  %t0 = load i8, i8* %arrayidx0
+  %t1 = load i8, i8* %arrayidx1
+  %t2 = load i8, i8* %arrayidx2
+  ret void
+exit:
+  ret void
+}
+
+; The last load may not execute, so derefenceable bytes only covers the 1st two loads.
+
+define void @partial_in_entry(i16* %ptr, i1 %cond) {
+; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
+; IS__TUNIT____-LABEL: define {{[^@]+}}@partial_in_entry
+; IS__TUNIT____-SAME: (i16* nocapture nofree nonnull readnone align 2 dereferenceable(4) [[PTR:%.*]], i1 [[COND:%.*]]) [[ATTR2]] {
+; IS__TUNIT____-NEXT:  entry:
+; IS__TUNIT____-NEXT:    br i1 [[COND]], label [[LOADS:%.*]], label [[EXIT:%.*]]
+; IS__TUNIT____:       loads:
+; IS__TUNIT____-NEXT:    ret void
+; IS__TUNIT____:       exit:
+; IS__TUNIT____-NEXT:    ret void
+;
+; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@partial_in_entry
+; IS__CGSCC____-SAME: (i16* nocapture nofree nonnull readnone align 2 dereferenceable(4) [[PTR:%.*]], i1 [[COND:%.*]]) [[ATTR2]] {
+; IS__CGSCC____-NEXT:  entry:
+; IS__CGSCC____-NEXT:    br i1 [[COND]], label [[LOADS:%.*]], label [[EXIT:%.*]]
+; IS__CGSCC____:       loads:
+; IS__CGSCC____-NEXT:    ret void
+; IS__CGSCC____:       exit:
+; IS__CGSCC____-NEXT:    ret void
+;
+entry:
+  %arrayidx0 = getelementptr i16, i16* %ptr, i64 0
+  %arrayidx1 = getelementptr i16, i16* %ptr, i64 1
+  %arrayidx2 = getelementptr i16, i16* %ptr, i64 2
+  %t0 = load i16, i16* %arrayidx0
+  %t1 = load i16, i16* %arrayidx1
+  br i1 %cond, label %loads, label %exit
+loads:
+  %t2 = load i16, i16* %arrayidx2
+  ret void
+exit:
+  ret void
+}
+
+; The volatile load can't be used to prove a non-volatile access is allowed.
+; The 2nd and 3rd loads may never execute.
+
+define void @volatile_is_not_dereferenceable(i16* %ptr) {
+; IS__TUNIT____: Function Attrs: argmemonly nofree nounwind willreturn
+; IS__TUNIT____-LABEL: define {{[^@]+}}@volatile_is_not_dereferenceable
+; IS__TUNIT____-SAME: (i16* nofree align 2 [[PTR:%.*]]) [[ATTR3:#.*]] {
+; IS__TUNIT____-NEXT:    [[ARRAYIDX0:%.*]] = getelementptr i16, i16* [[PTR]], i64 0
+; IS__TUNIT____-NEXT:    [[T0:%.*]] = load volatile i16, i16* [[ARRAYIDX0]], align 2
+; IS__TUNIT____-NEXT:    ret void
+;
+; IS__CGSCC____: Function Attrs: argmemonly nofree norecurse nounwind willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@volatile_is_not_dereferenceable
+; IS__CGSCC____-SAME: (i16* nofree align 2 [[PTR:%.*]]) [[ATTR3:#.*]] {
+; IS__CGSCC____-NEXT:    [[ARRAYIDX0:%.*]] = getelementptr i16, i16* [[PTR]], i64 0
+; IS__CGSCC____-NEXT:    [[T0:%.*]] = load volatile i16, i16* [[ARRAYIDX0]], align 2
+; IS__CGSCC____-NEXT:    ret void
+;
+  %arrayidx0 = getelementptr i16, i16* %ptr, i64 0
+  %arrayidx1 = getelementptr i16, i16* %ptr, i64 1
+  %arrayidx2 = getelementptr i16, i16* %ptr, i64 2
+  %t0 = load volatile i16, i16* %arrayidx0
+  %t1 = load i16, i16* %arrayidx1
+  %t2 = load i16, i16* %arrayidx2
+  ret void
+}
+
+; TODO: We should allow inference for atomic (but not volatile) ops.
+
+define void @atomic_is_alright(i16* %ptr) {
+; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
+; IS__TUNIT____-LABEL: define {{[^@]+}}@atomic_is_alright
+; IS__TUNIT____-SAME: (i16* nocapture nofree nonnull readnone align 2 dereferenceable(6) [[PTR:%.*]]) [[ATTR2]] {
+; IS__TUNIT____-NEXT:    ret void
+;
+; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@atomic_is_alright
+; IS__CGSCC____-SAME: (i16* nocapture nofree nonnull readnone align 2 dereferenceable(6) [[PTR:%.*]]) [[ATTR2]] {
+; IS__CGSCC____-NEXT:    ret void
+;
+  %arrayidx0 = getelementptr i16, i16* %ptr, i64 0
+  %arrayidx1 = getelementptr i16, i16* %ptr, i64 1
+  %arrayidx2 = getelementptr i16, i16* %ptr, i64 2
+  %t0 = load atomic i16, i16* %arrayidx0 unordered, align 2
+  %t1 = load i16, i16* %arrayidx1
+  %t2 = load i16, i16* %arrayidx2
+  ret void
+}
+
+declare void @may_not_return()
+
+define void @not_guaranteed_to_transfer_execution(i16* %ptr) {
+; CHECK-LABEL: define {{[^@]+}}@not_guaranteed_to_transfer_execution
+; CHECK-SAME: (i16* nocapture nofree nonnull readnone align 2 dereferenceable(2) [[PTR:%.*]]) {
+; CHECK-NEXT:    call void @may_not_return()
+; CHECK-NEXT:    ret void
+;
+  %arrayidx0 = getelementptr i16, i16* %ptr, i64 0
+  %arrayidx1 = getelementptr i16, i16* %ptr, i64 1
+  %arrayidx2 = getelementptr i16, i16* %ptr, i64 2
+  %t0 = load i16, i16* %arrayidx0
+  call void @may_not_return()
+  %t1 = load i16, i16* %arrayidx1
+  %t2 = load i16, i16* %arrayidx2
+  ret void
+}
+
+; We must have consecutive accesses.
+
+define void @variable_gep_index(i8* %unused, i8* %ptr, i64 %variable_index) {
+; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
+; IS__TUNIT____-LABEL: define {{[^@]+}}@variable_gep_index
+; IS__TUNIT____-SAME: (i8* nocapture nofree readnone [[UNUSED:%.*]], i8* nocapture nofree nonnull readnone dereferenceable(1) [[PTR:%.*]], i64 [[VARIABLE_INDEX:%.*]]) [[ATTR2]] {
+; IS__TUNIT____-NEXT:    ret void
+;
+; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@variable_gep_index
+; IS__CGSCC____-SAME: (i8* nocapture nofree readnone [[UNUSED:%.*]], i8* nocapture nofree nonnull readnone dereferenceable(1) [[PTR:%.*]], i64 [[VARIABLE_INDEX:%.*]]) [[ATTR2]] {
+; IS__CGSCC____-NEXT:    ret void
+;
+  %arrayidx1 = getelementptr i8, i8* %ptr, i64 %variable_index
+  %arrayidx2 = getelementptr i8, i8* %ptr, i64 2
+  %t0 = load i8, i8* %ptr
+  %t1 = load i8, i8* %arrayidx1
+  %t2 = load i8, i8* %arrayidx2
+  ret void
+}
+
+; Deal with >1 GEP index.
+
+define void @multi_index_gep(<4 x i8>* %ptr) {
+; FIXME: %ptr should be dereferenceable(4)
+; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
+; IS__TUNIT____-LABEL: define {{[^@]+}}@multi_index_gep
+; IS__TUNIT____-SAME: (<4 x i8>* nocapture nofree nonnull readnone dereferenceable(1) [[PTR:%.*]]) [[ATTR2]] {
+; IS__TUNIT____-NEXT:    ret void
+;
+; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@multi_index_gep
+; IS__CGSCC____-SAME: (<4 x i8>* nocapture nofree nonnull readnone dereferenceable(1) [[PTR:%.*]]) [[ATTR2]] {
+; IS__CGSCC____-NEXT:    ret void
+;
+  %arrayidx00 = getelementptr <4 x i8>, <4 x i8>* %ptr, i64 0, i64 0
+  %t0 = load i8, i8* %arrayidx00
+  ret void
+}
+
+; Could round weird bitwidths down?
+
+define void @not_byte_multiple(i9* %ptr) {
+; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
+; IS__TUNIT____-LABEL: define {{[^@]+}}@not_byte_multiple
+; IS__TUNIT____-SAME: (i9* nocapture nofree nonnull readnone align 2 dereferenceable(2) [[PTR:%.*]]) [[ATTR2]] {
+; IS__TUNIT____-NEXT:    ret void
+;
+; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@not_byte_multiple
+; IS__CGSCC____-SAME: (i9* nocapture nofree nonnull readnone align 2 dereferenceable(2) [[PTR:%.*]]) [[ATTR2]] {
+; IS__CGSCC____-NEXT:    ret void
+;
+  %arrayidx0 = getelementptr i9, i9* %ptr, i64 0
+  %t0 = load i9, i9* %arrayidx0
+  ret void
+}
+
+; Missing direct access from the pointer.
+
+define void @no_pointer_deref(i16* %ptr) {
+; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
+; IS__TUNIT____-LABEL: define {{[^@]+}}@no_pointer_deref
+; IS__TUNIT____-SAME: (i16* nocapture nofree readnone align 2 [[PTR:%.*]]) [[ATTR2]] {
+; IS__TUNIT____-NEXT:    ret void
+;
+; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@no_pointer_deref
+; IS__CGSCC____-SAME: (i16* nocapture nofree readnone align 2 [[PTR:%.*]]) [[ATTR2]] {
+; IS__CGSCC____-NEXT:    ret void
+;
+  %arrayidx1 = getelementptr i16, i16* %ptr, i64 1
+  %arrayidx2 = getelementptr i16, i16* %ptr, i64 2
+  %t1 = load i16, i16* %arrayidx1
+  %t2 = load i16, i16* %arrayidx2
+  ret void
+}
+
+; Out-of-order is ok, but missing access concludes dereferenceable range.
+
+define void @non_consecutive(i32* %ptr) {
+; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
+; IS__TUNIT____-LABEL: define {{[^@]+}}@non_consecutive
+; IS__TUNIT____-SAME: (i32* nocapture nofree nonnull readnone align 4 dereferenceable(8) [[PTR:%.*]]) [[ATTR2]] {
+; IS__TUNIT____-NEXT:    ret void
+;
+; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@non_consecutive
+; IS__CGSCC____-SAME: (i32* nocapture nofree nonnull readnone align 4 dereferenceable(8) [[PTR:%.*]]) [[ATTR2]] {
+; IS__CGSCC____-NEXT:    ret void
+;
+  %arrayidx1 = getelementptr i32, i32* %ptr, i64 1
+  %arrayidx0 = getelementptr i32, i32* %ptr, i64 0
+  %arrayidx3 = getelementptr i32, i32* %ptr, i64 3
+  %t1 = load i32, i32* %arrayidx1
+  %t0 = load i32, i32* %arrayidx0
+  %t3 = load i32, i32* %arrayidx3
+  ret void
+}
+
+; Improve on existing dereferenceable attribute.
+
+define void @more_bytes(i32* dereferenceable(8) %ptr) {
+; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
+; IS__TUNIT____-LABEL: define {{[^@]+}}@more_bytes
+; IS__TUNIT____-SAME: (i32* nocapture nofree nonnull readnone align 4 dereferenceable(16) [[PTR:%.*]]) [[ATTR2]] {
+; IS__TUNIT____-NEXT:    ret void
+;
+; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@more_bytes
+; IS__CGSCC____-SAME: (i32* nocapture nofree nonnull readnone align 4 dereferenceable(16) [[PTR:%.*]]) [[ATTR2]] {
+; IS__CGSCC____-NEXT:    ret void
+;
+  %arrayidx3 = getelementptr i32, i32* %ptr, i64 3
+  %arrayidx1 = getelementptr i32, i32* %ptr, i64 1
+  %arrayidx0 = getelementptr i32, i32* %ptr, i64 0
+  %arrayidx2 = getelementptr i32, i32* %ptr, i64 2
+  %t3 = load i32, i32* %arrayidx3
+  %t1 = load i32, i32* %arrayidx1
+  %t2 = load i32, i32* %arrayidx2
+  %t0 = load i32, i32* %arrayidx0
+  ret void
+}
+
+; Improve on existing dereferenceable_or_null attribute.
+
+define void @more_bytes_and_not_null(i32* dereferenceable_or_null(8) %ptr) {
+; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
+; IS__TUNIT____-LABEL: define {{[^@]+}}@more_bytes_and_not_null
+; IS__TUNIT____-SAME: (i32* nocapture nofree nonnull readnone align 4 dereferenceable(16) [[PTR:%.*]]) [[ATTR2]] {
+; IS__TUNIT____-NEXT:    ret void
+;
+; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@more_bytes_and_not_null
+; IS__CGSCC____-SAME: (i32* nocapture nofree nonnull readnone align 4 dereferenceable(16) [[PTR:%.*]]) [[ATTR2]] {
+; IS__CGSCC____-NEXT:    ret void
+;
+  %arrayidx3 = getelementptr i32, i32* %ptr, i64 3
+  %arrayidx1 = getelementptr i32, i32* %ptr, i64 1
+  %arrayidx0 = getelementptr i32, i32* %ptr, i64 0
+  %arrayidx2 = getelementptr i32, i32* %ptr, i64 2
+  %t3 = load i32, i32* %arrayidx3
+  %t1 = load i32, i32* %arrayidx1
+  %t2 = load i32, i32* %arrayidx2
+  %t0 = load i32, i32* %arrayidx0
+  ret void
+}
+
+; But don't pessimize existing dereferenceable attribute.
+
+define void @better_bytes(i32* dereferenceable(100) %ptr) {
+; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
+; IS__TUNIT____-LABEL: define {{[^@]+}}@better_bytes
+; IS__TUNIT____-SAME: (i32* nocapture nofree nonnull readnone align 4 dereferenceable(100) [[PTR:%.*]]) [[ATTR2]] {
+; IS__TUNIT____-NEXT:    ret void
+;
+; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@better_bytes
+; IS__CGSCC____-SAME: (i32* nocapture nofree nonnull readnone align 4 dereferenceable(100) [[PTR:%.*]]) [[ATTR2]] {
+; IS__CGSCC____-NEXT:    ret void
+;
+  %arrayidx3 = getelementptr i32, i32* %ptr, i64 3
+  %arrayidx1 = getelementptr i32, i32* %ptr, i64 1
+  %arrayidx0 = getelementptr i32, i32* %ptr, i64 0
+  %arrayidx2 = getelementptr i32, i32* %ptr, i64 2
+  %t3 = load i32, i32* %arrayidx3
+  %t1 = load i32, i32* %arrayidx1
+  %t2 = load i32, i32* %arrayidx2
+  %t0 = load i32, i32* %arrayidx0
+  ret void
+}
+
+define void @bitcast(i32* %arg) {
+; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
+; IS__TUNIT____-LABEL: define {{[^@]+}}@bitcast
+; IS__TUNIT____-SAME: (i32* nocapture nofree nonnull readnone align 4 dereferenceable(8) [[ARG:%.*]]) [[ATTR2]] {
+; IS__TUNIT____-NEXT:    ret void
+;
+; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@bitcast
+; IS__CGSCC____-SAME: (i32* nocapture nofree nonnull readnone align 4 dereferenceable(8) [[ARG:%.*]]) [[ATTR2]] {
+; IS__CGSCC____-NEXT:    ret void
+;
+  %ptr = bitcast i32* %arg to float*
+  %arrayidx0 = getelementptr float, float* %ptr, i64 0
+  %arrayidx1 = getelementptr float, float* %ptr, i64 1
+  %t0 = load float, float* %arrayidx0
+  %t1 = load float, float* %arrayidx1
+  ret void
+}
+
+define void @bitcast_different_sizes(double* %arg1, i8* %arg2) {
+; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
+; IS__TUNIT____-LABEL: define {{[^@]+}}@bitcast_different_sizes
+; IS__TUNIT____-SAME: (double* nocapture nofree nonnull readnone align 4 dereferenceable(12) [[ARG1:%.*]], i8* nocapture nofree nonnull readnone align 4 dereferenceable(16) [[ARG2:%.*]]) [[ATTR2]] {
+; IS__TUNIT____-NEXT:    ret void
+;
+; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@bitcast_different_sizes
+; IS__CGSCC____-SAME: (double* nocapture nofree nonnull readnone align 4 dereferenceable(12) [[ARG1:%.*]], i8* nocapture nofree nonnull readnone align 4 dereferenceable(16) [[ARG2:%.*]]) [[ATTR2]] {
+; IS__CGSCC____-NEXT:    ret void
+;
+  %ptr1 = bitcast double* %arg1 to float*
+  %a10 = getelementptr float, float* %ptr1, i64 0
+  %a11 = getelementptr float, float* %ptr1, i64 1
+  %a12 = getelementptr float, float* %ptr1, i64 2
+  %ld10 = load float, float* %a10
+  %ld11 = load float, float* %a11
+  %ld12 = load float, float* %a12
+
+  %ptr2 = bitcast i8* %arg2 to i64*
+  %a20 = getelementptr i64, i64* %ptr2, i64 0
+  %a21 = getelementptr i64, i64* %ptr2, i64 1
+  %ld20 = load i64, i64* %a20
+  %ld21 = load i64, i64* %a21
+  ret void
+}
+
+define void @negative_offset(i32* %arg) {
+; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
+; IS__TUNIT____-LABEL: define {{[^@]+}}@negative_offset
+; IS__TUNIT____-SAME: (i32* nocapture nofree nonnull readnone align 4 dereferenceable(4) [[ARG:%.*]]) [[ATTR2]] {
+; IS__TUNIT____-NEXT:    ret void
+;
+; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@negative_offset
+; IS__CGSCC____-SAME: (i32* nocapture nofree nonnull readnone align 4 dereferenceable(4) [[ARG:%.*]]) [[ATTR2]] {
+; IS__CGSCC____-NEXT:    ret void
+;
+  %ptr = bitcast i32* %arg to float*
+  %arrayidx0 = getelementptr float, float* %ptr, i64 0
+  %arrayidx1 = getelementptr float, float* %ptr, i64 -1
+  %t0 = load float, float* %arrayidx0
+  %t1 = load float, float* %arrayidx1
+  ret void
+}
+
+define void @stores(i32* %arg) {
+; IS__TUNIT____: Function Attrs: argmemonly nofree nosync nounwind willreturn writeonly
+; IS__TUNIT____-LABEL: define {{[^@]+}}@stores
+; IS__TUNIT____-SAME: (i32* nocapture nofree nonnull writeonly align 4 dereferenceable(8) [[ARG:%.*]]) [[ATTR4:#.*]] {
+; IS__TUNIT____-NEXT:    [[PTR:%.*]] = bitcast i32* [[ARG]] to float*
+; IS__TUNIT____-NEXT:    [[ARRAYIDX0:%.*]] = getelementptr float, float* [[PTR]], i64 0
+; IS__TUNIT____-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr float, float* [[PTR]], i64 1
+; IS__TUNIT____-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX0]], align 4
+; IS__TUNIT____-NEXT:    store float 2.000000e+00, float* [[ARRAYIDX1]], align 4
+; IS__TUNIT____-NEXT:    ret void
+;
+; IS__CGSCC____: Function Attrs: argmemonly nofree norecurse nosync nounwind willreturn writeonly
+; IS__CGSCC____-LABEL: define {{[^@]+}}@stores
+; IS__CGSCC____-SAME: (i32* nocapture nofree nonnull writeonly align 4 dereferenceable(8) [[ARG:%.*]]) [[ATTR4:#.*]] {
+; IS__CGSCC____-NEXT:    [[PTR:%.*]] = bitcast i32* [[ARG]] to float*
+; IS__CGSCC____-NEXT:    [[ARRAYIDX0:%.*]] = getelementptr float, float* [[PTR]], i64 0
+; IS__CGSCC____-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr float, float* [[PTR]], i64 1
+; IS__CGSCC____-NEXT:    store float 1.000000e+00, float* [[ARRAYIDX0]], align 4
+; IS__CGSCC____-NEXT:    store float 2.000000e+00, float* [[ARRAYIDX1]], align 4
+; IS__CGSCC____-NEXT:    ret void
+;
+  %ptr = bitcast i32* %arg to float*
+  %arrayidx0 = getelementptr float, float* %ptr, i64 0
+  %arrayidx1 = getelementptr float, float* %ptr, i64 1
+  store float 1.0, float* %arrayidx0
+  store float 2.0, float* %arrayidx1
+  ret void
+}
+
+define void @load_store(i32* %arg) {
+; IS__TUNIT____: Function Attrs: argmemonly nofree nosync nounwind willreturn writeonly
+; IS__TUNIT____-LABEL: define {{[^@]+}}@load_store
+; IS__TUNIT____-SAME: (i32* nocapture nofree nonnull writeonly align 4 dereferenceable(8) [[ARG:%.*]]) [[ATTR4]] {
+; IS__TUNIT____-NEXT:    [[PTR:%.*]] = bitcast i32* [[ARG]] to float*
+; IS__TUNIT____-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr float, float* [[PTR]], i64 1
+; IS__TUNIT____-NEXT:    store float 2.000000e+00, float* [[ARRAYIDX1]], align 4
+; IS__TUNIT____-NEXT:    ret void
+;
+; IS__CGSCC____: Function Attrs: argmemonly nofree norecurse nosync nounwind willreturn writeonly
+; IS__CGSCC____-LABEL: define {{[^@]+}}@load_store
+; IS__CGSCC____-SAME: (i32* nocapture nofree nonnull writeonly align 4 dereferenceable(8) [[ARG:%.*]]) [[ATTR4]] {
+; IS__CGSCC____-NEXT:    [[PTR:%.*]] = bitcast i32* [[ARG]] to float*
+; IS__CGSCC____-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr float, float* [[PTR]], i64 1
+; IS__CGSCC____-NEXT:    store float 2.000000e+00, float* [[ARRAYIDX1]], align 4
+; IS__CGSCC____-NEXT:    ret void
+;
+  %ptr = bitcast i32* %arg to float*
+  %arrayidx0 = getelementptr float, float* %ptr, i64 0
+  %arrayidx1 = getelementptr float, float* %ptr, i64 1
+  %t1 = load float, float* %arrayidx0
+  store float 2.0, float* %arrayidx1
+  ret void
+}
+
+define void @different_size1(i32* %arg) {
+; IS__TUNIT____: Function Attrs: argmemonly nofree nosync nounwind willreturn writeonly
+; IS__TUNIT____-LABEL: define {{[^@]+}}@different_size1
+; IS__TUNIT____-SAME: (i32* nocapture nofree nonnull writeonly align 8 dereferenceable(8) [[ARG:%.*]]) [[ATTR4]] {
+; IS__TUNIT____-NEXT:    [[ARG_CAST:%.*]] = bitcast i32* [[ARG]] to double*
+; IS__TUNIT____-NEXT:    store double 0.000000e+00, double* [[ARG_CAST]], align 8
+; IS__TUNIT____-NEXT:    store i32 0, i32* [[ARG]], align 8
+; IS__TUNIT____-NEXT:    ret void
+;
+; IS__CGSCC____: Function Attrs: argmemonly nofree norecurse nosync nounwind willreturn writeonly
+; IS__CGSCC____-LABEL: define {{[^@]+}}@different_size1
+; IS__CGSCC____-SAME: (i32* nocapture nofree nonnull writeonly align 8 dereferenceable(8) [[ARG:%.*]]) [[ATTR4]] {
+; IS__CGSCC____-NEXT:    [[ARG_CAST:%.*]] = bitcast i32* [[ARG]] to double*
+; IS__CGSCC____-NEXT:    store double 0.000000e+00, double* [[ARG_CAST]], align 8
+; IS__CGSCC____-NEXT:    store i32 0, i32* [[ARG]], align 8
+; IS__CGSCC____-NEXT:    ret void
+;
+  %arg-cast = bitcast i32* %arg to double*
+  store double 0.000000e+00, double* %arg-cast
+  store i32 0, i32* %arg
+  ret void
+}
+
+define void @different_size2(i32* %arg) {
+; IS__TUNIT____: Function Attrs: argmemonly nofree nosync nounwind willreturn writeonly
+; IS__TUNIT____-LABEL: define {{[^@]+}}@different_size2
+; IS__TUNIT____-SAME: (i32* nocapture nofree nonnull writeonly align 8 dereferenceable(8) [[ARG:%.*]]) [[ATTR4]] {
+; IS__TUNIT____-NEXT:    store i32 0, i32* [[ARG]], align 8
+; IS__TUNIT____-NEXT:    [[ARG_CAST:%.*]] = bitcast i32* [[ARG]] to double*
+; IS__TUNIT____-NEXT:    store double 0.000000e+00, double* [[ARG_CAST]], align 8
+; IS__TUNIT____-NEXT:    ret void
+;
+; IS__CGSCC____: Function Attrs: argmemonly nofree norecurse nosync nounwind willreturn writeonly
+; IS__CGSCC____-LABEL: define {{[^@]+}}@different_size2
+; IS__CGSCC____-SAME: (i32* nocapture nofree nonnull writeonly align 8 dereferenceable(8) [[ARG:%.*]]) [[ATTR4]] {
+; IS__CGSCC____-NEXT:    store i32 0, i32* [[ARG]], align 8
+; IS__CGSCC____-NEXT:    [[ARG_CAST:%.*]] = bitcast i32* [[ARG]] to double*
+; IS__CGSCC____-NEXT:    store double 0.000000e+00, double* [[ARG_CAST]], align 8
+; IS__CGSCC____-NEXT:    ret void
+;
+  store i32 0, i32* %arg
+  %arg-cast = bitcast i32* %arg to double*
+  store double 0.000000e+00, double* %arg-cast
+  ret void
+}
+
+; Make use of MustBeExecuted Explorer
+;
+; [CFG]
+; entry
+;  / \
+; l1 l2
+; | X |
+; l3 l4
+;  \ /
+;  l5
+;  / \
+; l6 l7
+;  \ /
+;  end
+; According to the above CFG, we can see that instructions in l5 Block must be executed.
+; Therefore, %p must be dereferenced.
+;
+; ATTRIBUTOR_CGSCC_NPM-LABEL: define i32 @require_cfg_analysis(i32 %c, i32* {{.*}} dereferenceable(4) %p)
+define i32 @require_cfg_analysis(i32 %c, i32* %p) {
+; IS__TUNIT_OPM: Function Attrs: argmemonly nofree nosync nounwind willreturn writeonly
+; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@require_cfg_analysis
+; IS__TUNIT_OPM-SAME: (i32 [[C:%.*]], i32* nocapture nofree writeonly [[P:%.*]]) [[ATTR4:#.*]] {
+; IS__TUNIT_OPM-NEXT:    [[TOBOOL1:%.*]] = icmp eq i32 [[C]], 0
+; IS__TUNIT_OPM-NEXT:    br i1 [[TOBOOL1]], label [[L1:%.*]], label [[L2:%.*]]
+; IS__TUNIT_OPM:       l1:
+; IS__TUNIT_OPM-NEXT:    [[TOBOOL2:%.*]] = icmp eq i32 [[C]], 1
+; IS__TUNIT_OPM-NEXT:    br i1 [[TOBOOL2]], label [[L3:%.*]], label [[L4:%.*]]
+; IS__TUNIT_OPM:       l2:
+; IS__TUNIT_OPM-NEXT:    [[TOBOOL3:%.*]] = icmp eq i32 [[C]], 2
+; IS__TUNIT_OPM-NEXT:    br i1 [[TOBOOL3]], label [[L3]], label [[L4]]
+; IS__TUNIT_OPM:       l3:
+; IS__TUNIT_OPM-NEXT:    br label [[L5:%.*]]
+; IS__TUNIT_OPM:       l4:
+; IS__TUNIT_OPM-NEXT:    br label [[L5]]
+; IS__TUNIT_OPM:       l5:
+; IS__TUNIT_OPM-NEXT:    [[TOBOOL4:%.*]] = icmp eq i32 [[C]], 4
+; IS__TUNIT_OPM-NEXT:    br i1 [[TOBOOL4]], label [[L6:%.*]], label [[L7:%.*]]
+; IS__TUNIT_OPM:       l6:
+; IS__TUNIT_OPM-NEXT:    store i32 0, i32* [[P]], align 4
+; IS__TUNIT_OPM-NEXT:    br label [[END:%.*]]
+; IS__TUNIT_OPM:       l7:
+; IS__TUNIT_OPM-NEXT:    store i32 1, i32* [[P]], align 4
+; IS__TUNIT_OPM-NEXT:    br label [[END]]
+; IS__TUNIT_OPM:       end:
+; IS__TUNIT_OPM-NEXT:    ret i32 1
+;
+; IS__TUNIT_NPM: Function Attrs: argmemonly nofree nosync nounwind willreturn writeonly
+; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@require_cfg_analysis
+; IS__TUNIT_NPM-SAME: (i32 [[C:%.*]], i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[P:%.*]]) [[ATTR4:#.*]] {
+; IS__TUNIT_NPM-NEXT:    [[TOBOOL1:%.*]] = icmp eq i32 [[C]], 0
+; IS__TUNIT_NPM-NEXT:    br i1 [[TOBOOL1]], label [[L1:%.*]], label [[L2:%.*]]
+; IS__TUNIT_NPM:       l1:
+; IS__TUNIT_NPM-NEXT:    br label [[L4:%.*]]
+; IS__TUNIT_NPM:       l2:
+; IS__TUNIT_NPM-NEXT:    [[TOBOOL3:%.*]] = icmp eq i32 [[C]], 2
+; IS__TUNIT_NPM-NEXT:    br i1 [[TOBOOL3]], label [[L3:%.*]], label [[L4]]
+; IS__TUNIT_NPM:       l3:
+; IS__TUNIT_NPM-NEXT:    br label [[L5:%.*]]
+; IS__TUNIT_NPM:       l4:
+; IS__TUNIT_NPM-NEXT:    br label [[L5]]
+; IS__TUNIT_NPM:       l5:
+; IS__TUNIT_NPM-NEXT:    [[TOBOOL4:%.*]] = icmp eq i32 [[C]], 4
+; IS__TUNIT_NPM-NEXT:    br i1 [[TOBOOL4]], label [[L6:%.*]], label [[L7:%.*]]
+; IS__TUNIT_NPM:       l6:
+; IS__TUNIT_NPM-NEXT:    store i32 0, i32* [[P]], align 4
+; IS__TUNIT_NPM-NEXT:    br label [[END:%.*]]
+; IS__TUNIT_NPM:       l7:
+; IS__TUNIT_NPM-NEXT:    store i32 1, i32* [[P]], align 4
+; IS__TUNIT_NPM-NEXT:    br label [[END]]
+; IS__TUNIT_NPM:       end:
+; IS__TUNIT_NPM-NEXT:    ret i32 1
+;
+; IS__CGSCC_OPM: Function Attrs: argmemonly nofree norecurse nosync nounwind willreturn writeonly
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@require_cfg_analysis
+; IS__CGSCC_OPM-SAME: (i32 [[C:%.*]], i32* nocapture nofree writeonly [[P:%.*]]) [[ATTR4:#.*]] {
+; IS__CGSCC_OPM-NEXT:    [[TOBOOL1:%.*]] = icmp eq i32 [[C]], 0
+; IS__CGSCC_OPM-NEXT:    br i1 [[TOBOOL1]], label [[L1:%.*]], label [[L2:%.*]]
+; IS__CGSCC_OPM:       l1:
+; IS__CGSCC_OPM-NEXT:    [[TOBOOL2:%.*]] = icmp eq i32 [[C]], 1
+; IS__CGSCC_OPM-NEXT:    br i1 [[TOBOOL2]], label [[L3:%.*]], label [[L4:%.*]]
+; IS__CGSCC_OPM:       l2:
+; IS__CGSCC_OPM-NEXT:    [[TOBOOL3:%.*]] = icmp eq i32 [[C]], 2
+; IS__CGSCC_OPM-NEXT:    br i1 [[TOBOOL3]], label [[L3]], label [[L4]]
+; IS__CGSCC_OPM:       l3:
+; IS__CGSCC_OPM-NEXT:    br label [[L5:%.*]]
+; IS__CGSCC_OPM:       l4:
+; IS__CGSCC_OPM-NEXT:    br label [[L5]]
+; IS__CGSCC_OPM:       l5:
+; IS__CGSCC_OPM-NEXT:    [[TOBOOL4:%.*]] = icmp eq i32 [[C]], 4
+; IS__CGSCC_OPM-NEXT:    br i1 [[TOBOOL4]], label [[L6:%.*]], label [[L7:%.*]]
+; IS__CGSCC_OPM:       l6:
+; IS__CGSCC_OPM-NEXT:    store i32 0, i32* [[P]], align 4
+; IS__CGSCC_OPM-NEXT:    br label [[END:%.*]]
+; IS__CGSCC_OPM:       l7:
+; IS__CGSCC_OPM-NEXT:    store i32 1, i32* [[P]], align 4
+; IS__CGSCC_OPM-NEXT:    br label [[END]]
+; IS__CGSCC_OPM:       end:
+; IS__CGSCC_OPM-NEXT:    ret i32 1
+;
+; IS__CGSCC_NPM: Function Attrs: argmemonly nofree norecurse nosync nounwind willreturn writeonly
+; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@require_cfg_analysis
+; IS__CGSCC_NPM-SAME: (i32 [[C:%.*]], i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[P:%.*]]) [[ATTR4:#.*]] {
+; IS__CGSCC_NPM-NEXT:    [[TOBOOL1:%.*]] = icmp eq i32 [[C]], 0
+; IS__CGSCC_NPM-NEXT:    br i1 [[TOBOOL1]], label [[L1:%.*]], label [[L2:%.*]]
+; IS__CGSCC_NPM:       l1:
+; IS__CGSCC_NPM-NEXT:    br label [[L4:%.*]]
+; IS__CGSCC_NPM:       l2:
+; IS__CGSCC_NPM-NEXT:    [[TOBOOL3:%.*]] = icmp eq i32 [[C]], 2
+; IS__CGSCC_NPM-NEXT:    br i1 [[TOBOOL3]], label [[L3:%.*]], label [[L4]]
+; IS__CGSCC_NPM:       l3:
+; IS__CGSCC_NPM-NEXT:    br label [[L5:%.*]]
+; IS__CGSCC_NPM:       l4:
+; IS__CGSCC_NPM-NEXT:    br label [[L5]]
+; IS__CGSCC_NPM:       l5:
+; IS__CGSCC_NPM-NEXT:    [[TOBOOL4:%.*]] = icmp eq i32 [[C]], 4
+; IS__CGSCC_NPM-NEXT:    br i1 [[TOBOOL4]], label [[L6:%.*]], label [[L7:%.*]]
+; IS__CGSCC_NPM:       l6:
+; IS__CGSCC_NPM-NEXT:    store i32 0, i32* [[P]], align 4
+; IS__CGSCC_NPM-NEXT:    br label [[END:%.*]]
+; IS__CGSCC_NPM:       l7:
+; IS__CGSCC_NPM-NEXT:    store i32 1, i32* [[P]], align 4
+; IS__CGSCC_NPM-NEXT:    br label [[END]]
+; IS__CGSCC_NPM:       end:
+; IS__CGSCC_NPM-NEXT:    ret i32 1
+;
+  %tobool1 = icmp eq i32 %c, 0
+  br i1 %tobool1, label %l1, label %l2
+l1:
+  %tobool2 = icmp eq i32 %c, 1
+  br i1 %tobool2, label %l3, label %l4
+l2:
+  %tobool3 = icmp eq i32 %c, 2
+  br i1 %tobool3, label %l3, label %l4
+l3:
+  br label %l5
+l4:
+  br label %l5
+l5:
+  %tobool4 = icmp eq i32 %c, 4
+  br i1 %tobool4, label %l6, label %l7
+l6:
+  store i32 0, i32* %p
+  br label %end
+l7:
+  store i32 1, i32* %p
+  br label %end
+end:
+  ret i32 1
+}
--- a/llvm/test/Transforms/BDCE/vectors-inseltpoison.ll
+++ b/llvm/test/Transforms/BDCE/vectors-inseltpoison.ll
@ -0,0 +1,102 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -bdce < %s | FileCheck %s
+
+; BDCE applied to integer vectors.
+
+define <2 x i32> @test_basic(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: @test_basic(
+; CHECK-NEXT:    [[A3:%.*]] = and <2 x i32> zeroinitializer, <i32 4, i32 4>
+; CHECK-NEXT:    [[B2:%.*]] = add <2 x i32> [[B:%.*]], <i32 1, i32 1>
+; CHECK-NEXT:    [[B3:%.*]] = and <2 x i32> [[B2]], <i32 8, i32 8>
+; CHECK-NEXT:    [[C:%.*]] = or <2 x i32> [[A3]], [[B3]]
+; CHECK-NEXT:    [[D:%.*]] = ashr <2 x i32> [[C]], <i32 3, i32 3>
+; CHECK-NEXT:    ret <2 x i32> [[D]]
+;
+  %a2 = add <2 x i32> %a, <i32 1, i32 1>
+  %a3 = and <2 x i32> %a2, <i32 4, i32 4>
+  %b2 = add <2 x i32> %b, <i32 1, i32 1>
+  %b3 = and <2 x i32> %b2, <i32 8, i32 8>
+  %c = or <2 x i32> %a3, %b3
+  %d = ashr <2 x i32> %c, <i32 3, i32 3>
+  ret <2 x i32> %d
+}
+
+; Going vector -> scalar
+define i32 @test_extractelement(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: @test_extractelement(
+; CHECK-NEXT:    [[A3:%.*]] = and <2 x i32> zeroinitializer, <i32 4, i32 4>
+; CHECK-NEXT:    [[B2:%.*]] = add <2 x i32> [[B:%.*]], <i32 1, i32 1>
+; CHECK-NEXT:    [[B3:%.*]] = and <2 x i32> [[B2]], <i32 8, i32 8>
+; CHECK-NEXT:    [[C:%.*]] = or <2 x i32> [[A3]], [[B3]]
+; CHECK-NEXT:    [[D:%.*]] = extractelement <2 x i32> [[C]], i32 0
+; CHECK-NEXT:    [[E:%.*]] = ashr i32 [[D]], 3
+; CHECK-NEXT:    ret i32 [[E]]
+;
+  %a2 = add <2 x i32> %a, <i32 1, i32 1>
+  %a3 = and <2 x i32> %a2, <i32 4, i32 4>
+  %b2 = add <2 x i32> %b, <i32 1, i32 1>
+  %b3 = and <2 x i32> %b2, <i32 8, i32 8>
+  %c = or <2 x i32> %a3, %b3
+  %d = extractelement <2 x i32> %c, i32 0
+  %e = ashr i32 %d, 3
+  ret i32 %e
+}
+
+; Going scalar -> vector
+define <2 x i32> @test_insertelement(i32 %a, i32 %b) {
+; CHECK-LABEL: @test_insertelement(
+; CHECK-NEXT:    [[X3:%.*]] = and <2 x i32> zeroinitializer, <i32 4, i32 4>
+; CHECK-NEXT:    [[Y:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
+; CHECK-NEXT:    [[Y2:%.*]] = insertelement <2 x i32> [[Y]], i32 [[A:%.*]], i32 1
+; CHECK-NEXT:    [[Y3:%.*]] = and <2 x i32> [[Y2]], <i32 8, i32 8>
+; CHECK-NEXT:    [[Z:%.*]] = or <2 x i32> [[X3]], [[Y3]]
+; CHECK-NEXT:    [[U:%.*]] = ashr <2 x i32> [[Z]], <i32 3, i32 3>
+; CHECK-NEXT:    ret <2 x i32> [[U]]
+;
+  %x = insertelement <2 x i32> poison, i32 %a, i32 0
+  %x2 = insertelement <2 x i32> %x, i32 %b, i32 1
+  %x3 = and <2 x i32> %x2, <i32 4, i32 4>
+  %y = insertelement <2 x i32> poison, i32 %b, i32 0
+  %y2 = insertelement <2 x i32> %y, i32 %a, i32 1
+  %y3 = and <2 x i32> %y2, <i32 8, i32 8>
+  %z = or <2 x i32> %x3, %y3
+  %u = ashr <2 x i32> %z, <i32 3, i32 3>
+  ret <2 x i32> %u
+}
+
+; Some non-int vectors and conversions
+define <2 x i32> @test_conversion(<2 x i32> %a) {
+; CHECK-LABEL: @test_conversion(
+; CHECK-NEXT:    [[A2:%.*]] = add <2 x i32> [[A:%.*]], <i32 1, i32 1>
+; CHECK-NEXT:    [[A3:%.*]] = and <2 x i32> [[A2]], <i32 2, i32 2>
+; CHECK-NEXT:    [[X:%.*]] = uitofp <2 x i32> [[A3]] to <2 x double>
+; CHECK-NEXT:    [[Y:%.*]] = fadd <2 x double> [[X]], <double 1.000000e+00, double 1.000000e+00>
+; CHECK-NEXT:    [[Z:%.*]] = fptoui <2 x double> [[Y]] to <2 x i32>
+; CHECK-NEXT:    [[U:%.*]] = ashr <2 x i32> [[Z]], <i32 3, i32 3>
+; CHECK-NEXT:    ret <2 x i32> [[U]]
+;
+  %a2 = add <2 x i32> %a, <i32 1, i32 1>
+  %a3 = and <2 x i32> %a2, <i32 2, i32 2>
+  %x = uitofp <2 x i32> %a3 to <2 x double>
+  %y = fadd <2 x double> %x, <double 1.0, double 1.0>
+  %z = fptoui <2 x double> %y to <2 x i32>
+  %u = ashr <2 x i32> %z, <i32 3, i32 3>
+  ret <2 x i32> %u
+}
+
+; Assumption invalidation (adapted from invalidate-assumptions.ll)
+define <2 x i1> @test_assumption_invalidation(<2 x i1> %b, <2 x i8> %x) {
+; CHECK-LABEL: @test_assumption_invalidation(
+; CHECK-NEXT:    [[LITTLE_NUMBER:%.*]] = zext <2 x i1> [[B:%.*]] to <2 x i8>
+; CHECK-NEXT:    [[BIG_NUMBER:%.*]] = shl <2 x i8> zeroinitializer, <i8 1, i8 1>
+; CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i8> [[BIG_NUMBER]], [[LITTLE_NUMBER]]
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc <2 x i8> [[SUB]] to <2 x i1>
+; CHECK-NEXT:    ret <2 x i1> [[TRUNC]]
+;
+  %setbit = or <2 x i8> %x, <i8 64, i8 64>
+  %little_number = zext <2 x i1> %b to <2 x i8>
+  %big_number = shl <2 x i8> %setbit, <i8 1, i8 1>
+  %sub = sub nuw <2 x i8> %big_number, %little_number
+  %trunc = trunc <2 x i8> %sub to <2 x i1>
+  ret <2 x i1> %trunc
+}
--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt-inseltpoison.ll
@ -0,0 +1,113 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -codegenprepare < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+%struct.a = type { i32, i32 }
+@c = external dso_local global %struct.a, align 4
+@glob_array = internal unnamed_addr constant [16 x i32] [i32 1, i32 1, i32 2, i32 3, i32 5, i32 8, i32 13, i32 21, i32 34, i32 55, i32 89, i32 144, i32 233, i32 377, i32 610, i32 987], align 16
+
+define <vscale x 4 x i32> @splat_base(i32* %base, <vscale x 4 x i64> %index, <vscale x 4 x i1> %mask) #0 {
+; CHECK-LABEL: @splat_base(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, i32* [[BASE:%.*]], <vscale x 4 x i64> [[INDEX:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> [[TMP1]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
+;
+  %broadcast.splatinsert = insertelement <vscale x 4 x i32*> poison, i32* %base, i32 0
+  %broadcast.splat = shufflevector <vscale x 4 x i32*> %broadcast.splatinsert, <vscale x 4 x i32*> undef, <vscale x 4 x i32> zeroinitializer
+  %gep = getelementptr i32, <vscale x 4 x i32*> %broadcast.splat, <vscale x 4 x i64> %index
+  %res = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> %gep, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @splat_struct(%struct.a* %base, <vscale x 4 x i1> %mask) #0 {
+; CHECK-LABEL: @splat_struct(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_A:%.*]], %struct.a* [[BASE:%.*]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], <vscale x 4 x i64> zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> [[TMP2]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
+;
+  %gep = getelementptr %struct.a, %struct.a* %base, <vscale x 4 x i64> zeroinitializer, i32 1
+  %res = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> %gep, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @scalar_index(i32* %base, i64 %index, <vscale x 4 x i1> %mask) #0 {
+; CHECK-LABEL: @scalar_index(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, i32* [[BASE:%.*]], i64 [[INDEX:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], <vscale x 4 x i64> zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> [[TMP2]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
+;
+  %broadcast.splatinsert = insertelement <vscale x 4 x i32*> poison, i32* %base, i32 0
+  %broadcast.splat = shufflevector <vscale x 4 x i32*> %broadcast.splatinsert, <vscale x 4 x i32*> undef, <vscale x 4 x i32> zeroinitializer
+  %gep = getelementptr i32, <vscale x 4 x i32*> %broadcast.splat, i64 %index
+  %res = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> %gep, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @splat_index(i32* %base, i64 %index, <vscale x 4 x i1> %mask) #0 {
+; CHECK-LABEL: @splat_index(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, i32* [[BASE:%.*]], i64 [[INDEX:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], <vscale x 4 x i64> zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> [[TMP2]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
+;
+  %broadcast.splatinsert = insertelement <vscale x 4 x i64> poison, i64 %index, i32 0
+  %broadcast.splat = shufflevector <vscale x 4 x i64> %broadcast.splatinsert, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
+  %gep = getelementptr i32, i32* %base, <vscale x 4 x i64> %broadcast.splat
+  %res = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> %gep, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @test_global_array(<vscale x 4 x i64> %indxs, <vscale x 4 x i1> %mask) #0 {
+; CHECK-LABEL: @test_global_array(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @glob_array, i64 0, i64 0), <vscale x 4 x i64> [[INDXS:%.*]]
+; CHECK-NEXT:    [[G:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> [[TMP1]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[G]]
+;
+  %p = getelementptr inbounds [16 x i32], [16 x i32]* @glob_array, i64 0, <vscale x 4 x i64> %indxs
+  %g = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> %p, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
+  ret <vscale x 4 x i32> %g
+}
+
+define <vscale x 4 x i32> @global_struct_splat(<vscale x 4 x i1> %mask) #0 {
+; CHECK-LABEL: @global_struct_splat(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> shufflevector (<vscale x 4 x i32*> insertelement (<vscale x 4 x i32*> undef, i32* getelementptr inbounds (%struct.a, %struct.a* @c, i64 0, i32 1), i32 0), <vscale x 4 x i32*> undef, <vscale x 4 x i32> zeroinitializer), i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
+;
+  %1 = insertelement <vscale x 4 x %struct.a*> poison, %struct.a* @c, i32 0
+  %2 = shufflevector <vscale x 4 x %struct.a*> %1, <vscale x 4 x %struct.a*> undef, <vscale x 4 x i32> zeroinitializer
+  %3 = getelementptr %struct.a, <vscale x 4 x %struct.a*> %2, <vscale x 4 x i64> zeroinitializer, i32 1
+  %4 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> %3, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
+  ret <vscale x 4 x i32> %4
+}
+
+define <vscale x 4 x i32> @splat_ptr_gather(i32* %ptr, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %passthru) #0 {
+; CHECK-LABEL: @splat_ptr_gather(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, i32* [[PTR:%.*]], <vscale x 4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> [[TMP1]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+;
+  %1 = insertelement <vscale x 4 x i32*> poison, i32* %ptr, i32 0
+  %2 = shufflevector <vscale x 4 x i32*> %1, <vscale x 4 x i32*> undef, <vscale x 4 x i32> zeroinitializer
+  %3 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> %2, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %passthru)
+  ret <vscale x 4 x i32> %3
+}
+
+define void @splat_ptr_scatter(i32* %ptr, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %val) #0 {
+; CHECK-LABEL: @splat_ptr_scatter(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, i32* [[PTR:%.*]], <vscale x 4 x i64> zeroinitializer
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> [[VAL:%.*]], <vscale x 4 x i32*> [[TMP1]], i32 4, <vscale x 4 x i1> [[MASK:%.*]])
+; CHECK-NEXT:    ret void
+;
+  %1 = insertelement <vscale x 4 x i32*> poison, i32* %ptr, i32 0
+  %2 = shufflevector <vscale x 4 x i32*> %1, <vscale x 4 x i32*> undef, <vscale x 4 x i32> zeroinitializer
+  call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> %val, <vscale x 4 x i32*> %2, i32 4, <vscale x 4 x i1> %mask)
+  ret void
+}
+
+declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
+declare void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32>, <vscale x 4 x i32*>, i32, <vscale x 4 x i1>)
+
+attributes #0 = { "target-features"="+sve" }
--- a/llvm/test/Transforms/CodeGenPrepare/AMDGPU/bypass-slow-div-debug-info-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AMDGPU/bypass-slow-div-debug-info-inseltpoison.ll
@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -codegenprepare %s | FileCheck %s
+; Make sure BypassSlowDivision doesn't drop debug info
+
+define i64 @sdiv64(i64 %a, i64 %b) {
+; CHECK-LABEL: @sdiv64(
+; CHECK-NEXT:    [[TMP1:%.*]] = or i64 [[A:%.*]], [[B:%.*]], [[DBG6:!dbg !.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[TMP1]], -4294967296, [[DBG6]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[TMP2]], 0, [[DBG6]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP9:%.*]], [[DBG6]]
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[B]] to i32, [[DBG6]]
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i64 [[A]] to i32, [[DBG6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = udiv i32 [[TMP6]], [[TMP5]], [[DBG6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64, [[DBG6]]
+; CHECK-NEXT:    br label [[TMP11:%.*]], [[DBG6]]
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = sdiv i64 [[A]], [[B]], [[DBG6]]
+; CHECK-NEXT:    br label [[TMP11]], [[DBG6]]
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i64 [ [[TMP8]], [[TMP4]] ], [ [[TMP10]], [[TMP9]] ], [[DBG6]]
+; CHECK-NEXT:    ret i64 [[TMP12]]
+;
+  %d = sdiv i64 %a, %b, !dbg !6
+  ret i64 %d
+}
+
+; FIXME: The debugloc for the rem parts end up with the dbg of the
+; division.
+define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
+; CHECK-LABEL: @sdivrem64(
+; CHECK-NEXT:    [[TMP1:%.*]] = or i64 [[A:%.*]], [[B:%.*]], [[DBG6]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[TMP1]], -4294967296, [[DBG6]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[TMP2]], 0, [[DBG6]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP11:%.*]], [[DBG6]]
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[B]] to i32, [[DBG6]]
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i64 [[A]] to i32, [[DBG6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = udiv i32 [[TMP6]], [[TMP5]], [[DBG6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = urem i32 [[TMP6]], [[TMP5]], [[DBG6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64, [[DBG6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64, [[DBG6]]
+; CHECK-NEXT:    br label [[TMP14:%.*]], [[DBG6]]
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = sdiv i64 [[A]], [[B]], [[DBG6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = srem i64 [[A]], [[B]], [[DBG6]]
+; CHECK-NEXT:    br label [[TMP14]], [[DBG6]]
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = phi i64 [ [[TMP9]], [[TMP4]] ], [ [[TMP12]], [[TMP11]] ], [[DBG6]]
+; CHECK-NEXT:    [[TMP16:%.*]] = phi i64 [ [[TMP10]], [[TMP4]] ], [ [[TMP13]], [[TMP11]] ], [[DBG6]]
+; CHECK-NEXT:    [[INS0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP15]], i32 0
+; CHECK-NEXT:    [[INS1:%.*]] = insertelement <2 x i64> [[INS0]], i64 [[TMP16]], i32 1
+; CHECK-NEXT:    ret <2 x i64> [[INS1]]
+;
+  %d = sdiv i64 %a, %b, !dbg !6
+  %r = srem i64 %a, %b, !dbg !10
+  %ins0 = insertelement <2 x i64> poison, i64 %d, i32 0
+  %ins1 = insertelement <2 x i64> %ins0, i64 %r, i32 1
+  ret <2 x i64> %ins1
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.5 ", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "basic.c", directory: ".")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 1, !"Debug Info Version", i32 3}
+!5 = !{!"clang version 3.5 "}
+!6 = !DILocation(line: 3, scope: !7)
+!7 = distinct !DILexicalBlock(scope: !8, file: !1, line: 3)
+!8 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !9, scopeLine: 1, virtualIndex: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!9 = !DISubroutineType(types: !2)
+!10 = !DILocation(line: 4, scope: !7)
--- a/llvm/test/Transforms/CodeGenPrepare/ARM/sink-add-mul-shufflevector-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/ARM/sink-add-mul-shufflevector-inseltpoison.ll
@ -0,0 +1,219 @@
+; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp < %s -codegenprepare -S | FileCheck -check-prefix=CHECK %s
+
+define void @sink_add_mul(i32* %s1, i32 %x, i32* %d, i32 %n) {
+; CHECK-LABEL: @sink_add_mul(
+; CHECK:    vector.ph:
+; CHECK-NOT:  [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
+; CHECK-NOT:  [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:    vector.body:
+; CHECK:      [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
+; CHECK:      [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
+;
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %n.vec = and i32 %n, -4
+  %broadcast.splatinsert8 = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i32, i32* %s1, i32 %index
+  %1 = bitcast i32* %0 to <4 x i32>*
+  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
+  %2 = mul nsw <4 x i32> %wide.load, %broadcast.splat9
+  %3 = getelementptr inbounds i32, i32* %d, i32 %index
+  %4 = bitcast i32* %3 to <4 x i32>*
+  %wide.load10 = load <4 x i32>, <4 x i32>* %4, align 4
+  %5 = add nsw <4 x i32> %wide.load10, %2
+  %6 = bitcast i32* %3 to <4 x i32>*
+  store <4 x i32> %5, <4 x i32>* %6, align 4
+  %index.next = add i32 %index, 4
+  %7 = icmp eq i32 %index.next, %n.vec
+  br i1 %7, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
+  ret void
+}
+
+define void @sink_add_mul_multiple(i32* %s1, i32* %s2, i32 %x, i32* %d, i32* %d2, i32 %n) {
+; CHECK-LABEL: @sink_add_mul_multiple(
+; CHECK:    vector.ph:
+; CHECK-NOT:  [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
+; CHECK-NOT:  [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:    vector.body:
+; CHECK:      [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 %x, i32 0
+; CHECK:      [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:      mul nsw <4 x i32> %wide.load, [[TMP3]]
+; CHECK:      [[TMP2b:%.*]] = insertelement <4 x i32> poison, i32 %x, i32 0
+; CHECK:      [[TMP3b:%.*]] = shufflevector <4 x i32> [[TMP2b]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:      mul nsw <4 x i32> %wide.load18, [[TMP3b]]
+;
+entry:
+  %cmp13 = icmp sgt i32 %n, 0
+  br i1 %cmp13, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %n.vec = and i32 %n, -4
+  %broadcast.splatinsert15 = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i32, i32* %s1, i32 %index
+  %1 = bitcast i32* %0 to <4 x i32>*
+  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
+  %2 = mul nsw <4 x i32> %wide.load, %broadcast.splat16
+  %3 = getelementptr inbounds i32, i32* %d, i32 %index
+  %4 = bitcast i32* %3 to <4 x i32>*
+  %wide.load17 = load <4 x i32>, <4 x i32>* %4, align 4
+  %5 = add nsw <4 x i32> %wide.load17, %2
+  %6 = bitcast i32* %3 to <4 x i32>*
+  store <4 x i32> %5, <4 x i32>* %6, align 4
+  %7 = getelementptr inbounds i32, i32* %s2, i32 %index
+  %8 = bitcast i32* %7 to <4 x i32>*
+  %wide.load18 = load <4 x i32>, <4 x i32>* %8, align 4
+  %9 = mul nsw <4 x i32> %wide.load18, %broadcast.splat16
+  %10 = getelementptr inbounds i32, i32* %d2, i32 %index
+  %11 = bitcast i32* %10 to <4 x i32>*
+  %wide.load19 = load <4 x i32>, <4 x i32>* %11, align 4
+  %12 = add nsw <4 x i32> %wide.load19, %9
+  %13 = bitcast i32* %10 to <4 x i32>*
+  store <4 x i32> %12, <4 x i32>* %13, align 4
+  %index.next = add i32 %index, 4
+  %14 = icmp eq i32 %index.next, %n.vec
+  br i1 %14, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
+  ret void
+}
+
+
+define void @sink_add_sub_unsinkable(i32* %s1, i32* %s2, i32 %x, i32* %d, i32* %d2, i32 %n) {
+; CHECK-LABEL: @sink_add_sub_unsinkable(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP13:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP13]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N]], -4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT16:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT15]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+;
+entry:
+  %cmp13 = icmp sgt i32 %n, 0
+  br i1 %cmp13, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %n.vec = and i32 %n, -4
+  %broadcast.splatinsert15 = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i32, i32* %s1, i32 %index
+  %1 = bitcast i32* %0 to <4 x i32>*
+  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
+  %2 = mul nsw <4 x i32> %wide.load, %broadcast.splat16
+  %3 = getelementptr inbounds i32, i32* %d, i32 %index
+  %4 = bitcast i32* %3 to <4 x i32>*
+  %wide.load17 = load <4 x i32>, <4 x i32>* %4, align 4
+  %5 = add nsw <4 x i32> %wide.load17, %2
+  %6 = bitcast i32* %3 to <4 x i32>*
+  store <4 x i32> %5, <4 x i32>* %6, align 4
+  %7 = getelementptr inbounds i32, i32* %s2, i32 %index
+  %8 = bitcast i32* %7 to <4 x i32>*
+  %wide.load18 = load <4 x i32>, <4 x i32>* %8, align 4
+  %9 = sub nsw <4 x i32> %broadcast.splat16, %wide.load18
+  %10 = getelementptr inbounds i32, i32* %d2, i32 %index
+  %11 = bitcast i32* %10 to <4 x i32>*
+  %wide.load19 = load <4 x i32>, <4 x i32>* %11, align 4
+  %12 = add nsw <4 x i32> %wide.load19, %9
+  %13 = bitcast i32* %10 to <4 x i32>*
+  store <4 x i32> %12, <4 x i32>* %13, align 4
+  %index.next = add i32 %index, 4
+  %14 = icmp eq i32 %index.next, %n.vec
+  br i1 %14, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
+  ret void
+}
+
+define void @sink_sub(i32* %s1, i32 %x, i32* %d, i32 %n) {
+; CHECK-LABEL: @sink_sub(
+; CHECK:    vector.ph:
+; CHECK-NOT:  [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
+; CHECK-NOT:  [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:    vector.body:
+; CHECK:      [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
+; CHECK:      [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
+;
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %n.vec = and i32 %n, -4
+  %broadcast.splatinsert8 = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i32, i32* %s1, i32 %index
+  %1 = bitcast i32* %0 to <4 x i32>*
+  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
+  %2 = sub nsw <4 x i32> %wide.load, %broadcast.splat9
+  %3 = getelementptr inbounds i32, i32* %d, i32 %index
+  %4 = bitcast i32* %3 to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %4, align 4
+  %index.next = add i32 %index, 4
+  %5 = icmp eq i32 %index.next, %n.vec
+  br i1 %5, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
+  ret void
+}
+
+define void @sink_sub_unsinkable(i32* %s1, i32 %x, i32* %d, i32 %n) {
+entry:
+; CHECK-LABEL: @sink_sub_unsinkable(
+; CHECK:      vector.ph:
+; CHECK-NEXT:   [[N_VEC:%.*]] = and i32 [[N]], -4
+; CHECK-NEXT:   [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
+; CHECK-NEXT:   [[BROADCAST_SPLAT16:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT15]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:   br label [[VECTOR_BODY:%.*]]
+; CHECK:      vector.body:
+; CHECK-NOT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
+; CHECK-NOT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
+;
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %n.vec = and i32 %n, -4
+  %broadcast.splatinsert8 = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i32, i32* %s1, i32 %index
+  %1 = bitcast i32* %0 to <4 x i32>*
+  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
+  %2 = sub nsw <4 x i32> %broadcast.splat9, %wide.load
+  %3 = getelementptr inbounds i32, i32* %d, i32 %index
+  %4 = bitcast i32* %3 to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %4, align 4
+  %index.next = add i32 %index, 4
+  %5 = icmp eq i32 %index.next, %n.vec
+  br i1 %5, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
+  ret void
+}
--- a/llvm/test/Transforms/CodeGenPrepare/ARM/sinkchain-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/ARM/sinkchain-inseltpoison.ll
@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp < %s -codegenprepare -S | FileCheck -check-prefix=CHECK %s
+
+; Sink the shufflevector/insertelement pair, followed by the trunc. The sunk instruction end up dead.
+define signext i8 @dead(i16* noalias nocapture readonly %s1, i16 zeroext %x, i8* noalias nocapture %d, i32 %n) {
+; CHECK-LABEL: @dead(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N:%.*]], -8
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[L6:%.*]] = getelementptr inbounds i16, i16* [[S1:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[L7:%.*]] = bitcast i16* [[L6]] to <8 x i16>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[L7]], align 2
+; CHECK-NEXT:    [[L8:%.*]] = trunc <8 x i16> [[WIDE_LOAD]] to <8 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[L9:%.*]] = mul <8 x i8> [[TMP2]], [[L8]]
+; CHECK-NEXT:    [[L13:%.*]] = getelementptr inbounds i8, i8* [[D:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[L14:%.*]] = bitcast i8* [[L13]] to <8 x i8>*
+; CHECK-NEXT:    store <8 x i8> [[L9]], <8 x i8>* [[L14]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; CHECK-NEXT:    [[L15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[L15]], label [[EXIT:%.*]], label [[VECTOR_BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i8 0
+;
+entry:
+  %n.vec = and i32 %n, -8
+  %l0 = trunc i16 %x to i8
+  %l1 = insertelement <8 x i8> poison, i8 %l0, i32 0
+  %broadcast.splat26 = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
+  %l6 = getelementptr inbounds i16, i16* %s1, i32 %index
+  %l7 = bitcast i16* %l6 to <8 x i16>*
+  %wide.load = load <8 x i16>, <8 x i16>* %l7, align 2
+  %l8 = trunc <8 x i16> %wide.load to <8 x i8>
+  %l9 = mul <8 x i8> %broadcast.splat26, %l8
+  %l13 = getelementptr inbounds i8, i8* %d, i32 %index
+  %l14 = bitcast i8* %l13 to <8 x i8>*
+  store <8 x i8> %l9, <8 x i8>* %l14, align 1
+  %index.next = add i32 %index, 8
+  %l15 = icmp eq i32 %index.next, %n.vec
+  br i1 %l15, label %exit, label %vector.body
+
+exit:                                     ; preds = %vector.body
+  ret i8 0
+}
+
+; Same as above, but the shuffle has an extra use meaning it shouldnt be deleted
+define signext i8 @alive(i16* noalias nocapture readonly %s1, i16 zeroext %x, i8* noalias nocapture %d, i32 %n) {
+; CHECK-LABEL: @alive(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N:%.*]], -8
+; CHECK-NEXT:    [[L0:%.*]] = trunc i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[L1:%.*]] = insertelement <8 x i8> poison, i8 [[L0]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT26:%.*]] = shufflevector <8 x i8> [[L1]], <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[L2:%.*]] = sub <8 x i8> zeroinitializer, [[BROADCAST_SPLAT26]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i16 [[X]] to i8
+; CHECK-NEXT:    [[L6:%.*]] = getelementptr inbounds i16, i16* [[S1:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[L7:%.*]] = bitcast i16* [[L6]] to <8 x i16>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[L7]], align 2
+; CHECK-NEXT:    [[L8:%.*]] = trunc <8 x i16> [[WIDE_LOAD]] to <8 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[L9:%.*]] = mul <8 x i8> [[TMP2]], [[L8]]
+; CHECK-NEXT:    [[L13:%.*]] = getelementptr inbounds i8, i8* [[D:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[L14:%.*]] = bitcast i8* [[L13]] to <8 x i8>*
+; CHECK-NEXT:    store <8 x i8> [[L9]], <8 x i8>* [[L14]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; CHECK-NEXT:    [[L15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[L15]], label [[EXIT:%.*]], label [[VECTOR_BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i8 0
+;
+entry:
+  %n.vec = and i32 %n, -8
+  %l0 = trunc i16 %x to i8
+  %l1 = insertelement <8 x i8> poison, i8 %l0, i32 0
+  %broadcast.splat26 = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> zeroinitializer
+  %l2 = sub <8 x i8> zeroinitializer, %broadcast.splat26
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
+  %l6 = getelementptr inbounds i16, i16* %s1, i32 %index
+  %l7 = bitcast i16* %l6 to <8 x i16>*
+  %wide.load = load <8 x i16>, <8 x i16>* %l7, align 2
+  %l8 = trunc <8 x i16> %wide.load to <8 x i8>
+  %l9 = mul <8 x i8> %broadcast.splat26, %l8
+  %l13 = getelementptr inbounds i8, i8* %d, i32 %index
+  %l14 = bitcast i8* %l13 to <8 x i8>*
+  store <8 x i8> %l9, <8 x i8>* %l14, align 1
+  %index.next = add i32 %index, 8
+  %l15 = icmp eq i32 %index.next, %n.vec
+  br i1 %l15, label %exit, label %vector.body
+
+exit:                                     ; preds = %vector.body
+  ret i8 0
+}
--- a/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll
@ -0,0 +1,113 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -codegenprepare < %s | FileCheck %s
+
+target datalayout =
+"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.a = type { i32, i32 }
+@c = external dso_local global %struct.a, align 4
+@glob_array = internal unnamed_addr constant [16 x i32] [i32 1, i32 1, i32 2, i32 3, i32 5, i32 8, i32 13, i32 21, i32 34, i32 55, i32 89, i32 144, i32 233, i32 377, i32 610, i32 987], align 16
+
+define <4 x i32> @splat_base(i32* %base, <4 x i64> %index) {
+; CHECK-LABEL: @splat_base(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, i32* [[BASE:%.*]], <4 x i64> [[INDEX:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP1]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %broadcast.splatinsert = insertelement <4 x i32*> poison, i32* %base, i32 0
+  %broadcast.splat = shufflevector <4 x i32*> %broadcast.splatinsert, <4 x i32*> undef, <4 x i32> zeroinitializer
+  %gep = getelementptr i32, <4 x i32*> %broadcast.splat, <4 x i64> %index
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @splat_struct(%struct.a* %base) {
+; CHECK-LABEL: @splat_struct(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_A:%.*]], %struct.a* [[BASE:%.*]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %gep = getelementptr %struct.a, %struct.a* %base, <4 x i64> zeroinitializer, i32 1
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @scalar_index(i32* %base, i64 %index) {
+; CHECK-LABEL: @scalar_index(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, i32* [[BASE:%.*]], i64 [[INDEX:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %broadcast.splatinsert = insertelement <4 x i32*> poison, i32* %base, i32 0
+  %broadcast.splat = shufflevector <4 x i32*> %broadcast.splatinsert, <4 x i32*> undef, <4 x i32> zeroinitializer
+  %gep = getelementptr i32, <4 x i32*> %broadcast.splat, i64 %index
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @splat_index(i32* %base, i64 %index) {
+; CHECK-LABEL: @splat_index(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, i32* [[BASE:%.*]], i64 [[INDEX:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer
+  %gep = getelementptr i32, i32* %base, <4 x i64> %broadcast.splat
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_global_array(<4 x i64> %indxs) {
+; CHECK-LABEL: @test_global_array(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @glob_array, i64 0, i64 0), <4 x i64> [[INDXS:%.*]]
+; CHECK-NEXT:    [[G:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP1]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    ret <4 x i32> [[G]]
+;
+  %p = getelementptr inbounds [16 x i32], [16 x i32]* @glob_array, i64 0, <4 x i64> %indxs
+  %g = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %p, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  ret <4 x i32> %g
+}
+
+define <4 x i32> @global_struct_splat() {
+; CHECK-LABEL: @global_struct_splat(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> <i32* getelementptr inbounds (%struct.a, %struct.a* @c, i64 0, i32 1), i32* getelementptr inbounds (%struct.a, %struct.a* @c, i64 0, i32 1), i32* getelementptr inbounds (%struct.a, %struct.a* @c, i64 0, i32 1), i32* getelementptr inbounds (%struct.a, %struct.a* @c, i64 0, i32 1)>, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = insertelement <4 x %struct.a*> poison, %struct.a* @c, i32 0
+  %2 = shufflevector <4 x %struct.a*> %1, <4 x %struct.a*> undef, <4 x i32> zeroinitializer
+  %3 = getelementptr %struct.a, <4 x %struct.a*> %2, <4 x i64> zeroinitializer, i32 1
+  %4 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %3, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  ret <4 x i32> %4
+}
+
+define <4 x i32> @splat_ptr_gather(i32* %ptr, <4 x i1> %mask, <4 x i32> %passthru) {
+; CHECK-LABEL: @splat_ptr_gather(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, i32* [[PTR:%.*]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP1]], i32 4, <4 x i1> [[MASK:%.*]], <4 x i32> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+  %1 = insertelement <4 x i32*> poison, i32* %ptr, i32 0
+  %2 = shufflevector <4 x i32*> %1, <4 x i32*> undef, <4 x i32> zeroinitializer
+  %3 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> %mask, <4 x i32> %passthru)
+  ret <4 x i32> %3
+}
+
+define void @splat_ptr_scatter(i32* %ptr, <4 x i1> %mask, <4 x i32> %val) {
+; CHECK-LABEL: @splat_ptr_scatter(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, i32* [[PTR:%.*]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> [[VAL:%.*]], <4 x i32*> [[TMP1]], i32 4, <4 x i1> [[MASK:%.*]])
+; CHECK-NEXT:    ret void
+;
+  %1 = insertelement <4 x i32*> poison, i32* %ptr, i32 0
+  %2 = shufflevector <4 x i32*> %1, <4 x i32*> undef, <4 x i32> zeroinitializer
+  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %val, <4 x i32*> %2, i32 4, <4 x i1> %mask)
+  ret void
+}
+
+declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
+declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>)
--- a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-inseltpoison.ll
@ -0,0 +1,321 @@
+; RUN: opt -S -codegenprepare < %s | FileCheck %s
+
+target datalayout =
+"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@x = external global [1 x [2 x <4 x float>]]
+
+; Can we sink single addressing mode computation to use?
+define void @test1(i1 %cond, i64* %base) {
+; CHECK-LABEL: @test1
+; CHECK: getelementptr inbounds i8, {{.+}} 40
+entry:
+  %addr = getelementptr inbounds i64, i64* %base, i64 5
+  %casted = bitcast i64* %addr to i32*
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+  %v = load i32, i32* %casted, align 4
+  br label %fallthrough
+
+fallthrough:
+  ret void
+}
+
+declare void @foo(i32)
+
+; Make sure sinking two copies of addressing mode into different blocks works
+define void @test2(i1 %cond, i64* %base) {
+; CHECK-LABEL: @test2
+entry:
+  %addr = getelementptr inbounds i64, i64* %base, i64 5
+  %casted = bitcast i64* %addr to i32*
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+; CHECK-LABEL: if.then:
+; CHECK: getelementptr inbounds i8, {{.+}} 40
+  %v1 = load i32, i32* %casted, align 4
+  call void @foo(i32 %v1)
+  %cmp = icmp eq i32 %v1, 0
+  br i1 %cmp, label %next, label %fallthrough
+
+next:
+; CHECK-LABEL: next:
+; CHECK: getelementptr inbounds i8, {{.+}} 40
+  %v2 = load i32, i32* %casted, align 4
+  call void @foo(i32 %v2)
+  br label %fallthrough
+
+fallthrough:
+  ret void
+}
+
+; If we have two loads in the same block, only need one copy of addressing mode
+; - instruction selection will duplicate if needed
+define void @test3(i1 %cond, i64* %base) {
+; CHECK-LABEL: @test3
+entry:
+  %addr = getelementptr inbounds i64, i64* %base, i64 5
+  %casted = bitcast i64* %addr to i32*
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+; CHECK-LABEL: if.then:
+; CHECK: getelementptr inbounds i8, {{.+}} 40
+  %v1 = load i32, i32* %casted, align 4
+  call void @foo(i32 %v1)
+; CHECK-NOT: getelementptr inbounds i8, {{.+}} 40
+  %v2 = load i32, i32* %casted, align 4
+  call void @foo(i32 %v2)
+  br label %fallthrough
+
+fallthrough:
+  ret void
+}
+
+; Can we still sink addressing mode if there's a cold use of the
+; address itself?  
+define void @test4(i1 %cond, i64* %base) {
+; CHECK-LABEL: @test4
+entry:
+  %addr = getelementptr inbounds i64, i64* %base, i64 5
+  %casted = bitcast i64* %addr to i32*
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+; CHECK-LABEL: if.then:
+; CHECK: getelementptr inbounds i8, {{.+}} 40
+  %v1 = load i32, i32* %casted, align 4
+  call void @foo(i32 %v1)
+  %cmp = icmp eq i32 %v1, 0
+  br i1 %cmp, label %rare.1, label %fallthrough
+
+fallthrough:
+  ret void
+
+rare.1:
+; CHECK-LABEL: rare.1:
+; CHECK: getelementptr inbounds i8, {{.+}} 40
+  call void @slowpath(i32 %v1, i32* %casted) cold
+  br label %fallthrough
+}
+
+; Negative test - don't want to duplicate addressing into hot path
+define void @test5(i1 %cond, i64* %base) {
+; CHECK-LABEL: @test5
+entry:
+; CHECK: %addr = getelementptr inbounds
+  %addr = getelementptr inbounds i64, i64* %base, i64 5
+  %casted = bitcast i64* %addr to i32*
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+; CHECK-LABEL: if.then:
+; CHECK-NOT: getelementptr inbounds i8, {{.+}} 40
+  %v1 = load i32, i32* %casted, align 4
+  call void @foo(i32 %v1)
+  %cmp = icmp eq i32 %v1, 0
+  br i1 %cmp, label %rare.1, label %fallthrough
+
+fallthrough:
+  ret void
+
+rare.1:
+  call void @slowpath(i32 %v1, i32* %casted) ;; NOT COLD
+  br label %fallthrough
+}
+
+; Negative test - opt for size
+define void @test6(i1 %cond, i64* %base) minsize {
+; CHECK-LABEL: @test6
+entry:
+; CHECK: %addr = getelementptr
+  %addr = getelementptr inbounds i64, i64* %base, i64 5
+  %casted = bitcast i64* %addr to i32*
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+; CHECK-LABEL: if.then:
+; CHECK-NOT: getelementptr inbounds i8, {{.+}} 40
+  %v1 = load i32, i32* %casted, align 4
+  call void @foo(i32 %v1)
+  %cmp = icmp eq i32 %v1, 0
+  br i1 %cmp, label %rare.1, label %fallthrough
+
+fallthrough:
+  ret void
+
+rare.1:
+  call void @slowpath(i32 %v1, i32* %casted) cold
+  br label %fallthrough
+}
+
+; Negative test - opt for size
+define void @test6_pgso(i1 %cond, i64* %base) !prof !14 {
+; CHECK-LABEL: @test6
+entry:
+; CHECK: %addr = getelementptr
+  %addr = getelementptr inbounds i64, i64* %base, i64 5
+  %casted = bitcast i64* %addr to i32*
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+; CHECK-LABEL: if.then:
+; CHECK-NOT: getelementptr inbounds i8, {{.+}} 40
+  %v1 = load i32, i32* %casted, align 4
+  call void @foo(i32 %v1)
+  %cmp = icmp eq i32 %v1, 0
+  br i1 %cmp, label %rare.1, label %fallthrough
+
+fallthrough:
+  ret void
+
+rare.1:
+  call void @slowpath(i32 %v1, i32* %casted) cold
+  br label %fallthrough
+}
+
+; Make sure sinking two copies of addressing mode into different blocks works
+; when there are cold paths for each.
+define void @test7(i1 %cond, i64* %base) {
+; CHECK-LABEL: @test7
+entry:
+  %addr = getelementptr inbounds i64, i64* %base, i64 5
+  %casted = bitcast i64* %addr to i32*
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+; CHECK-LABEL: if.then:
+; CHECK: getelementptr inbounds i8, {{.+}} 40
+  %v1 = load i32, i32* %casted, align 4
+  call void @foo(i32 %v1)
+  %cmp = icmp eq i32 %v1, 0
+  br i1 %cmp, label %rare.1, label %next
+
+next:
+; CHECK-LABEL: next:
+; CHECK: getelementptr inbounds i8, {{.+}} 40
+  %v2 = load i32, i32* %casted, align 4
+  call void @foo(i32 %v2)
+  %cmp2 = icmp eq i32 %v2, 0
+  br i1 %cmp2, label %rare.1, label %fallthrough
+
+fallthrough:
+  ret void
+
+rare.1:
+; CHECK-LABEL: rare.1:
+; CHECK: getelementptr inbounds i8, {{.+}} 40
+  call void @slowpath(i32 %v1, i32* %casted) cold
+  br label %next
+
+rare.2:
+; CHECK-LABEL: rare.2:
+; CHECK: getelementptr inbounds i8, {{.+}} 40
+  call void @slowpath(i32 %v2, i32* %casted) cold
+  br label %fallthrough
+}
+
+declare void @slowpath(i32, i32*)
+
+; Make sure we don't end up in an infinite loop after we fail to sink.
+; CHECK-LABEL: define void @test8
+; CHECK: %ptr = getelementptr i8, i8* %aFOO_load_ptr2int_2void, i32 undef
+define void @test8() {
+allocas:
+  %aFOO_load = load float*, float** undef
+  %aFOO_load_ptr2int = ptrtoint float* %aFOO_load to i64
+  %aFOO_load_ptr2int_broadcast_init = insertelement <4 x i64> poison, i64 %aFOO_load_ptr2int, i32 0
+  %aFOO_load_ptr2int_2void = inttoptr i64 %aFOO_load_ptr2int to i8*
+  %ptr = getelementptr i8, i8* %aFOO_load_ptr2int_2void, i32 undef
+  br label %load.i145
+
+load.i145:
+  %ptr.i143 = bitcast i8* %ptr to <4 x float>*
+  %valall.i144 = load <4 x float>, <4 x float>* %ptr.i143, align 4
+  %x_offset = getelementptr [1 x [2 x <4 x float>]], [1 x [2 x <4 x float>]]* @x, i32 0, i64 0
+  br label %pl_loop.i.i122
+
+pl_loop.i.i122:
+  br label %pl_loop.i.i122
+}
+
+; Make sure we can sink address computation even
+; if there is a cycle in phi nodes.
+define void @test9(i1 %cond, i64* %base) {
+; CHECK-LABEL: @test9
+entry:
+  %addr = getelementptr inbounds i64, i64* %base, i64 5
+  %casted = bitcast i64* %addr to i32*
+  br label %header
+
+header:
+  %iv = phi i32 [0, %entry], [%iv.inc, %backedge]
+  %casted.loop = phi i32* [%casted, %entry], [%casted.merged, %backedge]
+  br i1 %cond, label %if.then, label %backedge
+
+if.then:
+  call void @foo(i32 %iv)
+  %addr.1 = getelementptr inbounds i64, i64* %base, i64 5
+  %casted.1 = bitcast i64* %addr.1 to i32*
+  br label %backedge
+
+backedge:
+; CHECK-LABEL: backedge:
+; CHECK: getelementptr inbounds i8, {{.+}} 40
+  %casted.merged = phi i32* [%casted.loop, %header], [%casted.1, %if.then]
+  %v = load i32, i32* %casted.merged, align 4
+  call void @foo(i32 %v)
+  %iv.inc = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv.inc, 1000
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  ret void
+}
+
+; Make sure we can eliminate a select when both arguments perform equivalent
+; address computation.
+define void @test10(i1 %cond, i64* %base) {
+; CHECK-LABEL: @test10
+; CHECK: getelementptr inbounds i8, {{.+}} 40
+; CHECK-NOT: select
+entry:
+  %gep1 = getelementptr inbounds i64, i64* %base, i64 5
+  %gep1.casted = bitcast i64* %gep1 to i32*
+  %base.casted = bitcast i64* %base to i32*
+  %gep2 = getelementptr inbounds i32, i32* %base.casted, i64 10
+  %casted.merged = select i1 %cond, i32* %gep1.casted, i32* %gep2
+  %v = load i32, i32* %casted.merged, align 4
+  call void @foo(i32 %v)
+  ret void
+}
+
+; Found by fuzzer, getSExtValue of > 64 bit constant
+define void @i96_mul(i1* %base, i96 %offset) {
+BB:
+  ;; RHS = 0x7FFFFFFFFFFFFFFFFFFFFFFF
+  %B84 = mul i96 %offset, 39614081257132168796771975167
+  %G23 = getelementptr i1, i1* %base, i96 %B84
+  store i1 false, i1* %G23
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
--- a/llvm/test/Transforms/CodeGenPrepare/X86/vec-shift-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/vec-shift-inseltpoison.ll
@ -0,0 +1,406 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -codegenprepare -mtriple=x86_64-- -mattr=+avx -S < %s | FileCheck %s --check-prefixes=ALL,AVX1
+; RUN: opt -codegenprepare -mtriple=x86_64-- -mattr=+avx2 -S < %s | FileCheck %s --check-prefixes=ALL,AVX2
+; RUN: opt -codegenprepare -mtriple=x86_64-- -mattr=+avx512bw -S < %s | FileCheck %s --check-prefixes=ALL,AVX512BW
+; RUN: opt -codegenprepare -mtriple=x86_64-- -mattr=+avx,+xop -S < %s | FileCheck %s --check-prefixes=ALL,XOP
+; RUN: opt -codegenprepare -mtriple=x86_64-- -mattr=+avx2,+xop -S < %s | FileCheck %s --check-prefixes=ALL,XOP
+; RUN: opt -codegenprepare -mtriple=x86_64-- -mattr=+avx -S -enable-debugify < %s 2>&1 | FileCheck %s -check-prefix=DEBUG
+
+define <4 x i32> @vector_variable_shift_right_v4i32(<4 x i1> %cond, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
+; AVX1-LABEL: @vector_variable_shift_right_v4i32(
+; AVX1-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:    [[SEL:%.*]] = select <4 x i1> [[COND:%.*]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
+; AVX1-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> [[Z:%.*]], [[SPLAT1]]
+; AVX1-NEXT:    [[TMP2:%.*]] = lshr <4 x i32> [[Z]], [[SPLAT2]]
+; AVX1-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[COND]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]]
+; AVX1-NEXT:    ret <4 x i32> [[TMP3]]
+;
+; AVX2-LABEL: @vector_variable_shift_right_v4i32(
+; AVX2-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:    [[SEL:%.*]] = select <4 x i1> [[COND:%.*]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
+; AVX2-NEXT:    [[SH:%.*]] = lshr <4 x i32> [[Z:%.*]], [[SEL]]
+; AVX2-NEXT:    ret <4 x i32> [[SH]]
+;
+; AVX512BW-LABEL: @vector_variable_shift_right_v4i32(
+; AVX512BW-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512BW-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512BW-NEXT:    [[SEL:%.*]] = select <4 x i1> [[COND:%.*]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
+; AVX512BW-NEXT:    [[SH:%.*]] = lshr <4 x i32> [[Z:%.*]], [[SEL]]
+; AVX512BW-NEXT:    ret <4 x i32> [[SH]]
+;
+; XOP-LABEL: @vector_variable_shift_right_v4i32(
+; XOP-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
+; XOP-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
+; XOP-NEXT:    [[SEL:%.*]] = select <4 x i1> [[COND:%.*]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
+; XOP-NEXT:    [[SH:%.*]] = lshr <4 x i32> [[Z:%.*]], [[SEL]]
+; XOP-NEXT:    ret <4 x i32> [[SH]]
+;
+  %splat1 = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer
+  %splat2 = shufflevector <4 x i32> %y, <4 x i32> undef, <4 x i32> zeroinitializer
+  %sel = select <4 x i1> %cond, <4 x i32> %splat1, <4 x i32> %splat2
+  %sh = lshr <4 x i32> %z, %sel
+  ret <4 x i32> %sh
+}
+
+define <16 x i16> @vector_variable_shift_right_v16i16(<16 x i1> %cond, <16 x i16> %x, <16 x i16> %y, <16 x i16> %z) {
+; AVX1-LABEL: @vector_variable_shift_right_v16i16(
+; AVX1-NEXT:    [[SPLAT1:%.*]] = shufflevector <16 x i16> [[X:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:    [[SPLAT2:%.*]] = shufflevector <16 x i16> [[Y:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:    [[SEL:%.*]] = select <16 x i1> [[COND:%.*]], <16 x i16> [[SPLAT1]], <16 x i16> [[SPLAT2]]
+; AVX1-NEXT:    [[TMP1:%.*]] = lshr <16 x i16> [[Z:%.*]], [[SPLAT1]]
+; AVX1-NEXT:    [[TMP2:%.*]] = lshr <16 x i16> [[Z]], [[SPLAT2]]
+; AVX1-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[COND]], <16 x i16> [[TMP1]], <16 x i16> [[TMP2]]
+; AVX1-NEXT:    ret <16 x i16> [[TMP3]]
+;
+; AVX2-LABEL: @vector_variable_shift_right_v16i16(
+; AVX2-NEXT:    [[SPLAT1:%.*]] = shufflevector <16 x i16> [[X:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:    [[SPLAT2:%.*]] = shufflevector <16 x i16> [[Y:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:    [[SEL:%.*]] = select <16 x i1> [[COND:%.*]], <16 x i16> [[SPLAT1]], <16 x i16> [[SPLAT2]]
+; AVX2-NEXT:    [[TMP1:%.*]] = lshr <16 x i16> [[Z:%.*]], [[SPLAT1]]
+; AVX2-NEXT:    [[TMP2:%.*]] = lshr <16 x i16> [[Z]], [[SPLAT2]]
+; AVX2-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[COND]], <16 x i16> [[TMP1]], <16 x i16> [[TMP2]]
+; AVX2-NEXT:    ret <16 x i16> [[TMP3]]
+;
+; AVX512BW-LABEL: @vector_variable_shift_right_v16i16(
+; AVX512BW-NEXT:    [[SPLAT1:%.*]] = shufflevector <16 x i16> [[X:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512BW-NEXT:    [[SPLAT2:%.*]] = shufflevector <16 x i16> [[Y:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512BW-NEXT:    [[SEL:%.*]] = select <16 x i1> [[COND:%.*]], <16 x i16> [[SPLAT1]], <16 x i16> [[SPLAT2]]
+; AVX512BW-NEXT:    [[SH:%.*]] = lshr <16 x i16> [[Z:%.*]], [[SEL]]
+; AVX512BW-NEXT:    ret <16 x i16> [[SH]]
+;
+; XOP-LABEL: @vector_variable_shift_right_v16i16(
+; XOP-NEXT:    [[SPLAT1:%.*]] = shufflevector <16 x i16> [[X:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
+; XOP-NEXT:    [[SPLAT2:%.*]] = shufflevector <16 x i16> [[Y:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
+; XOP-NEXT:    [[SEL:%.*]] = select <16 x i1> [[COND:%.*]], <16 x i16> [[SPLAT1]], <16 x i16> [[SPLAT2]]
+; XOP-NEXT:    [[SH:%.*]] = lshr <16 x i16> [[Z:%.*]], [[SEL]]
+; XOP-NEXT:    ret <16 x i16> [[SH]]
+;
+  %splat1 = shufflevector <16 x i16> %x, <16 x i16> undef, <16 x i32> zeroinitializer
+  %splat2 = shufflevector <16 x i16> %y, <16 x i16> undef, <16 x i32> zeroinitializer
+  %sel = select <16 x i1> %cond, <16 x i16> %splat1, <16 x i16> %splat2
+  %sh = lshr <16 x i16> %z, %sel
+  ret <16 x i16> %sh
+}
+
+define <32 x i8> @vector_variable_shift_right_v32i8(<32 x i1> %cond, <32 x i8> %x, <32 x i8> %y, <32 x i8> %z) {
+; ALL-LABEL: @vector_variable_shift_right_v32i8(
+; ALL-NEXT:    [[SPLAT1:%.*]] = shufflevector <32 x i8> [[X:%.*]], <32 x i8> undef, <32 x i32> zeroinitializer
+; ALL-NEXT:    [[SPLAT2:%.*]] = shufflevector <32 x i8> [[Y:%.*]], <32 x i8> undef, <32 x i32> zeroinitializer
+; ALL-NEXT:    [[SEL:%.*]] = select <32 x i1> [[COND:%.*]], <32 x i8> [[SPLAT1]], <32 x i8> [[SPLAT2]]
+; ALL-NEXT:    [[SH:%.*]] = lshr <32 x i8> [[Z:%.*]], [[SEL]]
+; ALL-NEXT:    ret <32 x i8> [[SH]]
+;
+  %splat1 = shufflevector <32 x i8> %x, <32 x i8> undef, <32 x i32> zeroinitializer
+  %splat2 = shufflevector <32 x i8> %y, <32 x i8> undef, <32 x i32> zeroinitializer
+  %sel = select <32 x i1> %cond, <32 x i8> %splat1, <32 x i8> %splat2
+  %sh = lshr <32 x i8> %z, %sel
+  ret <32 x i8> %sh
+}
+
+; PR37428 - https://bugs.llvm.org/show_bug.cgi?id=37428
+
+define void @vector_variable_shift_left_loop(i32* nocapture %arr, i8* nocapture readonly %control, i32 %count, i32 %amt0, i32 %amt1, i32 %x) {
+; AVX1-LABEL: @vector_variable_shift_left_loop(
+; AVX1-NEXT:  entry:
+; AVX1-NEXT:    [[CMP16:%.*]] = icmp sgt i32 [[COUNT:%.*]], 0
+; AVX1-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[COUNT]] to i64
+; AVX1-NEXT:    br i1 [[CMP16]], label [[VECTOR_PH:%.*]], label [[EXIT:%.*]]
+; AVX1:       vector.ph:
+; AVX1-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292
+; AVX1-NEXT:    [[SPLATINSERT18:%.*]] = insertelement <4 x i32> poison, i32 [[AMT0:%.*]], i32 0
+; AVX1-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLATINSERT18]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:    [[SPLATINSERT20:%.*]] = insertelement <4 x i32> poison, i32 [[AMT1:%.*]], i32 0
+; AVX1-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLATINSERT20]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:    [[SPLATINSERT22:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
+; AVX1-NEXT:    [[SPLAT3:%.*]] = shufflevector <4 x i32> [[SPLATINSERT22]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX1:       vector.body:
+; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[CONTROL:%.*]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>*
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; AVX1-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
+; AVX1-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
+; AVX1-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[SPLATINSERT18]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:    [[TMP5:%.*]] = shl <4 x i32> [[SPLAT3]], [[TMP4]]
+; AVX1-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[SPLATINSERT20]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:    [[TMP7:%.*]] = shl <4 x i32> [[SPLAT3]], [[TMP6]]
+; AVX1-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP5]], <4 x i32> [[TMP7]]
+; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; AVX1-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* [[TMP10]], align 4
+; AVX1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; AVX1-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX1-NEXT:    br i1 [[TMP11]], label [[EXIT]], label [[VECTOR_BODY]]
+; AVX1:       exit:
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @vector_variable_shift_left_loop(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[CMP16:%.*]] = icmp sgt i32 [[COUNT:%.*]], 0
+; AVX2-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[COUNT]] to i64
+; AVX2-NEXT:    br i1 [[CMP16]], label [[VECTOR_PH:%.*]], label [[EXIT:%.*]]
+; AVX2:       vector.ph:
+; AVX2-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292
+; AVX2-NEXT:    [[SPLATINSERT18:%.*]] = insertelement <4 x i32> poison, i32 [[AMT0:%.*]], i32 0
+; AVX2-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLATINSERT18]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:    [[SPLATINSERT20:%.*]] = insertelement <4 x i32> poison, i32 [[AMT1:%.*]], i32 0
+; AVX2-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLATINSERT20]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:    [[SPLATINSERT22:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
+; AVX2-NEXT:    [[SPLAT3:%.*]] = shufflevector <4 x i32> [[SPLATINSERT22]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX2:       vector.body:
+; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[CONTROL:%.*]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>*
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; AVX2-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
+; AVX2-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
+; AVX2-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[SPLAT3]], [[TMP3]]
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; AVX2-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP6]], align 4
+; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX2-NEXT:    br i1 [[TMP7]], label [[EXIT]], label [[VECTOR_BODY]]
+; AVX2:       exit:
+; AVX2-NEXT:    ret void
+;
+; AVX512BW-LABEL: @vector_variable_shift_left_loop(
+; AVX512BW-NEXT:  entry:
+; AVX512BW-NEXT:    [[CMP16:%.*]] = icmp sgt i32 [[COUNT:%.*]], 0
+; AVX512BW-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[COUNT]] to i64
+; AVX512BW-NEXT:    br i1 [[CMP16]], label [[VECTOR_PH:%.*]], label [[EXIT:%.*]]
+; AVX512BW:       vector.ph:
+; AVX512BW-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292
+; AVX512BW-NEXT:    [[SPLATINSERT18:%.*]] = insertelement <4 x i32> poison, i32 [[AMT0:%.*]], i32 0
+; AVX512BW-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLATINSERT18]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512BW-NEXT:    [[SPLATINSERT20:%.*]] = insertelement <4 x i32> poison, i32 [[AMT1:%.*]], i32 0
+; AVX512BW-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLATINSERT20]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512BW-NEXT:    [[SPLATINSERT22:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
+; AVX512BW-NEXT:    [[SPLAT3:%.*]] = shufflevector <4 x i32> [[SPLATINSERT22]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512BW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX512BW:       vector.body:
+; AVX512BW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX512BW-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[CONTROL:%.*]], i64 [[INDEX]]
+; AVX512BW-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>*
+; AVX512BW-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; AVX512BW-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
+; AVX512BW-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
+; AVX512BW-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[SPLAT3]], [[TMP3]]
+; AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
+; AVX512BW-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; AVX512BW-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP6]], align 4
+; AVX512BW-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX512BW-NEXT:    br i1 [[TMP7]], label [[EXIT]], label [[VECTOR_BODY]]
+; AVX512BW:       exit:
+; AVX512BW-NEXT:    ret void
+;
+; XOP-LABEL: @vector_variable_shift_left_loop(
+; XOP-NEXT:  entry:
+; XOP-NEXT:    [[CMP16:%.*]] = icmp sgt i32 [[COUNT:%.*]], 0
+; XOP-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[COUNT]] to i64
+; XOP-NEXT:    br i1 [[CMP16]], label [[VECTOR_PH:%.*]], label [[EXIT:%.*]]
+; XOP:       vector.ph:
+; XOP-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292
+; XOP-NEXT:    [[SPLATINSERT18:%.*]] = insertelement <4 x i32> poison, i32 [[AMT0:%.*]], i32 0
+; XOP-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLATINSERT18]], <4 x i32> undef, <4 x i32> zeroinitializer
+; XOP-NEXT:    [[SPLATINSERT20:%.*]] = insertelement <4 x i32> poison, i32 [[AMT1:%.*]], i32 0
+; XOP-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLATINSERT20]], <4 x i32> undef, <4 x i32> zeroinitializer
+; XOP-NEXT:    [[SPLATINSERT22:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
+; XOP-NEXT:    [[SPLAT3:%.*]] = shufflevector <4 x i32> [[SPLATINSERT22]], <4 x i32> undef, <4 x i32> zeroinitializer
+; XOP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; XOP:       vector.body:
+; XOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; XOP-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[CONTROL:%.*]], i64 [[INDEX]]
+; XOP-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>*
+; XOP-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; XOP-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
+; XOP-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
+; XOP-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[SPLAT3]], [[TMP3]]
+; XOP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
+; XOP-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; XOP-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP6]], align 4
+; XOP-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; XOP-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; XOP-NEXT:    br i1 [[TMP7]], label [[EXIT]], label [[VECTOR_BODY]]
+; XOP:       exit:
+; XOP-NEXT:    ret void
+;
+entry:
+  %cmp16 = icmp sgt i32 %count, 0
+  %wide.trip.count = zext i32 %count to i64
+  br i1 %cmp16, label %vector.ph, label %exit
+
+vector.ph:
+  %n.vec = and i64 %wide.trip.count, 4294967292
+  %splatinsert18 = insertelement <4 x i32> poison, i32 %amt0, i32 0
+  %splat1 = shufflevector <4 x i32> %splatinsert18, <4 x i32> undef, <4 x i32> zeroinitializer
+  %splatinsert20 = insertelement <4 x i32> poison, i32 %amt1, i32 0
+  %splat2 = shufflevector <4 x i32> %splatinsert20, <4 x i32> undef, <4 x i32> zeroinitializer
+  %splatinsert22 = insertelement <4 x i32> poison, i32 %x, i32 0
+  %splat3 = shufflevector <4 x i32> %splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i8, i8* %control, i64 %index
+  %1 = bitcast i8* %0 to <4 x i8>*
+  %wide.load = load <4 x i8>, <4 x i8>* %1, align 1
+  %2 = icmp eq <4 x i8> %wide.load, zeroinitializer
+  %3 = select <4 x i1> %2, <4 x i32> %splat1, <4 x i32> %splat2
+  %4 = shl <4 x i32> %splat3, %3
+  %5 = getelementptr inbounds i32, i32* %arr, i64 %index
+  %6 = bitcast i32* %5 to <4 x i32>*
+  store <4 x i32> %4, <4 x i32>* %6, align 4
+  %index.next = add i64 %index, 4
+  %7 = icmp eq i64 %index.next, %n.vec
+  br i1 %7, label %exit, label %vector.body
+
+exit:
+  ret void
+}
+
+; PR37426 - https://bugs.llvm.org/show_bug.cgi?id=37426
+; If we don't have real vector shift instructions (AVX1), convert the funnel
+; shift into 2 funnel shifts and sink the splat shuffles into the loop.
+
+define void @fancierRotate2(i32* %arr, i8* %control, i32 %rot0, i32 %rot1) {
+; AVX1-LABEL: @fancierRotate2(
+; AVX1-NEXT:  entry:
+; AVX1-NEXT:    [[I0:%.*]] = insertelement <8 x i32> poison, i32 [[ROT0:%.*]], i32 0
+; AVX1-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:    [[I1:%.*]] = insertelement <8 x i32> poison, i32 [[ROT1:%.*]], i32 0
+; AVX1-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:    br label [[LOOP:%.*]]
+; AVX1:       loop:
+; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
+; AVX1-NEXT:    [[T0:%.*]] = getelementptr inbounds i8, i8* [[CONTROL:%.*]], i64 [[INDEX]]
+; AVX1-NEXT:    [[T1:%.*]] = bitcast i8* [[T0]] to <8 x i8>*
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[T1]], align 1
+; AVX1-NEXT:    [[T2:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
+; AVX1-NEXT:    [[SHAMT:%.*]] = select <8 x i1> [[T2]], <8 x i32> [[S0]], <8 x i32> [[S1]]
+; AVX1-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
+; AVX1-NEXT:    [[T5:%.*]] = bitcast i32* [[T4]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[T5]], align 4
+; AVX1-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[TMP0]])
+; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[TMP2]])
+; AVX1-NEXT:    [[TMP4:%.*]] = select <8 x i1> [[T2]], <8 x i32> [[TMP1]], <8 x i32> [[TMP3]]
+; AVX1-NEXT:    store <8 x i32> [[TMP4]], <8 x i32>* [[T5]], align 4
+; AVX1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; AVX1-NEXT:    [[T7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; AVX1-NEXT:    br i1 [[T7]], label [[EXIT:%.*]], label [[LOOP]]
+; AVX1:       exit:
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @fancierRotate2(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[I0:%.*]] = insertelement <8 x i32> poison, i32 [[ROT0:%.*]], i32 0
+; AVX2-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:    [[I1:%.*]] = insertelement <8 x i32> poison, i32 [[ROT1:%.*]], i32 0
+; AVX2-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:    br label [[LOOP:%.*]]
+; AVX2:       loop:
+; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
+; AVX2-NEXT:    [[T0:%.*]] = getelementptr inbounds i8, i8* [[CONTROL:%.*]], i64 [[INDEX]]
+; AVX2-NEXT:    [[T1:%.*]] = bitcast i8* [[T0]] to <8 x i8>*
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[T1]], align 1
+; AVX2-NEXT:    [[T2:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
+; AVX2-NEXT:    [[SHAMT:%.*]] = select <8 x i1> [[T2]], <8 x i32> [[S0]], <8 x i32> [[S1]]
+; AVX2-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
+; AVX2-NEXT:    [[T5:%.*]] = bitcast i32* [[T4]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[T5]], align 4
+; AVX2-NEXT:    [[ROT:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[SHAMT]])
+; AVX2-NEXT:    store <8 x i32> [[ROT]], <8 x i32>* [[T5]], align 4
+; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; AVX2-NEXT:    [[T7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; AVX2-NEXT:    br i1 [[T7]], label [[EXIT:%.*]], label [[LOOP]]
+; AVX2:       exit:
+; AVX2-NEXT:    ret void
+;
+; AVX512BW-LABEL: @fancierRotate2(
+; AVX512BW-NEXT:  entry:
+; AVX512BW-NEXT:    [[I0:%.*]] = insertelement <8 x i32> poison, i32 [[ROT0:%.*]], i32 0
+; AVX512BW-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512BW-NEXT:    [[I1:%.*]] = insertelement <8 x i32> poison, i32 [[ROT1:%.*]], i32 0
+; AVX512BW-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512BW-NEXT:    br label [[LOOP:%.*]]
+; AVX512BW:       loop:
+; AVX512BW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
+; AVX512BW-NEXT:    [[T0:%.*]] = getelementptr inbounds i8, i8* [[CONTROL:%.*]], i64 [[INDEX]]
+; AVX512BW-NEXT:    [[T1:%.*]] = bitcast i8* [[T0]] to <8 x i8>*
+; AVX512BW-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[T1]], align 1
+; AVX512BW-NEXT:    [[T2:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
+; AVX512BW-NEXT:    [[SHAMT:%.*]] = select <8 x i1> [[T2]], <8 x i32> [[S0]], <8 x i32> [[S1]]
+; AVX512BW-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
+; AVX512BW-NEXT:    [[T5:%.*]] = bitcast i32* [[T4]] to <8 x i32>*
+; AVX512BW-NEXT:    [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[T5]], align 4
+; AVX512BW-NEXT:    [[ROT:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[SHAMT]])
+; AVX512BW-NEXT:    store <8 x i32> [[ROT]], <8 x i32>* [[T5]], align 4
+; AVX512BW-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; AVX512BW-NEXT:    [[T7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; AVX512BW-NEXT:    br i1 [[T7]], label [[EXIT:%.*]], label [[LOOP]]
+; AVX512BW:       exit:
+; AVX512BW-NEXT:    ret void
+;
+; XOP-LABEL: @fancierRotate2(
+; XOP-NEXT:  entry:
+; XOP-NEXT:    [[I0:%.*]] = insertelement <8 x i32> poison, i32 [[ROT0:%.*]], i32 0
+; XOP-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
+; XOP-NEXT:    [[I1:%.*]] = insertelement <8 x i32> poison, i32 [[ROT1:%.*]], i32 0
+; XOP-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; XOP-NEXT:    br label [[LOOP:%.*]]
+; XOP:       loop:
+; XOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
+; XOP-NEXT:    [[T0:%.*]] = getelementptr inbounds i8, i8* [[CONTROL:%.*]], i64 [[INDEX]]
+; XOP-NEXT:    [[T1:%.*]] = bitcast i8* [[T0]] to <8 x i8>*
+; XOP-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[T1]], align 1
+; XOP-NEXT:    [[T2:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
+; XOP-NEXT:    [[SHAMT:%.*]] = select <8 x i1> [[T2]], <8 x i32> [[S0]], <8 x i32> [[S1]]
+; XOP-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
+; XOP-NEXT:    [[T5:%.*]] = bitcast i32* [[T4]] to <8 x i32>*
+; XOP-NEXT:    [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[T5]], align 4
+; XOP-NEXT:    [[ROT:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[SHAMT]])
+; XOP-NEXT:    store <8 x i32> [[ROT]], <8 x i32>* [[T5]], align 4
+; XOP-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; XOP-NEXT:    [[T7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; XOP-NEXT:    br i1 [[T7]], label [[EXIT:%.*]], label [[LOOP]]
+; XOP:       exit:
+; XOP-NEXT:    ret void
+;
+entry:
+  %i0 = insertelement <8 x i32> poison, i32 %rot0, i32 0
+  %s0 = shufflevector <8 x i32> %i0, <8 x i32> undef, <8 x i32> zeroinitializer
+  %i1 = insertelement <8 x i32> poison, i32 %rot1, i32 0
+  %s1 = shufflevector <8 x i32> %i1, <8 x i32> undef, <8 x i32> zeroinitializer
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %t0 = getelementptr inbounds i8, i8* %control, i64 %index
+  %t1 = bitcast i8* %t0 to <8 x i8>*
+  %wide.load = load <8 x i8>, <8 x i8>* %t1, align 1
+  %t2 = icmp eq <8 x i8> %wide.load, zeroinitializer
+  %shamt = select <8 x i1> %t2, <8 x i32> %s0, <8 x i32> %s1
+  %t4 = getelementptr inbounds i32, i32* %arr, i64 %index
+  %t5 = bitcast i32* %t4 to <8 x i32>*
+  %wide.load21 = load <8 x i32>, <8 x i32>* %t5, align 4
+  %rot = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %wide.load21, <8 x i32> %wide.load21, <8 x i32> %shamt)
+  store <8 x i32> %rot, <8 x i32>* %t5, align 4
+  %index.next = add i64 %index, 8
+  %t7 = icmp eq i64 %index.next, 1024
+  br i1 %t7, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+declare <8 x i32> @llvm.fshl.v8i32(<8 x i32>, <8 x i32>, <8 x i32>) #1
+
+; Check that every instruction inserted by -codegenprepare has a debug location.
+; DEBUG: CheckModuleDebugify: PASS
--- a/llvm/test/Transforms/CodeGenPrepare/X86/x86-shuffle-sink-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/x86-shuffle-sink-inseltpoison.ll
@ -0,0 +1,257 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -codegenprepare -mcpu=corei7 %s | FileCheck %s --check-prefixes=CHECK,CHECK-SSE2
+; RUN: opt -S -codegenprepare -mcpu=bdver2 %s | FileCheck %s --check-prefixes=CHECK,CHECK-XOP
+; RUN: opt -S -codegenprepare -mcpu=core-avx2 %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX2
+; RUN: opt -S -codegenprepare -mcpu=skylake-avx512 %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX512BW
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-darwin10.9.0"
+
+define <16 x i8> @test_8bit(<16 x i8> %lhs, <16 x i8> %tmp, i1 %tst) {
+; CHECK-LABEL: @test_8bit(
+; CHECK-NEXT:    [[MASK:%.*]] = shufflevector <16 x i8> [[TMP:%.*]], <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if_true:
+; CHECK-NEXT:    ret <16 x i8> [[MASK]]
+; CHECK:       if_false:
+; CHECK-NEXT:    [[RES:%.*]] = shl <16 x i8> [[LHS:%.*]], [[MASK]]
+; CHECK-NEXT:    ret <16 x i8> [[RES]]
+;
+  %mask = shufflevector <16 x i8> %tmp, <16 x i8> undef, <16 x i32> zeroinitializer
+  br i1 %tst, label %if_true, label %if_false
+
+if_true:
+  ret <16 x i8> %mask
+
+if_false:
+  %res = shl <16 x i8> %lhs, %mask
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @test_16bit(<8 x i16> %lhs, <8 x i16> %tmp, i1 %tst) {
+; CHECK-SSE2-LABEL: @test_16bit(
+; CHECK-SSE2-NEXT:    [[MASK:%.*]] = shufflevector <8 x i16> [[TMP:%.*]], <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-SSE2-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK-SSE2:       if_true:
+; CHECK-SSE2-NEXT:    ret <8 x i16> [[MASK]]
+; CHECK-SSE2:       if_false:
+; CHECK-SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[TMP]], <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-SSE2-NEXT:    [[RES:%.*]] = shl <8 x i16> [[LHS:%.*]], [[TMP1]]
+; CHECK-SSE2-NEXT:    ret <8 x i16> [[RES]]
+;
+; CHECK-XOP-LABEL: @test_16bit(
+; CHECK-XOP-NEXT:    [[MASK:%.*]] = shufflevector <8 x i16> [[TMP:%.*]], <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-XOP-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK-XOP:       if_true:
+; CHECK-XOP-NEXT:    ret <8 x i16> [[MASK]]
+; CHECK-XOP:       if_false:
+; CHECK-XOP-NEXT:    [[RES:%.*]] = shl <8 x i16> [[LHS:%.*]], [[MASK]]
+; CHECK-XOP-NEXT:    ret <8 x i16> [[RES]]
+;
+; CHECK-AVX2-LABEL: @test_16bit(
+; CHECK-AVX2-NEXT:    [[MASK:%.*]] = shufflevector <8 x i16> [[TMP:%.*]], <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-AVX2-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK-AVX2:       if_true:
+; CHECK-AVX2-NEXT:    ret <8 x i16> [[MASK]]
+; CHECK-AVX2:       if_false:
+; CHECK-AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[TMP]], <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-AVX2-NEXT:    [[RES:%.*]] = shl <8 x i16> [[LHS:%.*]], [[TMP1]]
+; CHECK-AVX2-NEXT:    ret <8 x i16> [[RES]]
+;
+; CHECK-AVX512BW-LABEL: @test_16bit(
+; CHECK-AVX512BW-NEXT:    [[MASK:%.*]] = shufflevector <8 x i16> [[TMP:%.*]], <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-AVX512BW-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK-AVX512BW:       if_true:
+; CHECK-AVX512BW-NEXT:    ret <8 x i16> [[MASK]]
+; CHECK-AVX512BW:       if_false:
+; CHECK-AVX512BW-NEXT:    [[RES:%.*]] = shl <8 x i16> [[LHS:%.*]], [[MASK]]
+; CHECK-AVX512BW-NEXT:    ret <8 x i16> [[RES]]
+;
+  %mask = shufflevector <8 x i16> %tmp, <8 x i16> undef, <8 x i32> zeroinitializer
+  br i1 %tst, label %if_true, label %if_false
+
+if_true:
+  ret <8 x i16> %mask
+
+if_false:
+  %res = shl <8 x i16> %lhs, %mask
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @test_notsplat(<4 x i32> %lhs, <4 x i32> %tmp, i1 %tst) {
+; CHECK-LABEL: @test_notsplat(
+; CHECK-NEXT:    [[MASK:%.*]] = shufflevector <4 x i32> [[TMP:%.*]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+; CHECK-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if_true:
+; CHECK-NEXT:    ret <4 x i32> [[MASK]]
+; CHECK:       if_false:
+; CHECK-NEXT:    [[RES:%.*]] = shl <4 x i32> [[LHS:%.*]], [[MASK]]
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %mask = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+  br i1 %tst, label %if_true, label %if_false
+
+if_true:
+  ret <4 x i32> %mask
+
+if_false:
+  %res = shl <4 x i32> %lhs, %mask
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_32bit(<4 x i32> %lhs, <4 x i32> %tmp, i1 %tst) {
+; CHECK-SSE2-LABEL: @test_32bit(
+; CHECK-SSE2-NEXT:    [[MASK:%.*]] = shufflevector <4 x i32> [[TMP:%.*]], <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 0, i32 0>
+; CHECK-SSE2-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK-SSE2:       if_true:
+; CHECK-SSE2-NEXT:    ret <4 x i32> [[MASK]]
+; CHECK-SSE2:       if_false:
+; CHECK-SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP]], <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 0, i32 0>
+; CHECK-SSE2-NEXT:    [[RES:%.*]] = ashr <4 x i32> [[LHS:%.*]], [[TMP1]]
+; CHECK-SSE2-NEXT:    ret <4 x i32> [[RES]]
+;
+; CHECK-XOP-LABEL: @test_32bit(
+; CHECK-XOP-NEXT:    [[MASK:%.*]] = shufflevector <4 x i32> [[TMP:%.*]], <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 0, i32 0>
+; CHECK-XOP-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK-XOP:       if_true:
+; CHECK-XOP-NEXT:    ret <4 x i32> [[MASK]]
+; CHECK-XOP:       if_false:
+; CHECK-XOP-NEXT:    [[RES:%.*]] = ashr <4 x i32> [[LHS:%.*]], [[MASK]]
+; CHECK-XOP-NEXT:    ret <4 x i32> [[RES]]
+;
+; CHECK-AVX-LABEL: @test_32bit(
+; CHECK-AVX-NEXT:    [[MASK:%.*]] = shufflevector <4 x i32> [[TMP:%.*]], <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 0, i32 0>
+; CHECK-AVX-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK-AVX:       if_true:
+; CHECK-AVX-NEXT:    ret <4 x i32> [[MASK]]
+; CHECK-AVX:       if_false:
+; CHECK-AVX-NEXT:    [[RES:%.*]] = ashr <4 x i32> [[LHS:%.*]], [[MASK]]
+; CHECK-AVX-NEXT:    ret <4 x i32> [[RES]]
+;
+  %mask = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 0, i32 0>
+  br i1 %tst, label %if_true, label %if_false
+
+if_true:
+  ret <4 x i32> %mask
+
+if_false:
+  %res = ashr <4 x i32> %lhs, %mask
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @test_64bit(<2 x i64> %lhs, <2 x i64> %tmp, i1 %tst) {
+; CHECK-SSE2-LABEL: @test_64bit(
+; CHECK-SSE2-NEXT:    [[MASK:%.*]] = shufflevector <2 x i64> [[TMP:%.*]], <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-SSE2-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK-SSE2:       if_true:
+; CHECK-SSE2-NEXT:    ret <2 x i64> [[MASK]]
+; CHECK-SSE2:       if_false:
+; CHECK-SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-SSE2-NEXT:    [[RES:%.*]] = lshr <2 x i64> [[LHS:%.*]], [[TMP1]]
+; CHECK-SSE2-NEXT:    ret <2 x i64> [[RES]]
+;
+; CHECK-XOP-LABEL: @test_64bit(
+; CHECK-XOP-NEXT:    [[MASK:%.*]] = shufflevector <2 x i64> [[TMP:%.*]], <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-XOP-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK-XOP:       if_true:
+; CHECK-XOP-NEXT:    ret <2 x i64> [[MASK]]
+; CHECK-XOP:       if_false:
+; CHECK-XOP-NEXT:    [[RES:%.*]] = lshr <2 x i64> [[LHS:%.*]], [[MASK]]
+; CHECK-XOP-NEXT:    ret <2 x i64> [[RES]]
+;
+; CHECK-AVX-LABEL: @test_64bit(
+; CHECK-AVX-NEXT:    [[MASK:%.*]] = shufflevector <2 x i64> [[TMP:%.*]], <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-AVX-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK-AVX:       if_true:
+; CHECK-AVX-NEXT:    ret <2 x i64> [[MASK]]
+; CHECK-AVX:       if_false:
+; CHECK-AVX-NEXT:    [[RES:%.*]] = lshr <2 x i64> [[LHS:%.*]], [[MASK]]
+; CHECK-AVX-NEXT:    ret <2 x i64> [[RES]]
+;
+  %mask = shufflevector <2 x i64> %tmp, <2 x i64> undef, <2 x i32> zeroinitializer
+  br i1 %tst, label %if_true, label %if_false
+
+if_true:
+  ret <2 x i64> %mask
+
+if_false:
+  %res = lshr <2 x i64> %lhs, %mask
+  ret <2 x i64> %res
+}
+
+define void @funnel_splatvar(i32* nocapture %arr, i32 %rot) {
+; CHECK-SSE2-LABEL: @funnel_splatvar(
+; CHECK-SSE2-NEXT:  entry:
+; CHECK-SSE2-NEXT:    [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <8 x i32> poison, i32 [[ROT:%.*]], i32 0
+; CHECK-SSE2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SSE2:       vector.body:
+; CHECK-SSE2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SSE2-NEXT:    [[T0:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
+; CHECK-SSE2-NEXT:    [[T1:%.*]] = bitcast i32* [[T0]] to <8 x i32>*
+; CHECK-SSE2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[T1]], align 4
+; CHECK-SSE2-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-SSE2-NEXT:    [[T2:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD]], <8 x i32> [[WIDE_LOAD]], <8 x i32> [[TMP0]])
+; CHECK-SSE2-NEXT:    store <8 x i32> [[T2]], <8 x i32>* [[T1]], align 4
+; CHECK-SSE2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-SSE2-NEXT:    [[T3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
+; CHECK-SSE2-NEXT:    br i1 [[T3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK-SSE2:       for.cond.cleanup:
+; CHECK-SSE2-NEXT:    ret void
+;
+; CHECK-XOP-LABEL: @funnel_splatvar(
+; CHECK-XOP-NEXT:  entry:
+; CHECK-XOP-NEXT:    [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <8 x i32> poison, i32 [[ROT:%.*]], i32 0
+; CHECK-XOP-NEXT:    [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-XOP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-XOP:       vector.body:
+; CHECK-XOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-XOP-NEXT:    [[T0:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
+; CHECK-XOP-NEXT:    [[T1:%.*]] = bitcast i32* [[T0]] to <8 x i32>*
+; CHECK-XOP-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[T1]], align 4
+; CHECK-XOP-NEXT:    [[T2:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD]], <8 x i32> [[WIDE_LOAD]], <8 x i32> [[BROADCAST_SPLAT16]])
+; CHECK-XOP-NEXT:    store <8 x i32> [[T2]], <8 x i32>* [[T1]], align 4
+; CHECK-XOP-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-XOP-NEXT:    [[T3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
+; CHECK-XOP-NEXT:    br i1 [[T3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK-XOP:       for.cond.cleanup:
+; CHECK-XOP-NEXT:    ret void
+;
+; CHECK-AVX-LABEL: @funnel_splatvar(
+; CHECK-AVX-NEXT:  entry:
+; CHECK-AVX-NEXT:    [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <8 x i32> poison, i32 [[ROT:%.*]], i32 0
+; CHECK-AVX-NEXT:    [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-AVX-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-AVX:       vector.body:
+; CHECK-AVX-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-AVX-NEXT:    [[T0:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
+; CHECK-AVX-NEXT:    [[T1:%.*]] = bitcast i32* [[T0]] to <8 x i32>*
+; CHECK-AVX-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[T1]], align 4
+; CHECK-AVX-NEXT:    [[T2:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD]], <8 x i32> [[WIDE_LOAD]], <8 x i32> [[BROADCAST_SPLAT16]])
+; CHECK-AVX-NEXT:    store <8 x i32> [[T2]], <8 x i32>* [[T1]], align 4
+; CHECK-AVX-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-AVX-NEXT:    [[T3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
+; CHECK-AVX-NEXT:    br i1 [[T3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK-AVX:       for.cond.cleanup:
+; CHECK-AVX-NEXT:    ret void
+;
+entry:
+  %broadcast.splatinsert15 = insertelement <8 x i32> poison, i32 %rot, i32 0
+  %broadcast.splat16 = shufflevector <8 x i32> %broadcast.splatinsert15, <8 x i32> undef, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %t0 = getelementptr inbounds i32, i32* %arr, i64 %index
+  %t1 = bitcast i32* %t0 to <8 x i32>*
+  %wide.load = load <8 x i32>, <8 x i32>* %t1, align 4
+  %t2 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %wide.load, <8 x i32> %wide.load, <8 x i32> %broadcast.splat16)
+  store <8 x i32> %t2, <8 x i32>* %t1, align 4
+  %index.next = add i64 %index, 8
+  %t3 = icmp eq i64 %index.next, 65536
+  br i1 %t3, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}
+
+declare <8 x i32> @llvm.fshl.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
--- a/llvm/test/Transforms/GVN/2016-08-30-MaskedScatterGather-inseltpoison.ll
+++ b/llvm/test/Transforms/GVN/2016-08-30-MaskedScatterGather-inseltpoison.ll
@ -0,0 +1,42 @@
+; RUN: opt < %s -basic-aa -gvn -S | FileCheck %s
+
+declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
+declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
+
+; This test ensures that masked scatter and gather operations, which take vectors of pointers,
+; do not have pointer aliasing ignored when being processed.
+; No scatter/gather calls should end up eliminated
+; CHECK: llvm.masked.gather
+; CHECK: llvm.masked.gather
+; CHECK: llvm.masked.scatter
+; CHECK: llvm.masked.gather
+; CHECK: llvm.masked.scatter
+; CHECK: llvm.masked.gather
+define spir_kernel void @test(<2 x i32*> %in1, <2 x i32*> %in2, i32* %out) {
+entry:
+  ; Just some temporary storage
+  %tmp.0 = alloca i32
+  %tmp.1 = alloca i32
+  %tmp.i = insertelement <2 x i32*> poison, i32* %tmp.0, i32 0
+  %tmp = insertelement <2 x i32*> %tmp.i, i32* %tmp.1, i32 1
+  ; Read from in1 and in2
+  %in1.v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %in1, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
+  %in2.v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %in2, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
+  ; Store in1 to the allocas
+  call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %in1.v, <2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>);
+  ; Read in1 from the allocas
+  ; This gather should alias the scatter we just saw
+  %tmp.v.0 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
+  ; Store in2 to the allocas
+  call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %in2.v, <2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>);
+  ; Read in2 from the allocas
+  ; This gather should alias the scatter we just saw, and not be eliminated
+  %tmp.v.1 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
+  ; Store in2 to out for good measure
+  %tmp.v.1.0 = extractelement <2 x i32> %tmp.v.1, i32 0
+  %tmp.v.1.1 = extractelement <2 x i32> %tmp.v.1, i32 1
+  store i32 %tmp.v.1.0, i32* %out
+  %out.1 = getelementptr i32, i32* %out, i32 1
+  store i32 %tmp.v.1.1, i32* %out.1
+  ret void
+}
--- a/llvm/test/Transforms/GVN/constexpr-vector-constainsundef-crash-inseltpoison.ll
+++ b/llvm/test/Transforms/GVN/constexpr-vector-constainsundef-crash-inseltpoison.ll
@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -gvn -S %s | FileCheck %s
+
+; Reduced test case from
+; https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=24278
+
+; Make sure we do not crash when dealing with a vector constant expression.
+define <4 x i64*> @test(i64* %ptr) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L3:%.*]] = load i64, i64* [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[I6:%.*]] = insertelement <4 x i64*> getelementptr (i64, i64* null, <4 x i64> <i64 poison, i64 poison, i64 poison, i64 -128>), i64* undef, i64 [[L3]]
+; CHECK-NEXT:    ret <4 x i64*> [[I6]]
+;
+entry:
+  %B9 = sdiv i16 -32768, 256
+  %L3 = load i64, i64* %ptr, align 4
+  %B3 = sub i16 0, %B9
+  %0 = insertelement <4 x i16> poison, i16 %B3, i32 3
+  %1 = sub <4 x i16> zeroinitializer, %0
+  %2 = sext <4 x i16> %1 to <4 x i32>
+  %3 = getelementptr inbounds i64, i64* null, <4 x i32> %2
+  %I6 = insertelement <4 x i64*> %3, i64* undef, i64 %L3
+  ret <4 x i64*> %I6
+}
--- a/llvm/test/Transforms/GVN/non-integral-pointers-inseltpoison.ll
+++ b/llvm/test/Transforms/GVN/non-integral-pointers-inseltpoison.ll
@ -0,0 +1,456 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -gvn -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:4:5"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @f0(i1 %alwaysFalse, i64 %val, i64* %loc) {
+; CHECK-LABEL: @f0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i64 [[VAL:%.*]], i64* [[LOC:%.*]], align 8
+; CHECK-NEXT:    br i1 [[ALWAYSFALSE:%.*]], label [[NEVERTAKEN:%.*]], label [[ALWAYSTAKEN:%.*]]
+; CHECK:       neverTaken:
+; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast i64* [[LOC]] to i8 addrspace(4)**
+; CHECK-NEXT:    [[PTR:%.*]] = load i8 addrspace(4)*, i8 addrspace(4)** [[LOC_BC]], align 8
+; CHECK-NEXT:    store i8 5, i8 addrspace(4)* [[PTR]], align 1
+; CHECK-NEXT:    ret void
+; CHECK:       alwaysTaken:
+; CHECK-NEXT:    ret void
+;
+  entry:
+  store i64 %val, i64* %loc
+  br i1 %alwaysFalse, label %neverTaken, label %alwaysTaken
+
+  neverTaken:
+  %loc.bc = bitcast i64* %loc to i8 addrspace(4)**
+  %ptr = load i8 addrspace(4)*, i8 addrspace(4)** %loc.bc
+  store i8 5, i8 addrspace(4)* %ptr
+  ret void
+
+  alwaysTaken:
+  ret void
+}
+
+define i64 @f1(i1 %alwaysFalse, i8 addrspace(4)* %val, i8 addrspace(4)** %loc) {
+; CHECK-LABEL: @f1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i8 addrspace(4)* [[VAL:%.*]], i8 addrspace(4)** [[LOC:%.*]], align 8
+; CHECK-NEXT:    br i1 [[ALWAYSFALSE:%.*]], label [[NEVERTAKEN:%.*]], label [[ALWAYSTAKEN:%.*]]
+; CHECK:       neverTaken:
+; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast i8 addrspace(4)** [[LOC]] to i64*
+; CHECK-NEXT:    [[INT:%.*]] = load i64, i64* [[LOC_BC]], align 8
+; CHECK-NEXT:    ret i64 [[INT]]
+; CHECK:       alwaysTaken:
+; CHECK-NEXT:    ret i64 42
+;
+  entry:
+  store i8 addrspace(4)* %val, i8 addrspace(4)** %loc
+  br i1 %alwaysFalse, label %neverTaken, label %alwaysTaken
+
+  neverTaken:
+  %loc.bc = bitcast i8 addrspace(4)** %loc to i64*
+  %int = load i64, i64* %loc.bc
+  ret i64 %int
+
+  alwaysTaken:
+  ret i64 42
+}
+
+;; Note: For terseness, we stop using the %alwaysfalse trick for the
+;; tests below and just exercise the bits of forwarding logic directly.
+
+declare void @llvm.memset.p4i8.i64(i8 addrspace(4)* nocapture, i8, i64, i1) nounwind
+
+; Can't forward as the load might be dead.  (Pretend we wrote out the alwaysfalse idiom above.)
+define i8 addrspace(4)* @neg_forward_memset(i8 addrspace(4)* addrspace(4)* %loc) {
+; CHECK-LABEL: @neg_forward_memset(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast i8 addrspace(4)* addrspace(4)* [[LOC:%.*]] to i8 addrspace(4)*
+; CHECK-NEXT:    call void @llvm.memset.p4i8.i64(i8 addrspace(4)* align 4 [[LOC_BC]], i8 7, i64 8, i1 false)
+; CHECK-NEXT:    [[REF:%.*]] = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* [[LOC]], align 8
+; CHECK-NEXT:    ret i8 addrspace(4)* [[REF]]
+;
+  entry:
+  %loc.bc = bitcast i8 addrspace(4)* addrspace(4)* %loc to i8 addrspace(4)*
+  call void @llvm.memset.p4i8.i64(i8 addrspace(4)* align 4 %loc.bc, i8 7, i64 8, i1 false)
+  %ref = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* %loc
+  ret i8 addrspace(4)* %ref
+}
+
+define <1 x i8 addrspace(4)*> @neg_forward_memset_vload(<1 x i8 addrspace(4)*> addrspace(4)* %loc) {
+; CHECK-LABEL: @neg_forward_memset_vload(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast <1 x i8 addrspace(4)*> addrspace(4)* [[LOC:%.*]] to i8 addrspace(4)*
+; CHECK-NEXT:    call void @llvm.memset.p4i8.i64(i8 addrspace(4)* align 4 [[LOC_BC]], i8 7, i64 8, i1 false)
+; CHECK-NEXT:    [[REF:%.*]] = load <1 x i8 addrspace(4)*>, <1 x i8 addrspace(4)*> addrspace(4)* [[LOC]], align 8
+; CHECK-NEXT:    ret <1 x i8 addrspace(4)*> [[REF]]
+;
+  entry:
+  %loc.bc = bitcast <1 x i8 addrspace(4)*> addrspace(4)* %loc to i8 addrspace(4)*
+  call void @llvm.memset.p4i8.i64(i8 addrspace(4)* align 4 %loc.bc, i8 7, i64 8, i1 false)
+  %ref = load <1 x i8 addrspace(4)*>, <1 x i8 addrspace(4)*> addrspace(4)* %loc
+  ret <1 x i8 addrspace(4)*> %ref
+}
+
+
+; Can forward since we can do so w/o breaking types
+define i8 addrspace(4)* @forward_memset_zero(i8 addrspace(4)* addrspace(4)* %loc) {
+; CHECK-LABEL: @forward_memset_zero(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast i8 addrspace(4)* addrspace(4)* [[LOC:%.*]] to i8 addrspace(4)*
+; CHECK-NEXT:    call void @llvm.memset.p4i8.i64(i8 addrspace(4)* align 4 [[LOC_BC]], i8 0, i64 8, i1 false)
+; CHECK-NEXT:    ret i8 addrspace(4)* null
+;
+  entry:
+  %loc.bc = bitcast i8 addrspace(4)* addrspace(4)* %loc to i8 addrspace(4)*
+  call void @llvm.memset.p4i8.i64(i8 addrspace(4)* align 4 %loc.bc, i8 0, i64 8, i1 false)
+  %ref = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* %loc
+  ret i8 addrspace(4)* %ref
+}
+
+; Can't forward as the load might be dead.  (Pretend we wrote out the alwaysfalse idiom above.)
+define i8 addrspace(4)* @neg_forward_store(i8 addrspace(4)* addrspace(4)* %loc) {
+; CHECK-LABEL: @neg_forward_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast i8 addrspace(4)* addrspace(4)* [[LOC:%.*]] to i64 addrspace(4)*
+; CHECK-NEXT:    store i64 5, i64 addrspace(4)* [[LOC_BC]], align 8
+; CHECK-NEXT:    [[REF:%.*]] = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* [[LOC]], align 8
+; CHECK-NEXT:    ret i8 addrspace(4)* [[REF]]
+;
+  entry:
+  %loc.bc = bitcast i8 addrspace(4)* addrspace(4)* %loc to i64 addrspace(4)*
+  store i64 5, i64 addrspace(4)* %loc.bc
+  %ref = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* %loc
+  ret i8 addrspace(4)* %ref
+}
+
+define <1 x i8 addrspace(4)*> @neg_forward_store_vload(<1 x i8 addrspace(4)*> addrspace(4)* %loc) {
+; CHECK-LABEL: @neg_forward_store_vload(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast <1 x i8 addrspace(4)*> addrspace(4)* [[LOC:%.*]] to i64 addrspace(4)*
+; CHECK-NEXT:    store i64 5, i64 addrspace(4)* [[LOC_BC]], align 8
+; CHECK-NEXT:    [[REF:%.*]] = load <1 x i8 addrspace(4)*>, <1 x i8 addrspace(4)*> addrspace(4)* [[LOC]], align 8
+; CHECK-NEXT:    ret <1 x i8 addrspace(4)*> [[REF]]
+;
+  entry:
+  %loc.bc = bitcast <1 x i8 addrspace(4)*> addrspace(4)* %loc to i64 addrspace(4)*
+  store i64 5, i64 addrspace(4)* %loc.bc
+  %ref = load <1 x i8 addrspace(4)*>, <1 x i8 addrspace(4)*> addrspace(4)* %loc
+  ret <1 x i8 addrspace(4)*> %ref
+}
+
+; Nulls have known bit patterns, so we can forward
+define i8 addrspace(4)* @forward_store_zero(i8 addrspace(4)* addrspace(4)* %loc) {
+; CHECK-LABEL: @forward_store_zero(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast i8 addrspace(4)* addrspace(4)* [[LOC:%.*]] to i64 addrspace(4)*
+; CHECK-NEXT:    store i64 0, i64 addrspace(4)* [[LOC_BC]], align 8
+; CHECK-NEXT:    ret i8 addrspace(4)* null
+;
+  entry:
+  %loc.bc = bitcast i8 addrspace(4)* addrspace(4)* %loc to i64 addrspace(4)*
+  store i64 0, i64 addrspace(4)* %loc.bc
+  %ref = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* %loc
+  ret i8 addrspace(4)* %ref
+}
+
+; Nulls have known bit patterns, so we can forward
+define i8 addrspace(4)* @forward_store_zero2(i8 addrspace(4)* addrspace(4)* %loc) {
+; CHECK-LABEL: @forward_store_zero2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast i8 addrspace(4)* addrspace(4)* [[LOC:%.*]] to <2 x i32> addrspace(4)*
+; CHECK-NEXT:    store <2 x i32> zeroinitializer, <2 x i32> addrspace(4)* [[LOC_BC]], align 8
+; CHECK-NEXT:    ret i8 addrspace(4)* null
+;
+  entry:
+  %loc.bc = bitcast i8 addrspace(4)* addrspace(4)* %loc to <2 x i32> addrspace(4)*
+  store <2 x i32> zeroinitializer, <2 x i32> addrspace(4)* %loc.bc
+  %ref = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* %loc
+  ret i8 addrspace(4)* %ref
+}
+
+
+
+@NonZeroConstant = constant <4 x i64> <i64 3, i64 3, i64 3, i64 3>
+@NonZeroConstant2 = constant <4 x i64 addrspace(4)*> <
+  i64 addrspace(4)* getelementptr (i64, i64 addrspace(4)* null, i32 3),
+  i64 addrspace(4)* getelementptr (i64, i64 addrspace(4)* null, i32 3),
+  i64 addrspace(4)* getelementptr (i64, i64 addrspace(4)* null, i32 3),
+  i64 addrspace(4)* getelementptr (i64, i64 addrspace(4)* null, i32 3)>
+@ZeroConstant = constant <4 x i64> zeroinitializer
+
+
+; Can't forward as the load might be dead.  (Pretend we wrote out the alwaysfalse idiom above.)
+define i8 addrspace(4)* @neg_forward_memcopy(i8 addrspace(4)* addrspace(4)* %loc) {
+; CHECK-LABEL: @neg_forward_memcopy(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast i8 addrspace(4)* addrspace(4)* [[LOC:%.*]] to i8 addrspace(4)*
+; CHECK-NEXT:    call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 [[LOC_BC]], i8* bitcast (<4 x i64>* @NonZeroConstant to i8*), i64 8, i1 false)
+; CHECK-NEXT:    [[REF:%.*]] = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* [[LOC]], align 8
+; CHECK-NEXT:    ret i8 addrspace(4)* [[REF]]
+;
+entry:
+  %loc.bc = bitcast i8 addrspace(4)* addrspace(4)* %loc to i8 addrspace(4)*
+  %src.bc = bitcast <4 x i64>* @NonZeroConstant to i8*
+  call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 %loc.bc, i8* %src.bc, i64 8, i1 false)
+  %ref = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* %loc
+  ret i8 addrspace(4)* %ref
+}
+
+define i64 addrspace(4)* @neg_forward_memcopy2(i64 addrspace(4)* addrspace(4)* %loc) {
+; CHECK-LABEL: @neg_forward_memcopy2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast i64 addrspace(4)* addrspace(4)* [[LOC:%.*]] to i8 addrspace(4)*
+; CHECK-NEXT:    call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 [[LOC_BC]], i8* bitcast (<4 x i64>* @NonZeroConstant to i8*), i64 8, i1 false)
+; CHECK-NEXT:    [[REF:%.*]] = load i64 addrspace(4)*, i64 addrspace(4)* addrspace(4)* [[LOC]], align 8
+; CHECK-NEXT:    ret i64 addrspace(4)* [[REF]]
+;
+entry:
+  %loc.bc = bitcast i64 addrspace(4)* addrspace(4)* %loc to i8 addrspace(4)*
+  %src.bc = bitcast <4 x i64>* @NonZeroConstant to i8*
+  call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 %loc.bc, i8* %src.bc, i64 8, i1 false)
+  %ref = load i64 addrspace(4)*, i64 addrspace(4)* addrspace(4)* %loc
+  ret i64 addrspace(4)* %ref
+}
+
+; TODO: missed optimization
+define i8 addrspace(4)* @forward_memcopy(i8 addrspace(4)* addrspace(4)* %loc) {
+; CHECK-LABEL: @forward_memcopy(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast i8 addrspace(4)* addrspace(4)* [[LOC:%.*]] to i8 addrspace(4)*
+; CHECK-NEXT:    call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 [[LOC_BC]], i8* bitcast (<4 x i64 addrspace(4)*>* @NonZeroConstant2 to i8*), i64 8, i1 false)
+; CHECK-NEXT:    [[REF:%.*]] = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* [[LOC]], align 8
+; CHECK-NEXT:    ret i8 addrspace(4)* [[REF]]
+;
+entry:
+  %loc.bc = bitcast i8 addrspace(4)* addrspace(4)* %loc to i8 addrspace(4)*
+  %src.bc = bitcast <4 x i64 addrspace(4)*>* @NonZeroConstant2 to i8*
+  call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 %loc.bc, i8* %src.bc, i64 8, i1 false)
+  %ref = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* %loc
+  ret i8 addrspace(4)* %ref
+}
+
+define i64 addrspace(4)* @forward_memcopy2(i64 addrspace(4)* addrspace(4)* %loc) {
+; CHECK-LABEL: @forward_memcopy2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast i64 addrspace(4)* addrspace(4)* [[LOC:%.*]] to i8 addrspace(4)*
+; CHECK-NEXT:    call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 [[LOC_BC]], i8* bitcast (<4 x i64 addrspace(4)*>* @NonZeroConstant2 to i8*), i64 8, i1 false)
+; CHECK-NEXT:    ret i64 addrspace(4)* getelementptr (i64, i64 addrspace(4)* null, i32 3)
+;
+entry:
+  %loc.bc = bitcast i64 addrspace(4)* addrspace(4)* %loc to i8 addrspace(4)*
+  %src.bc = bitcast <4 x i64 addrspace(4)*>* @NonZeroConstant2 to i8*
+  call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 %loc.bc, i8* %src.bc, i64 8, i1 false)
+  %ref = load i64 addrspace(4)*, i64 addrspace(4)* addrspace(4)* %loc
+  ret i64 addrspace(4)* %ref
+}
+
+define <1 x i8 addrspace(4)*> @neg_forward_memcpy_vload(<1 x i8 addrspace(4)*> addrspace(4)* %loc) {
+; CHECK-LABEL: @neg_forward_memcpy_vload(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast <1 x i8 addrspace(4)*> addrspace(4)* [[LOC:%.*]] to i8 addrspace(4)*
+; CHECK-NEXT:    call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 [[LOC_BC]], i8* bitcast (<4 x i64>* @NonZeroConstant to i8*), i64 8, i1 false)
+; CHECK-NEXT:    [[REF:%.*]] = load <1 x i8 addrspace(4)*>, <1 x i8 addrspace(4)*> addrspace(4)* [[LOC]], align 8
+; CHECK-NEXT:    ret <1 x i8 addrspace(4)*> [[REF]]
+;
+entry:
+  %loc.bc = bitcast <1 x i8 addrspace(4)*> addrspace(4)* %loc to i8 addrspace(4)*
+  %src.bc = bitcast <4 x i64>* @NonZeroConstant to i8*
+  call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 %loc.bc, i8* %src.bc, i64 8, i1 false)
+  %ref = load <1 x i8 addrspace(4)*>, <1 x i8 addrspace(4)*> addrspace(4)* %loc
+  ret <1 x i8 addrspace(4)*> %ref
+}
+
+define <4 x i64 addrspace(4)*> @neg_forward_memcpy_vload2(<4 x i64 addrspace(4)*> addrspace(4)* %loc) {
+; CHECK-LABEL: @neg_forward_memcpy_vload2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast <4 x i64 addrspace(4)*> addrspace(4)* [[LOC:%.*]] to i8 addrspace(4)*
+; CHECK-NEXT:    call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 [[LOC_BC]], i8* bitcast (<4 x i64>* @NonZeroConstant to i8*), i64 32, i1 false)
+; CHECK-NEXT:    [[REF:%.*]] = load <4 x i64 addrspace(4)*>, <4 x i64 addrspace(4)*> addrspace(4)* [[LOC]], align 32
+; CHECK-NEXT:    ret <4 x i64 addrspace(4)*> [[REF]]
+;
+entry:
+  %loc.bc = bitcast <4 x i64 addrspace(4)*> addrspace(4)* %loc to i8 addrspace(4)*
+  %src.bc = bitcast <4 x i64>* @NonZeroConstant to i8*
+  call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 %loc.bc, i8* %src.bc, i64 32, i1 false)
+  %ref = load <4 x i64 addrspace(4)*>, <4 x i64 addrspace(4)*> addrspace(4)* %loc
+  ret <4 x i64 addrspace(4)*> %ref
+}
+
+define <4 x i64> @neg_forward_memcpy_vload3(<4 x i64> addrspace(4)* %loc) {
+; CHECK-LABEL: @neg_forward_memcpy_vload3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast <4 x i64> addrspace(4)* [[LOC:%.*]] to i8 addrspace(4)*
+; CHECK-NEXT:    call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 [[LOC_BC]], i8* bitcast (<4 x i64 addrspace(4)*>* @NonZeroConstant2 to i8*), i64 32, i1 false)
+; CHECK-NEXT:    [[REF:%.*]] = load <4 x i64>, <4 x i64> addrspace(4)* [[LOC]], align 32
+; CHECK-NEXT:    ret <4 x i64> [[REF]]
+;
+entry:
+  %loc.bc = bitcast <4 x i64> addrspace(4)* %loc to i8 addrspace(4)*
+  %src.bc = bitcast <4 x i64 addrspace(4)*>* @NonZeroConstant2 to i8*
+  call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 %loc.bc, i8* %src.bc, i64 32, i1 false)
+  %ref = load <4 x i64>, <4 x i64> addrspace(4)* %loc
+  ret <4 x i64> %ref
+}
+
+define <1 x i64 addrspace(4)*> @forward_memcpy_vload3(<4 x i64 addrspace(4)*> addrspace(4)* %loc) {
+; CHECK-LABEL: @forward_memcpy_vload3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast <4 x i64 addrspace(4)*> addrspace(4)* [[LOC:%.*]] to i8 addrspace(4)*
+; CHECK-NEXT:    call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 [[LOC_BC]], i8* bitcast (<4 x i64 addrspace(4)*>* @NonZeroConstant2 to i8*), i64 32, i1 false)
+; CHECK-NEXT:    ret <1 x i64 addrspace(4)*> <i64 addrspace(4)* getelementptr (i64, i64 addrspace(4)* null, i32 3)>
+;
+entry:
+  %loc.bc = bitcast <4 x i64 addrspace(4)*> addrspace(4)* %loc to i8 addrspace(4)*
+  %src.bc = bitcast <4 x i64 addrspace(4)*>* @NonZeroConstant2 to i8*
+  call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 %loc.bc, i8* %src.bc, i64 32, i1 false)
+  %ref = load <4 x i64 addrspace(4)*>, <4 x i64 addrspace(4)*> addrspace(4)* %loc
+  %val = extractelement <4 x i64 addrspace(4)*> %ref, i32 0
+  %ret = insertelement <1 x i64 addrspace(4)*> poison, i64 addrspace(4)* %val, i32 0
+  ret <1 x i64 addrspace(4)*> %ret
+}
+
+; Can forward since we can do so w/o breaking types
+define i8 addrspace(4)* @forward_memcpy_zero(i8 addrspace(4)* addrspace(4)* %loc) {
+; CHECK-LABEL: @forward_memcpy_zero(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast i8 addrspace(4)* addrspace(4)* [[LOC:%.*]] to i8 addrspace(4)*
+; CHECK-NEXT:    call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 [[LOC_BC]], i8* bitcast (<4 x i64>* @ZeroConstant to i8*), i64 8, i1 false)
+; CHECK-NEXT:    ret i8 addrspace(4)* null
+;
+entry:
+  %loc.bc = bitcast i8 addrspace(4)* addrspace(4)* %loc to i8 addrspace(4)*
+  %src.bc = bitcast <4 x i64>* @ZeroConstant to i8*
+  call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 %loc.bc, i8* %src.bc, i64 8, i1 false)
+  %ref = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* %loc
+  ret i8 addrspace(4)* %ref
+}
+
+declare void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* nocapture, i8* nocapture, i64, i1) nounwind
+
+
+; Same as the neg_forward_store cases, but for non defs.
+; (Pretend we wrote out the alwaysfalse idiom above.)
+define i8 addrspace(4)* @neg_store_clobber(i8 addrspace(4)* addrspace(4)* %loc) {
+; CHECK-LABEL: @neg_store_clobber(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast i8 addrspace(4)* addrspace(4)* [[LOC:%.*]] to <2 x i64> addrspace(4)*
+; CHECK-NEXT:    store <2 x i64> <i64 4, i64 4>, <2 x i64> addrspace(4)* [[LOC_BC]], align 16
+; CHECK-NEXT:    [[LOC_OFF:%.*]] = getelementptr i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* [[LOC]], i64 1
+; CHECK-NEXT:    [[REF:%.*]] = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* [[LOC_OFF]], align 8
+; CHECK-NEXT:    ret i8 addrspace(4)* [[REF]]
+;
+entry:
+  %loc.bc = bitcast i8 addrspace(4)* addrspace(4)* %loc to <2 x i64> addrspace(4)*
+  store <2 x i64> <i64 4, i64 4>, <2 x i64> addrspace(4)* %loc.bc
+  %loc.off = getelementptr i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* %loc, i64 1
+  %ref = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* %loc.off
+  ret i8 addrspace(4)* %ref
+}
+
+declare void @use(<2 x i64>) inaccessiblememonly
+
+; Same as the neg_forward_store cases, but for non defs.
+; (Pretend we wrote out the alwaysfalse idiom above.)
+define i8 addrspace(4)* @neg_load_clobber(i8 addrspace(4)* addrspace(4)* %loc) {
+; CHECK-LABEL: @neg_load_clobber(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast i8 addrspace(4)* addrspace(4)* [[LOC:%.*]] to <2 x i64> addrspace(4)*
+; CHECK-NEXT:    [[V:%.*]] = load <2 x i64>, <2 x i64> addrspace(4)* [[LOC_BC]], align 16
+; CHECK-NEXT:    call void @use(<2 x i64> [[V]])
+; CHECK-NEXT:    [[LOC_OFF:%.*]] = getelementptr i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* [[LOC]], i64 1
+; CHECK-NEXT:    [[REF:%.*]] = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* [[LOC_OFF]], align 8
+; CHECK-NEXT:    ret i8 addrspace(4)* [[REF]]
+;
+entry:
+  %loc.bc = bitcast i8 addrspace(4)* addrspace(4)* %loc to <2 x i64> addrspace(4)*
+  %v = load <2 x i64>, <2 x i64> addrspace(4)* %loc.bc
+  call void @use(<2 x i64> %v)
+  %loc.off = getelementptr i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* %loc, i64 1
+  %ref = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* %loc.off
+  ret i8 addrspace(4)* %ref
+}
+
+define i8 addrspace(4)* @store_clobber_zero(i8 addrspace(4)* addrspace(4)* %loc) {
+; CHECK-LABEL: @store_clobber_zero(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast i8 addrspace(4)* addrspace(4)* [[LOC:%.*]] to <2 x i64> addrspace(4)*
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, <2 x i64> addrspace(4)* [[LOC_BC]], align 16
+; CHECK-NEXT:    [[LOC_OFF:%.*]] = getelementptr i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* [[LOC]], i64 1
+; CHECK-NEXT:    ret i8 addrspace(4)* null
+;
+entry:
+  %loc.bc = bitcast i8 addrspace(4)* addrspace(4)* %loc to <2 x i64> addrspace(4)*
+  store <2 x i64> zeroinitializer, <2 x i64> addrspace(4)* %loc.bc
+  %loc.off = getelementptr i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* %loc, i64 1
+  %ref = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* %loc.off
+  ret i8 addrspace(4)* %ref
+}
+
+
+define void @smaller_vector(i8* %p) {
+; CHECK-LABEL: @smaller_vector(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = bitcast i8* [[P:%.*]] to <4 x i64 addrspace(4)*>*
+; CHECK-NEXT:    [[B:%.*]] = bitcast i8* [[P]] to <2 x i64 addrspace(4)*>*
+; CHECK-NEXT:    [[V4:%.*]] = load <4 x i64 addrspace(4)*>, <4 x i64 addrspace(4)*>* [[A]], align 32
+; CHECK-NEXT:    [[V2:%.*]] = load <2 x i64 addrspace(4)*>, <2 x i64 addrspace(4)*>* [[B]], align 32
+; CHECK-NEXT:    call void @use.v2(<2 x i64 addrspace(4)*> [[V2]])
+; CHECK-NEXT:    call void @use.v4(<4 x i64 addrspace(4)*> [[V4]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = bitcast i8* %p to <4 x i64 addrspace(4)*>*
+  %b = bitcast i8* %p to <2 x i64 addrspace(4)*>*
+  %v4 = load <4 x i64 addrspace(4)*>, <4 x i64 addrspace(4)*>* %a, align 32
+  %v2 = load <2 x i64 addrspace(4)*>, <2 x i64 addrspace(4)*>* %b, align 32
+  call void @use.v2(<2 x i64 addrspace(4)*> %v2)
+  call void @use.v4(<4 x i64 addrspace(4)*> %v4)
+  ret void
+}
+
+define i64 addrspace(4)* @vector_extract(i8* %p) {
+; CHECK-LABEL: @vector_extract(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = bitcast i8* [[P:%.*]] to <4 x i64 addrspace(4)*>*
+; CHECK-NEXT:    [[B:%.*]] = bitcast i8* [[P]] to i64 addrspace(4)**
+; CHECK-NEXT:    [[V4:%.*]] = load <4 x i64 addrspace(4)*>, <4 x i64 addrspace(4)*>* [[A]], align 32
+; CHECK-NEXT:    [[RES:%.*]] = load i64 addrspace(4)*, i64 addrspace(4)** [[B]], align 32
+; CHECK-NEXT:    call void @use.v4(<4 x i64 addrspace(4)*> [[V4]])
+; CHECK-NEXT:    ret i64 addrspace(4)* [[RES]]
+;
+entry:
+  %a = bitcast i8* %p to <4 x i64 addrspace(4)*>*
+  %b = bitcast i8* %p to i64 addrspace(4)**
+  %v4 = load <4 x i64 addrspace(4)*>, <4 x i64 addrspace(4)*>* %a, align 32
+  %res = load i64 addrspace(4)*, i64 addrspace(4)** %b, align 32
+  call void @use.v4(<4 x i64 addrspace(4)*> %v4)
+  ret i64 addrspace(4)* %res
+}
+
+declare void @use.v2(<2 x i64 addrspace(4)*>)
+declare void @use.v4(<4 x i64 addrspace(4)*>)
+ define i8 addrspace(5)* @multini(i1 %alwaysFalse, i8 addrspace(4)* %val, i8 addrspace(4)** %loc) {
+; CHECK-LABEL: @multini(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i8 addrspace(4)* [[VAL:%.*]], i8 addrspace(4)** [[LOC:%.*]], align 8
+; CHECK-NEXT:    br i1 [[ALWAYSFALSE:%.*]], label [[NEVERTAKEN:%.*]], label [[ALWAYSTAKEN:%.*]]
+; CHECK:       neverTaken:
+; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast i8 addrspace(4)** [[LOC]] to i8 addrspace(5)**
+; CHECK-NEXT:    [[DIFFERENTAS:%.*]] = load i8 addrspace(5)*, i8 addrspace(5)** [[LOC_BC]], align 8
+; CHECK-NEXT:    ret i8 addrspace(5)* [[DIFFERENTAS]]
+; CHECK:       alwaysTaken:
+; CHECK-NEXT:    ret i8 addrspace(5)* null
+;
+  entry:
+  store i8 addrspace(4)* %val, i8 addrspace(4)** %loc
+  br i1 %alwaysFalse, label %neverTaken, label %alwaysTaken
+
+  neverTaken:
+  %loc.bc = bitcast i8 addrspace(4)** %loc to i8 addrspace(5)**
+  %differentas = load i8 addrspace(5)*, i8 addrspace(5)** %loc.bc
+  ret i8 addrspace(5)* %differentas
+
+  alwaysTaken:
+  ret i8 addrspace(5)* null
+  }
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/old-pass-regressions-inseltpoison.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/old-pass-regressions-inseltpoison.ll
@ -0,0 +1,143 @@
+; RUN: opt -data-layout=A5 -S -mtriple=amdgcn-amd-amdhsa -infer-address-spaces %s | FileCheck %s
+
+; Regression tests from old HSAIL addrspacecast optimization pass
+
+@data = internal addrspace(1) global [100 x double] [double 0.00, double 1.000000e-01, double 2.000000e-01, double 3.000000e-01, double 4.000000e-01, double 5.000000e-01, double 6.000000e-01, double 7.000000e-01, double 8.000000e-01, double 9.000000e-01, double 1.00, double 1.10, double 1.20, double 1.30, double 1.40, double 1.50, double 1.60, double 1.70, double 1.80, double 1.90, double 2.00, double 2.10, double 2.20, double 2.30, double 2.40, double 2.50, double 2.60, double 2.70, double 2.80, double 2.90, double 3.00, double 3.10, double 3.20, double 3.30, double 3.40, double 3.50, double 3.60, double 3.70, double 3.80, double 3.90, double 4.00, double 4.10, double 4.20, double 4.30, double 4.40, double 4.50, double 4.60, double 4.70, double 4.80, double 4.90, double 5.00, double 5.10, double 5.20, double 5.30, double 5.40, double 5.50, double 5.60, double 5.70, double 5.80, double 5.90, double 6.00, double 6.10, double 6.20, double 6.30, double 6.40, double 6.50, double 6.60, double 6.70, double 6.80, double 6.90, double 7.00, double 7.10, double 7.20, double 7.30, double 7.40, double 7.50, double 7.60, double 7.70, double 7.80, double 7.90, double 8.00, double 8.10, double 8.20, double 8.30, double 8.40, double 8.50, double 8.60, double 8.70, double 8.80, double 8.90, double 9.00, double 9.10, double 9.20, double 9.30, double 9.40, double 9.50, double 9.60, double 9.70, double 9.80, double 9.90], align 8
+
+
+; Should generate flat load
+
+; CHECK-LABEL: @generic_address_bitcast_const(
+; CHECK: %vecload1 = load <2 x double>, <2 x double> addrspace(1)* bitcast (double addrspace(1)* getelementptr inbounds ([100 x double], [100 x double] addrspace(1)* @data, i64 0, i64 4) to <2 x double> addrspace(1)*), align 8
+define amdgpu_kernel void @generic_address_bitcast_const(i64 %arg0, i32 addrspace(1)* nocapture %results) #0 {
+entry:
+  %tmp1 = call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 %tmp2, %arg0
+  %vecload1 = load <2 x double>, <2 x double>* bitcast (double* getelementptr ([100 x double], [100 x double]* addrspacecast ([100 x double] addrspace(1)* @data to [100 x double]*), i64 0, i64 4) to <2 x double>*), align 8
+  %cmp = fcmp ord <2 x double> %vecload1, zeroinitializer
+  %sext = sext <2 x i1> %cmp to <2 x i64>
+  %tmp4 = extractelement <2 x i64> %sext, i64 0
+  %tmp5 = extractelement <2 x i64> %sext, i64 1
+  %tmp6 = and i64 %tmp4, %tmp5
+  %tmp7 = lshr i64 %tmp6, 63
+  %tmp8 = trunc i64 %tmp7 to i32
+  %idxprom = and i64 %tmp3, 4294967295
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %results, i64 %idxprom
+  store i32 %tmp8, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+@generic_address_bug9749.val = internal addrspace(1) global float 0.0, align 4
+
+declare i32 @_Z9get_fencePv(i8*)
+%opencl.pipe_t = type opaque
+
+; This is a compile time assert bug, but we still want to check optimization
+; is performed to generate ld_global.
+; CHECK-LABEL: @generic_address_pipe_bug9673(
+; CHECK: %tmp1 = bitcast %opencl.pipe_t addrspace(3)* %in_pipe to i32 addrspace(3)*
+; CHECK: %add.ptr = getelementptr inbounds i32, i32 addrspace(3)* %tmp1, i32 2
+; CHECK: %tmp2 = load i32, i32 addrspace(3)* %add.ptr, align 4
+define amdgpu_kernel void @generic_address_pipe_bug9673(%opencl.pipe_t addrspace(3)* nocapture %in_pipe, i32 addrspace(1)* nocapture %dst) #0 {
+entry:
+  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = bitcast %opencl.pipe_t addrspace(3)* %in_pipe to i32 addrspace(3)*
+  %add.ptr = getelementptr inbounds i32, i32 addrspace(3)* %tmp1, i32 2
+  %tmp2 = load i32, i32 addrspace(3)* %add.ptr, align 4
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %dst, i32 %tmp
+  store i32 %tmp2, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Should generate flat load
+; CHECK-LABEL: @generic_address_bug9749(
+; CHECK: br i1
+; CHECK: load float, float*
+; CHECK: br label
+define amdgpu_kernel void @generic_address_bug9749(i32 addrspace(1)* nocapture %results) #0 {
+entry:
+  %ptr = alloca float*, align 8, addrspace(5)
+  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  store float 0x3FB99999A0000000, float addrspace(1)* @generic_address_bug9749.val, align 4
+  store volatile float* addrspacecast (float addrspace(1)* @generic_address_bug9749.val to float*), float* addrspace(5)* %ptr, align 8
+  %tmp2 = load volatile float*, float* addrspace(5)* %ptr, align 8
+  %tmp3 = load float, float addrspace(1)* @generic_address_bug9749.val, align 4
+  %tmp4 = bitcast float* %tmp2 to i8*
+  %call.i = call i32 @_Z9get_fencePv(i8* %tmp4) #1
+  %switch.i.i = icmp ult i32 %call.i, 4
+  br i1 %switch.i.i, label %if.end.i, label %helperFunction.exit
+
+if.end.i:                                         ; preds = %entry
+  %tmp5 = load float, float* %tmp2, align 4
+  %not.cmp.i = fcmp oeq float %tmp5, %tmp3
+  %phitmp = zext i1 %not.cmp.i to i32
+  br label %helperFunction.exit
+
+helperFunction.exit:                              ; preds = %if.end.i, %entry
+  %retval.0.i = phi i32 [ 0, %entry ], [ %phitmp, %if.end.i ]
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %results, i64 %tmp1
+  store i32 %retval.0.i, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; CHECK-LABEL: @generic_address_opt_phi_bug9776_simple_phi_kernel(
+; CHECK: phi i32 addrspace(3)*
+; CHECK: store i32 %i.03, i32 addrspace(3)* %
+define amdgpu_kernel void @generic_address_opt_phi_bug9776_simple_phi_kernel(i32 addrspace(3)* nocapture %in, i32 %numElems) #0 {
+entry:
+  %cmp1 = icmp eq i32 %numElems, 0
+  br i1 %cmp1, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %tmp = addrspacecast i32 addrspace(3)* %in to i32*
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %i.03 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %ptr.02 = phi i32* [ %tmp, %for.body.lr.ph ], [ %add.ptr, %for.body ]
+  store i32 %i.03, i32* %ptr.02, align 4
+  %add.ptr = getelementptr inbounds i32, i32* %ptr.02, i64 4
+  %inc = add nuw i32 %i.03, 1
+  %exitcond = icmp eq i32 %inc, %numElems
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; CHECK-LABEL: @generic_address_bug9899(
+; CHECK: %vecload = load <2 x i32>, <2 x i32> addrspace(3)*
+; CHECK: store <2 x i32> %tmp16, <2 x i32> addrspace(3)*
+define amdgpu_kernel void @generic_address_bug9899(i64 %arg0, i32 addrspace(3)* nocapture %sourceA, i32 addrspace(3)* nocapture %destValues) #0 {
+entry:
+  %tmp1 = call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 %tmp2, %arg0
+  %sext = shl i64 %tmp3, 32
+  %tmp4 = addrspacecast i32 addrspace(3)* %destValues to i32*
+  %tmp5 = addrspacecast i32 addrspace(3)* %sourceA to i32*
+  %tmp6 = ashr exact i64 %sext, 31
+  %tmp7 = getelementptr inbounds i32, i32* %tmp5, i64 %tmp6
+  %arrayidx_v4 = bitcast i32* %tmp7 to <2 x i32>*
+  %vecload = load <2 x i32>, <2 x i32>* %arrayidx_v4, align 4
+  %tmp8 = extractelement <2 x i32> %vecload, i32 0
+  %tmp9 = extractelement <2 x i32> %vecload, i32 1
+  %tmp10 = icmp eq i32 %tmp8, 0
+  %tmp11 = select i1 %tmp10, i32 32, i32 %tmp8
+  %tmp12 = icmp eq i32 %tmp9, 0
+  %tmp13 = select i1 %tmp12, i32 32, i32 %tmp9
+  %tmp14 = getelementptr inbounds i32, i32* %tmp4, i64 %tmp6
+  %tmp15 = insertelement <2 x i32> poison, i32 %tmp11, i32 0
+  %tmp16 = insertelement <2 x i32> %tmp15, i32 %tmp13, i32 1
+  %arrayidx_v41 = bitcast i32* %tmp14 to <2 x i32>*
+  store <2 x i32> %tmp16, <2 x i32>* %arrayidx_v41, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+attributes #2 = { nounwind readnone }
--- a/llvm/test/Transforms/InferFunctionAttrs/dereferenceable-inseltpoison.ll
+++ b/llvm/test/Transforms/InferFunctionAttrs/dereferenceable-inseltpoison.ll
@ -0,0 +1,357 @@
+; RUN: opt < %s -inferattrs -S | FileCheck %s
+
+
+
+; Determine dereference-ability before unused loads get deleted:
+; https://bugs.llvm.org/show_bug.cgi?id=21780
+
+define <4 x double> @PR21780(double* %ptr) {
+; CHECK-LABEL: @PR21780(double* %ptr)
+
+  ; GEP of index 0 is simplified away.
+  %arrayidx1 = getelementptr inbounds double, double* %ptr, i64 1
+  %arrayidx2 = getelementptr inbounds double, double* %ptr, i64 2
+  %arrayidx3 = getelementptr inbounds double, double* %ptr, i64 3
+
+  %t0 = load double, double* %ptr, align 8
+  %t1 = load double, double* %arrayidx1, align 8
+  %t2 = load double, double* %arrayidx2, align 8
+  %t3 = load double, double* %arrayidx3, align 8
+
+  %vecinit0 = insertelement <4 x double> poison, double %t0, i32 0
+  %vecinit1 = insertelement <4 x double> %vecinit0, double %t1, i32 1
+  %vecinit2 = insertelement <4 x double> %vecinit1, double %t2, i32 2
+  %vecinit3 = insertelement <4 x double> %vecinit2, double %t3, i32 3
+  %shuffle = shufflevector <4 x double> %vecinit3, <4 x double> %vecinit3, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  ret <4 x double> %shuffle
+}
+
+
+define double @PR21780_only_access3_with_inbounds(double* %ptr) {
+; CHECK-LABEL: @PR21780_only_access3_with_inbounds(double* %ptr)
+
+  %arrayidx3 = getelementptr inbounds double, double* %ptr, i64 3
+  %t3 = load double, double* %arrayidx3, align 8
+  ret double %t3
+}
+
+define double @PR21780_only_access3_without_inbounds(double* %ptr) {
+; CHECK-LABEL: @PR21780_only_access3_without_inbounds(double* %ptr)
+  %arrayidx3 = getelementptr double, double* %ptr, i64 3
+  %t3 = load double, double* %arrayidx3, align 8
+  ret double %t3
+}
+
+define double @PR21780_without_inbounds(double* %ptr) {
+; CHECK-LABEL: @PR21780_without_inbounds(double* %ptr)
+
+  %arrayidx1 = getelementptr double, double* %ptr, i64 1
+  %arrayidx2 = getelementptr double, double* %ptr, i64 2
+  %arrayidx3 = getelementptr double, double* %ptr, i64 3
+
+  %t0 = load double, double* %ptr, align 8
+  %t1 = load double, double* %arrayidx1, align 8
+  %t2 = load double, double* %arrayidx2, align 8
+  %t3 = load double, double* %arrayidx3, align 8
+
+  ret double %t3
+}
+
+; Unsimplified, but still valid. Also, throw in some bogus arguments.
+
+define void @gep0(i8* %unused, i8* %other, i8* %ptr) {
+; CHECK-LABEL: @gep0(i8* %unused, i8* %other, i8* %ptr)
+  %arrayidx0 = getelementptr i8, i8* %ptr, i64 0
+  %arrayidx1 = getelementptr i8, i8* %ptr, i64 1
+  %arrayidx2 = getelementptr i8, i8* %ptr, i64 2
+  %t0 = load i8, i8* %arrayidx0
+  %t1 = load i8, i8* %arrayidx1
+  %t2 = load i8, i8* %arrayidx2
+  store i8 %t2, i8* %other
+  ret void
+}
+
+; Order of accesses does not change computation.
+; Multiple arguments may be dereferenceable.
+
+define void @ordering(i8* %ptr1, i32* %ptr2) {
+; CHECK-LABEL: @ordering(i8* %ptr1, i32* %ptr2)
+  %a20 = getelementptr i32, i32* %ptr2, i64 0
+  %a12 = getelementptr i8, i8* %ptr1, i64 2
+  %t12 = load i8, i8* %a12
+  %a11 = getelementptr i8, i8* %ptr1, i64 1
+  %t20 = load i32, i32* %a20
+  %a10 = getelementptr i8, i8* %ptr1, i64 0
+  %t10 = load i8, i8* %a10
+  %t11 = load i8, i8* %a11
+  %a21 = getelementptr i32, i32* %ptr2, i64 1
+  %t21 = load i32, i32* %a21
+  ret void
+}
+
+; Not in entry block.
+
+define void @not_entry_but_guaranteed_to_execute(i8* %ptr) {
+; CHECK-LABEL: @not_entry_but_guaranteed_to_execute(i8* %ptr)
+entry:
+  br label %exit
+exit:
+  %arrayidx0 = getelementptr i8, i8* %ptr, i64 0
+  %arrayidx1 = getelementptr i8, i8* %ptr, i64 1
+  %arrayidx2 = getelementptr i8, i8* %ptr, i64 2
+  %t0 = load i8, i8* %arrayidx0
+  %t1 = load i8, i8* %arrayidx1
+  %t2 = load i8, i8* %arrayidx2
+  ret void
+}
+
+; Not in entry block and not guaranteed to execute.
+
+define void @not_entry_not_guaranteed_to_execute(i8* %ptr, i1 %cond) {
+; CHECK-LABEL: @not_entry_not_guaranteed_to_execute(i8* %ptr, i1 %cond)
+entry:
+  br i1 %cond, label %loads, label %exit
+loads:
+  %arrayidx0 = getelementptr i8, i8* %ptr, i64 0
+  %arrayidx1 = getelementptr i8, i8* %ptr, i64 1
+  %arrayidx2 = getelementptr i8, i8* %ptr, i64 2
+  %t0 = load i8, i8* %arrayidx0
+  %t1 = load i8, i8* %arrayidx1
+  %t2 = load i8, i8* %arrayidx2
+  ret void
+exit:
+  ret void
+}
+
+; The last load may not execute, so derefenceable bytes only covers the 1st two loads.
+
+define void @partial_in_entry(i16* %ptr, i1 %cond) {
+; CHECK-LABEL: @partial_in_entry(i16* %ptr, i1 %cond)
+entry:
+  %arrayidx0 = getelementptr i16, i16* %ptr, i64 0
+  %arrayidx1 = getelementptr i16, i16* %ptr, i64 1
+  %arrayidx2 = getelementptr i16, i16* %ptr, i64 2
+  %t0 = load i16, i16* %arrayidx0
+  %t1 = load i16, i16* %arrayidx1
+  br i1 %cond, label %loads, label %exit
+loads:
+  %t2 = load i16, i16* %arrayidx2
+  ret void
+exit:
+  ret void
+}
+
+; The volatile load can't be used to prove a non-volatile access is allowed.
+; The 2nd and 3rd loads may never execute.
+
+define void @volatile_is_not_dereferenceable(i16* %ptr) {
+; CHECK-LABEL: @volatile_is_not_dereferenceable(i16* %ptr)
+  %arrayidx0 = getelementptr i16, i16* %ptr, i64 0
+  %arrayidx1 = getelementptr i16, i16* %ptr, i64 1
+  %arrayidx2 = getelementptr i16, i16* %ptr, i64 2
+  %t0 = load volatile i16, i16* %arrayidx0
+  %t1 = load i16, i16* %arrayidx1
+  %t2 = load i16, i16* %arrayidx2
+  ret void
+}
+
+; TODO: We should allow inference for atomic (but not volatile) ops.
+
+define void @atomic_is_alright(i16* %ptr) {
+; CHECK-LABEL: @atomic_is_alright(i16* %ptr)
+  %arrayidx0 = getelementptr i16, i16* %ptr, i64 0
+  %arrayidx1 = getelementptr i16, i16* %ptr, i64 1
+  %arrayidx2 = getelementptr i16, i16* %ptr, i64 2
+  %t0 = load atomic i16, i16* %arrayidx0 unordered, align 2
+  %t1 = load i16, i16* %arrayidx1
+  %t2 = load i16, i16* %arrayidx2
+  ret void
+}
+
+declare void @may_not_return()
+
+define void @not_guaranteed_to_transfer_execution(i16* %ptr) {
+; CHECK-LABEL: @not_guaranteed_to_transfer_execution(i16* %ptr)
+  %arrayidx0 = getelementptr i16, i16* %ptr, i64 0
+  %arrayidx1 = getelementptr i16, i16* %ptr, i64 1
+  %arrayidx2 = getelementptr i16, i16* %ptr, i64 2
+  %t0 = load i16, i16* %arrayidx0
+  call void @may_not_return()
+  %t1 = load i16, i16* %arrayidx1
+  %t2 = load i16, i16* %arrayidx2
+  ret void
+}
+
+; We must have consecutive accesses.
+
+define void @variable_gep_index(i8* %unused, i8* %ptr, i64 %variable_index) {
+; CHECK-LABEL: @variable_gep_index(i8* %unused, i8* %ptr, i64 %variable_index)
+  %arrayidx1 = getelementptr i8, i8* %ptr, i64 %variable_index
+  %arrayidx2 = getelementptr i8, i8* %ptr, i64 2
+  %t0 = load i8, i8* %ptr
+  %t1 = load i8, i8* %arrayidx1
+  %t2 = load i8, i8* %arrayidx2
+  ret void
+}
+
+; Deal with >1 GEP index.
+
+define void @multi_index_gep(<4 x i8>* %ptr) {
+; CHECK-LABEL: @multi_index_gep(<4 x i8>* %ptr)
+; FIXME: %ptr should be dereferenceable(4)
+  %arrayidx00 = getelementptr <4 x i8>, <4 x i8>* %ptr, i64 0, i64 0
+  %t0 = load i8, i8* %arrayidx00
+  ret void
+}
+
+; Could round weird bitwidths down?
+
+define void @not_byte_multiple(i9* %ptr) {
+; CHECK-LABEL: @not_byte_multiple(i9* %ptr)
+  %arrayidx0 = getelementptr i9, i9* %ptr, i64 0
+  %t0 = load i9, i9* %arrayidx0
+  ret void
+}
+
+; Missing direct access from the pointer.
+
+define void @no_pointer_deref(i16* %ptr) {
+; CHECK-LABEL: @no_pointer_deref(i16* %ptr)
+  %arrayidx1 = getelementptr i16, i16* %ptr, i64 1
+  %arrayidx2 = getelementptr i16, i16* %ptr, i64 2
+  %t1 = load i16, i16* %arrayidx1
+  %t2 = load i16, i16* %arrayidx2
+  ret void
+}
+
+; Out-of-order is ok, but missing access concludes dereferenceable range.
+
+define void @non_consecutive(i32* %ptr) {
+; CHECK-LABEL: @non_consecutive(i32* %ptr)
+  %arrayidx1 = getelementptr i32, i32* %ptr, i64 1
+  %arrayidx0 = getelementptr i32, i32* %ptr, i64 0
+  %arrayidx3 = getelementptr i32, i32* %ptr, i64 3
+  %t1 = load i32, i32* %arrayidx1
+  %t0 = load i32, i32* %arrayidx0
+  %t3 = load i32, i32* %arrayidx3
+  ret void
+}
+
+; Improve on existing dereferenceable attribute.
+
+define void @more_bytes(i32* dereferenceable(8) %ptr) {
+; CHECK-LABEL: @more_bytes(i32* dereferenceable(8) %ptr)
+  %arrayidx3 = getelementptr i32, i32* %ptr, i64 3
+  %arrayidx1 = getelementptr i32, i32* %ptr, i64 1
+  %arrayidx0 = getelementptr i32, i32* %ptr, i64 0
+  %arrayidx2 = getelementptr i32, i32* %ptr, i64 2
+  %t3 = load i32, i32* %arrayidx3
+  %t1 = load i32, i32* %arrayidx1
+  %t2 = load i32, i32* %arrayidx2
+  %t0 = load i32, i32* %arrayidx0
+  ret void
+}
+
+; Improve on existing dereferenceable_or_null attribute.
+
+define void @more_bytes_and_not_null(i32* dereferenceable_or_null(8) %ptr) {
+; CHECK-LABEL: @more_bytes_and_not_null(i32* dereferenceable_or_null(8) %ptr)
+  %arrayidx3 = getelementptr i32, i32* %ptr, i64 3
+  %arrayidx1 = getelementptr i32, i32* %ptr, i64 1
+  %arrayidx0 = getelementptr i32, i32* %ptr, i64 0
+  %arrayidx2 = getelementptr i32, i32* %ptr, i64 2
+  %t3 = load i32, i32* %arrayidx3
+  %t1 = load i32, i32* %arrayidx1
+  %t2 = load i32, i32* %arrayidx2
+  %t0 = load i32, i32* %arrayidx0
+  ret void
+}
+
+; But don't pessimize existing dereferenceable attribute.
+
+define void @better_bytes(i32* dereferenceable(100) %ptr) {
+; CHECK-LABEL: @better_bytes(i32* dereferenceable(100) %ptr)
+  %arrayidx3 = getelementptr i32, i32* %ptr, i64 3
+  %arrayidx1 = getelementptr i32, i32* %ptr, i64 1
+  %arrayidx0 = getelementptr i32, i32* %ptr, i64 0
+  %arrayidx2 = getelementptr i32, i32* %ptr, i64 2
+  %t3 = load i32, i32* %arrayidx3
+  %t1 = load i32, i32* %arrayidx1
+  %t2 = load i32, i32* %arrayidx2
+  %t0 = load i32, i32* %arrayidx0
+  ret void
+}
+
+define void @bitcast(i32* %arg) {
+; CHECK-LABEL: @bitcast(i32* %arg)
+  %ptr = bitcast i32* %arg to float*
+  %arrayidx0 = getelementptr float, float* %ptr, i64 0
+  %arrayidx1 = getelementptr float, float* %ptr, i64 1
+  %t0 = load float, float* %arrayidx0
+  %t1 = load float, float* %arrayidx1
+  ret void
+}
+
+define void @bitcast_different_sizes(double* %arg1, i8* %arg2) {
+; CHECK-LABEL: @bitcast_different_sizes(double* %arg1, i8* %arg2)
+  %ptr1 = bitcast double* %arg1 to float*
+  %a10 = getelementptr float, float* %ptr1, i64 0
+  %a11 = getelementptr float, float* %ptr1, i64 1
+  %a12 = getelementptr float, float* %ptr1, i64 2
+  %ld10 = load float, float* %a10
+  %ld11 = load float, float* %a11
+  %ld12 = load float, float* %a12
+
+  %ptr2 = bitcast i8* %arg2 to i64*
+  %a20 = getelementptr i64, i64* %ptr2, i64 0
+  %a21 = getelementptr i64, i64* %ptr2, i64 1
+  %ld20 = load i64, i64* %a20
+  %ld21 = load i64, i64* %a21
+  ret void
+}
+
+define void @negative_offset(i32* %arg) {
+; CHECK-LABEL: @negative_offset(i32* %arg)
+  %ptr = bitcast i32* %arg to float*
+  %arrayidx0 = getelementptr float, float* %ptr, i64 0
+  %arrayidx1 = getelementptr float, float* %ptr, i64 -1
+  %t0 = load float, float* %arrayidx0
+  %t1 = load float, float* %arrayidx1
+  ret void
+}
+
+define void @stores(i32* %arg) {
+; CHECK-LABEL: @stores(i32* %arg)
+  %ptr = bitcast i32* %arg to float*
+  %arrayidx0 = getelementptr float, float* %ptr, i64 0
+  %arrayidx1 = getelementptr float, float* %ptr, i64 1
+  store float 1.0, float* %arrayidx0
+  store float 2.0, float* %arrayidx1
+  ret void
+}
+
+define void @load_store(i32* %arg) {
+; CHECK-LABEL: @load_store(i32* %arg)
+  %ptr = bitcast i32* %arg to float*
+  %arrayidx0 = getelementptr float, float* %ptr, i64 0
+  %arrayidx1 = getelementptr float, float* %ptr, i64 1
+  %t1 = load float, float* %arrayidx0
+  store float 2.0, float* %arrayidx1
+  ret void
+}
+
+define void @different_size1(i32* %arg) {
+; CHECK-LABEL: @different_size1(i32* %arg)
+  %arg-cast = bitcast i32* %arg to double*
+  store double 0.000000e+00, double* %arg-cast
+  store i32 0, i32* %arg
+  ret void
+}
+
+define void @different_size2(i32* %arg) {
+; CHECK-LABEL: @different_size2(i32* %arg)
+  store i32 0, i32* %arg
+  %arg-cast = bitcast i32* %arg to double*
+  store double 0.000000e+00, double* %arg-cast
+  ret void
+}
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-bitcast-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-bitcast-inseltpoison.ll
@ -0,0 +1,13 @@
+; RUN: opt -instcombine -mtriple=aarch64-linux-gnu -mattr=+sve -S < %s | FileCheck %s
+
+; We shouldn't fold bitcast(insert <vscale x 1 x iX> .., iX %val, i32 0)
+; into bitcast(iX %val) for scalable vectors.
+define <vscale x 2 x i8> @bitcast_of_insert_i8_i16(i16 %val) #0 {
+; CHECK-LABEL: @bitcast_of_insert_i8_i16(
+; CHECK-NOT:   bitcast i16 %val to <vscale x 2 x i8>
+; CHECK:       bitcast <vscale x 1 x i16> %op2 to <vscale x 2 x i8>
+entry:
+  %op2 = insertelement <vscale x 1 x i16> poison, i16 %val, i32 0
+  %0 = bitcast <vscale x 1 x i16> %op2 to <vscale x 2 x i8>
+  ret <vscale x 2 x i8> %0
+}
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll
--- a/llvm/test/Transforms/InstCombine/X86/x86-addsub-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-addsub-inseltpoison.ll
@ -0,0 +1,194 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
+
+declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>)
+declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>)
+declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>)
+declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>)
+declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8 immarg) #0
+
+;
+; Demanded Elts
+;
+
+define double @elts_addsub_v2f64(<2 x double> %0, <2 x double> %1) {
+; CHECK-LABEL: @elts_addsub_v2f64(
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP1:%.*]], <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> [[TMP0:%.*]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; CHECK-NEXT:    ret double [[TMP5]]
+;
+  %3 = shufflevector <2 x double> %0, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  %4 = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %5 = tail call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %3, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 0
+  ret double %6
+}
+
+define double @elts_addsub_v2f64_sub(<2 x double> %0, <2 x double> %1) {
+; CHECK-LABEL: @elts_addsub_v2f64_sub(
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP0:%.*]], [[TMP1:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; CHECK-NEXT:    ret double [[TMP4]]
+;
+  %3 = shufflevector <2 x double> %0, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  %4 = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  %5 = tail call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %3, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 0
+  ret double %6
+}
+
+define float @elts_addsub_v4f32(<4 x float> %0, <4 x float> %1) {
+; CHECK-LABEL: @elts_addsub_v4f32(
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd float [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    ret float [[TMP6]]
+;
+  %3 = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %4 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %5 = tail call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %3, <4 x float> %4)
+  %6 = extractelement <4 x float> %5, i32 0
+  %7 = extractelement <4 x float> %5, i32 1
+  %8 = fadd float %6, %7
+  ret float %8
+}
+
+define float @elts_addsub_v4f32_add(<4 x float> %0, <4 x float> %1) {
+; CHECK-LABEL: @elts_addsub_v4f32_add(
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP0:%.*]], [[TMP1:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd float [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    ret float [[TMP6]]
+;
+  %3 = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %4 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %5 = tail call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %3, <4 x float> %4)
+  %6 = extractelement <4 x float> %5, i32 1
+  %7 = extractelement <4 x float> %5, i32 3
+  %8 = fadd float %6, %7
+  ret float %8
+}
+
+define double @elts_addsub_v4f64(<4 x double> %0, <4 x double> %1) {
+; CHECK-LABEL: @elts_addsub_v4f64(
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> [[TMP0:%.*]], <4 x double> [[TMP1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd double [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    ret double [[TMP6]]
+;
+  %3 = shufflevector <4 x double> %0, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
+  %4 = shufflevector <4 x double> %1, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
+  %5 = tail call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %3, <4 x double> %4)
+  %6 = extractelement <4 x double> %5, i32 0
+  %7 = extractelement <4 x double> %5, i32 1
+  %8 = fadd double %6, %7
+  ret double %8
+}
+
+define double @elts_addsub_v4f64_add(<4 x double> %0, <4 x double> %1) {
+; CHECK-LABEL: @elts_addsub_v4f64_add(
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP0:%.*]], [[TMP1:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP3]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd double [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    ret double [[TMP6]]
+;
+  %3 = shufflevector <4 x double> %0, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
+  %4 = shufflevector <4 x double> %1, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
+  %5 = tail call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %3, <4 x double> %4)
+  %6 = extractelement <4 x double> %5, i32 1
+  %7 = extractelement <4 x double> %5, i32 3
+  %8 = fadd double %6, %7
+  ret double %8
+}
+
+define float @elts_addsub_v8f32(<8 x float> %0, <8 x float> %1) {
+; CHECK-LABEL: @elts_addsub_v8f32(
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> [[TMP0:%.*]], <8 x float> [[TMP1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd float [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    ret float [[TMP6]]
+;
+  %3 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 4, i32 4, i32 4>
+  %4 = shufflevector <8 x float> %1, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 4, i32 4, i32 4>
+  %5 = tail call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %3, <8 x float> %4)
+  %6 = extractelement <8 x float> %5, i32 0
+  %7 = extractelement <8 x float> %5, i32 1
+  %8 = fadd float %6, %7
+  ret float %8
+}
+
+define float @elts_addsub_v8f32_sub(<8 x float> %0, <8 x float> %1) {
+; CHECK-LABEL: @elts_addsub_v8f32_sub(
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP0:%.*]], [[TMP1:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP3]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd float [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    ret float [[TMP6]]
+;
+  %3 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 4, i32 4, i32 4>
+  %4 = shufflevector <8 x float> %1, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 4, i32 4, i32 4>
+  %5 = tail call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %3, <8 x float> %4)
+  %6 = extractelement <8 x float> %5, i32 0
+  %7 = extractelement <8 x float> %5, i32 4
+  %8 = fadd float %6, %7
+  ret float %8
+}
+
+define void @PR46277(float %0, float %1, float %2, float %3, <4 x float> %4, float* %5) {
+; CHECK-LABEL: @PR46277(
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> poison, float [[TMP0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> [[TMP8]], <4 x float> [[TMP4:%.*]])
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP5:%.*]], i64 1
+; CHECK-NEXT:    store float [[TMP10]], float* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP9]], i32 1
+; CHECK-NEXT:    store float [[TMP12]], float* [[TMP11]], align 4
+; CHECK-NEXT:    ret void
+;
+  %7 = insertelement <4 x float> poison, float %0, i32 0
+  %8 = insertelement <4 x float> %7, float %1, i32 1
+  %9 = insertelement <4 x float> %8, float %2, i32 2
+  %10 = insertelement <4 x float> %9, float %3, i32 3
+  %11 = tail call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %10, <4 x float> %4)
+  %12 = extractelement <4 x float> %11, i32 0
+  %13 = getelementptr inbounds float, float* %5, i64 1
+  store float %12, float* %5, align 4
+  %14 = extractelement <4 x float> %11, i32 1
+  store float %14, float* %13, align 4
+  ret void
+}
+
+define double @PR48476_fsub(<2 x double> %x) {
+; CHECK-LABEL: @PR48476_fsub(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> <double 0.000000e+00, double undef>, [[X:%.*]]
+; CHECK-NEXT:    [[T2:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP1]], <2 x double> [[X]], i8 6)
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x double> [[T2]], i32 0
+; CHECK-NEXT:    ret double [[VECEXT]]
+;
+  %t1 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> zeroinitializer, <2 x double> %x)
+  %t2 = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %t1, <2 x double> %x, i8 6)
+  %vecext = extractelement <2 x double> %t2, i32 0
+  ret double %vecext
+}
+
+define double @PR48476_fadd_fsub(<2 x double> %x) {
+; CHECK-LABEL: @PR48476_fadd_fsub(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[X:%.*]], <double undef, double 0.000000e+00>
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> [[S]], [[X]]
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; CHECK-NEXT:    ret double [[VECEXT]]
+;
+  %t1 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> zeroinitializer, <2 x double> %x)
+  %s = shufflevector <2 x double> %t1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+  %t2 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %s, <2 x double> %x)
+  %vecext = extractelement <2 x double> %t2, i32 0
+  ret double %vecext
+}
--- a/llvm/test/Transforms/InstCombine/X86/x86-avx512-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-avx512-inseltpoison.ll
--- a/llvm/test/Transforms/InstCombine/X86/x86-pack-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-pack-inseltpoison.ll
@ -0,0 +1,635 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
+
+;
+; UNDEF Elts
+;
+
+define <8 x i16> @undef_packssdw_128() {
+; CHECK-LABEL: @undef_packssdw_128(
+; CHECK-NEXT:    ret <8 x i16> undef
+;
+  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> undef, <4 x i32> undef)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @undef_packusdw_128() {
+; CHECK-LABEL: @undef_packusdw_128(
+; CHECK-NEXT:    ret <8 x i16> undef
+;
+  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> undef, <4 x i32> undef)
+  ret <8 x i16> %1
+}
+
+define <16 x i8> @undef_packsswb_128() {
+; CHECK-LABEL: @undef_packsswb_128(
+; CHECK-NEXT:    ret <16 x i8> undef
+;
+  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> undef, <8 x i16> undef)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @undef_packuswb_128() {
+; CHECK-LABEL: @undef_packuswb_128(
+; CHECK-NEXT:    ret <16 x i8> undef
+;
+  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> undef, <8 x i16> undef)
+  ret <16 x i8> %1
+}
+
+define <16 x i16> @undef_packssdw_256() {
+; CHECK-LABEL: @undef_packssdw_256(
+; CHECK-NEXT:    ret <16 x i16> undef
+;
+  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> undef, <8 x i32> undef)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @undef_packusdw_256() {
+; CHECK-LABEL: @undef_packusdw_256(
+; CHECK-NEXT:    ret <16 x i16> undef
+;
+  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> undef, <8 x i32> undef)
+  ret <16 x i16> %1
+}
+
+define <32 x i8> @undef_packsswb_256() {
+; CHECK-LABEL: @undef_packsswb_256(
+; CHECK-NEXT:    ret <32 x i8> undef
+;
+  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> undef, <16 x i16> undef)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @undef_packuswb_256() {
+; CHECK-LABEL: @undef_packuswb_256(
+; CHECK-NEXT:    ret <32 x i8> undef
+;
+  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> undef, <16 x i16> undef)
+  ret <32 x i8> %1
+}
+
+define <32 x i16> @undef_packssdw_512() {
+; CHECK-LABEL: @undef_packssdw_512(
+; CHECK-NEXT:    ret <32 x i16> undef
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> undef, <16 x i32> undef)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @undef_packusdw_512() {
+; CHECK-LABEL: @undef_packusdw_512(
+; CHECK-NEXT:    ret <32 x i16> undef
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> undef, <16 x i32> undef)
+  ret <32 x i16> %1
+}
+
+define <64 x i8> @undef_packsswb_512() {
+; CHECK-LABEL: @undef_packsswb_512(
+; CHECK-NEXT:    ret <64 x i8> undef
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> undef, <32 x i16> undef)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @undef_packuswb_512() {
+; CHECK-LABEL: @undef_packuswb_512(
+; CHECK-NEXT:    ret <64 x i8> undef
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> undef, <32 x i16> undef)
+  ret <64 x i8> %1
+}
+
+;
+; Constant Folding
+;
+
+define <8 x i16> @fold_packssdw_128() {
+; CHECK-LABEL: @fold_packssdw_128(
+; CHECK-NEXT:    ret <8 x i16> <i16 0, i16 -1, i16 32767, i16 -32768, i16 0, i16 0, i16 0, i16 0>
+;
+  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> <i32 0, i32 -1, i32 65536, i32 -131072>, <4 x i32> zeroinitializer)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @fold_packusdw_128() {
+; CHECK-LABEL: @fold_packusdw_128(
+; CHECK-NEXT:    ret <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 0, i16 0, i16 -32768, i16 -1>
+;
+  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> undef, <4 x i32> <i32 0, i32 -1, i32 32768, i32 65537>)
+  ret <8 x i16> %1
+}
+
+define <16 x i8> @fold_packsswb_128() {
+; CHECK-LABEL: @fold_packsswb_128(
+; CHECK-NEXT:    ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>
+;
+  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> zeroinitializer, <8 x i16> undef)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @fold_packuswb_128() {
+; CHECK-LABEL: @fold_packuswb_128(
+; CHECK-NEXT:    ret <16 x i8> <i8 0, i8 1, i8 0, i8 -1, i8 0, i8 0, i8 0, i8 15, i8 0, i8 127, i8 0, i8 1, i8 0, i8 1, i8 0, i8 0>
+;
+  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> <i16 0, i16 1, i16 -1, i16 255, i16 65535, i16 -32768, i16 -127, i16 15>, <8 x i16> <i16 -15, i16 127, i16 32768, i16 -65535, i16 -255, i16 1, i16 -1, i16 0>)
+  ret <16 x i8> %1
+}
+
+define <16 x i16> @fold_packssdw_256() {
+; CHECK-LABEL: @fold_packssdw_256(
+; CHECK-NEXT:    ret <16 x i16> <i16 0, i16 256, i16 32767, i16 -32768, i16 undef, i16 undef, i16 undef, i16 undef, i16 -127, i16 -32768, i16 -32767, i16 32767, i16 undef, i16 undef, i16 undef, i16 undef>
+;
+  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> <i32 0, i32 256, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>, <8 x i32> undef)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @fold_packusdw_256() {
+; CHECK-LABEL: @fold_packusdw_256(
+; CHECK-NEXT:    ret <16 x i16> <i16 0, i16 0, i16 0, i16 -1, i16 0, i16 256, i16 -1, i16 0, i16 127, i16 -32768, i16 32767, i16 0, i16 0, i16 0, i16 0, i16 32767>
+;
+  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> <i32 0, i32 -256, i32 -65535, i32 65536, i32 127, i32 32768, i32 32767, i32 -32767>, <8 x i32> <i32 0, i32 256, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>)
+  ret <16 x i16> %1
+}
+
+define <32 x i8> @fold_packsswb_256() {
+; CHECK-LABEL: @fold_packsswb_256(
+; CHECK-NEXT:    ret <32 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+;
+  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> undef, <16 x i16> zeroinitializer)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @fold_packuswb_256() {
+; CHECK-LABEL: @fold_packuswb_256(
+; CHECK-NEXT:    ret <32 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64>
+;
+  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> zeroinitializer, <16 x i16> <i16 0, i16 -127, i16 -128, i16 -32768, i16 65536, i16 255, i16 256, i16 512, i16 -1, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64>)
+  ret <32 x i8> %1
+}
+
+define <32 x i16> @fold_packssdw_512() {
+; CHECK-LABEL: @fold_packssdw_512(
+; CHECK-NEXT:    ret <32 x i16> <i16 0, i16 512, i16 32767, i16 -32768, i16 undef, i16 undef, i16 undef, i16 undef, i16 -127, i16 -32768, i16 -32767, i16 32767, i16 undef, i16 undef, i16 undef, i16 undef, i16 0, i16 512, i16 32767, i16 -32768, i16 undef, i16 undef, i16 undef, i16 undef, i16 -127, i16 -32768, i16 -32767, i16 32767, i16 undef, i16 undef, i16 undef, i16 undef>
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> <i32 0, i32 512, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767, i32 0, i32 512, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>, <16 x i32> undef)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @fold_packusdw_512() {
+; CHECK-LABEL: @fold_packusdw_512(
+; CHECK-NEXT:    ret <32 x i16> <i16 0, i16 0, i16 0, i16 -1, i16 0, i16 512, i16 -1, i16 0, i16 127, i16 -32768, i16 32767, i16 0, i16 0, i16 0, i16 0, i16 32767, i16 0, i16 0, i16 0, i16 -1, i16 0, i16 512, i16 -1, i16 0, i16 127, i16 -32768, i16 32767, i16 0, i16 0, i16 0, i16 0, i16 32767>
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> <i32 0, i32 -512, i32 -65535, i32 65536, i32 127, i32 32768, i32 32767, i32 -32767, i32 0, i32 -512, i32 -65535, i32 65536, i32 127, i32 32768, i32 32767, i32 -32767>, <16 x i32> <i32 0, i32 512, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767, i32 0, i32 512, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>)
+  ret <32 x i16> %1
+}
+
+define <64 x i8> @fold_packsswb_512() {
+; CHECK-LABEL: @fold_packsswb_512(
+; CHECK-NEXT:    ret <64 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> undef, <32 x i16> zeroinitializer)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @fold_packuswb_512() {
+; CHECK-LABEL: @fold_packuswb_512(
+; CHECK-NEXT:    ret <64 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64>
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> zeroinitializer, <32 x i16> <i16 0, i16 -127, i16 -128, i16 -32768, i16 65536, i16 255, i16 512, i16 512, i16 -1, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 0, i16 -127, i16 -128, i16 -32768, i16 65536, i16 255, i16 512, i16 512, i16 -1, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64>)
+  ret <64 x i8> %1
+}
+
+;
+; Demanded Elts
+;
+
+define <8 x i16> @elts_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @elts_packssdw_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> [[A0:%.*]], <4 x i32> undef)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+;
+  %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 undef, i32 undef>
+  %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 undef>
+  %3 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %1, <4 x i32> %2)
+  %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 7, i32 7, i32 7, i32 7>
+  ret <8 x i16> %4
+}
+
+define <8 x i16> @elts_packusdw_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @elts_packusdw_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 undef>
+; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+;
+  %1 = insertelement <4 x i32> %a0, i32 0, i32 0
+  %2 = insertelement <4 x i32> %a1, i32 0, i32 3
+  %3 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %1, <4 x i32> %2)
+  %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 undef>
+  ret <8 x i16> %4
+}
+
+define <16 x i8> @elts_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @elts_packsswb_128(
+; CHECK-NEXT:    ret <16 x i8> zeroinitializer
+;
+  %1 = insertelement <8 x i16> %a0, i16 0, i32 0
+  %2 = insertelement <8 x i16> %a1, i16 0, i32 0
+  %3 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %1, <8 x i16> %2)
+  %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  ret <16 x i8> %4
+}
+
+define <16 x i8> @elts_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @elts_packuswb_128(
+; CHECK-NEXT:    ret <16 x i8> undef
+;
+  %1 = insertelement <8 x i16> poison, i16 0, i32 0
+  %2 = insertelement <8 x i16> poison, i16 0, i32 0
+  %3 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %1, <8 x i16> %2)
+  %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+  ret <16 x i8> %4
+}
+
+define <16 x i16> @elts_packssdw_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @elts_packssdw_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> [[A0:%.*]], <8 x i32> undef)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 undef, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <16 x i16> [[TMP2]]
+;
+  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 undef, i32 6, i32 5, i32 undef>
+  %3 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %1, <8 x i32> %2)
+  %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 7, i32 8, i32 undef, i32 undef, i32 11, i32 12, i32 undef, i32 undef, i32 15>
+  ret <16 x i16> %4
+}
+
+define <16 x i16> @elts_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @elts_packusdw_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A1:%.*]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> undef, <8 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+;
+  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %3 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %1, <8 x i32> %2)
+  %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i16> %4
+}
+
+define <32 x i8> @elts_packsswb_256(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: @elts_packsswb_256(
+; CHECK-NEXT:    ret <32 x i8> zeroinitializer
+;
+  %1 = insertelement <16 x i16> %a0, i16 0, i32 0
+  %2 = insertelement <16 x i16> %a1, i16 0, i32 8
+  %3 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %1, <16 x i16> %2)
+  %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+  ret <32 x i8> %4
+}
+
+define <32 x i8> @elts_packuswb_256(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: @elts_packuswb_256(
+; CHECK-NEXT:    ret <32 x i8> undef
+;
+  %1 = insertelement <16 x i16> poison, i16 0, i32 1
+  %2 = insertelement <16 x i16> poison, i16 0, i32 0
+  %3 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %1, <16 x i16> %2)
+  %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> zeroinitializer
+  ret <32 x i8> %4
+}
+
+define <32 x i16> @elts_packssdw_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @elts_packssdw_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A0:%.*]], <16 x i32> undef)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i16> [[TMP1]], <32 x i16> undef, <32 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 undef, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 24, i32 undef, i32 undef, i32 27, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <32 x i16> [[TMP2]]
+;
+  %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 8, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 undef, i32 6, i32 5, i32 undef, i32 undef, i32 10, i32 9, i32 undef, i32 undef, i32 14, i32 13, i32 undef>
+  %3 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %1, <16 x i32> %2)
+  %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 7, i32 8, i32 undef, i32 undef, i32 11, i32 12, i32 undef, i32 undef, i32 15, i32 undef, i32 undef, i32 18, i32 19, i32 20, i32 undef, i32 undef, i32 23, i32 24, i32 undef, i32 undef, i32 27, i32 28, i32 undef, i32 undef, i32 31>
+  ret <32 x i16> %4
+}
+
+define <32 x i16> @elts_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @elts_packusdw_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A1:%.*]], <16 x i32> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> undef, <16 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
+;
+  %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+  %3 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %1, <16 x i32> %2)
+  %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <32 x i16> %4
+}
+
+define <64 x i8> @elts_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) {
+; CHECK-LABEL: @elts_packsswb_512(
+; CHECK-NEXT:    ret <64 x i8> zeroinitializer
+;
+  %1 = insertelement <32 x i16> %a0, i16 0, i32 0
+  %2 = insertelement <32 x i16> %a1, i16 0, i32 8
+  %3 = insertelement <32 x i16> %1, i16 0, i32 16
+  %4 = insertelement <32 x i16> %2, i16 0, i32 24
+  %5 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %3, <32 x i16> %4)
+  %6 = shufflevector <64 x i8> %5, <64 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56>
+  ret <64 x i8> %6
+}
+
+define <64 x i8> @elts_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) {
+; CHECK-LABEL: @elts_packuswb_512(
+; CHECK-NEXT:    ret <64 x i8> undef
+;
+  %1 = insertelement <32 x i16> poison, i16 0, i32 1
+  %2 = insertelement <32 x i16> poison, i16 0, i32 0
+  %3 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %1, <32 x i16> %2)
+  %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> zeroinitializer
+  ret <64 x i8> %4
+}
+
+;
+; Truncation (without Saturation)
+;
+
+define <8 x i16> @trunc_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @trunc_packssdw_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> [[A0:%.*]], <i32 17, i32 17, i32 17, i32 17>
+; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i32> [[A1:%.*]], <i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+;
+  %1 = ashr <4 x i32> %a0, <i32 17, i32 17, i32 17, i32 17>
+  %2 = and  <4 x i32> %a1, <i32 15, i32 15, i32 15, i32 15>
+  %3 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %1, <4 x i32> %2)
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @trunc_packusdw_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @trunc_packusdw_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> [[A0:%.*]], <i32 17, i32 17, i32 17, i32 17>
+; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i32> [[A1:%.*]], <i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+;
+  %1 = lshr <4 x i32> %a0, <i32 17, i32 17, i32 17, i32 17>
+  %2 = and  <4 x i32> %a1, <i32 15, i32 15, i32 15, i32 15>
+  %3 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %1, <4 x i32> %2)
+  ret <8 x i16> %3
+}
+
+define <16 x i8> @trunc_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @trunc_packsswb_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i16> [[A0:%.*]], <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i16> [[A1:%.*]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+;
+  %1 = ashr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+  %2 = and  <8 x i16> %a1, <i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1>
+  %3 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %1, <8 x i16> %2)
+  ret <16 x i8> %3
+}
+
+define <16 x i8> @trunc_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @trunc_packuswb_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i16> [[A0:%.*]], <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i16> [[A1:%.*]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+;
+  %1 = lshr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+  %2 = and  <8 x i16> %a1, <i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1>
+  %3 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %1, <8 x i16> %2)
+  ret <16 x i8> %3
+}
+
+define <16 x i16> @trunc_packssdw_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @trunc_packssdw_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A0:%.*]], <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr <8 x i32> [[A1:%.*]], <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
+; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+;
+  %1 = ashr <8 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
+  %2 = ashr <8 x i32> %a1, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
+  %3 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %1, <8 x i32> %2)
+  ret <16 x i16> %3
+}
+
+define <16 x i16> @trunc_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @trunc_packusdw_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> [[A0:%.*]], <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
+; CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i32> [[A1:%.*]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
+; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+;
+  %1 = lshr <8 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
+  %2 = and  <8 x i32> %a1, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+  %3 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %1, <8 x i32> %2)
+  ret <16 x i16> %3
+}
+
+define <32 x i8> @trunc_packsswb_256(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: @trunc_packsswb_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i16> [[A0:%.*]], <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = and <16 x i16> [[A1:%.*]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
+; CHECK-NEXT:    ret <32 x i8> [[TMP3]]
+;
+  %1 = ashr <16 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+  %2 = and  <16 x i16> %a1, <i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1>
+  %3 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %1, <16 x i16> %2)
+  ret <32 x i8> %3
+}
+
+define <32 x i8> @trunc_packuswb_256(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: @trunc_packuswb_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <16 x i16> [[A0:%.*]], <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = and <16 x i16> [[A1:%.*]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
+; CHECK-NEXT:    ret <32 x i8> [[TMP3]]
+;
+  %1 = lshr <16 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+  %2 = and  <16 x i16> %a1, <i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1>
+  %3 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %1, <16 x i16> %2)
+  ret <32 x i8> %3
+}
+
+define <32 x i16> @trunc_packssdw_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @trunc_packssdw_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i32> [[A0:%.*]], <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr <16 x i32> [[A1:%.*]], <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
+; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
+;
+  %1 = ashr <16 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
+  %2 = ashr <16 x i32> %a1, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
+  %3 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %1, <16 x i32> %2)
+  ret <32 x i16> %3
+}
+
+define <32 x i16> @trunc_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @trunc_packusdw_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <16 x i32> [[A0:%.*]], <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
+; CHECK-NEXT:    [[TMP2:%.*]] = and <16 x i32> [[A1:%.*]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
+; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
+;
+  %1 = lshr <16 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
+  %2 = and  <16 x i32> %a1, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+  %3 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %1, <16 x i32> %2)
+  ret <32 x i16> %3
+}
+
+define <64 x i8> @trunc_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) {
+; CHECK-LABEL: @trunc_packsswb_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <32 x i16> [[A0:%.*]], <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = and <32 x i16> [[A1:%.*]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]])
+; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
+;
+  %1 = ashr <32 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+  %2 = and  <32 x i16> %a1, <i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1>
+  %3 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %1, <32 x i16> %2)
+  ret <64 x i8> %3
+}
+
+define <64 x i8> @trunc_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) {
+; CHECK-LABEL: @trunc_packuswb_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <32 x i16> [[A0:%.*]], <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = and <32 x i16> [[A1:%.*]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]])
+; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
+;
+  %1 = lshr <32 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+  %2 = and  <32 x i16> %a1, <i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1>
+  %3 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %1, <32 x i16> %2)
+  ret <64 x i8> %3
+}
+
+;
+; Signed Pack Comparison Results
+;
+
+define <8 x i16> @cmp_packssdw_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
+; CHECK-LABEL: @cmp_packssdw_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i32> [[A0:%.*]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[A2:%.*]], [[A3:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> [[TMP3]], <4 x i32> [[TMP4]])
+; CHECK-NEXT:    ret <8 x i16> [[TMP5]]
+;
+  %1 = icmp eq <4 x i32> %a0, %a1
+  %2 = icmp eq <4 x i32> %a2, %a3
+  %3 = sext <4 x i1> %1 to <4 x i32>
+  %4 = sext <4 x i1> %2 to <4 x i32>
+  %5 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %3, <4 x i32> %4)
+  ret <8 x i16> %5
+}
+
+define <16 x i8> @cmp_packsswb_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
+; CHECK-LABEL: @cmp_packsswb_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i16> [[A0:%.*]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <8 x i16> [[A2:%.*]], [[A3:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> [[TMP3]], <8 x i16> [[TMP4]])
+; CHECK-NEXT:    ret <16 x i8> [[TMP5]]
+;
+  %1 = icmp eq <8 x i16> %a0, %a1
+  %2 = icmp eq <8 x i16> %a2, %a3
+  %3 = sext <8 x i1> %1 to <8 x i16>
+  %4 = sext <8 x i1> %2 to <8 x i16>
+  %5 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %3, <8 x i16> %4)
+  ret <16 x i8> %5
+}
+
+define <16 x i16> @cmp_packssdw_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
+; CHECK-LABEL: @cmp_packssdw_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i32> [[A0:%.*]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <8 x i32> [[A2:%.*]], [[A3:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> [[TMP3]], <8 x i32> [[TMP4]])
+; CHECK-NEXT:    ret <16 x i16> [[TMP5]]
+;
+  %1 = icmp eq <8 x i32> %a0, %a1
+  %2 = icmp eq <8 x i32> %a2, %a3
+  %3 = sext <8 x i1> %1 to <8 x i32>
+  %4 = sext <8 x i1> %2 to <8 x i32>
+  %5 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %3, <8 x i32> %4)
+  ret <16 x i16> %5
+}
+
+define <32 x i8> @cmp_packsswb_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) {
+; CHECK-LABEL: @cmp_packsswb_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <16 x i16> [[A0:%.*]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <16 x i16> [[A2:%.*]], [[A3:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> [[TMP3]], <16 x i16> [[TMP4]])
+; CHECK-NEXT:    ret <32 x i8> [[TMP5]]
+;
+  %1 = icmp eq <16 x i16> %a0, %a1
+  %2 = icmp eq <16 x i16> %a2, %a3
+  %3 = sext <16 x i1> %1 to <16 x i16>
+  %4 = sext <16 x i1> %2 to <16 x i16>
+  %5 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %3, <16 x i16> %4)
+  ret <32 x i8> %5
+}
+
+define <32 x i16> @cmp_packssdw_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, <16 x i32> %a3) {
+; CHECK-LABEL: @cmp_packssdw_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <16 x i32> [[A0:%.*]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <16 x i32> [[A2:%.*]], [[A3:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP2]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP3]], <16 x i32> [[TMP4]])
+; CHECK-NEXT:    ret <32 x i16> [[TMP5]]
+;
+  %1 = icmp eq <16 x i32> %a0, %a1
+  %2 = icmp eq <16 x i32> %a2, %a3
+  %3 = sext <16 x i1> %1 to <16 x i32>
+  %4 = sext <16 x i1> %2 to <16 x i32>
+  %5 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %3, <16 x i32> %4)
+  ret <32 x i16> %5
+}
+
+define <64 x i8> @cmp_packsswb_512(<32 x i16> %a0, <32 x i16> %a1, <32 x i16> %a2, <32 x i16> %a3) {
+; CHECK-LABEL: @cmp_packsswb_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <32 x i16> [[A0:%.*]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <32 x i16> [[A2:%.*]], [[A3:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <32 x i1> [[TMP1]] to <32 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <32 x i1> [[TMP2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[TMP3]], <32 x i16> [[TMP4]])
+; CHECK-NEXT:    ret <64 x i8> [[TMP5]]
+;
+  %1 = icmp eq <32 x i16> %a0, %a1
+  %2 = icmp eq <32 x i16> %a2, %a3
+  %3 = sext <32 x i1> %1 to <32 x i16>
+  %4 = sext <32 x i1> %2 to <32 x i16>
+  %5 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %3, <32 x i16> %4)
+  ret <64 x i8> %5
+}
+
+declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
+declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
+declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
+declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
+declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
+
+declare <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32>, <16 x i32>) nounwind readnone
+declare <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32>, <16 x i32>) nounwind readnone
+declare <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16>, <32 x i16>) nounwind readnone
+declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>) nounwind readnone
--- a/llvm/test/Transforms/InstCombine/X86/x86-sse-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-sse-inseltpoison.ll
@ -0,0 +1,694 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define float @test_rcp_ss_0(float %a) {
+; CHECK-LABEL: @test_rcp_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    ret float [[TMP3]]
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %4)
+  %6 = extractelement <4 x float> %5, i32 0
+  ret float %6
+}
+
+define float @test_rcp_ss_1(float %a) {
+; CHECK-LABEL: @test_rcp_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %4)
+  %6 = extractelement <4 x float> %5, i32 1
+  ret float %6
+}
+
+define float @test_sqrt_ss_0(float %a) {
+; CHECK-LABEL: @test_sqrt_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.sqrt.f32(float [[A:%.*]])
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %4)
+  %6 = extractelement <4 x float> %5, i32 0
+  ret float %6
+}
+
+define float @test_sqrt_ss_2(float %a) {
+; CHECK-LABEL: @test_sqrt_ss_2(
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %4)
+  %6 = extractelement <4 x float> %5, i32 2
+  ret float %6
+}
+
+define float @test_rsqrt_ss_0(float %a) {
+; CHECK-LABEL: @test_rsqrt_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    ret float [[TMP3]]
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %4)
+  %6 = extractelement <4 x float> %5, i32 0
+  ret float %6
+}
+
+define float @test_rsqrt_ss_3(float %a) {
+; CHECK-LABEL: @test_rsqrt_ss_3(
+; CHECK-NEXT:    ret float 3.000000e+00
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %4)
+  %6 = extractelement <4 x float> %5, i32 3
+  ret float %6
+}
+
+define float @test_add_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_add_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd float [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> poison, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %4, <4 x float> %8)
+  %r = extractelement <4 x float> %9, i32 0
+  ret float %r
+}
+
+define float @test_add_ss_1(float %a, float %b) {
+; CHECK-LABEL: @test_add_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> poison, float %b, i32 0
+  %6 = tail call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %4, <4 x float> %5)
+  %7 = extractelement <4 x float> %6, i32 1
+  ret float %7
+}
+
+define float @test_add_ss_2(float %a) {
+; CHECK-LABEL: @test_add_ss_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd float [[A:%.*]], [[A]]
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %1 = insertelement <4 x float> zeroinitializer, float %a, i32 0
+  %2 = tail call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %1, <4 x float> %1)
+  %3 = extractelement <4 x float> %2, i32 0
+  ret float %3
+}
+
+define float @test_sub_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_sub_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub float [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> poison, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %4, <4 x float> %8)
+  %r = extractelement <4 x float> %9, i32 0
+  ret float %r
+}
+
+define float @test_sub_ss_2(float %a, float %b) {
+; CHECK-LABEL: @test_sub_ss_2(
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> poison, float %b, i32 0
+  %6 = tail call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %4, <4 x float> %5)
+  %7 = extractelement <4 x float> %6, i32 2
+  ret float %7
+}
+
+define float @test_sub_ss_3(float %a) {
+; CHECK-LABEL: @test_sub_ss_3(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub float [[A:%.*]], [[A]]
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %1 = insertelement <4 x float> zeroinitializer, float %a, i32 0
+  %2 = tail call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %1, <4 x float> %1)
+  %3 = extractelement <4 x float> %2, i32 0
+  ret float %3
+}
+
+define float @test_mul_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_mul_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul float [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> poison, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %4, <4 x float> %8)
+  %r = extractelement <4 x float> %9, i32 0
+  ret float %r
+}
+
+define float @test_mul_ss_3(float %a, float %b) {
+; CHECK-LABEL: @test_mul_ss_3(
+; CHECK-NEXT:    ret float 3.000000e+00
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> poison, float %b, i32 0
+  %6 = tail call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %4, <4 x float> %5)
+  %7 = extractelement <4 x float> %6, i32 3
+  ret float %7
+}
+
+define float @test_mul_ss_4(float %a) {
+; CHECK-LABEL: @test_mul_ss_4(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul float [[A:%.*]], [[A]]
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %1 = insertelement <4 x float> zeroinitializer, float %a, i32 0
+  %2 = tail call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %1, <4 x float> %1)
+  %3 = extractelement <4 x float> %2, i32 0
+  ret float %3
+}
+
+define float @test_div_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_div_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv float [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> poison, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %4, <4 x float> %8)
+  %r = extractelement <4 x float> %9, i32 0
+  ret float %r
+}
+
+define float @test_div_ss_1(float %a, float %b) {
+; CHECK-LABEL: @test_div_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> poison, float %b, i32 0
+  %6 = tail call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %4, <4 x float> %5)
+  %7 = extractelement <4 x float> %6, i32 1
+  ret float %7
+}
+
+define float @test_div_ss_2(float %a) {
+; CHECK-LABEL: @test_div_ss_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv float [[A:%.*]], [[A]]
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %1 = insertelement <4 x float> zeroinitializer, float %a, i32 0
+  %2 = tail call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %1, <4 x float> %1)
+  %3 = extractelement <4 x float> %2, i32 0
+  ret float %3
+}
+
+define <4 x float> @test_min_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_min_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a, <4 x float> %3)
+  ret <4 x float> %4
+}
+
+define float @test_min_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_min_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    ret float [[TMP4]]
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> poison, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %4, <4 x float> %8)
+  %10 = extractelement <4 x float> %9, i32 0
+  ret float %10
+}
+
+define float @test_min_ss_2(float %a, float %b) {
+; CHECK-LABEL: @test_min_ss_2(
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> poison, float %b, i32 0
+  %6 = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %4, <4 x float> %5)
+  %7 = extractelement <4 x float> %6, i32 2
+  ret float %7
+}
+
+define float @test_min_ss_3(float %a) {
+; CHECK-LABEL: @test_min_ss_3(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> [[TMP1]], <4 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    ret float [[TMP3]]
+;
+  %1 = insertelement <4 x float> zeroinitializer, float %a, i32 0
+  %2 = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %1, <4 x float> %1)
+  %3 = extractelement <4 x float> %2, i32 0
+  ret float %3
+}
+
+define <4 x float> @test_max_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_max_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a, <4 x float> %3)
+  ret <4 x float> %4
+}
+
+define float @test_max_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_max_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    ret float [[TMP4]]
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> poison, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %4, <4 x float> %8)
+  %10 = extractelement <4 x float> %9, i32 0
+  ret float %10
+}
+
+define float @test_max_ss_3(float %a, float %b) {
+; CHECK-LABEL: @test_max_ss_3(
+; CHECK-NEXT:    ret float 3.000000e+00
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> poison, float %b, i32 0
+  %6 = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %4, <4 x float> %5)
+  %7 = extractelement <4 x float> %6, i32 3
+  ret float %7
+}
+
+define float @test_max_ss_4(float %a) {
+; CHECK-LABEL: @test_max_ss_4(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> [[TMP1]], <4 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    ret float [[TMP3]]
+;
+  %1 = insertelement <4 x float> zeroinitializer, float %a, i32 0
+  %2 = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %1, <4 x float> %1)
+  %3 = extractelement <4 x float> %2, i32 0
+  ret float %3
+}
+
+define <4 x float> @test_cmp_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_cmp_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i8 0)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a, <4 x float> %3, i8 0)
+  ret <4 x float> %4
+}
+
+define float @test_cmp_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_cmp_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]], i8 0)
+; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    ret float [[R]]
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> poison, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %4, <4 x float> %8, i8 0)
+  %r = extractelement <4 x float> %9, i32 0
+  ret float %r
+}
+
+define float @test_cmp_ss_1(float %a, float %b) {
+; CHECK-LABEL: @test_cmp_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> poison, float %b, i32 0
+  %6 = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %4, <4 x float> %5, i8 0)
+  %7 = extractelement <4 x float> %6, i32 1
+  ret float %7
+}
+
+define float @test_cmp_ss_2(float %a) {
+; CHECK-LABEL: @test_cmp_ss_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> [[TMP1]], <4 x float> [[TMP1]], i8 3)
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    ret float [[TMP3]]
+;
+  %1 = insertelement <4 x float> zeroinitializer, float %a, i32 0
+  %2 = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %1, <4 x float> %1, i8 3)
+  %3 = extractelement <4 x float> %2, i32 0
+  ret float %3
+}
+
+define i32 @test_comieq_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_comieq_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.comieq.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> poison, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call i32 @llvm.x86.sse.comieq.ss(<4 x float> %4, <4 x float> %8)
+  ret i32 %9
+}
+
+define i32 @test_comige_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_comige_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.comige.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> poison, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call i32 @llvm.x86.sse.comige.ss(<4 x float> %4, <4 x float> %8)
+  ret i32 %9
+}
+
+define i32 @test_comigt_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_comigt_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.comigt.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> poison, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call i32 @llvm.x86.sse.comigt.ss(<4 x float> %4, <4 x float> %8)
+  ret i32 %9
+}
+
+define i32 @test_comile_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_comile_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.comile.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> poison, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call i32 @llvm.x86.sse.comile.ss(<4 x float> %4, <4 x float> %8)
+  ret i32 %9
+}
+
+define i32 @test_comilt_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_comilt_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.comilt.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> poison, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call i32 @llvm.x86.sse.comilt.ss(<4 x float> %4, <4 x float> %8)
+  ret i32 %9
+}
+
+define i32 @test_comineq_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_comineq_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.comineq.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> poison, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call i32 @llvm.x86.sse.comineq.ss(<4 x float> %4, <4 x float> %8)
+  ret i32 %9
+}
+
+define i32 @test_ucomieq_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_ucomieq_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> poison, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %4, <4 x float> %8)
+  ret i32 %9
+}
+
+define i32 @test_ucomige_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_ucomige_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.ucomige.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> poison, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %4, <4 x float> %8)
+  ret i32 %9
+}
+
+define i32 @test_ucomigt_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_ucomigt_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> poison, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %4, <4 x float> %8)
+  ret i32 %9
+}
+
+define i32 @test_ucomile_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_ucomile_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.ucomile.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> poison, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %4, <4 x float> %8)
+  ret i32 %9
+}
+
+define i32 @test_ucomilt_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_ucomilt_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> poison, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %4, <4 x float> %8)
+  ret i32 %9
+}
+
+define i32 @test_ucomineq_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_ucomineq_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> poison, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %4, <4 x float> %8)
+  ret i32 %9
+}
+
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>)
+declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>)
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>)
+
+declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8)
+
+declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>)
+declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>)
+declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>)
+declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>)
+declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>)
+declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>)
+
+declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>)
+declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>)
+declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>)
+declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>)
+declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>)
+declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>)
--- a/llvm/test/Transforms/InstCombine/X86/x86-sse2-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-sse2-inseltpoison.ll
@ -0,0 +1,541 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define double @test_sqrt_sd_0(double %a) {
+; CHECK-LABEL: @test_sqrt_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = call double @llvm.sqrt.f64(double [[A:%.*]])
+; CHECK-NEXT:    ret double [[TMP1]]
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %2)
+  %4 = extractelement <2 x double> %3, i32 0
+  ret double %4
+}
+
+define double @test_sqrt_sd_1(double %a) {
+; CHECK-LABEL: @test_sqrt_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %2)
+  %4 = extractelement <2 x double> %3, i32 1
+  ret double %4
+}
+
+define double @test_add_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_add_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd double [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret double [[TMP1]]
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> poison, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 0
+  ret double %6
+}
+
+define double @test_add_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_add_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> poison, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6
+}
+
+define double @test_add_sd_2(double %a) {
+; CHECK-LABEL: @test_add_sd_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd double [[A:%.*]], [[A]]
+; CHECK-NEXT:    ret double [[TMP1]]
+;
+  %1 = insertelement <2 x double> zeroinitializer, double %a, i32 0
+  %2 = tail call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %1, <2 x double> %1)
+  %3 = extractelement <2 x double> %2, i32 0
+  ret double %3
+}
+
+define double @test_sub_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_sub_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub double [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret double [[TMP1]]
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> poison, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 0
+  ret double %6
+}
+
+define double @test_sub_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_sub_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> poison, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6
+}
+
+define double @test_sub_sd_2(double %a) {
+; CHECK-LABEL: @test_sub_sd_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub double [[A:%.*]], [[A]]
+; CHECK-NEXT:    ret double [[TMP1]]
+;
+  %1 = insertelement <2 x double> zeroinitializer, double %a, i32 0
+  %2 = tail call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %1, <2 x double> %1)
+  %3 = extractelement <2 x double> %2, i32 0
+  ret double %3
+}
+
+define double @test_mul_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_mul_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul double [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret double [[TMP1]]
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> poison, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 0
+  ret double %6
+}
+
+define double @test_mul_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_mul_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> poison, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6
+}
+
+define double @test_mul_sd_2(double %a) {
+; CHECK-LABEL: @test_mul_sd_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul double [[A:%.*]], [[A]]
+; CHECK-NEXT:    ret double [[TMP1]]
+;
+  %1 = insertelement <2 x double> zeroinitializer, double %a, i32 0
+  %2 = tail call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %1, <2 x double> %1)
+  %3 = extractelement <2 x double> %2, i32 0
+  ret double %3
+}
+
+define double @test_div_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_div_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv double [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret double [[TMP1]]
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> poison, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 0
+  ret double %6
+}
+
+define double @test_div_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_div_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> poison, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6
+}
+
+define double @test_div_sd_2(double %a) {
+; CHECK-LABEL: @test_div_sd_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv double [[A:%.*]], [[A]]
+; CHECK-NEXT:    ret double [[TMP1]]
+;
+  %1 = insertelement <2 x double> zeroinitializer, double %a, i32 0
+  %2 = tail call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %1, <2 x double> %1)
+  %3 = extractelement <2 x double> %2, i32 0
+  ret double %3
+}
+
+define <2 x double> @test_min_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_min_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a, <2 x double> %1)
+  ret <2 x double> %2
+}
+
+define double @test_min_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_min_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; CHECK-NEXT:    ret double [[TMP4]]
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> poison, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 0
+  ret double %6
+}
+
+define double @test_min_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_min_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> poison, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6
+}
+
+define double @test_min_sd_2(double %a) {
+; CHECK-LABEL: @test_min_sd_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> <double undef, double 0.000000e+00>, double [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> [[TMP1]], <2 x double> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; CHECK-NEXT:    ret double [[TMP3]]
+;
+  %1 = insertelement <2 x double> zeroinitializer, double %a, i32 0
+  %2 = tail call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %1, <2 x double> %1)
+  %3 = extractelement <2 x double> %2, i32 0
+  ret double %3
+}
+
+define <2 x double> @test_max_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_max_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a, <2 x double> %1)
+  ret <2 x double> %2
+}
+
+define double @test_max_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_max_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; CHECK-NEXT:    ret double [[TMP4]]
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> poison, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 0
+  ret double %6
+}
+
+define double @test_max_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_max_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> poison, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6
+}
+
+define double @test_max_sd_2(double %a) {
+; CHECK-LABEL: @test_max_sd_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> <double undef, double 0.000000e+00>, double [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> [[TMP1]], <2 x double> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; CHECK-NEXT:    ret double [[TMP3]]
+;
+  %1 = insertelement <2 x double> zeroinitializer, double %a, i32 0
+  %2 = tail call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %1, <2 x double> %1)
+  %3 = extractelement <2 x double> %2, i32 0
+  ret double %3
+}
+
+define <2 x double> @test_cmp_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_cmp_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], i8 0)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a, <2 x double> %1, i8 0)
+  ret <2 x double> %2
+}
+
+define double @test_cmp_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_cmp_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]], i8 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; CHECK-NEXT:    ret double [[TMP4]]
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> poison, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %2, <2 x double> %4, i8 0)
+  %6 = extractelement <2 x double> %5, i32 0
+  ret double %6
+}
+
+define double @test_cmp_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_cmp_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> poison, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %2, <2 x double> %4, i8 0)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6
+}
+
+define double @test_cmp_sd_2(double %a) {
+; CHECK-LABEL: @test_cmp_sd_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> <double undef, double 0.000000e+00>, double [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP1]], <2 x double> [[TMP1]], i8 3)
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; CHECK-NEXT:    ret double [[TMP3]]
+;
+  %1 = insertelement <2 x double> zeroinitializer, double %a, i32 0
+  %2 = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %1, <2 x double> %1, i8 3)
+  %3 = extractelement <2 x double> %2, i32 0
+  ret double %3
+}
+
+define i32 @test_comieq_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_comieq_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.comieq.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> poison, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %2, <2 x double> %4)
+  ret i32 %5
+}
+
+define i32 @test_comige_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_comige_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.comige.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> poison, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call i32 @llvm.x86.sse2.comige.sd(<2 x double> %2, <2 x double> %4)
+  ret i32 %5
+}
+
+define i32 @test_comigt_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_comigt_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.comigt.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> poison, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call i32 @llvm.x86.sse2.comigt.sd(<2 x double> %2, <2 x double> %4)
+  ret i32 %5
+}
+
+define i32 @test_comile_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_comile_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.comile.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> poison, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call i32 @llvm.x86.sse2.comile.sd(<2 x double> %2, <2 x double> %4)
+  ret i32 %5
+}
+
+define i32 @test_comilt_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_comilt_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.comilt.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> poison, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call i32 @llvm.x86.sse2.comilt.sd(<2 x double> %2, <2 x double> %4)
+  ret i32 %5
+}
+
+define i32 @test_comineq_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_comineq_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.comineq.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> poison, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call i32 @llvm.x86.sse2.comineq.sd(<2 x double> %2, <2 x double> %4)
+  ret i32 %5
+}
+
+define i32 @test_ucomieq_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_ucomieq_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> poison, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %2, <2 x double> %4)
+  ret i32 %5
+}
+
+define i32 @test_ucomige_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_ucomige_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.ucomige.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> poison, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call i32 @llvm.x86.sse2.ucomige.sd(<2 x double> %2, <2 x double> %4)
+  ret i32 %5
+}
+
+define i32 @test_ucomigt_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_ucomigt_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.ucomigt.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> poison, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call i32 @llvm.x86.sse2.ucomigt.sd(<2 x double> %2, <2 x double> %4)
+  ret i32 %5
+}
+
+define i32 @test_ucomile_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_ucomile_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.ucomile.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> poison, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call i32 @llvm.x86.sse2.ucomile.sd(<2 x double> %2, <2 x double> %4)
+  ret i32 %5
+}
+
+define i32 @test_ucomilt_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_ucomilt_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.ucomilt.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> poison, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call i32 @llvm.x86.sse2.ucomilt.sd(<2 x double> %2, <2 x double> %4)
+  ret i32 %5
+}
+
+define i32 @test_ucomineq_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_ucomineq_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.sse2.ucomineq.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> poison, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call i32 @llvm.x86.sse2.ucomineq.sd(<2 x double> %2, <2 x double> %4)
+  ret i32 %5
+}
+
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8)
+
+declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>)
+declare i32 @llvm.x86.sse2.comige.sd(<2 x double>, <2 x double>)
+declare i32 @llvm.x86.sse2.comigt.sd(<2 x double>, <2 x double>)
+declare i32 @llvm.x86.sse2.comile.sd(<2 x double>, <2 x double>)
+declare i32 @llvm.x86.sse2.comilt.sd(<2 x double>, <2 x double>)
+declare i32 @llvm.x86.sse2.comineq.sd(<2 x double>, <2 x double>)
+
+declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>)
+declare i32 @llvm.x86.sse2.ucomige.sd(<2 x double>, <2 x double>)
+declare i32 @llvm.x86.sse2.ucomigt.sd(<2 x double>, <2 x double>)
+declare i32 @llvm.x86.sse2.ucomile.sd(<2 x double>, <2 x double>)
+declare i32 @llvm.x86.sse2.ucomilt.sd(<2 x double>, <2 x double>)
+declare i32 @llvm.x86.sse2.ucomineq.sd(<2 x double>, <2 x double>)
--- a/llvm/test/Transforms/InstCombine/X86/x86-sse41-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-sse41-inseltpoison.ll
@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define <2 x double> @test_round_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_round_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], i32 10)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 0
+  %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+  %3 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %1, <2 x double> %2, i32 10)
+  ret <2 x double> %3
+}
+
+define double @test_round_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_round_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> undef, <2 x double> [[TMP1]], i32 10)
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; CHECK-NEXT:    ret double [[TMP3]]
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> poison, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %2, <2 x double> %4, i32 10)
+  %6 = extractelement <2 x double> %5, i32 0
+  ret double %6
+}
+
+define double @test_round_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_round_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> poison, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %2, <2 x double> %4, i32 10)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6
+}
+
+define double @test_round_sd_2(double %a) {
+; CHECK-LABEL: @test_round_sd_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> undef, <2 x double> [[TMP1]], i32 10)
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; CHECK-NEXT:    ret double [[TMP3]]
+;
+  %1 = insertelement <2 x double> zeroinitializer, double %a, i32 0
+  %2 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %1, <2 x double> %1, i32 10)
+  %3 = extractelement <2 x double> %2, i32 0
+  ret double %3
+}
+
+define <4 x float> @test_round_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_round_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> <float undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, <4 x float> [[B:%.*]], i32 10)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %5 = insertelement <4 x float> %4, float 2.000000e+00, i32 2
+  %6 = insertelement <4 x float> %5, float 3.000000e+00, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %3, <4 x float> %6, i32 10)
+  ret <4 x float> %7
+}
+
+define float @test_round_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_round_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> undef, <4 x float> [[TMP1]], i32 10)
+; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    ret float [[R]]
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> poison, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %4, <4 x float> %8, i32 10)
+  %r = extractelement <4 x float> %9, i32 0
+  ret float %r
+}
+
+define float @test_round_ss_2(float %a, float %b) {
+; CHECK-LABEL: @test_round_ss_2(
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> poison, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %4, <4 x float> %8, i32 10)
+  %r = extractelement <4 x float> %9, i32 2
+  ret float %r
+}
+
+define float @test_round_ss_3(float %a) {
+; CHECK-LABEL: @test_round_ss_3(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> undef, <4 x float> [[TMP1]], i32 10)
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    ret float [[TMP3]]
+;
+  %1 = insertelement <4 x float> zeroinitializer, float %a, i32 0
+  %2 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %1, <4 x float> %1, i32 10)
+  %3 = extractelement <4 x float> %2, i32 0
+  ret float %3
+}
+
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
+declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
--- a/llvm/test/Transforms/InstCombine/X86/x86-vec_demanded_elts-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-vec_demanded_elts-inseltpoison.ll
@ -0,0 +1,110 @@
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define i16 @test1(float %f) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[TMP281:%.*]] = fadd float %f, -1.000000e+00
+; CHECK-NEXT:    [[TMP373:%.*]] = fmul float [[TMP281]], 5.000000e-01
+; CHECK-NEXT:    [[TMP374:%.*]] = insertelement <4 x float> undef, float [[TMP373]], i32 0
+; CHECK-NEXT:    [[TMP48:%.*]] = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> [[TMP374]], <4 x float> <float 6.553500e+04, float undef, float undef, float undef>)
+; CHECK-NEXT:    [[TMP59:%.*]] = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> [[TMP48]], <4 x float> <float 0.000000e+00, float undef, float undef, float undef>)
+; CHECK-NEXT:    [[TMP_UPGRD_1:%.*]] = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> [[TMP59]])
+; CHECK-NEXT:    [[TMP69:%.*]] = trunc i32 [[TMP_UPGRD_1]] to i16
+; CHECK-NEXT:    ret i16 [[TMP69]]
+;
+  %tmp = insertelement <4 x float> poison, float %f, i32 0
+  %tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1
+  %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2
+  %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3
+  %tmp28 = tail call <4 x float> @llvm.x86.sse.sub.ss( <4 x float> %tmp12, <4 x float> < float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )
+  %tmp37 = tail call <4 x float> @llvm.x86.sse.mul.ss( <4 x float> %tmp28, <4 x float> < float 5.000000e-01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )
+  %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp37, <4 x float> < float 6.553500e+04, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )
+  %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> zeroinitializer )
+  %tmp.upgrd.1 = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 )
+  %tmp69 = trunc i32 %tmp.upgrd.1 to i16
+  ret i16 %tmp69
+}
+
+define i64 @test3(float %f, double %d) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[V00:%.*]] = insertelement <4 x float> undef, float %f, i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> [[V00]])
+; CHECK-NEXT:    [[V10:%.*]] = insertelement <4 x float> undef, float %f, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> [[V10]])
+; CHECK-NEXT:    [[V20:%.*]] = insertelement <4 x float> undef, float %f, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> [[V20]])
+; CHECK-NEXT:    [[V30:%.*]] = insertelement <4 x float> undef, float %f, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> [[V30]])
+; CHECK-NEXT:    [[V40:%.*]] = insertelement <2 x double> undef, double %d, i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> [[V40]])
+; CHECK-NEXT:    [[V50:%.*]] = insertelement <2 x double> undef, double %d, i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> [[V50]])
+; CHECK-NEXT:    [[V60:%.*]] = insertelement <2 x double> undef, double %d, i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> [[V60]])
+; CHECK-NEXT:    [[V70:%.*]] = insertelement <2 x double> undef, double %d, i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> [[V70]])
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], [[TMP11]]
+; CHECK-NEXT:    ret i64 [[TMP15]]
+;
+  %v00 = insertelement <4 x float> poison, float %f, i32 0
+  %v01 = insertelement <4 x float> %v00, float 0.000000e+00, i32 1
+  %v02 = insertelement <4 x float> %v01, float 0.000000e+00, i32 2
+  %v03 = insertelement <4 x float> %v02, float 0.000000e+00, i32 3
+  %tmp0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %v03)
+  %v10 = insertelement <4 x float> poison, float %f, i32 0
+  %v11 = insertelement <4 x float> %v10, float 0.000000e+00, i32 1
+  %v12 = insertelement <4 x float> %v11, float 0.000000e+00, i32 2
+  %v13 = insertelement <4 x float> %v12, float 0.000000e+00, i32 3
+  %tmp1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %v13)
+  %v20 = insertelement <4 x float> poison, float %f, i32 0
+  %v21 = insertelement <4 x float> %v20, float 0.000000e+00, i32 1
+  %v22 = insertelement <4 x float> %v21, float 0.000000e+00, i32 2
+  %v23 = insertelement <4 x float> %v22, float 0.000000e+00, i32 3
+  %tmp2 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %v23)
+  %v30 = insertelement <4 x float> poison, float %f, i32 0
+  %v31 = insertelement <4 x float> %v30, float 0.000000e+00, i32 1
+  %v32 = insertelement <4 x float> %v31, float 0.000000e+00, i32 2
+  %v33 = insertelement <4 x float> %v32, float 0.000000e+00, i32 3
+  %tmp3 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %v33)
+  %v40 = insertelement <2 x double> poison, double %d, i32 0
+  %v41 = insertelement <2 x double> %v40, double 0.000000e+00, i32 1
+  %tmp4 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %v41)
+  %v50 = insertelement <2 x double> poison, double %d, i32 0
+  %v51 = insertelement <2 x double> %v50, double 0.000000e+00, i32 1
+  %tmp5 = tail call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %v51)
+  %v60 = insertelement <2 x double> poison, double %d, i32 0
+  %v61 = insertelement <2 x double> %v60, double 0.000000e+00, i32 1
+  %tmp6 = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %v61)
+  %v70 = insertelement <2 x double> poison, double %d, i32 0
+  %v71 = insertelement <2 x double> %v70, double 0.000000e+00, i32 1
+  %tmp7 = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %v71)
+  %tmp8 = add i32 %tmp0, %tmp2
+  %tmp9 = add i32 %tmp4, %tmp6
+  %tmp10 = add i32 %tmp8, %tmp9
+  %tmp11 = sext i32 %tmp10 to i64
+  %tmp12 = add i64 %tmp1, %tmp3
+  %tmp13 = add i64 %tmp5, %tmp7
+  %tmp14 = add i64 %tmp12, %tmp13
+  %tmp15 = add i64 %tmp11, %tmp14
+  ret i64 %tmp15
+}
+
+declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>)
+declare i32 @llvm.x86.sse.cvtss2si(<4 x float>)
+declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>)
+declare i32 @llvm.x86.sse.cvttss2si(<4 x float>)
+declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>)
+declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>)
+declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>)
+declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>)
+declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>)
--- a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts-inseltpoison.ll
--- a/llvm/test/Transforms/InstCombine/X86/x86-xop-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-xop-inseltpoison.ll
@ -0,0 +1,305 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
+
+define <2 x double> @test_vfrcz_sd(<2 x double> %a) {
+; CHECK-LABEL: @test_vfrcz_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> [[A:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %1)
+  ret <2 x double> %2
+}
+
+define double @test_vfrcz_sd_0(double %a) {
+; CHECK-LABEL: @test_vfrcz_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; CHECK-NEXT:    ret double [[TMP3]]
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = tail call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %2)
+  %4 = extractelement <2 x double> %3, i32 0
+  ret double %4
+}
+
+define double @test_vfrcz_sd_1(double %a) {
+; CHECK-LABEL: @test_vfrcz_sd_1(
+; CHECK-NEXT:    ret double 0.000000e+00
+;
+  %1 = insertelement <2 x double> poison, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = tail call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %2)
+  %4 = extractelement <2 x double> %3, i32 1
+  ret double %4
+}
+
+define <4 x float> @test_vfrcz_ss(<4 x float> %a) {
+; CHECK-LABEL: @test_vfrcz_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> [[A:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %3)
+  ret <4 x float> %4
+}
+
+define float @test_vfrcz_ss_0(float %a) {
+; CHECK-LABEL: @test_vfrcz_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    ret float [[TMP3]]
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = tail call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %4)
+  %6 = extractelement <4 x float> %5, i32 0
+  ret float %6
+}
+
+define float @test_vfrcz_ss_3(float %a) {
+; CHECK-LABEL: @test_vfrcz_ss_3(
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %1 = insertelement <4 x float> poison, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = tail call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %4)
+  %6 = extractelement <4 x float> %5, i32 3
+  ret float %6
+}
+
+define <2 x i64> @cmp_slt_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: @cmp_slt_v2i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <2 x i64> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.xop.vpcomltq(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @cmp_ult_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: @cmp_ult_v2i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i64> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.xop.vpcomltuq(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @cmp_sle_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: @cmp_sle_v2i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sle <2 x i64> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.xop.vpcomleq(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @cmp_ule_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: @cmp_ule_v2i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <2 x i64> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+;
+  %1 = tail call <2 x i64> @llvm.x86.xop.vpcomleuq(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %1
+}
+
+define <4 x i32> @cmp_sgt_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @cmp_sgt_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+  %1 = tail call <4 x i32> @llvm.x86.xop.vpcomgtd(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @cmp_ugt_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @cmp_ugt_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt <4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+  %1 = tail call <4 x i32> @llvm.x86.xop.vpcomgtud(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @cmp_sge_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @cmp_sge_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sge <4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+  %1 = tail call <4 x i32> @llvm.x86.xop.vpcomged(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @cmp_uge_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @cmp_uge_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp uge <4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+  %1 = tail call <4 x i32> @llvm.x86.xop.vpcomgeud(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %1
+}
+
+define <8 x i16> @cmp_seq_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: @cmp_seq_v8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i16> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+;
+  %1 = tail call <8 x i16> @llvm.x86.xop.vpcomeqw(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @cmp_ueq_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: @cmp_ueq_v8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i16> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+;
+  %1 = tail call <8 x i16> @llvm.x86.xop.vpcomequw(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @cmp_sne_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: @cmp_sne_v8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <8 x i16> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+;
+  %1 = tail call <8 x i16> @llvm.x86.xop.vpcomnew(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @cmp_une_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: @cmp_une_v8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <8 x i16> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+;
+  %1 = tail call <8 x i16> @llvm.x86.xop.vpcomneuw(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %1
+}
+
+define <16 x i8> @cmp_strue_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @cmp_strue_v16i8(
+; CHECK-NEXT:    ret <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+;
+  %1 = tail call <16 x i8> @llvm.x86.xop.vpcomtrueb(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @cmp_utrue_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @cmp_utrue_v16i8(
+; CHECK-NEXT:    ret <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+;
+  %1 = tail call <16 x i8> @llvm.x86.xop.vpcomtrueub(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @cmp_sfalse_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @cmp_sfalse_v16i8(
+; CHECK-NEXT:    ret <16 x i8> zeroinitializer
+;
+  %1 = tail call <16 x i8> @llvm.x86.xop.vpcomfalseb(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @cmp_ufalse_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @cmp_ufalse_v16i8(
+; CHECK-NEXT:    ret <16 x i8> zeroinitializer
+;
+  %1 = tail call <16 x i8> @llvm.x86.xop.vpcomfalseub(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %1
+}
+
+declare <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double>) nounwind readnone
+declare <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float>) nounwind readnone
+
+declare <16 x i8> @llvm.x86.xop.vpcomltb(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomltw(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomltd(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomltq(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.x86.xop.vpcomltub(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomltuw(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomltud(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomltuq(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.x86.xop.vpcomleb(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomlew(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomled(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomleq(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.x86.xop.vpcomleub(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomleuw(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomleud(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomleuq(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.x86.xop.vpcomgtb(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomgtw(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomgtd(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomgtq(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.x86.xop.vpcomgtub(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomgtuw(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomgtud(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomgtuq(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.x86.xop.vpcomgeb(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomgew(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomged(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomgeq(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.x86.xop.vpcomgeub(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomgeuw(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomgeud(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomgeuq(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.x86.xop.vpcomeqb(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomeqw(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomeqd(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomeqq(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.x86.xop.vpcomequb(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomequw(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomequd(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomequq(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.x86.xop.vpcomneb(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomnew(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomned(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomneq(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.x86.xop.vpcomneub(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomneuw(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomneud(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomneuq(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.x86.xop.vpcomfalseb(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomfalsew(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomfalsed(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomfalseq(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.x86.xop.vpcomfalseub(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomfalseuw(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomfalseud(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomfalseuq(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.x86.xop.vpcomtrueb(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomtrued(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomtrueq(<2 x i64>, <2 x i64>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomtruew(<8 x i16>, <8 x i16>) nounwind readnone
+declare <16 x i8> @llvm.x86.xop.vpcomtrueub(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpcomtrueuw(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpcomtrueud(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpcomtrueuq(<2 x i64>, <2 x i64>) nounwind readnone
--- a/llvm/test/Transforms/InstCombine/bitcast-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/bitcast-inseltpoison.ll
@ -0,0 +1,573 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin10.0.0"
+
+; Bitcasts between vectors and scalars are valid.
+; PR4487
+define i32 @test1(i64 %a) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    ret i32 0
+;
+  %t1 = bitcast i64 %a to <2 x i32>
+  %t2 = bitcast i64 %a to <2 x i32>
+  %t3 = xor <2 x i32> %t1, %t2
+  %t4 = extractelement <2 x i32> %t3, i32 0
+  ret i32 %t4
+}
+
+; Perform the bitwise logic in the source type of the operands to eliminate bitcasts.
+
+define <2 x i32> @xor_two_vector_bitcasts(<1 x i64> %a, <1 x i64> %b) {
+; CHECK-LABEL: @xor_two_vector_bitcasts(
+; CHECK-NEXT:    [[T31:%.*]] = xor <1 x i64> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[T3:%.*]] = bitcast <1 x i64> [[T31]] to <2 x i32>
+; CHECK-NEXT:    ret <2 x i32> [[T3]]
+;
+  %t1 = bitcast <1 x i64> %a to <2 x i32>
+  %t2 = bitcast <1 x i64> %b to <2 x i32>
+  %t3 = xor <2 x i32> %t1, %t2
+  ret <2 x i32> %t3
+}
+
+; No change. Bitcasts are canonicalized above bitwise logic.
+
+define <2 x i32> @xor_bitcast_vec_to_vec(<1 x i64> %a) {
+; CHECK-LABEL: @xor_bitcast_vec_to_vec(
+; CHECK-NEXT:    [[T1:%.*]] = bitcast <1 x i64> [[A:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[T2:%.*]] = xor <2 x i32> [[T1]], <i32 1, i32 2>
+; CHECK-NEXT:    ret <2 x i32> [[T2]]
+;
+  %t1 = bitcast <1 x i64> %a to <2 x i32>
+  %t2 = xor <2 x i32> <i32 1, i32 2>, %t1
+  ret <2 x i32> %t2
+}
+
+; No change. Bitcasts are canonicalized above bitwise logic.
+
+define i64 @and_bitcast_vec_to_int(<2 x i32> %a) {
+; CHECK-LABEL: @and_bitcast_vec_to_int(
+; CHECK-NEXT:    [[T1:%.*]] = bitcast <2 x i32> [[A:%.*]] to i64
+; CHECK-NEXT:    [[T2:%.*]] = and i64 [[T1]], 3
+; CHECK-NEXT:    ret i64 [[T2]]
+;
+  %t1 = bitcast <2 x i32> %a to i64
+  %t2 = and i64 %t1, 3
+  ret i64 %t2
+}
+
+; No change. Bitcasts are canonicalized above bitwise logic.
+
+define <2 x i32> @or_bitcast_int_to_vec(i64 %a) {
+; CHECK-LABEL: @or_bitcast_int_to_vec(
+; CHECK-NEXT:    [[T1:%.*]] = bitcast i64 [[A:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[T2:%.*]] = or <2 x i32> [[T1]], <i32 1, i32 2>
+; CHECK-NEXT:    ret <2 x i32> [[T2]]
+;
+  %t1 = bitcast i64 %a to <2 x i32>
+  %t2 = or <2 x i32> %t1, <i32 1, i32 2>
+  ret <2 x i32> %t2
+}
+
+; PR26702 - https://bugs.llvm.org//show_bug.cgi?id=26702
+; Bitcast is canonicalized above logic, so we can see the not-not pattern.
+
+define <2 x i64> @is_negative(<4 x i32> %x) {
+; CHECK-LABEL: @is_negative(
+; CHECK-NEXT:    [[LOBIT:%.*]] = ashr <4 x i32> [[X:%.*]], <i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[LOBIT]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %lobit = ashr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
+  %not = xor <4 x i32> %lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %bc = bitcast <4 x i32> %not to <2 x i64>
+  %notnot = xor <2 x i64> %bc, <i64 -1, i64 -1>
+  ret <2 x i64> %notnot
+}
+
+; This variation has an extra bitcast at the end. This means that the 2nd xor
+; can be done in <4 x i32> to eliminate a bitcast regardless of canonicalizaion.
+
+define <4 x i32> @is_negative_bonus_bitcast(<4 x i32> %x) {
+; CHECK-LABEL: @is_negative_bonus_bitcast(
+; CHECK-NEXT:    [[LOBIT:%.*]] = ashr <4 x i32> [[X:%.*]], <i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT:    ret <4 x i32> [[LOBIT]]
+;
+  %lobit = ashr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
+  %not = xor <4 x i32> %lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %bc = bitcast <4 x i32> %not to <2 x i64>
+  %notnot = xor <2 x i64> %bc, <i64 -1, i64 -1>
+  %bc2 = bitcast <2 x i64> %notnot to <4 x i32>
+  ret <4 x i32> %bc2
+}
+
+; Bitcasts are canonicalized above bitwise logic.
+
+define <2 x i8> @canonicalize_bitcast_logic_with_constant(<4 x i4> %x) {
+; CHECK-LABEL: @canonicalize_bitcast_logic_with_constant(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i4> [[X:%.*]] to <2 x i8>
+; CHECK-NEXT:    [[B:%.*]] = and <2 x i8> [[TMP1]], <i8 -128, i8 -128>
+; CHECK-NEXT:    ret <2 x i8> [[B]]
+;
+  %a = and <4 x i4> %x, <i4 0, i4 8, i4 0, i4 8>
+  %b = bitcast <4 x i4> %a to <2 x i8>
+  ret <2 x i8> %b
+}
+
+; PR27925 - https://llvm.org/bugs/show_bug.cgi?id=27925
+
+define <4 x i32> @bitcasts_and_bitcast(<4 x i32> %a, <8 x i16> %b) {
+; CHECK-LABEL: @bitcasts_and_bitcast(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[BC3:%.*]] = and <4 x i32> [[TMP1]], [[A:%.*]]
+; CHECK-NEXT:    ret <4 x i32> [[BC3]]
+;
+  %bc1 = bitcast <4 x i32> %a to <2 x i64>
+  %bc2 = bitcast <8 x i16> %b to <2 x i64>
+  %and = and <2 x i64> %bc2, %bc1
+  %bc3 = bitcast <2 x i64> %and to <4 x i32>
+  ret <4 x i32> %bc3
+}
+
+; The destination must have an integer element type.
+; FIXME: We can still eliminate one bitcast in this test by doing the logic op
+; in the type of the input that has an integer element type.
+
+define <4 x float> @bitcasts_and_bitcast_to_fp(<4 x float> %a, <8 x i16> %b) {
+; CHECK-LABEL: @bitcasts_and_bitcast_to_fp(
+; CHECK-NEXT:    [[BC1:%.*]] = bitcast <4 x float> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[BC2:%.*]] = bitcast <8 x i16> [[B:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[AND:%.*]] = and <2 x i64> [[BC2]], [[BC1]]
+; CHECK-NEXT:    [[BC3:%.*]] = bitcast <2 x i64> [[AND]] to <4 x float>
+; CHECK-NEXT:    ret <4 x float> [[BC3]]
+;
+  %bc1 = bitcast <4 x float> %a to <2 x i64>
+  %bc2 = bitcast <8 x i16> %b to <2 x i64>
+  %and = and <2 x i64> %bc2, %bc1
+  %bc3 = bitcast <2 x i64> %and to <4 x float>
+  ret <4 x float> %bc3
+}
+
+; FIXME: Transform limited from changing vector op to integer op to avoid codegen problems.
+
+define i128 @bitcast_or_bitcast(i128 %a, <2 x i64> %b) {
+; CHECK-LABEL: @bitcast_or_bitcast(
+; CHECK-NEXT:    [[BC1:%.*]] = bitcast i128 [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i64> [[BC1]], [[B:%.*]]
+; CHECK-NEXT:    [[BC2:%.*]] = bitcast <2 x i64> [[OR]] to i128
+; CHECK-NEXT:    ret i128 [[BC2]]
+;
+  %bc1 = bitcast i128 %a to <2 x i64>
+  %or = or <2 x i64> %b, %bc1
+  %bc2 = bitcast <2 x i64> %or to i128
+  ret i128 %bc2
+}
+
+; FIXME: Transform limited from changing integer op to vector op to avoid codegen problems.
+
+define <4 x i32> @bitcast_xor_bitcast(<4 x i32> %a, i128 %b) {
+; CHECK-LABEL: @bitcast_xor_bitcast(
+; CHECK-NEXT:    [[BC1:%.*]] = bitcast <4 x i32> [[A:%.*]] to i128
+; CHECK-NEXT:    [[XOR:%.*]] = xor i128 [[BC1]], [[B:%.*]]
+; CHECK-NEXT:    [[BC2:%.*]] = bitcast i128 [[XOR]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[BC2]]
+;
+  %bc1 = bitcast <4 x i32> %a to i128
+  %xor = xor i128 %bc1, %b
+  %bc2 = bitcast i128 %xor to <4 x i32>
+  ret <4 x i32> %bc2
+}
+
+; https://llvm.org/bugs/show_bug.cgi?id=6137#c6
+
+define <4 x float> @bitcast_vector_select(<4 x float> %x, <2 x i64> %y, <4 x i1> %cmp) {
+; CHECK-LABEL: @bitcast_vector_select(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[Y:%.*]] to <4 x float>
+; CHECK-NEXT:    [[T7:%.*]] = select <4 x i1> [[CMP:%.*]], <4 x float> [[X:%.*]], <4 x float> [[TMP1]]
+; CHECK-NEXT:    ret <4 x float> [[T7]]
+;
+  %t4 = bitcast <4 x float> %x to <4 x i32>
+  %t5 = bitcast <2 x i64> %y to <4 x i32>
+  %t6 = select <4 x i1> %cmp, <4 x i32> %t4, <4 x i32> %t5
+  %t7 = bitcast <4 x i32> %t6 to <4 x float>
+  ret <4 x float> %t7
+}
+
+define float @bitcast_scalar_select_of_scalars(float %x, i32 %y, i1 %cmp) {
+; CHECK-LABEL: @bitcast_scalar_select_of_scalars(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[Y:%.*]] to float
+; CHECK-NEXT:    [[T7:%.*]] = select i1 [[CMP:%.*]], float [[X:%.*]], float [[TMP1]]
+; CHECK-NEXT:    ret float [[T7]]
+;
+  %t4 = bitcast float %x to i32
+  %t6 = select i1 %cmp, i32 %t4, i32 %y
+  %t7 = bitcast i32 %t6 to float
+  ret float %t7
+}
+
+; FIXME: We should change the select operand types to scalars, but we need to make
+; sure the backend can reverse that transform if needed.
+
+define float @bitcast_scalar_select_type_mismatch1(float %x, <4 x i8> %y, i1 %cmp) {
+; CHECK-LABEL: @bitcast_scalar_select_type_mismatch1(
+; CHECK-NEXT:    [[T4:%.*]] = bitcast float [[X:%.*]] to <4 x i8>
+; CHECK-NEXT:    [[T6:%.*]] = select i1 [[CMP:%.*]], <4 x i8> [[T4]], <4 x i8> [[Y:%.*]]
+; CHECK-NEXT:    [[T7:%.*]] = bitcast <4 x i8> [[T6]] to float
+; CHECK-NEXT:    ret float [[T7]]
+;
+  %t4 = bitcast float %x to <4 x i8>
+  %t6 = select i1 %cmp, <4 x i8> %t4, <4 x i8> %y
+  %t7 = bitcast <4 x i8> %t6 to float
+  ret float %t7
+}
+
+; FIXME: We should change the select operand types to vectors, but we need to make
+; sure the backend can reverse that transform if needed.
+
+define <4 x i8> @bitcast_scalar_select_type_mismatch2(<4 x i8> %x, float %y, i1 %cmp) {
+; CHECK-LABEL: @bitcast_scalar_select_type_mismatch2(
+; CHECK-NEXT:    [[T4:%.*]] = bitcast <4 x i8> [[X:%.*]] to float
+; CHECK-NEXT:    [[T6:%.*]] = select i1 [[CMP:%.*]], float [[T4]], float [[Y:%.*]]
+; CHECK-NEXT:    [[T7:%.*]] = bitcast float [[T6]] to <4 x i8>
+; CHECK-NEXT:    ret <4 x i8> [[T7]]
+;
+  %t4 = bitcast <4 x i8> %x to float
+  %t6 = select i1 %cmp, float %t4, float %y
+  %t7 = bitcast float %t6 to <4 x i8>
+  ret <4 x i8> %t7
+}
+
+define <4 x float> @bitcast_scalar_select_of_vectors(<4 x float> %x, <2 x i64> %y, i1 %cmp) {
+; CHECK-LABEL: @bitcast_scalar_select_of_vectors(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[Y:%.*]] to <4 x float>
+; CHECK-NEXT:    [[T7:%.*]] = select i1 [[CMP:%.*]], <4 x float> [[X:%.*]], <4 x float> [[TMP1]]
+; CHECK-NEXT:    ret <4 x float> [[T7]]
+;
+  %t4 = bitcast <4 x float> %x to <4 x i32>
+  %t5 = bitcast <2 x i64> %y to <4 x i32>
+  %t6 = select i1 %cmp, <4 x i32> %t4, <4 x i32> %t5
+  %t7 = bitcast <4 x i32> %t6 to <4 x float>
+  ret <4 x float> %t7
+}
+
+; Can't change the type of the vector select if the dest type is scalar.
+
+define float @bitcast_vector_select_no_fold1(float %x, <2 x i16> %y, <4 x i1> %cmp) {
+; CHECK-LABEL: @bitcast_vector_select_no_fold1(
+; CHECK-NEXT:    [[T4:%.*]] = bitcast float [[X:%.*]] to <4 x i8>
+; CHECK-NEXT:    [[T5:%.*]] = bitcast <2 x i16> [[Y:%.*]] to <4 x i8>
+; CHECK-NEXT:    [[T6:%.*]] = select <4 x i1> [[CMP:%.*]], <4 x i8> [[T4]], <4 x i8> [[T5]]
+; CHECK-NEXT:    [[T7:%.*]] = bitcast <4 x i8> [[T6]] to float
+; CHECK-NEXT:    ret float [[T7]]
+;
+  %t4 = bitcast float %x to <4 x i8>
+  %t5 = bitcast <2 x i16> %y to <4 x i8>
+  %t6 = select <4 x i1> %cmp, <4 x i8> %t4, <4 x i8> %t5
+  %t7 = bitcast <4 x i8> %t6 to float
+  ret float %t7
+}
+
+; Can't change the type of the vector select if the number of elements in the dest type is not the same.
+
+define <2 x float> @bitcast_vector_select_no_fold2(<2 x float> %x, <4 x i16> %y, <8 x i1> %cmp) {
+; CHECK-LABEL: @bitcast_vector_select_no_fold2(
+; CHECK-NEXT:    [[T4:%.*]] = bitcast <2 x float> [[X:%.*]] to <8 x i8>
+; CHECK-NEXT:    [[T5:%.*]] = bitcast <4 x i16> [[Y:%.*]] to <8 x i8>
+; CHECK-NEXT:    [[T6:%.*]] = select <8 x i1> [[CMP:%.*]], <8 x i8> [[T4]], <8 x i8> [[T5]]
+; CHECK-NEXT:    [[T7:%.*]] = bitcast <8 x i8> [[T6]] to <2 x float>
+; CHECK-NEXT:    ret <2 x float> [[T7]]
+;
+  %t4 = bitcast <2 x float> %x to <8 x i8>
+  %t5 = bitcast <4 x i16> %y to <8 x i8>
+  %t6 = select <8 x i1> %cmp, <8 x i8> %t4, <8 x i8> %t5
+  %t7 = bitcast <8 x i8> %t6 to <2 x float>
+  ret <2 x float> %t7
+}
+
+; Optimize bitcasts that are extracting low element of vector.  This happens because of SRoA.
+; rdar://7892780
+define float @test2(<2 x float> %A, <2 x i32> %B) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i32> [[B:%.*]] to <2 x float>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[BC]], i32 0
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP24]], [[TMP4]]
+; CHECK-NEXT:    ret float [[ADD]]
+;
+  %tmp28 = bitcast <2 x float> %A to i64  ; <i64> [#uses=2]
+  %tmp23 = trunc i64 %tmp28 to i32                ; <i32> [#uses=1]
+  %tmp24 = bitcast i32 %tmp23 to float            ; <float> [#uses=1]
+
+  %tmp = bitcast <2 x i32> %B to i64
+  %tmp2 = trunc i64 %tmp to i32                ; <i32> [#uses=1]
+  %tmp4 = bitcast i32 %tmp2 to float            ; <float> [#uses=1]
+
+  %add = fadd float %tmp24, %tmp4
+  ret float %add
+}
+
+; Optimize bitcasts that are extracting other elements of a vector.  This happens because of SRoA.
+; rdar://7892780
+define float @test3(<2 x float> %A, <2 x i64> %B) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[A:%.*]], i32 1
+; CHECK-NEXT:    [[BC2:%.*]] = bitcast <2 x i64> [[B:%.*]] to <4 x float>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[BC2]], i32 2
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP24]], [[TMP4]]
+; CHECK-NEXT:    ret float [[ADD]]
+;
+  %tmp28 = bitcast <2 x float> %A to i64
+  %tmp29 = lshr i64 %tmp28, 32
+  %tmp23 = trunc i64 %tmp29 to i32
+  %tmp24 = bitcast i32 %tmp23 to float
+
+  %tmp = bitcast <2 x i64> %B to i128
+  %tmp1 = lshr i128 %tmp, 64
+  %tmp2 = trunc i128 %tmp1 to i32
+  %tmp4 = bitcast i32 %tmp2 to float
+
+  %add = fadd float %tmp24, %tmp4
+  ret float %add
+}
+
+; Both bitcasts are unnecessary; change the extractelement.
+
+define float @bitcast_extelt1(<2 x float> %A) {
+; CHECK-LABEL: @bitcast_extelt1(
+; CHECK-NEXT:    [[BC2:%.*]] = extractelement <2 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    ret float [[BC2]]
+;
+  %bc1 = bitcast <2 x float> %A to <2 x i32>
+  %ext = extractelement <2 x i32> %bc1, i32 0
+  %bc2 = bitcast i32 %ext to float
+  ret float %bc2
+}
+
+; Second bitcast can be folded into the first.
+
+define i64 @bitcast_extelt2(<4 x float> %A) {
+; CHECK-LABEL: @bitcast_extelt2(
+; CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x float> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[BC2:%.*]] = extractelement <2 x i64> [[BC]], i32 1
+; CHECK-NEXT:    ret i64 [[BC2]]
+;
+  %bc1 = bitcast <4 x float> %A to <2 x double>
+  %ext = extractelement <2 x double> %bc1, i32 1
+  %bc2 = bitcast double %ext to i64
+  ret i64 %bc2
+}
+
+; TODO: This should return %A.
+
+define <2 x i32> @bitcast_extelt3(<2 x i32> %A) {
+; CHECK-LABEL: @bitcast_extelt3(
+; CHECK-NEXT:    [[BC1:%.*]] = bitcast <2 x i32> [[A:%.*]] to <1 x i64>
+; CHECK-NEXT:    [[EXT:%.*]] = extractelement <1 x i64> [[BC1]], i32 0
+; CHECK-NEXT:    [[BC2:%.*]] = bitcast i64 [[EXT]] to <2 x i32>
+; CHECK-NEXT:    ret <2 x i32> [[BC2]]
+;
+  %bc1 = bitcast <2 x i32> %A to <1 x i64>
+  %ext = extractelement <1 x i64> %bc1, i32 0
+  %bc2 = bitcast i64 %ext to <2 x i32>
+  ret <2 x i32> %bc2
+}
+
+; Handle the case where the input is not a vector.
+
+define double @bitcast_extelt4(i128 %A) {
+; CHECK-LABEL: @bitcast_extelt4(
+; CHECK-NEXT:    [[BC:%.*]] = bitcast i128 [[A:%.*]] to <2 x double>
+; CHECK-NEXT:    [[BC2:%.*]] = extractelement <2 x double> [[BC]], i32 0
+; CHECK-NEXT:    ret double [[BC2]]
+;
+  %bc1 = bitcast i128 %A to <2 x i64>
+  %ext = extractelement <2 x i64> %bc1, i32 0
+  %bc2 = bitcast i64 %ext to double
+  ret double %bc2
+}
+
+define <2 x i32> @test4(i32 %A, i32 %B){
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[B:%.*]], i32 1
+; CHECK-NEXT:    ret <2 x i32> [[TMP2]]
+;
+  %tmp38 = zext i32 %A to i64
+  %tmp32 = zext i32 %B to i64
+  %tmp33 = shl i64 %tmp32, 32
+  %ins35 = or i64 %tmp33, %tmp38
+  %tmp43 = bitcast i64 %ins35 to <2 x i32>
+  ret <2 x i32> %tmp43
+}
+
+; rdar://8360454
+define <2 x float> @test5(float %A, float %B) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[B:%.*]], i32 1
+; CHECK-NEXT:    ret <2 x float> [[TMP2]]
+;
+  %tmp37 = bitcast float %A to i32
+  %tmp38 = zext i32 %tmp37 to i64
+  %tmp31 = bitcast float %B to i32
+  %tmp32 = zext i32 %tmp31 to i64
+  %tmp33 = shl i64 %tmp32, 32
+  %ins35 = or i64 %tmp33, %tmp38
+  %tmp43 = bitcast i64 %ins35 to <2 x float>
+  ret <2 x float> %tmp43
+}
+
+define <2 x float> @test6(float %A){
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> <float 4.200000e+01, float undef>, float [[A:%.*]], i32 1
+; CHECK-NEXT:    ret <2 x float> [[TMP1]]
+;
+  %tmp23 = bitcast float %A to i32
+  %tmp24 = zext i32 %tmp23 to i64
+  %tmp25 = shl i64 %tmp24, 32
+  %mask20 = or i64 %tmp25, 1109917696
+  %tmp35 = bitcast i64 %mask20 to <2 x float>
+  ret <2 x float> %tmp35
+}
+
+define i64 @ISPC0(i64 %in) {
+; CHECK-LABEL: @ISPC0(
+; CHECK-NEXT:    ret i64 0
+;
+  %out = and i64 %in, xor (i64 bitcast (<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1> to i64), i64 -1)
+  ret i64 %out
+}
+
+
+define i64 @Vec2(i64 %in) {
+; CHECK-LABEL: @Vec2(
+; CHECK-NEXT:    ret i64 0
+;
+  %out = and i64 %in, xor (i64 bitcast (<4 x i16> <i16 0, i16 0, i16 0, i16 0> to i64), i64 0)
+  ret i64 %out
+}
+
+define i64 @All11(i64 %in) {
+; CHECK-LABEL: @All11(
+; CHECK-NEXT:    ret i64 0
+;
+  %out = and i64 %in, xor (i64 bitcast (<2 x float> bitcast (i64 -1 to <2 x float>) to i64), i64 -1)
+  ret i64 %out
+}
+
+
+define i32 @All111(i32 %in) {
+; CHECK-LABEL: @All111(
+; CHECK-NEXT:    ret i32 0
+;
+  %out = and i32 %in, xor (i32 bitcast (<1 x float> bitcast (i32 -1 to <1 x float>) to i32), i32 -1)
+  ret i32 %out
+}
+
+define <2 x i16> @BitcastInsert(i32 %a) {
+; CHECK-LABEL: @BitcastInsert(
+; CHECK-NEXT:    [[R:%.*]] = bitcast i32 [[A:%.*]] to <2 x i16>
+; CHECK-NEXT:    ret <2 x i16> [[R]]
+;
+  %v = insertelement <1 x i32> poison, i32 %a, i32 0
+  %r = bitcast <1 x i32> %v to <2 x i16>
+  ret <2 x i16> %r
+}
+
+; PR17293
+define <2 x i64> @test7(<2 x i8*>* %arg) nounwind {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast <2 x i8*>* [[ARG:%.*]] to <2 x i64>*
+; CHECK-NEXT:    [[LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[CAST]], align 16
+; CHECK-NEXT:    ret <2 x i64> [[LOAD]]
+;
+  %cast = bitcast <2 x i8*>* %arg to <2 x i64>*
+  %load = load <2 x i64>, <2 x i64>* %cast, align 16
+  ret <2 x i64> %load
+}
+
+define i8 @test8() {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:    ret i8 -85
+;
+  %res = bitcast <8 x i1> <i1 true, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true> to i8
+  ret i8 %res
+}
+
+@g = internal unnamed_addr global i32 undef
+
+define void @constant_fold_vector_to_double() {
+; CHECK-LABEL: @constant_fold_vector_to_double(
+; CHECK-NEXT:    store volatile double 1.000000e+00, double* undef, align 8
+; CHECK-NEXT:    store volatile double 1.000000e+00, double* undef, align 8
+; CHECK-NEXT:    store volatile double 1.000000e+00, double* undef, align 8
+; CHECK-NEXT:    store volatile double 1.000000e+00, double* undef, align 8
+; CHECK-NEXT:    store volatile double 0xFFFFFFFFFFFFFFFF, double* undef, align 8
+; CHECK-NEXT:    store volatile double 0x162E000004D2, double* undef, align 8
+; CHECK-NEXT:    store volatile double bitcast (<2 x i32> <i32 1234, i32 ptrtoint (i32* @g to i32)> to double), double* undef, align 8
+; CHECK-NEXT:    store volatile double 0x400000003F800000, double* undef, align 8
+; CHECK-NEXT:    store volatile double 0.000000e+00, double* undef, align 8
+; CHECK-NEXT:    store volatile double 0.000000e+00, double* undef, align 8
+; CHECK-NEXT:    store volatile double 0.000000e+00, double* undef, align 8
+; CHECK-NEXT:    store volatile double 0.000000e+00, double* undef, align 8
+; CHECK-NEXT:    store volatile double 0.000000e+00, double* undef, align 8
+; CHECK-NEXT:    store volatile double 0.000000e+00, double* undef, align 8
+; CHECK-NEXT:    ret void
+;
+  store volatile double bitcast (<1 x i64> <i64 4607182418800017408> to double), double* undef
+  store volatile double bitcast (<2 x i32> <i32 0, i32 1072693248> to double), double* undef
+  store volatile double bitcast (<4 x i16> <i16 0, i16 0, i16 0, i16 16368> to double), double* undef
+  store volatile double bitcast (<8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 240, i8 63> to double), double* undef
+
+  store volatile double bitcast (<2 x i32> <i32 -1, i32 -1> to double), double* undef
+  store volatile double bitcast (<2 x i32> <i32 1234, i32 5678> to double), double* undef
+
+  store volatile double bitcast (<2 x i32> <i32 1234, i32 ptrtoint (i32* @g to i32)> to double), double* undef
+  store volatile double bitcast (<2 x float> <float 1.0, float 2.0> to double), double* undef
+
+  store volatile double bitcast (<2 x i32> zeroinitializer to double), double* undef
+  store volatile double bitcast (<4 x i16> zeroinitializer to double), double* undef
+  store volatile double bitcast (<8 x i8> zeroinitializer to double), double* undef
+  store volatile double bitcast (<16 x i4> zeroinitializer to double), double* undef
+  store volatile double bitcast (<32 x i2> zeroinitializer to double), double* undef
+  store volatile double bitcast (<64 x i1> zeroinitializer to double), double* undef
+  ret void
+}
+
+define void @constant_fold_vector_to_float() {
+; CHECK-LABEL: @constant_fold_vector_to_float(
+; CHECK-NEXT:    store volatile float 1.000000e+00, float* undef, align 4
+; CHECK-NEXT:    store volatile float 1.000000e+00, float* undef, align 4
+; CHECK-NEXT:    store volatile float 1.000000e+00, float* undef, align 4
+; CHECK-NEXT:    store volatile float 1.000000e+00, float* undef, align 4
+; CHECK-NEXT:    ret void
+;
+  store volatile float bitcast (<1 x i32> <i32 1065353216> to float), float* undef
+  store volatile float bitcast (<2 x i16> <i16 0, i16 16256> to float), float* undef
+  store volatile float bitcast (<4 x i8> <i8 0, i8 0, i8 128, i8 63> to float), float* undef
+  store volatile float bitcast (<32 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0> to float), float* undef
+
+  ret void
+}
+
+define void @constant_fold_vector_to_half() {
+; CHECK-LABEL: @constant_fold_vector_to_half(
+; CHECK-NEXT:    store volatile half 0xH4000, half* undef, align 2
+; CHECK-NEXT:    store volatile half 0xH4000, half* undef, align 2
+; CHECK-NEXT:    ret void
+;
+  store volatile half bitcast (<2 x i8> <i8 0, i8 64> to half), half* undef
+  store volatile half bitcast (<4 x i4> <i4 0, i4 0, i4 0, i4 4> to half), half* undef
+  ret void
+}
+
+; Ensure that we do not crash when looking at such a weird bitcast.
+define i8* @bitcast_from_single_element_pointer_vector_to_pointer(<1 x i8*> %ptrvec) {
+; CHECK-LABEL: @bitcast_from_single_element_pointer_vector_to_pointer(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i8*> [[PTRVEC:%.*]], i32 0
+; CHECK-NEXT:    ret i8* [[TMP1]]
+;
+  %ptr = bitcast <1 x i8*> %ptrvec to i8*
+  ret i8* %ptr
+}
--- a/llvm/test/Transforms/InstCombine/bitcast-vec-canon-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/bitcast-vec-canon-inseltpoison.ll
@ -0,0 +1,167 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+define double @a(<1 x i64> %y) {
+; CHECK-LABEL: @a(
+; CHECK-NEXT:    [[BC:%.*]] = bitcast <1 x i64> [[Y:%.*]] to <1 x double>
+; CHECK-NEXT:    [[C:%.*]] = extractelement <1 x double> [[BC]], i32 0
+; CHECK-NEXT:    ret double [[C]]
+;
+  %c = bitcast <1 x i64> %y to double
+  ret double %c
+}
+
+define i64 @b(<1 x i64> %y) {
+; CHECK-LABEL: @b(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[Y:%.*]], i32 0
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
+  %c = bitcast <1 x i64> %y to i64
+  ret i64 %c
+}
+
+define <1 x i64> @c(double %y) {
+; CHECK-LABEL: @c(
+; CHECK-NEXT:    [[C:%.*]] = bitcast double [[Y:%.*]] to <1 x i64>
+; CHECK-NEXT:    ret <1 x i64> [[C]]
+;
+  %c = bitcast double %y to <1 x i64>
+  ret <1 x i64> %c
+}
+
+define <1 x i64> @d(i64 %y) {
+; CHECK-LABEL: @d(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <1 x i64> undef, i64 [[Y:%.*]], i32 0
+; CHECK-NEXT:    ret <1 x i64> [[TMP1]]
+;
+  %c = bitcast i64 %y to <1 x i64>
+  ret <1 x i64> %c
+}
+
+define x86_mmx @e(<1 x i64> %y) {
+; CHECK-LABEL: @e(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[C:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
+; CHECK-NEXT:    ret x86_mmx [[C]]
+;
+  %c = bitcast <1 x i64> %y to x86_mmx
+  ret x86_mmx %c
+}
+
+define <1 x i64> @f(x86_mmx %y) {
+; CHECK-LABEL: @f(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast x86_mmx [[Y:%.*]] to i64
+; CHECK-NEXT:    [[C:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0
+; CHECK-NEXT:    ret <1 x i64> [[C]]
+;
+  %c = bitcast x86_mmx %y to <1 x i64>
+  ret <1 x i64> %c
+}
+
+define double @g(x86_mmx %x) {
+; CHECK-LABEL: @g(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast x86_mmx [[X:%.*]] to double
+; CHECK-NEXT:    ret double [[TMP0]]
+;
+entry:
+  %0 = bitcast x86_mmx %x to <1 x i64>
+  %1 = bitcast <1 x i64> %0 to double
+  ret double %1
+}
+
+; FP source is ok.
+
+define <3 x i64> @bitcast_inselt_undef(double %x, i32 %idx) {
+; CHECK-LABEL: @bitcast_inselt_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <3 x double> undef, double [[X:%.*]], i32 [[IDX:%.*]]
+; CHECK-NEXT:    [[I:%.*]] = bitcast <3 x double> [[TMP1]] to <3 x i64>
+; CHECK-NEXT:    ret <3 x i64> [[I]]
+;
+  %xb = bitcast double %x to i64
+  %i = insertelement <3 x i64> poison, i64 %xb, i32 %idx
+  ret <3 x i64> %i
+}
+
+; Integer source is ok; index is anything.
+
+define <3 x float> @bitcast_inselt_undef_fp(i32 %x, i567 %idx) {
+; CHECK-LABEL: @bitcast_inselt_undef_fp(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <3 x i32> undef, i32 [[X:%.*]], i567 [[IDX:%.*]]
+; CHECK-NEXT:    [[I:%.*]] = bitcast <3 x i32> [[TMP1]] to <3 x float>
+; CHECK-NEXT:    ret <3 x float> [[I]]
+;
+  %xb = bitcast i32 %x to float
+  %i = insertelement <3 x float> poison, float %xb, i567 %idx
+  ret <3 x float> %i
+}
+
+define <vscale x 3 x float> @bitcast_inselt_undef_vscale(i32 %x, i567 %idx) {
+; CHECK-LABEL: @bitcast_inselt_undef_vscale(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 3 x i32> undef, i32 [[X:%.*]], i567 [[IDX:%.*]]
+; CHECK-NEXT:    [[I:%.*]] = bitcast <vscale x 3 x i32> [[TMP1]] to <vscale x 3 x float>
+; CHECK-NEXT:    ret <vscale x 3 x float> [[I]]
+;
+  %xb = bitcast i32 %x to float
+  %i = insertelement <vscale x 3 x float> poison, float %xb, i567 %idx
+  ret <vscale x 3 x float> %i
+}
+
+declare void @use(i64)
+
+; Negative test - extra use prevents canonicalization
+
+define <3 x i64> @bitcast_inselt_undef_extra_use(double %x, i32 %idx) {
+; CHECK-LABEL: @bitcast_inselt_undef_extra_use(
+; CHECK-NEXT:    [[XB:%.*]] = bitcast double [[X:%.*]] to i64
+; CHECK-NEXT:    call void @use(i64 [[XB]])
+; CHECK-NEXT:    [[I:%.*]] = insertelement <3 x i64> poison, i64 [[XB]], i32 [[IDX:%.*]]
+; CHECK-NEXT:    ret <3 x i64> [[I]]
+;
+  %xb = bitcast double %x to i64
+  call void @use(i64 %xb)
+  %i = insertelement <3 x i64> poison, i64 %xb, i32 %idx
+  ret <3 x i64> %i
+}
+
+; Negative test - source type must be scalar
+
+define <3 x i64> @bitcast_inselt_undef_vec_src(<2 x i32> %x, i32 %idx) {
+; CHECK-LABEL: @bitcast_inselt_undef_vec_src(
+; CHECK-NEXT:    [[XB:%.*]] = bitcast <2 x i32> [[X:%.*]] to i64
+; CHECK-NEXT:    [[I:%.*]] = insertelement <3 x i64> poison, i64 [[XB]], i32 [[IDX:%.*]]
+; CHECK-NEXT:    ret <3 x i64> [[I]]
+;
+  %xb = bitcast <2 x i32> %x to i64
+  %i = insertelement <3 x i64> poison, i64 %xb, i32 %idx
+  ret <3 x i64> %i
+}
+
+; Negative test - source type must be scalar
+
+define <3 x i64> @bitcast_inselt_undef_from_mmx(x86_mmx %x, i32 %idx) {
+; CHECK-LABEL: @bitcast_inselt_undef_from_mmx(
+; CHECK-NEXT:    [[XB:%.*]] = bitcast x86_mmx [[X:%.*]] to i64
+; CHECK-NEXT:    [[I:%.*]] = insertelement <3 x i64> poison, i64 [[XB]], i32 [[IDX:%.*]]
+; CHECK-NEXT:    ret <3 x i64> [[I]]
+;
+  %xb = bitcast x86_mmx %x to i64
+  %i = insertelement <3 x i64> poison, i64 %xb, i32 %idx
+  ret <3 x i64> %i
+}
+
+; Reduce number of casts
+
+define <2 x i64> @PR45748(double %x, double %y) {
+; CHECK-LABEL: @PR45748(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[X:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[I1:%.*]] = bitcast <2 x double> [[TMP2]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[I1]]
+;
+  %xb = bitcast double %x to i64
+  %i0 = insertelement <2 x i64> poison, i64 %xb, i32 0
+  %yb = bitcast double %y to i64
+  %i1 = insertelement <2 x i64> %i0, i64 %yb, i32 1
+  ret <2 x i64> %i1
+}
--- a/llvm/test/Transforms/InstCombine/broadcast-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/broadcast-inseltpoison.ll
@ -0,0 +1,179 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+define <4 x float> @good1(float %arg) {
+; CHECK-LABEL: @good1(
+; CHECK-NEXT:    [[T:%.*]] = insertelement <4 x float> poison, float [[ARG:%.*]], i32 0
+; CHECK-NEXT:    [[T6:%.*]] = shufflevector <4 x float> [[T]], <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x float> [[T6]]
+;
+  %t = insertelement <4 x float> poison, float %arg, i32 0
+  %t4 = insertelement <4 x float> %t, float %arg, i32 1
+  %t5 = insertelement <4 x float> %t4, float %arg, i32 2
+  %t6 = insertelement <4 x float> %t5, float %arg, i32 3
+  ret <4 x float> %t6
+}
+
+define <4 x float> @good2(float %arg) {
+; CHECK-LABEL: @good2(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[ARG:%.*]], i32 0
+; CHECK-NEXT:    [[T6:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x float> [[T6]]
+;
+  %t = insertelement <4 x float> poison, float %arg, i32 1
+  %t4 = insertelement <4 x float> %t, float %arg, i32 2
+  %t5 = insertelement <4 x float> %t4, float %arg, i32 0
+  %t6 = insertelement <4 x float> %t5, float %arg, i32 3
+  ret <4 x float> %t6
+}
+
+define <4 x float> @good3(float %arg) {
+; CHECK-LABEL: @good3(
+; CHECK-NEXT:    [[T:%.*]] = insertelement <4 x float> undef, float [[ARG:%.*]], i32 0
+; CHECK-NEXT:    [[T6:%.*]] = shufflevector <4 x float> [[T]], <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x float> [[T6]]
+;
+  %t = insertelement <4 x float> zeroinitializer, float %arg, i32 0
+  %t4 = insertelement <4 x float> %t, float %arg, i32 1
+  %t5 = insertelement <4 x float> %t4, float %arg, i32 2
+  %t6 = insertelement <4 x float> %t5, float %arg, i32 3
+  ret <4 x float> %t6
+}
+
+define <4 x float> @good4(float %arg) {
+; CHECK-LABEL: @good4(
+; CHECK-NEXT:    [[T:%.*]] = insertelement <4 x float> undef, float [[ARG:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[T]], [[T]]
+; CHECK-NEXT:    [[T7:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x float> [[T7]]
+;
+  %t = insertelement <4 x float> zeroinitializer, float %arg, i32 0
+  %t4 = insertelement <4 x float> %t, float %arg, i32 1
+  %t5 = insertelement <4 x float> %t4, float %arg, i32 2
+  %t6 = insertelement <4 x float> %t5, float %arg, i32 3
+  %t7 = fadd <4 x float> %t6, %t6
+  ret <4 x float> %t7
+}
+
+define <4 x float> @good5(float %v) {
+; CHECK-LABEL: @good5(
+; CHECK-NEXT:    [[INS1:%.*]] = insertelement <4 x float> poison, float [[V:%.*]], i32 0
+; CHECK-NEXT:    [[A1:%.*]] = fadd <4 x float> [[INS1]], [[INS1]]
+; CHECK-NEXT:    [[INS4:%.*]] = shufflevector <4 x float> [[INS1]], <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = fadd <4 x float> [[A1]], [[INS4]]
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %ins1 = insertelement <4 x float> poison, float %v, i32 0
+  %a1 = fadd <4 x float> %ins1, %ins1
+  %ins2 = insertelement<4 x float> %ins1, float %v, i32 1
+  %ins3 = insertelement<4 x float> %ins2, float %v, i32 2
+  %ins4 = insertelement<4 x float> %ins3, float %v, i32 3
+  %res = fadd <4 x float> %a1, %ins4
+  ret <4 x float> %res
+}
+
+; The insert is changed to allow the canonical shuffle-splat pattern from element 0.
+
+define <4 x float> @splat_undef1(float %arg) {
+; CHECK-LABEL: @splat_undef1(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[ARG:%.*]], i32 0
+; CHECK-NEXT:    [[T6:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 0, i32 0>
+; CHECK-NEXT:    ret <4 x float> [[T6]]
+;
+  %t = insertelement <4 x float> poison, float %arg, i32 1
+  %t4 = insertelement <4 x float> %t, float %arg, i32 1
+  %t5 = insertelement <4 x float> %t4, float %arg, i32 2
+  %t6 = insertelement <4 x float> %t5, float %arg, i32 3
+  ret <4 x float> %t6
+}
+
+; Re-uses the existing first insertelement.
+
+define <4 x float> @splat_undef2(float %arg) {
+; CHECK-LABEL: @splat_undef2(
+; CHECK-NEXT:    [[T:%.*]] = insertelement <4 x float> poison, float [[ARG:%.*]], i32 0
+; CHECK-NEXT:    [[T6:%.*]] = shufflevector <4 x float> [[T]], <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 0, i32 0>
+; CHECK-NEXT:    ret <4 x float> [[T6]]
+;
+  %t = insertelement <4 x float> poison, float %arg, i32 0
+  %t5 = insertelement <4 x float> %t, float %arg, i32 2
+  %t6 = insertelement <4 x float> %t5, float %arg, i32 3
+  ret <4 x float> %t6
+}
+
+define <4 x float> @bad3(float %arg, float %arg2) {
+; CHECK-LABEL: @bad3(
+; CHECK-NEXT:    [[T:%.*]] = insertelement <4 x float> poison, float [[ARG:%.*]], i32 0
+; CHECK-NEXT:    [[T4:%.*]] = insertelement <4 x float> [[T]], float [[ARG2:%.*]], i32 1
+; CHECK-NEXT:    [[T5:%.*]] = insertelement <4 x float> [[T4]], float [[ARG]], i32 2
+; CHECK-NEXT:    [[T6:%.*]] = insertelement <4 x float> [[T5]], float [[ARG]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[T6]]
+;
+  %t = insertelement <4 x float> poison, float %arg, i32 0
+  %t4 = insertelement <4 x float> %t, float %arg2, i32 1
+  %t5 = insertelement <4 x float> %t4, float %arg, i32 2
+  %t6 = insertelement <4 x float> %t5, float %arg, i32 3
+  ret <4 x float> %t6
+}
+
+define <1 x float> @bad4(float %arg) {
+; CHECK-LABEL: @bad4(
+; CHECK-NEXT:    [[T:%.*]] = insertelement <1 x float> poison, float [[ARG:%.*]], i32 0
+; CHECK-NEXT:    ret <1 x float> [[T]]
+;
+  %t = insertelement <1 x float> poison, float %arg, i32 0
+  ret <1 x float> %t
+}
+
+; Multiple undef elements are ok.
+; TODO: Multiple uses triggers the transform at %t4, but we should sink/scalarize/CSE the splats?
+
+define <4 x float> @splat_undef3(float %arg) {
+; CHECK-LABEL: @splat_undef3(
+; CHECK-NEXT:    [[T:%.*]] = insertelement <4 x float> poison, float [[ARG:%.*]], i32 0
+; CHECK-NEXT:    [[T4:%.*]] = shufflevector <4 x float> [[T]], <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
+; CHECK-NEXT:    [[T6:%.*]] = shufflevector <4 x float> [[T]], <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[T7:%.*]] = fadd <4 x float> [[T6]], [[T4]]
+; CHECK-NEXT:    ret <4 x float> [[T7]]
+;
+  %t = insertelement <4 x float> poison, float %arg, i32 0
+  %t4 = insertelement <4 x float> %t, float %arg, i32 1
+  %t5 = insertelement <4 x float> %t4, float %arg, i32 2
+  %t6 = insertelement <4 x float> %t5, float %arg, i32 3
+  %t7 = fadd <4 x float> %t6, %t4
+  ret <4 x float> %t7
+}
+
+define <4 x float> @bad6(float %arg, i32 %k) {
+; CHECK-LABEL: @bad6(
+; CHECK-NEXT:    [[T:%.*]] = insertelement <4 x float> poison, float [[ARG:%.*]], i32 0
+; CHECK-NEXT:    [[T4:%.*]] = insertelement <4 x float> [[T]], float [[ARG]], i32 1
+; CHECK-NEXT:    [[T5:%.*]] = insertelement <4 x float> [[T4]], float [[ARG]], i32 [[K:%.*]]
+; CHECK-NEXT:    [[T6:%.*]] = insertelement <4 x float> [[T5]], float [[ARG]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[T6]]
+;
+  %t = insertelement <4 x float> poison, float %arg, i32 0
+  %t4 = insertelement <4 x float> %t, float %arg, i32 1
+  %t5 = insertelement <4 x float> %t4, float %arg, i32 %k
+  %t6 = insertelement <4 x float> %t5, float %arg, i32 3
+  ret <4 x float> %t6
+}
+
+define <4 x float> @bad7(float %v) {
+; CHECK-LABEL: @bad7(
+; CHECK-NEXT:    [[INS1:%.*]] = insertelement <4 x float> poison, float [[V:%.*]], i32 1
+; CHECK-NEXT:    [[A1:%.*]] = fadd <4 x float> [[INS1]], [[INS1]]
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x float> [[INS1]], float [[V]], i32 2
+; CHECK-NEXT:    [[INS3:%.*]] = insertelement <4 x float> [[INS2]], float [[V]], i32 3
+; CHECK-NEXT:    [[INS4:%.*]] = insertelement <4 x float> [[INS3]], float [[V]], i32 0
+; CHECK-NEXT:    [[RES:%.*]] = fadd <4 x float> [[A1]], [[INS4]]
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %ins1 = insertelement <4 x float> poison, float %v, i32 1
+  %a1 = fadd <4 x float> %ins1, %ins1
+  %ins2 = insertelement<4 x float> %ins1, float %v, i32 2
+  %ins3 = insertelement<4 x float> %ins2, float %v, i32 3
+  %ins4 = insertelement<4 x float> %ins3, float %v, i32 0
+  %res = fadd <4 x float> %a1, %ins4
+  ret <4 x float> %res
+}
--- a/llvm/test/Transforms/InstCombine/extractelement-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/extractelement-inseltpoison.ll
@ -0,0 +1,332 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S -data-layout="e" | FileCheck %s --check-prefixes=ANY,LE
+; RUN: opt < %s -instcombine -S -data-layout="E" | FileCheck %s --check-prefixes=ANY,BE
+
+define i32 @extractelement_out_of_range(<2 x i32> %x) {
+; ANY-LABEL: @extractelement_out_of_range(
+; ANY-NEXT:    ret i32 undef
+;
+  %E1 = extractelement <2 x i32> %x, i8 16
+  ret i32 %E1
+}
+
+define i32 @extractelement_type_out_of_range(<2 x i32> %x) {
+; ANY-LABEL: @extractelement_type_out_of_range(
+; ANY-NEXT:    [[E1:%.*]] = extractelement <2 x i32> [[X:%.*]], i128 0
+; ANY-NEXT:    ret i32 [[E1]]
+;
+  %E1 = extractelement <2 x i32> %x, i128 0
+  ret i32 %E1
+}
+
+define i32 @bitcasted_inselt_equal_num_elts(float %f) {
+; ANY-LABEL: @bitcasted_inselt_equal_num_elts(
+; ANY-NEXT:    [[R:%.*]] = bitcast float [[F:%.*]] to i32
+; ANY-NEXT:    ret i32 [[R]]
+;
+  %vf = insertelement <4 x float> poison, float %f, i32 0
+  %vi = bitcast <4 x float> %vf to <4 x i32>
+  %r = extractelement <4 x i32> %vi, i32 0
+  ret i32 %r
+}
+
+define i64 @test2(i64 %in) {
+; ANY-LABEL: @test2(
+; ANY-NEXT:    ret i64 [[IN:%.*]]
+;
+  %vec = insertelement <8 x i64> poison, i64 %in, i32 0
+  %splat = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> zeroinitializer
+  %add = add <8 x i64> %splat, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+  %r = extractelement <8 x i64> %add, i32 0
+  ret i64 %r
+}
+
+define i32 @bitcasted_inselt_wide_source_zero_elt(i64 %x) {
+; LE-LABEL: @bitcasted_inselt_wide_source_zero_elt(
+; LE-NEXT:    [[R:%.*]] = trunc i64 [[X:%.*]] to i32
+; LE-NEXT:    ret i32 [[R]]
+;
+; BE-LABEL: @bitcasted_inselt_wide_source_zero_elt(
+; BE-NEXT:    [[TMP1:%.*]] = lshr i64 [[X:%.*]], 32
+; BE-NEXT:    [[R:%.*]] = trunc i64 [[TMP1]] to i32
+; BE-NEXT:    ret i32 [[R]]
+;
+  %i = insertelement <2 x i64> zeroinitializer, i64 %x, i32 0
+  %b = bitcast <2 x i64> %i to <4 x i32>
+  %r = extractelement <4 x i32> %b, i32 0
+  ret i32 %r
+}
+
+define i16 @bitcasted_inselt_wide_source_modulo_elt(i64 %x) {
+; LE-LABEL: @bitcasted_inselt_wide_source_modulo_elt(
+; LE-NEXT:    [[R:%.*]] = trunc i64 [[X:%.*]] to i16
+; LE-NEXT:    ret i16 [[R]]
+;
+; BE-LABEL: @bitcasted_inselt_wide_source_modulo_elt(
+; BE-NEXT:    [[TMP1:%.*]] = lshr i64 [[X:%.*]], 48
+; BE-NEXT:    [[R:%.*]] = trunc i64 [[TMP1]] to i16
+; BE-NEXT:    ret i16 [[R]]
+;
+  %i = insertelement <2 x i64> poison, i64 %x, i32 1
+  %b = bitcast <2 x i64> %i to <8 x i16>
+  %r = extractelement <8 x i16> %b, i32 4
+  ret i16 %r
+}
+
+define i32 @bitcasted_inselt_wide_source_not_modulo_elt(i64 %x) {
+; LE-LABEL: @bitcasted_inselt_wide_source_not_modulo_elt(
+; LE-NEXT:    [[TMP1:%.*]] = lshr i64 [[X:%.*]], 32
+; LE-NEXT:    [[R:%.*]] = trunc i64 [[TMP1]] to i32
+; LE-NEXT:    ret i32 [[R]]
+;
+; BE-LABEL: @bitcasted_inselt_wide_source_not_modulo_elt(
+; BE-NEXT:    [[R:%.*]] = trunc i64 [[X:%.*]] to i32
+; BE-NEXT:    ret i32 [[R]]
+;
+  %i = insertelement <2 x i64> poison, i64 %x, i32 0
+  %b = bitcast <2 x i64> %i to <4 x i32>
+  %r = extractelement <4 x i32> %b, i32 1
+  ret i32 %r
+}
+
+define i8 @bitcasted_inselt_wide_source_not_modulo_elt_not_half(i32 %x) {
+; LE-LABEL: @bitcasted_inselt_wide_source_not_modulo_elt_not_half(
+; LE-NEXT:    [[TMP1:%.*]] = lshr i32 [[X:%.*]], 16
+; LE-NEXT:    [[R:%.*]] = trunc i32 [[TMP1]] to i8
+; LE-NEXT:    ret i8 [[R]]
+;
+; BE-LABEL: @bitcasted_inselt_wide_source_not_modulo_elt_not_half(
+; BE-NEXT:    [[TMP1:%.*]] = lshr i32 [[X:%.*]], 8
+; BE-NEXT:    [[R:%.*]] = trunc i32 [[TMP1]] to i8
+; BE-NEXT:    ret i8 [[R]]
+;
+  %i = insertelement <2 x i32> poison, i32 %x, i32 0
+  %b = bitcast <2 x i32> %i to <8 x i8>
+  %r = extractelement <8 x i8> %b, i32 2
+  ret i8 %r
+}
+
+define i3 @bitcasted_inselt_wide_source_not_modulo_elt_not_half_weird_types(i15 %x) {
+; LE-LABEL: @bitcasted_inselt_wide_source_not_modulo_elt_not_half_weird_types(
+; LE-NEXT:    [[TMP1:%.*]] = lshr i15 [[X:%.*]], 3
+; LE-NEXT:    [[R:%.*]] = trunc i15 [[TMP1]] to i3
+; LE-NEXT:    ret i3 [[R]]
+;
+; BE-LABEL: @bitcasted_inselt_wide_source_not_modulo_elt_not_half_weird_types(
+; BE-NEXT:    [[TMP1:%.*]] = lshr i15 [[X:%.*]], 9
+; BE-NEXT:    [[R:%.*]] = trunc i15 [[TMP1]] to i3
+; BE-NEXT:    ret i3 [[R]]
+;
+  %i = insertelement <3 x i15> poison, i15 %x, i32 0
+  %b = bitcast <3 x i15> %i to <15 x i3>
+  %r = extractelement <15 x i3> %b, i32 1
+  ret i3 %r
+}
+
+; Negative test for the above fold, but we can remove the insert here.
+
+define i8 @bitcasted_inselt_wide_source_wrong_insert(<2 x i32> %v, i32 %x) {
+; ANY-LABEL: @bitcasted_inselt_wide_source_wrong_insert(
+; ANY-NEXT:    [[B:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+; ANY-NEXT:    [[R:%.*]] = extractelement <8 x i8> [[B]], i32 2
+; ANY-NEXT:    ret i8 [[R]]
+;
+  %i = insertelement <2 x i32> %v, i32 %x, i32 1
+  %b = bitcast <2 x i32> %i to <8 x i8>
+  %r = extractelement <8 x i8> %b, i32 2
+  ret i8 %r
+}
+
+; Partial negative test for the above fold, extra uses are not allowed if shift is needed.
+
+declare void @use(<8 x i8>)
+
+define i8 @bitcasted_inselt_wide_source_uses(i32 %x) {
+; LE-LABEL: @bitcasted_inselt_wide_source_uses(
+; LE-NEXT:    [[I:%.*]] = insertelement <2 x i32> poison, i32 [[X:%.*]], i32 0
+; LE-NEXT:    [[B:%.*]] = bitcast <2 x i32> [[I]] to <8 x i8>
+; LE-NEXT:    call void @use(<8 x i8> [[B]])
+; LE-NEXT:    [[R:%.*]] = extractelement <8 x i8> [[B]], i32 3
+; LE-NEXT:    ret i8 [[R]]
+;
+; BE-LABEL: @bitcasted_inselt_wide_source_uses(
+; BE-NEXT:    [[I:%.*]] = insertelement <2 x i32> poison, i32 [[X:%.*]], i32 0
+; BE-NEXT:    [[B:%.*]] = bitcast <2 x i32> [[I]] to <8 x i8>
+; BE-NEXT:    call void @use(<8 x i8> [[B]])
+; BE-NEXT:    [[R:%.*]] = trunc i32 [[X]] to i8
+; BE-NEXT:    ret i8 [[R]]
+;
+  %i = insertelement <2 x i32> poison, i32 %x, i32 0
+  %b = bitcast <2 x i32> %i to <8 x i8>
+  call void @use(<8 x i8> %b)
+  %r = extractelement <8 x i8> %b, i32 3
+  ret i8 %r
+}
+
+define float @bitcasted_inselt_to_FP(i64 %x) {
+; LE-LABEL: @bitcasted_inselt_to_FP(
+; LE-NEXT:    [[TMP1:%.*]] = lshr i64 [[X:%.*]], 32
+; LE-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
+; LE-NEXT:    [[R:%.*]] = bitcast i32 [[TMP2]] to float
+; LE-NEXT:    ret float [[R]]
+;
+; BE-LABEL: @bitcasted_inselt_to_FP(
+; BE-NEXT:    [[TMP1:%.*]] = trunc i64 [[X:%.*]] to i32
+; BE-NEXT:    [[R:%.*]] = bitcast i32 [[TMP1]] to float
+; BE-NEXT:    ret float [[R]]
+;
+  %i = insertelement <2 x i64> poison, i64 %x, i32 0
+  %b = bitcast <2 x i64> %i to <4 x float>
+  %r = extractelement <4 x float> %b, i32 1
+  ret float %r
+}
+
+declare void @use_v2i128(<2 x i128>)
+declare void @use_v8f32(<8 x float>)
+
+define float @bitcasted_inselt_to_FP_uses(i128 %x) {
+; ANY-LABEL: @bitcasted_inselt_to_FP_uses(
+; ANY-NEXT:    [[I:%.*]] = insertelement <2 x i128> poison, i128 [[X:%.*]], i32 0
+; ANY-NEXT:    call void @use_v2i128(<2 x i128> [[I]])
+; ANY-NEXT:    [[B:%.*]] = bitcast <2 x i128> [[I]] to <8 x float>
+; ANY-NEXT:    [[R:%.*]] = extractelement <8 x float> [[B]], i32 1
+; ANY-NEXT:    ret float [[R]]
+;
+  %i = insertelement <2 x i128> poison, i128 %x, i32 0
+  call void @use_v2i128(<2 x i128> %i)
+  %b = bitcast <2 x i128> %i to <8 x float>
+  %r = extractelement <8 x float> %b, i32 1
+  ret float %r
+}
+
+define float @bitcasted_inselt_to_FP_uses2(i128 %x) {
+; ANY-LABEL: @bitcasted_inselt_to_FP_uses2(
+; ANY-NEXT:    [[I:%.*]] = insertelement <2 x i128> poison, i128 [[X:%.*]], i32 0
+; ANY-NEXT:    [[B:%.*]] = bitcast <2 x i128> [[I]] to <8 x float>
+; ANY-NEXT:    call void @use_v8f32(<8 x float> [[B]])
+; ANY-NEXT:    [[R:%.*]] = extractelement <8 x float> [[B]], i32 1
+; ANY-NEXT:    ret float [[R]]
+;
+  %i = insertelement <2 x i128> poison, i128 %x, i32 0
+  %b = bitcast <2 x i128> %i to <8 x float>
+  call void @use_v8f32(<8 x float> %b)
+  %r = extractelement <8 x float> %b, i32 1
+  ret float %r
+}
+
+define i32 @bitcasted_inselt_from_FP(double %x) {
+; LE-LABEL: @bitcasted_inselt_from_FP(
+; LE-NEXT:    [[TMP1:%.*]] = bitcast double [[X:%.*]] to i64
+; LE-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 32
+; LE-NEXT:    [[R:%.*]] = trunc i64 [[TMP2]] to i32
+; LE-NEXT:    ret i32 [[R]]
+;
+; BE-LABEL: @bitcasted_inselt_from_FP(
+; BE-NEXT:    [[TMP1:%.*]] = bitcast double [[X:%.*]] to i64
+; BE-NEXT:    [[R:%.*]] = trunc i64 [[TMP1]] to i32
+; BE-NEXT:    ret i32 [[R]]
+;
+  %i = insertelement <2 x double> poison, double %x, i32 0
+  %b = bitcast <2 x double> %i to <4 x i32>
+  %r = extractelement <4 x i32> %b, i32 1
+  ret i32 %r
+}
+
+declare void @use_v2f64(<2 x double>)
+declare void @use_v8i16(<8 x i16>)
+
+define i16 @bitcasted_inselt_from_FP_uses(double %x) {
+; ANY-LABEL: @bitcasted_inselt_from_FP_uses(
+; ANY-NEXT:    [[I:%.*]] = insertelement <2 x double> poison, double [[X:%.*]], i32 0
+; ANY-NEXT:    call void @use_v2f64(<2 x double> [[I]])
+; ANY-NEXT:    [[B:%.*]] = bitcast <2 x double> [[I]] to <8 x i16>
+; ANY-NEXT:    [[R:%.*]] = extractelement <8 x i16> [[B]], i32 1
+; ANY-NEXT:    ret i16 [[R]]
+;
+  %i = insertelement <2 x double> poison, double %x, i32 0
+  call void @use_v2f64(<2 x double> %i)
+  %b = bitcast <2 x double> %i to <8 x i16>
+  %r = extractelement <8 x i16> %b, i32 1
+  ret i16 %r
+}
+
+define i16 @bitcasted_inselt_from_FP_uses2(double %x) {
+; ANY-LABEL: @bitcasted_inselt_from_FP_uses2(
+; ANY-NEXT:    [[I:%.*]] = insertelement <2 x double> poison, double [[X:%.*]], i32 0
+; ANY-NEXT:    [[B:%.*]] = bitcast <2 x double> [[I]] to <8 x i16>
+; ANY-NEXT:    call void @use_v8i16(<8 x i16> [[B]])
+; ANY-NEXT:    [[R:%.*]] = extractelement <8 x i16> [[B]], i32 1
+; ANY-NEXT:    ret i16 [[R]]
+;
+  %i = insertelement <2 x double> poison, double %x, i32 0
+  %b = bitcast <2 x double> %i to <8 x i16>
+  call void @use_v8i16(<8 x i16> %b)
+  %r = extractelement <8 x i16> %b, i32 1
+  ret i16 %r
+}
+
+define float @bitcasted_inselt_to_and_from_FP(double %x) {
+; ANY-LABEL: @bitcasted_inselt_to_and_from_FP(
+; ANY-NEXT:    [[I:%.*]] = insertelement <2 x double> poison, double [[X:%.*]], i32 0
+; ANY-NEXT:    [[B:%.*]] = bitcast <2 x double> [[I]] to <4 x float>
+; ANY-NEXT:    [[R:%.*]] = extractelement <4 x float> [[B]], i32 1
+; ANY-NEXT:    ret float [[R]]
+;
+  %i = insertelement <2 x double> poison, double %x, i32 0
+  %b = bitcast <2 x double> %i to <4 x float>
+  %r = extractelement <4 x float> %b, i32 1
+  ret float %r
+}
+
+define float @bitcasted_inselt_to_and_from_FP_uses(double %x) {
+; ANY-LABEL: @bitcasted_inselt_to_and_from_FP_uses(
+; ANY-NEXT:    [[I:%.*]] = insertelement <2 x double> poison, double [[X:%.*]], i32 0
+; ANY-NEXT:    call void @use_v2f64(<2 x double> [[I]])
+; ANY-NEXT:    [[B:%.*]] = bitcast <2 x double> [[I]] to <4 x float>
+; ANY-NEXT:    [[R:%.*]] = extractelement <4 x float> [[B]], i32 1
+; ANY-NEXT:    ret float [[R]]
+;
+  %i = insertelement <2 x double> poison, double %x, i32 0
+  call void @use_v2f64(<2 x double> %i)
+  %b = bitcast <2 x double> %i to <4 x float>
+  %r = extractelement <4 x float> %b, i32 1
+  ret float %r
+}
+
+declare void @use_v4f32(<4 x float>)
+
+define float @bitcasted_inselt_to_and_from_FP_uses2(double %x) {
+; ANY-LABEL: @bitcasted_inselt_to_and_from_FP_uses2(
+; ANY-NEXT:    [[I:%.*]] = insertelement <2 x double> poison, double [[X:%.*]], i32 0
+; ANY-NEXT:    [[B:%.*]] = bitcast <2 x double> [[I]] to <4 x float>
+; ANY-NEXT:    call void @use_v4f32(<4 x float> [[B]])
+; ANY-NEXT:    [[R:%.*]] = extractelement <4 x float> [[B]], i32 1
+; ANY-NEXT:    ret float [[R]]
+;
+  %i = insertelement <2 x double> poison, double %x, i32 0
+  %b = bitcast <2 x double> %i to <4 x float>
+  call void @use_v4f32(<4 x float> %b)
+  %r = extractelement <4 x float> %b, i32 1
+  ret float %r
+}
+
+; This would crash/assert because the logic for collectShuffleElements()
+; does not consider the possibility of invalid insert/extract operands.
+
+define <4 x double> @invalid_extractelement(<2 x double> %a, <4 x double> %b, double* %p) {
+; ANY-LABEL: @invalid_extractelement(
+; ANY-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; ANY-NEXT:    [[T4:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[TMP1]], <4 x i32> <i32 undef, i32 1, i32 4, i32 3>
+; ANY-NEXT:    [[E:%.*]] = extractelement <4 x double> [[B]], i32 1
+; ANY-NEXT:    store double [[E]], double* [[P:%.*]], align 8
+; ANY-NEXT:    [[R:%.*]] = insertelement <4 x double> [[T4]], double undef, i64 0
+; ANY-NEXT:    ret <4 x double> [[R]]
+;
+  %t3 = extractelement <2 x double> %a, i32 0
+  %t4 = insertelement <4 x double> %b, double %t3, i32 2
+  %e = extractelement <4 x double> %t4, i32 1
+  store double %e, double* %p
+  %e1 = extractelement <2 x double> %a, i32 4 ; invalid index
+  %r = insertelement <4 x double> %t4, double %e1, i64 0
+  ret <4 x double> %r
+}
--- a/llvm/test/Transforms/InstCombine/fold-vector-zero-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/fold-vector-zero-inseltpoison.ll
@ -0,0 +1,35 @@
+; RUN: opt < %s -instcombine -S | not grep zeroinitializer
+
+define void @foo(i64 %A, i64 %B) {
+bb8:
+	br label %bb30
+
+bb30:
+	%s0 = phi i64 [ 0, %bb8 ], [ %r21, %bb30 ]
+	%l0 = phi i64 [ -2222, %bb8 ], [ %r23, %bb30 ]
+	%r2 = add i64 %s0, %B
+	%r3 = inttoptr i64 %r2 to <2 x double>*
+	%r4 = load <2 x double>, <2 x double>* %r3, align 8
+	%r6 = bitcast <2 x double> %r4 to <2 x i64>
+	%r7 = bitcast <2 x double> zeroinitializer to <2 x i64>
+	%r8 = insertelement <2 x i64> poison, i64 9223372036854775807, i32 0
+	%r9 = insertelement <2 x i64> poison, i64 -9223372036854775808, i32 0
+	%r10 = insertelement <2 x i64> %r8, i64 9223372036854775807, i32 1
+	%r11 = insertelement <2 x i64> %r9, i64 -9223372036854775808, i32 1
+	%r12 = and <2 x i64> %r6, %r10
+	%r13 = and <2 x i64> %r7, %r11
+	%r14 = or <2 x i64> %r12, %r13
+	%r15 = bitcast <2 x i64> %r14 to <2 x double>
+	%r18 = add i64 %s0, %A
+	%r19 = inttoptr i64 %r18 to <2 x double>*
+	store <2 x double> %r15, <2 x double>* %r19, align 8
+	%r21 = add i64 16, %s0
+	%r23 = add i64 1, %l0
+	%r25 = icmp slt i64 %r23, 0
+	%r26 = zext i1 %r25 to i64
+	%r27 = icmp ne i64 %r26, 0
+	br i1 %r27, label %bb30, label %bb5
+
+bb5:
+	ret void
+}
--- a/llvm/test/Transforms/InstCombine/icmp-bc-vec-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-bc-vec-inseltpoison.ll
@ -0,0 +1,127 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; Tests to verify proper functioning of the icmp folding implemented in
+;  InstCombiner::foldICmpBitCastConstant
+; Specifically, folding:
+;   icmp <pred> iN X, C
+;  where X = bitcast <M x iK> (shufflevector <M x iK> %vec, undef, SC)) to iN
+;    and C is a splat of a K-bit pattern
+;    and SC is a constant vector = <C', C', C', ..., C'>
+; Into:
+;  %E = extractelement <M x iK> %vec, i32 C'
+;  icmp <pred> iK %E, trunc(C)
+
+define i1 @test_i1_0(i1 %val) {
+; CHECK-LABEL: @test_i1_0(
+; CHECK-NEXT:    [[COND:%.*]] = xor i1 [[VAL:%.*]], true
+; CHECK-NEXT:    ret i1 [[COND]]
+;
+  %insvec = insertelement <4 x i1> poison, i1 %val, i32 0
+  %vec = shufflevector <4 x i1> %insvec, <4 x i1> undef, <4 x i32> zeroinitializer
+  %cast = bitcast <4 x i1> %vec to i4
+  %cond = icmp eq i4 %cast, 0
+  ret i1 %cond
+}
+
+define i1 @test_i1_0_2(i1 %val) {
+; CHECK-LABEL: @test_i1_0_2(
+; CHECK-NEXT:    [[COND:%.*]] = xor i1 [[VAL:%.*]], true
+; CHECK-NEXT:    ret i1 [[COND]]
+;
+  %insvec = insertelement <4 x i1> poison, i1 %val, i32 2
+  %vec = shufflevector <4 x i1> %insvec, <4 x i1> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  %cast = bitcast <4 x i1> %vec to i4
+  %cond = icmp eq i4 %cast, 0
+  ret i1 %cond
+}
+
+define i1 @test_i1_m1(i1 %val) {
+; CHECK-LABEL: @test_i1_m1(
+; CHECK-NEXT:    ret i1 [[VAL:%.*]]
+;
+  %insvec = insertelement <4 x i1> poison, i1 %val, i32 0
+  %vec = shufflevector <4 x i1> %insvec, <4 x i1> undef, <4 x i32> zeroinitializer
+  %cast = bitcast <4 x i1> %vec to i4
+  %cond = icmp eq i4 %cast, -1
+  ret i1 %cond
+}
+
+define i1 @test_i8_pattern(i8 %val) {
+; CHECK-LABEL: @test_i8_pattern(
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i8 [[VAL:%.*]], 72
+; CHECK-NEXT:    ret i1 [[COND]]
+;
+  %insvec = insertelement <4 x i8> poison, i8 %val, i32 0
+  %vec = shufflevector <4 x i8> %insvec, <4 x i8> undef, <4 x i32> zeroinitializer
+  %cast = bitcast <4 x i8> %vec to i32
+  %cond = icmp eq i32 %cast, 1212696648
+  ret i1 %cond
+}
+
+define i1 @test_i8_pattern_2(i8 %val) {
+; CHECK-LABEL: @test_i8_pattern_2(
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i8 [[VAL:%.*]], 72
+; CHECK-NEXT:    ret i1 [[COND]]
+;
+  %insvec = insertelement <4 x i8> poison, i8 %val, i32 2
+  %vec = shufflevector <4 x i8> %insvec, <4 x i8> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  %cast = bitcast <4 x i8> %vec to i32
+  %cond = icmp eq i32 %cast, 1212696648
+  ret i1 %cond
+}
+
+; Make sure we don't try to fold if the shufflemask has differing element values
+define i1 @test_i8_pattern_3(<4 x i8> %invec) {
+; CHECK-LABEL: @test_i8_pattern_3(
+; CHECK-NEXT:    [[VEC:%.*]] = shufflevector <4 x i8> [[INVEC:%.*]], <4 x i8> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast <4 x i8> [[VEC]] to i32
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[CAST]], 1212696648
+; CHECK-NEXT:    ret i1 [[COND]]
+;
+  %vec = shufflevector <4 x i8> %invec, <4 x i8> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %cast = bitcast <4 x i8> %vec to i32
+  %cond = icmp eq i32 %cast, 1212696648
+  ret i1 %cond
+}
+
+; Make sure we don't try to fold if the compared-to constant isn't a splatted value
+define i1 @test_i8_nopattern(i8 %val) {
+; CHECK-LABEL: @test_i8_nopattern(
+; CHECK-NEXT:    [[INSVEC:%.*]] = insertelement <4 x i8> poison, i8 [[VAL:%.*]], i32 0
+; CHECK-NEXT:    [[VEC:%.*]] = shufflevector <4 x i8> [[INSVEC]], <4 x i8> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast <4 x i8> [[VEC]] to i32
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[CAST]], 1212696647
+; CHECK-NEXT:    ret i1 [[COND]]
+;
+  %insvec = insertelement <4 x i8> poison, i8 %val, i32 0
+  %vec = shufflevector <4 x i8> %insvec, <4 x i8> undef, <4 x i32> zeroinitializer
+  %cast = bitcast <4 x i8> %vec to i32
+  %cond = icmp eq i32 %cast, 1212696647
+  ret i1 %cond
+}
+
+; Verify that we fold more than just the eq predicate
+define i1 @test_i8_ult_pattern(i8 %val) {
+; CHECK-LABEL: @test_i8_ult_pattern(
+; CHECK-NEXT:    [[COND:%.*]] = icmp ult i8 [[VAL:%.*]], 72
+; CHECK-NEXT:    ret i1 [[COND]]
+;
+  %insvec = insertelement <4 x i8> poison, i8 %val, i32 0
+  %vec = shufflevector <4 x i8> %insvec, <4 x i8> undef, <4 x i32> zeroinitializer
+  %cast = bitcast <4 x i8> %vec to i32
+  %cond = icmp ult i32 %cast, 1212696648
+  ret i1 %cond
+}
+
+define i1 @extending_shuffle_with_weird_types(<2 x i9> %v) {
+; CHECK-LABEL: @extending_shuffle_with_weird_types(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i9> [[V:%.*]], i32 0
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i9 [[TMP1]], 1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %splat = shufflevector <2 x i9> %v, <2 x i9> undef, <3 x i32> zeroinitializer
+  %cast = bitcast <3 x i9> %splat to i27
+  %cmp = icmp slt i27 %cast, 262657 ; 0x040201
+  ret i1 %cmp
+}
--- a/llvm/test/Transforms/InstCombine/inselt-binop-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/inselt-binop-inseltpoison.ll
@ -0,0 +1,635 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -instcombine %s | FileCheck %s
+
+define <2 x i8> @add_constant(i8 %x) {
+; CHECK-LABEL: @add_constant(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = add <2 x i8> [[INS]], <i8 42, i8 undef>
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 0
+  %bo = add <2 x i8> %ins, <i8 42, i8 undef>
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @add_constant_not_undef_lane(i8 %x) {
+; CHECK-LABEL: @add_constant_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = add <2 x i8> [[INS]], <i8 42, i8 -42>
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 0
+  %bo = add <2 x i8> %ins, <i8 42, i8 -42>
+  ret <2 x i8> %bo
+}
+
+; IR flags are not required, but they should propagate.
+
+define <2 x i8> @sub_constant_op0(i8 %x) {
+; CHECK-LABEL: @sub_constant_op0(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = sub nuw nsw <2 x i8> <i8 undef, i8 -42>, [[INS]]
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 1
+  %bo = sub nsw nuw <2 x i8> <i8 undef, i8 -42>, %ins
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @sub_constant_op0_not_undef_lane(i8 %x) {
+; CHECK-LABEL: @sub_constant_op0_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = sub nuw <2 x i8> <i8 42, i8 -42>, [[INS]]
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 1
+  %bo = sub nuw <2 x i8> <i8 42, i8 -42>, %ins
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @sub_constant_op1(i8 %x) {
+; CHECK-LABEL: @sub_constant_op1(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = add <2 x i8> [[INS]], <i8 -42, i8 undef>
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 0
+  %bo = sub nuw <2 x i8> %ins, <i8 42, i8 undef>
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @sub_constant_op1_not_undef_lane(i8 %x) {
+; CHECK-LABEL: @sub_constant_op1_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = add <2 x i8> [[INS]], <i8 -42, i8 42>
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 0
+  %bo = sub nuw <2 x i8> %ins, <i8 42, i8 -42>
+  ret <2 x i8> %bo
+}
+
+define <3 x i8> @mul_constant(i8 %x) {
+; CHECK-LABEL: @mul_constant(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <3 x i8> poison, i8 [[X:%.*]], i32 2
+; CHECK-NEXT:    [[BO:%.*]] = mul <3 x i8> [[INS]], <i8 undef, i8 undef, i8 -42>
+; CHECK-NEXT:    ret <3 x i8> [[BO]]
+;
+  %ins = insertelement <3 x i8> poison, i8 %x, i32 2
+  %bo = mul <3 x i8> %ins, <i8 undef, i8 undef, i8 -42>
+  ret <3 x i8> %bo
+}
+
+define <3 x i8> @mul_constant_not_undef_lane(i8 %x) {
+; CHECK-LABEL: @mul_constant_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <3 x i8> poison, i8 [[X:%.*]], i32 2
+; CHECK-NEXT:    [[BO:%.*]] = mul <3 x i8> [[INS]], <i8 42, i8 undef, i8 -42>
+; CHECK-NEXT:    ret <3 x i8> [[BO]]
+;
+  %ins = insertelement <3 x i8> poison, i8 %x, i32 2
+  %bo = mul <3 x i8> %ins, <i8 42, i8 undef, i8 -42>
+  ret <3 x i8> %bo
+}
+
+define <2 x i8> @shl_constant_op0(i8 %x) {
+; CHECK-LABEL: @shl_constant_op0(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = shl <2 x i8> <i8 undef, i8 2>, [[INS]]
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 1
+  %bo = shl <2 x i8> <i8 undef, i8 2>, %ins
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @shl_constant_op0_not_undef_lane(i8 %x) {
+; CHECK-LABEL: @shl_constant_op0_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = shl <2 x i8> <i8 5, i8 2>, [[INS]]
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 1
+  %bo = shl <2 x i8> <i8 5, i8 2>, %ins
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @shl_constant_op1(i8 %x) {
+; CHECK-LABEL: @shl_constant_op1(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = shl nuw <2 x i8> [[INS]], <i8 5, i8 undef>
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 0
+  %bo = shl nuw <2 x i8> %ins, <i8 5, i8 undef>
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @shl_constant_op1_not_undef_lane(i8 %x) {
+; CHECK-LABEL: @shl_constant_op1_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = shl nuw <2 x i8> [[INS]], <i8 5, i8 2>
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 0
+  %bo = shl nuw <2 x i8> %ins, <i8 5, i8 2>
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @ashr_constant_op0(i8 %x) {
+; CHECK-LABEL: @ashr_constant_op0(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = ashr exact <2 x i8> <i8 undef, i8 2>, [[INS]]
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 1
+  %bo = ashr exact <2 x i8> <i8 undef, i8 2>, %ins
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @ashr_constant_op0_not_undef_lane(i8 %x) {
+; CHECK-LABEL: @ashr_constant_op0_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = lshr <2 x i8> <i8 5, i8 2>, [[INS]]
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 1
+  %bo = ashr exact <2 x i8> <i8 5, i8 2>, %ins
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @ashr_constant_op1(i8 %x) {
+; CHECK-LABEL: @ashr_constant_op1(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = ashr <2 x i8> [[INS]], <i8 5, i8 undef>
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 0
+  %bo = ashr <2 x i8> %ins, <i8 5, i8 undef>
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @ashr_constant_op1_not_undef_lane(i8 %x) {
+; CHECK-LABEL: @ashr_constant_op1_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = ashr <2 x i8> [[INS]], <i8 5, i8 2>
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 0
+  %bo = ashr <2 x i8> %ins, <i8 5, i8 2>
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @lshr_constant_op0(i8 %x) {
+; CHECK-LABEL: @lshr_constant_op0(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = lshr <2 x i8> <i8 5, i8 undef>, [[INS]]
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 0
+  %bo = lshr <2 x i8> <i8 5, i8 undef>, %ins
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @lshr_constant_op0_not_undef_lane(i8 %x) {
+; CHECK-LABEL: @lshr_constant_op0_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = lshr <2 x i8> <i8 5, i8 2>, [[INS]]
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 0
+  %bo = lshr <2 x i8> <i8 5, i8 2>, %ins
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @lshr_constant_op1(i8 %x) {
+; CHECK-LABEL: @lshr_constant_op1(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = lshr exact <2 x i8> [[INS]], <i8 undef, i8 2>
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 1
+  %bo = lshr exact <2 x i8> %ins, <i8 undef, i8 2>
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @lshr_constant_op1_not_undef_lane(i8 %x) {
+; CHECK-LABEL: @lshr_constant_op1_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = lshr exact <2 x i8> [[INS]], <i8 5, i8 2>
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 1
+  %bo = lshr exact <2 x i8> %ins, <i8 5, i8 2>
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @urem_constant_op0(i8 %x) {
+; CHECK-LABEL: @urem_constant_op0(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = urem <2 x i8> <i8 5, i8 undef>, [[INS]]
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 0
+  %bo = urem <2 x i8> <i8 5, i8 undef>, %ins
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @urem_constant_op0_not_undef_lane(i8 %x) {
+; CHECK-LABEL: @urem_constant_op0_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = urem <2 x i8> <i8 5, i8 2>, [[INS]]
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 0
+  %bo = urem <2 x i8> <i8 5, i8 2>, %ins
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @urem_constant_op1(i8 %x) {
+; CHECK-LABEL: @urem_constant_op1(
+; CHECK-NEXT:    ret <2 x i8> undef
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 1
+  %bo = urem <2 x i8> %ins, <i8 undef, i8 2>
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @urem_constant_op1_not_undef_lane(i8 %x) {
+; CHECK-LABEL: @urem_constant_op1_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = urem <2 x i8> [[INS]], <i8 5, i8 2>
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 1
+  %bo = urem <2 x i8> %ins, <i8 5, i8 2>
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @srem_constant_op0(i8 %x) {
+; CHECK-LABEL: @srem_constant_op0(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = srem <2 x i8> <i8 5, i8 undef>, [[INS]]
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 0
+  %bo = srem <2 x i8> <i8 5, i8 undef>, %ins
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @srem_constant_op0_not_undef_lane(i8 %x) {
+; CHECK-LABEL: @srem_constant_op0_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = srem <2 x i8> <i8 5, i8 2>, [[INS]]
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 0
+  %bo = srem <2 x i8> <i8 5, i8 2>, %ins
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @srem_constant_op1(i8 %x) {
+; CHECK-LABEL: @srem_constant_op1(
+; CHECK-NEXT:    ret <2 x i8> undef
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 1
+  %bo = srem <2 x i8> %ins, <i8 undef, i8 2>
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @srem_constant_op1_not_undef_lane(i8 %x) {
+; CHECK-LABEL: @srem_constant_op1_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = srem <2 x i8> [[INS]], <i8 5, i8 2>
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 1
+  %bo = srem <2 x i8> %ins, <i8 5, i8 2>
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @udiv_constant_op0(i8 %x) {
+; CHECK-LABEL: @udiv_constant_op0(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = udiv exact <2 x i8> <i8 5, i8 undef>, [[INS]]
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 0
+  %bo = udiv exact <2 x i8> <i8 5, i8 undef>, %ins
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @udiv_constant_op0_not_undef_lane(i8 %x) {
+; CHECK-LABEL: @udiv_constant_op0_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = udiv exact <2 x i8> <i8 5, i8 2>, [[INS]]
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 0
+  %bo = udiv exact <2 x i8> <i8 5, i8 2>, %ins
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @udiv_constant_op1(i8 %x) {
+; CHECK-LABEL: @udiv_constant_op1(
+; CHECK-NEXT:    ret <2 x i8> undef
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 1
+  %bo = udiv <2 x i8> %ins, <i8 undef, i8 2>
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @udiv_constant_op1_not_undef_lane(i8 %x) {
+; CHECK-LABEL: @udiv_constant_op1_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = udiv <2 x i8> [[INS]], <i8 5, i8 2>
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 1
+  %bo = udiv <2 x i8> %ins, <i8 5, i8 2>
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @sdiv_constant_op0(i8 %x) {
+; CHECK-LABEL: @sdiv_constant_op0(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = sdiv <2 x i8> <i8 5, i8 undef>, [[INS]]
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 0
+  %bo = sdiv <2 x i8> <i8 5, i8 undef>, %ins
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @sdiv_constant_op0_not_undef_lane(i8 %x) {
+; CHECK-LABEL: @sdiv_constant_op0_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = sdiv <2 x i8> <i8 5, i8 2>, [[INS]]
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 0
+  %bo = sdiv <2 x i8> <i8 5, i8 2>, %ins
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @sdiv_constant_op1(i8 %x) {
+; CHECK-LABEL: @sdiv_constant_op1(
+; CHECK-NEXT:    ret <2 x i8> undef
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 1
+  %bo = sdiv exact <2 x i8> %ins, <i8 undef, i8 2>
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @sdiv_constant_op1_not_undef_lane(i8 %x) {
+; CHECK-LABEL: @sdiv_constant_op1_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = sdiv exact <2 x i8> [[INS]], <i8 5, i8 2>
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 1
+  %bo = sdiv exact <2 x i8> %ins, <i8 5, i8 2>
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @and_constant(i8 %x) {
+; CHECK-LABEL: @and_constant(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = and <2 x i8> [[INS]], <i8 42, i8 undef>
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 0
+  %bo = and <2 x i8> %ins, <i8 42, i8 undef>
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @and_constant_not_undef_lane(i8 %x) {
+; CHECK-LABEL: @and_constant_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = and <2 x i8> [[INS]], <i8 42, i8 -42>
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 0
+  %bo = and <2 x i8> %ins, <i8 42, i8 -42>
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @or_constant(i8 %x) {
+; CHECK-LABEL: @or_constant(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = or <2 x i8> [[INS]], <i8 undef, i8 -42>
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 1
+  %bo = or <2 x i8> %ins, <i8 undef, i8 -42>
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @or_constant_not_undef_lane(i8 %x) {
+; CHECK-LABEL: @or_constant_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = or <2 x i8> [[INS]], <i8 42, i8 -42>
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 1
+  %bo = or <2 x i8> %ins, <i8 42, i8 -42>
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @xor_constant(i8 %x) {
+; CHECK-LABEL: @xor_constant(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = xor <2 x i8> [[INS]], <i8 42, i8 undef>
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 0
+  %bo = xor <2 x i8> %ins, <i8 42, i8 undef>
+  ret <2 x i8> %bo
+}
+
+define <2 x i8> @xor_constant_not_undef_lane(i8 %x) {
+; CHECK-LABEL: @xor_constant_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = xor <2 x i8> [[INS]], <i8 42, i8 -42>
+; CHECK-NEXT:    ret <2 x i8> [[BO]]
+;
+  %ins = insertelement <2 x i8> poison, i8 %x, i32 0
+  %bo = xor <2 x i8> %ins, <i8 42, i8 -42>
+  ret <2 x i8> %bo
+}
+
+define <2 x float> @fadd_constant(float %x) {
+; CHECK-LABEL: @fadd_constant(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x float> poison, float [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = fadd <2 x float> [[INS]], <float 4.200000e+01, float undef>
+; CHECK-NEXT:    ret <2 x float> [[BO]]
+;
+  %ins = insertelement <2 x float> poison, float %x, i32 0
+  %bo = fadd <2 x float> %ins, <float 42.0, float undef>
+  ret <2 x float> %bo
+}
+
+define <2 x float> @fadd_constant_not_undef_lane(float %x) {
+; CHECK-LABEL: @fadd_constant_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x float> poison, float [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = fadd <2 x float> [[INS]], <float 4.200000e+01, float -4.200000e+01>
+; CHECK-NEXT:    ret <2 x float> [[BO]]
+;
+  %ins = insertelement <2 x float> poison, float %x, i32 1
+  %bo = fadd <2 x float> %ins, <float 42.0, float -42.0>
+  ret <2 x float> %bo
+}
+
+define <2 x float> @fsub_constant_op0(float %x) {
+; CHECK-LABEL: @fsub_constant_op0(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x float> poison, float [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = fsub fast <2 x float> <float 4.200000e+01, float undef>, [[INS]]
+; CHECK-NEXT:    ret <2 x float> [[BO]]
+;
+  %ins = insertelement <2 x float> poison, float %x, i32 0
+  %bo = fsub fast <2 x float> <float 42.0, float undef>, %ins
+  ret <2 x float> %bo
+}
+
+define <2 x float> @fsub_constant_op0_not_undef_lane(float %x) {
+; CHECK-LABEL: @fsub_constant_op0_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x float> poison, float [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = fsub nsz <2 x float> <float 4.200000e+01, float -4.200000e+01>, [[INS]]
+; CHECK-NEXT:    ret <2 x float> [[BO]]
+;
+  %ins = insertelement <2 x float> poison, float %x, i32 1
+  %bo = fsub nsz <2 x float> <float 42.0, float -42.0>, %ins
+  ret <2 x float> %bo
+}
+
+define <2 x float> @fsub_constant_op1(float %x) {
+; CHECK-LABEL: @fsub_constant_op1(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x float> poison, float [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = fadd <2 x float> [[INS]], <float undef, float -4.200000e+01>
+; CHECK-NEXT:    ret <2 x float> [[BO]]
+;
+  %ins = insertelement <2 x float> poison, float %x, i32 1
+  %bo = fsub <2 x float> %ins, <float undef, float 42.0>
+  ret <2 x float> %bo
+}
+
+define <2 x float> @fsub_constant_op1_not_undef_lane(float %x) {
+; CHECK-LABEL: @fsub_constant_op1_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x float> poison, float [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = fadd <2 x float> [[INS]], <float -4.200000e+01, float 4.200000e+01>
+; CHECK-NEXT:    ret <2 x float> [[BO]]
+;
+  %ins = insertelement <2 x float> poison, float %x, i32 0
+  %bo = fsub <2 x float> %ins, <float 42.0, float -42.0>
+  ret <2 x float> %bo
+}
+
+define <2 x float> @fmul_constant(float %x) {
+; CHECK-LABEL: @fmul_constant(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x float> poison, float [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = fmul reassoc <2 x float> [[INS]], <float 4.200000e+01, float undef>
+; CHECK-NEXT:    ret <2 x float> [[BO]]
+;
+  %ins = insertelement <2 x float> poison, float %x, i32 0
+  %bo = fmul reassoc <2 x float> %ins, <float 42.0, float undef>
+  ret <2 x float> %bo
+}
+
+define <2 x float> @fmul_constant_not_undef_lane(float %x) {
+; CHECK-LABEL: @fmul_constant_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x float> poison, float [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = fmul <2 x float> [[INS]], <float 4.200000e+01, float -4.200000e+01>
+; CHECK-NEXT:    ret <2 x float> [[BO]]
+;
+  %ins = insertelement <2 x float> poison, float %x, i32 1
+  %bo = fmul <2 x float> %ins, <float 42.0, float -42.0>
+  ret <2 x float> %bo
+}
+
+define <2 x float> @fdiv_constant_op0(float %x) {
+; CHECK-LABEL: @fdiv_constant_op0(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x float> poison, float [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = fdiv nnan <2 x float> <float undef, float 4.200000e+01>, [[INS]]
+; CHECK-NEXT:    ret <2 x float> [[BO]]
+;
+  %ins = insertelement <2 x float> poison, float %x, i32 1
+  %bo = fdiv nnan <2 x float> <float undef, float 42.0>, %ins
+  ret <2 x float> %bo
+}
+
+define <2 x float> @fdiv_constant_op0_not_undef_lane(float %x) {
+; CHECK-LABEL: @fdiv_constant_op0_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x float> poison, float [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = fdiv ninf <2 x float> <float 4.200000e+01, float -4.200000e+01>, [[INS]]
+; CHECK-NEXT:    ret <2 x float> [[BO]]
+;
+  %ins = insertelement <2 x float> poison, float %x, i32 0
+  %bo = fdiv ninf <2 x float> <float 42.0, float -42.0>, %ins
+  ret <2 x float> %bo
+}
+
+define <2 x float> @fdiv_constant_op1(float %x) {
+; CHECK-LABEL: @fdiv_constant_op1(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x float> poison, float [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = fdiv <2 x float> [[INS]], <float 4.200000e+01, float undef>
+; CHECK-NEXT:    ret <2 x float> [[BO]]
+;
+  %ins = insertelement <2 x float> poison, float %x, i32 0
+  %bo = fdiv <2 x float> %ins, <float 42.0, float undef>
+  ret <2 x float> %bo
+}
+
+define <2 x float> @fdiv_constant_op1_not_undef_lane(float %x) {
+; CHECK-LABEL: @fdiv_constant_op1_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x float> poison, float [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = fdiv <2 x float> [[INS]], <float 4.200000e+01, float -4.200000e+01>
+; CHECK-NEXT:    ret <2 x float> [[BO]]
+;
+  %ins = insertelement <2 x float> poison, float %x, i32 0
+  %bo = fdiv <2 x float> %ins, <float 42.0, float -42.0>
+  ret <2 x float> %bo
+}
+
+define <2 x float> @frem_constant_op0(float %x) {
+; CHECK-LABEL: @frem_constant_op0(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x float> poison, float [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = frem fast <2 x float> <float 4.200000e+01, float undef>, [[INS]]
+; CHECK-NEXT:    ret <2 x float> [[BO]]
+;
+  %ins = insertelement <2 x float> poison, float %x, i32 0
+  %bo = frem fast <2 x float> <float 42.0, float undef>, %ins
+  ret <2 x float> %bo
+}
+
+define <2 x float> @frem_constant_op0_not_undef_lane(float %x) {
+; CHECK-LABEL: @frem_constant_op0_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x float> poison, float [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = frem <2 x float> <float 4.200000e+01, float -4.200000e+01>, [[INS]]
+; CHECK-NEXT:    ret <2 x float> [[BO]]
+;
+  %ins = insertelement <2 x float> poison, float %x, i32 1
+  %bo = frem <2 x float> <float 42.0, float -42.0>, %ins
+  ret <2 x float> %bo
+}
+
+define <2 x float> @frem_constant_op1(float %x) {
+; CHECK-LABEL: @frem_constant_op1(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x float> poison, float [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = frem ninf <2 x float> [[INS]], <float undef, float 4.200000e+01>
+; CHECK-NEXT:    ret <2 x float> [[BO]]
+;
+  %ins = insertelement <2 x float> poison, float %x, i32 1
+  %bo = frem ninf <2 x float> %ins, <float undef, float 42.0>
+  ret <2 x float> %bo
+}
+
+define <2 x float> @frem_constant_op1_not_undef_lane(float %x) {
+; CHECK-LABEL: @frem_constant_op1_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x float> poison, float [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = frem nnan <2 x float> [[INS]], <float 4.200000e+01, float -4.200000e+01>
+; CHECK-NEXT:    ret <2 x float> [[BO]]
+;
+  %ins = insertelement <2 x float> poison, float %x, i32 0
+  %bo = frem nnan <2 x float> %ins, <float 42.0, float -42.0>
+  ret <2 x float> %bo
+}
+
--- a/llvm/test/Transforms/InstCombine/insert-extract-shuffle-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/insert-extract-shuffle-inseltpoison.ll
@ -0,0 +1,735 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -instcombine %s | FileCheck %s
+
+define <1 x i8> @test1(<8 x i8> %in) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[VEC:%.*]] = shufflevector <8 x i8> [[IN:%.*]], <8 x i8> undef, <1 x i32> <i32 5>
+; CHECK-NEXT:    ret <1 x i8> [[VEC]]
+;
+  %val = extractelement <8 x i8> %in, i32 5
+  %vec = insertelement <1 x i8> poison, i8 %val, i32 0
+  ret <1 x i8> %vec
+}
+
+define <4 x i16> @test2(<8 x i16> %in, <8 x i16> %in2) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[VEC_3:%.*]] = shufflevector <8 x i16> [[IN2:%.*]], <8 x i16> [[IN:%.*]], <4 x i32> <i32 11, i32 9, i32 0, i32 10>
+; CHECK-NEXT:    ret <4 x i16> [[VEC_3]]
+;
+  %elt0 = extractelement <8 x i16> %in, i32 3
+  %elt1 = extractelement <8 x i16> %in, i32 1
+  %elt2 = extractelement <8 x i16> %in2, i32 0
+  %elt3 = extractelement <8 x i16> %in, i32 2
+
+  %vec.0 = insertelement <4 x i16> poison, i16 %elt0, i32 0
+  %vec.1 = insertelement <4 x i16> %vec.0, i16 %elt1, i32 1
+  %vec.2 = insertelement <4 x i16> %vec.1, i16 %elt2, i32 2
+  %vec.3 = insertelement <4 x i16> %vec.2, i16 %elt3, i32 3
+
+  ret <4 x i16> %vec.3
+}
+
+define <2 x i64> @test_vcopyq_lane_p64(<2 x i64> %a, <1 x i64> %b) {
+; CHECK-LABEL: @test_vcopyq_lane_p64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <1 x i64> [[B:%.*]], <1 x i64> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[TMP1]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    ret <2 x i64> [[RES]]
+;
+  %elt = extractelement <1 x i64> %b, i32 0
+  %res = insertelement <2 x i64> %a, i64 %elt, i32 1
+  ret <2 x i64> %res
+}
+
+; PR2109: https://llvm.org/bugs/show_bug.cgi?id=2109
+
+define <4 x float> @widen_extract2(<4 x float> %ins, <2 x float> %ext) {
+; CHECK-LABEL: @widen_extract2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[EXT:%.*]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[I2:%.*]] = shufflevector <4 x float> [[INS:%.*]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 4, i32 2, i32 5>
+; CHECK-NEXT:    ret <4 x float> [[I2]]
+;
+  %e1 = extractelement <2 x float> %ext, i32 0
+  %e2 = extractelement <2 x float> %ext, i32 1
+  %i1 = insertelement <4 x float> %ins, float %e1, i32 1
+  %i2 = insertelement <4 x float> %i1, float %e2, i32 3
+  ret <4 x float> %i2
+}
+
+define <4 x float> @widen_extract3(<4 x float> %ins, <3 x float> %ext) {
+; CHECK-LABEL: @widen_extract3(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x float> [[EXT:%.*]], <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+; CHECK-NEXT:    [[I3:%.*]] = shufflevector <4 x float> [[INS:%.*]], <4 x float> [[TMP1]], <4 x i32> <i32 6, i32 5, i32 4, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[I3]]
+;
+  %e1 = extractelement <3 x float> %ext, i32 0
+  %e2 = extractelement <3 x float> %ext, i32 1
+  %e3 = extractelement <3 x float> %ext, i32 2
+  %i1 = insertelement <4 x float> %ins, float %e1, i32 2
+  %i2 = insertelement <4 x float> %i1, float %e2, i32 1
+  %i3 = insertelement <4 x float> %i2, float %e3, i32 0
+  ret <4 x float> %i3
+}
+
+define <8 x float> @widen_extract4(<8 x float> %ins, <2 x float> %ext) {
+; CHECK-LABEL: @widen_extract4(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[EXT:%.*]], <2 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[I1:%.*]] = shufflevector <8 x float> [[INS:%.*]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 8, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[I1]]
+;
+  %e1 = extractelement <2 x float> %ext, i32 0
+  %i1 = insertelement <8 x float> %ins, float %e1, i32 2
+  ret <8 x float> %i1
+}
+
+; PR26015: https://llvm.org/bugs/show_bug.cgi?id=26015
+; The widening shuffle must be inserted before any uses.
+
+define <8 x i16> @pr26015(<4 x i16> %t0) {
+; CHECK-LABEL: @pr26015(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[T0:%.*]], <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[T5:%.*]] = shufflevector <8 x i16> <i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 0, i16 undef>, <8 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 10, i32 4, i32 5, i32 6, i32 11>
+; CHECK-NEXT:    ret <8 x i16> [[T5]]
+;
+  %t1 = extractelement <4 x i16> %t0, i32 2
+  %t2 = insertelement <8 x i16> zeroinitializer, i16 %t1, i32 3
+  %t3 = insertelement <8 x i16> %t2, i16 0, i32 6
+  %t4 = extractelement <4 x i16> %t0, i32 3
+  %t5 = insertelement <8 x i16> %t3, i16 %t4, i32 7
+  ret <8 x i16> %t5
+}
+
+; PR25999: https://llvm.org/bugs/show_bug.cgi?id=25999
+; TODO: The widening shuffle could be inserted at the start of the function to allow the first extract to use it.
+
+define <8 x i16> @pr25999(<4 x i16> %t0, i1 %b) {
+; CHECK-LABEL: @pr25999(
+; CHECK-NEXT:    [[T1:%.*]] = extractelement <4 x i16> [[T0:%.*]], i32 2
+; CHECK-NEXT:    br i1 [[B:%.*]], label [[IF:%.*]], label [[END:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[T0]], <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[T3:%.*]] = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 0, i16 undef>, i16 [[T1]], i32 3
+; CHECK-NEXT:    [[T5:%.*]] = shufflevector <8 x i16> [[T3]], <8 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 11>
+; CHECK-NEXT:    ret <8 x i16> [[T5]]
+; CHECK:       end:
+; CHECK-NEXT:    [[A1:%.*]] = add i16 [[T1]], 4
+; CHECK-NEXT:    [[T6:%.*]] = insertelement <8 x i16> <i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, i16 [[A1]], i32 0
+; CHECK-NEXT:    ret <8 x i16> [[T6]]
+;
+
+  %t1 = extractelement <4 x i16> %t0, i32 2
+  br i1 %b, label %if, label %end
+
+if:
+  %t2 = insertelement <8 x i16> zeroinitializer, i16 %t1, i32 3
+  %t3 = insertelement <8 x i16> %t2, i16 0, i32 6
+  %t4 = extractelement <4 x i16> %t0, i32 3
+  %t5 = insertelement <8 x i16> %t3, i16 %t4, i32 7
+  ret <8 x i16> %t5
+
+end:
+  %a1 = add i16 %t1, 4
+  %t6 = insertelement <8 x i16> zeroinitializer, i16 %a1, i32 0
+  ret <8 x i16> %t6
+}
+
+; The widening shuffle must be inserted at a valid point (after the PHIs).
+
+define <4 x double> @pr25999_phis1(i1 %c, <2 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: @pr25999_phis1(
+; CHECK-NEXT:  bb1:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB2:%.*]], label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[R:%.*]] = call <2 x double> @dummy(<2 x double> [[A:%.*]])
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x double> [ [[A]], [[BB1:%.*]] ], [ [[R]], [[BB2]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x double> [ [[B:%.*]], [[BB1]] ], [ zeroinitializer, [[BB2]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP0]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+; CHECK-NEXT:    ret <4 x double> [[TMP4]]
+;
+bb1:
+  br i1 %c, label %bb2, label %bb3
+
+bb2:
+  %r = call <2 x double> @dummy(<2 x double> %a)
+  br label %bb3
+
+bb3:
+  %tmp1 = phi <2 x double> [ %a, %bb1 ], [ %r, %bb2 ]
+  %tmp2 = phi <4 x double> [ %b, %bb1 ], [ zeroinitializer, %bb2 ]
+  %tmp3 = extractelement <2 x double> %tmp1, i32 0
+  %tmp4 = insertelement <4 x double> %tmp2, double %tmp3, i32 2
+  ret <4 x double> %tmp4
+}
+
+declare <2 x double> @dummy(<2 x double>)
+
+define <4 x double> @pr25999_phis2(i1 %c, <2 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: @pr25999_phis2(
+; CHECK-NEXT:  bb1:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB2:%.*]], label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[R:%.*]] = call <2 x double> @dummy(<2 x double> [[A:%.*]])
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x double> [ [[A]], [[BB1:%.*]] ], [ [[R]], [[BB2]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x double> [ [[B:%.*]], [[BB1]] ], [ zeroinitializer, [[BB2]] ]
+; CHECK-NEXT:    [[D:%.*]] = fadd <2 x double> [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x double> [[D]], <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP0]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+; CHECK-NEXT:    ret <4 x double> [[TMP4]]
+;
+bb1:
+  br i1 %c, label %bb2, label %bb3
+
+bb2:
+  %r = call <2 x double> @dummy(<2 x double> %a)
+  br label %bb3
+
+bb3:
+  %tmp1 = phi <2 x double> [ %a, %bb1 ], [ %r, %bb2 ]
+  %tmp2 = phi <4 x double> [ %b, %bb1 ], [ zeroinitializer, %bb2 ]
+  %d = fadd <2 x double> %tmp1, %tmp1
+  %tmp3 = extractelement <2 x double> %d, i32 0
+  %tmp4 = insertelement <4 x double> %tmp2, double %tmp3, i32 2
+  ret <4 x double> %tmp4
+}
+
+; PR26354: https://llvm.org/bugs/show_bug.cgi?id=26354
+; Don't create a shufflevector if we know that we're not going to replace the insertelement.
+
+define double @pr26354(<2 x double>* %tmp, i1 %B) {
+; CHECK-LABEL: @pr26354(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LD:%.*]] = load <2 x double>, <2 x double>* [[TMP:%.*]], align 16
+; CHECK-NEXT:    br i1 [[B:%.*]], label [[IF:%.*]], label [[END:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[E2:%.*]] = extractelement <2 x double> [[LD]], i32 1
+; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x double> <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double undef>, double [[E2]], i32 3
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PH:%.*]] = phi <4 x double> [ undef, [[ENTRY:%.*]] ], [ [[I1]], [[IF]] ]
+; CHECK-NEXT:    [[E1:%.*]] = extractelement <2 x double> [[LD]], i32 0
+; CHECK-NEXT:    [[E3:%.*]] = extractelement <4 x double> [[PH]], i32 1
+; CHECK-NEXT:    [[MU:%.*]] = fmul double [[E1]], [[E3]]
+; CHECK-NEXT:    ret double [[MU]]
+;
+
+entry:
+  %ld = load <2 x double>, <2 x double>* %tmp
+  %e1 = extractelement <2 x double> %ld, i32 0
+  %e2 = extractelement <2 x double> %ld, i32 1
+  br i1 %B, label %if, label %end
+
+if:
+  %i1 = insertelement <4 x double> zeroinitializer, double %e2, i32 3
+  br label %end
+
+end:
+  %ph = phi <4 x double> [ undef, %entry ], [ %i1, %if ]
+  %e3 = extractelement <4 x double> %ph, i32 1
+  %mu = fmul double %e1, %e3
+  ret double %mu
+}
+
+; https://llvm.org/bugs/show_bug.cgi?id=30923
+; Delete the widening shuffle if we're not going to reduce the extract/insert to a shuffle.
+
+define <4 x float> @PR30923(<2 x float> %x) {
+; CHECK-LABEL: @PR30923(
+; CHECK-NEXT:  bb1:
+; CHECK-NEXT:    [[EXT1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1
+; CHECK-NEXT:    store float [[EXT1]], float* undef, align 4
+; CHECK-NEXT:    br label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[EXT2:%.*]] = extractelement <2 x float> [[X]], i32 0
+; CHECK-NEXT:    [[INS1:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float undef, float undef>, float [[EXT2]], i32 2
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x float> [[INS1]], float [[EXT1]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[INS2]]
+;
+bb1:
+  %ext1 = extractelement <2 x float> %x, i32 1
+  store float %ext1, float* undef, align 4
+  br label %bb2
+
+bb2:
+  %widen = shufflevector <2 x float> %x, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %ext2 = extractelement <4 x float> %widen, i32 0
+  %ins1 = insertelement <4 x float> <float 0.0, float 0.0, float undef, float undef>, float %ext2, i32 2
+  %ins2 = insertelement <4 x float> %ins1, float %ext1, i32 3
+  ret <4 x float> %ins2
+}
+
+; Don't insert extractelements from the wider vector before the def of the index operand.
+
+define <4 x i32> @extractelt_insertion(<2 x i32> %x, i32 %y) {
+; CHECK-LABEL: @extractelt_insertion(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <4 x i32> <i32 0, i32 0, i32 0, i32 undef>, <4 x i32> [[TMP0]], <4 x i32> <i32 0, i32 1, i32 2, i32 5>
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[Y:%.*]], 3
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[TMP0]], i32 [[C]]
+; CHECK-NEXT:    [[E:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[E]], <4 x i32> [[B]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i32> [[RET]]
+;
+entry:
+  %a = extractelement <2 x i32> %x, i32 1
+  %b = insertelement <4 x i32> zeroinitializer, i32 %a, i64 3
+  %c = add i32 %y, 3
+  %d = extractelement <2 x i32> %x, i32 %c
+  %e = icmp eq i32 %d, 0
+  %ret = select i1 %e, <4 x i32> %b, <4 x i32> zeroinitializer
+  ret <4 x i32> %ret
+}
+
+; PR34724: https://bugs.llvm.org/show_bug.cgi?id=34724
+
+define <4 x float> @collectShuffleElts(<2 x float> %x, float %y) {
+; CHECK-LABEL: @collectShuffleElts(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <2 x float> [[X]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x float> poison, float [[X0]], i32 1
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[X1]], i32 2
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[Y:%.*]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[V3]]
+;
+  %x0 = extractelement <2 x float> %x, i32 0
+  %x1 = extractelement <2 x float> %x, i32 1
+  %v1 = insertelement <4 x float> poison, float %x0, i32 1
+  %v2 = insertelement <4 x float> %v1, float %x1, i32 2
+  %v3 = insertelement <4 x float> %v2, float %y, i32 3
+  ret <4 x float> %v3
+}
+
+; Simplest case - insert scalar into undef, then shuffle that value in place into another vector.
+
+define <4 x float> @insert_shuffle(float %x, <4 x float> %y) {
+; CHECK-LABEL: @insert_shuffle(
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[X:%.*]], i32 0
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv = insertelement <4 x float> poison, float %x, i32 0
+  %r = shufflevector <4 x float> %xv, <4 x float> %y, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %r
+}
+
+; Insert scalar into some element of a dummy vector, then move it to a different element in another vector.
+
+define <4 x float> @insert_shuffle_translate(float %x, <4 x float> %y) {
+; CHECK-LABEL: @insert_shuffle_translate(
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[X:%.*]], i32 1
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv = insertelement <4 x float> poison, float %x, i32 0
+  %r = shufflevector <4 x float> %xv, <4 x float> %y, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+  ret <4 x float> %r
+}
+
+; The vector operand of the insert is irrelevant.
+
+define <4 x float> @insert_not_undef_shuffle_translate(float %x, <4 x float> %y, <4 x float> %q) {
+; CHECK-LABEL: @insert_not_undef_shuffle_translate(
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[X:%.*]], i32 2
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv = insertelement <4 x float> %q, float %x, i32 3
+  %r = shufflevector <4 x float> %xv, <4 x float> %y, <4 x i32> <i32 4, i32 5, i32 3, i32 7>
+  ret <4 x float> %r
+}
+
+; The insert may be the 2nd operand of the shuffle. The shuffle mask can include undef elements.
+
+define <4 x float> @insert_not_undef_shuffle_translate_commute(float %x, <4 x float> %y, <4 x float> %q) {
+; CHECK-LABEL: @insert_not_undef_shuffle_translate_commute(
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[X:%.*]], i32 1
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv = insertelement <4 x float> %q, float %x, i32 2
+  %r = shufflevector <4 x float> %y, <4 x float> %xv, <4 x i32> <i32 0, i32 6, i32 2, i32 undef>
+  ret <4 x float> %r
+}
+
+; Both shuffle operands may be inserts - choose the correct side.
+
+define <4 x float> @insert_insert_shuffle_translate(float %x1, float %x2, <4 x float> %q) {
+; CHECK-LABEL: @insert_insert_shuffle_translate(
+; CHECK-NEXT:    [[XV2:%.*]] = insertelement <4 x float> [[Q:%.*]], float [[X2:%.*]], i32 2
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[XV2]], float [[X1:%.*]], i32 1
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv1 = insertelement <4 x float> %q, float %x1, i32 0
+  %xv2 = insertelement <4 x float> %q, float %x2, i32 2
+  %r = shufflevector <4 x float> %xv1, <4 x float> %xv2, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+  ret <4 x float> %r
+}
+
+; Both shuffle operands may be inserts - choose the correct side.
+
+define <4 x float> @insert_insert_shuffle_translate_commute(float %x1, float %x2, <4 x float> %q) {
+; CHECK-LABEL: @insert_insert_shuffle_translate_commute(
+; CHECK-NEXT:    [[XV1:%.*]] = insertelement <4 x float> [[Q:%.*]], float [[X1:%.*]], i32 0
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[XV1]], float [[X2:%.*]], i32 1
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv1 = insertelement <4 x float> %q, float %x1, i32 0
+  %xv2 = insertelement <4 x float> %q, float %x2, i32 2
+  %r = shufflevector <4 x float> %xv1, <4 x float> %xv2, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+  ret <4 x float> %r
+}
+
+; Negative test - this only works if the shuffle is choosing exactly 1 element from 1 of the inputs.
+; TODO: But this could be a special-case because we're inserting into the same base vector.
+
+define <4 x float> @insert_insert_shuffle_translate_wrong_mask(float %x1, float %x2, <4 x float> %q) {
+; CHECK-LABEL: @insert_insert_shuffle_translate_wrong_mask(
+; CHECK-NEXT:    [[XV1:%.*]] = insertelement <4 x float> [[Q:%.*]], float [[X1:%.*]], i32 0
+; CHECK-NEXT:    [[XV2:%.*]] = insertelement <4 x float> [[Q]], float [[X2:%.*]], i32 2
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[XV1]], <4 x float> [[XV2]], <4 x i32> <i32 0, i32 6, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv1 = insertelement <4 x float> %q, float %x1, i32 0
+  %xv2 = insertelement <4 x float> %q, float %x2, i32 2
+  %r = shufflevector <4 x float> %xv1, <4 x float> %xv2, <4 x i32> <i32 0, i32 6, i32 2, i32 7>
+  ret <4 x float> %r
+}
+
+; The insert may have other uses.
+
+declare void @use(<4 x float>)
+
+define <4 x float> @insert_not_undef_shuffle_translate_commute_uses(float %x, <4 x float> %y, <4 x float> %q) {
+; CHECK-LABEL: @insert_not_undef_shuffle_translate_commute_uses(
+; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> [[Q:%.*]], float [[X:%.*]], i32 2
+; CHECK-NEXT:    call void @use(<4 x float> [[XV]])
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[X]], i32 0
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv = insertelement <4 x float> %q, float %x, i32 2
+  call void @use(<4 x float> %xv)
+  %r = shufflevector <4 x float> %y, <4 x float> %xv, <4 x i32> <i32 6, i32 undef, i32 2, i32 3>
+  ret <4 x float> %r
+}
+
+; Negative test - size-changing shuffle.
+
+define <5 x float> @insert_not_undef_shuffle_translate_commute_lengthen(float %x, <4 x float> %y, <4 x float> %q) {
+; CHECK-LABEL: @insert_not_undef_shuffle_translate_commute_lengthen(
+; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> undef, float [[X:%.*]], i32 2
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[XV]], <5 x i32> <i32 0, i32 6, i32 2, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <5 x float> [[R]]
+;
+  %xv = insertelement <4 x float> %q, float %x, i32 2
+  %r = shufflevector <4 x float> %y, <4 x float> %xv, <5 x i32> <i32 0, i32 6, i32 2, i32 undef, i32 undef>
+  ret <5 x float> %r
+}
+
+define <4 x float> @insert_nonzero_index_splat(float %x) {
+; CHECK-LABEL: @insert_nonzero_index_splat(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[X:%.*]], i32 0
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 0, i32 undef>
+; CHECK-NEXT:    ret <4 x float> [[SPLAT]]
+;
+  %xv = insertelement <4 x float> poison, float %x, i32 2
+  %splat = shufflevector <4 x float> %xv, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 2, i32 undef>
+  ret <4 x float> %splat
+}
+
+define <3 x double> @insert_nonzero_index_splat_narrow(double %x) {
+; CHECK-LABEL: @insert_nonzero_index_splat_narrow(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <3 x double> undef, double [[X:%.*]], i32 0
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <3 x double> [[TMP1]], <3 x double> undef, <3 x i32> <i32 0, i32 undef, i32 0>
+; CHECK-NEXT:    ret <3 x double> [[SPLAT]]
+;
+  %xv = insertelement <4 x double> poison, double %x, i32 3
+  %splat = shufflevector <4 x double> %xv, <4 x double> undef, <3 x i32> <i32 3, i32 undef, i32 3>
+  ret <3 x double> %splat
+}
+
+define <5 x i7> @insert_nonzero_index_splat_widen(i7 %x) {
+; CHECK-LABEL: @insert_nonzero_index_splat_widen(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <5 x i7> undef, i7 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <5 x i7> [[TMP1]], <5 x i7> undef, <5 x i32> <i32 undef, i32 0, i32 0, i32 undef, i32 0>
+; CHECK-NEXT:    ret <5 x i7> [[SPLAT]]
+;
+  %xv = insertelement <4 x i7> poison, i7 %x, i32 1
+  %splat = shufflevector <4 x i7> %xv, <4 x i7> undef, <5 x i32> <i32 undef, i32 1, i32 1, i32 undef, i32 1>
+  ret <5 x i7> %splat
+}
+
+; Negative test - don't increase instruction count
+
+define <4 x float> @insert_nonzero_index_splat_extra_use(float %x) {
+; CHECK-LABEL: @insert_nonzero_index_splat_extra_use(
+; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> poison, float [[X:%.*]], i32 2
+; CHECK-NEXT:    call void @use(<4 x float> [[XV]])
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x float> [[XV]], <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 2, i32 undef>
+; CHECK-NEXT:    ret <4 x float> [[SPLAT]]
+;
+  %xv = insertelement <4 x float> poison, float %x, i32 2
+  call void @use(<4 x float> %xv)
+  %splat = shufflevector <4 x float> %xv, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 2, i32 undef>
+  ret <4 x float> %splat
+}
+
+; Negative test - non-undef base vector
+
+define <4 x float> @insert_nonzero_index_splat_wrong_base(float %x, <4 x float> %y) {
+; CHECK-LABEL: @insert_nonzero_index_splat_wrong_base(
+; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[X:%.*]], i32 2
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x float> [[XV]], <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 3, i32 undef>
+; CHECK-NEXT:    ret <4 x float> [[SPLAT]]
+;
+  %xv = insertelement <4 x float> %y, float %x, i32 2
+  %splat = shufflevector <4 x float> %xv, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 3, i32 undef>
+  ret <4 x float> %splat
+}
+
+; Negative test - non-constant insert index
+
+define <4 x float> @insert_nonzero_index_splat_wrong_index(float %x, i32 %index) {
+; CHECK-LABEL: @insert_nonzero_index_splat_wrong_index(
+; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> poison, float [[X:%.*]], i32 [[INDEX:%.*]]
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x float> [[XV]], <4 x float> undef, <4 x i32> <i32 undef, i32 1, i32 1, i32 undef>
+; CHECK-NEXT:    ret <4 x float> [[SPLAT]]
+;
+  %xv = insertelement <4 x float> poison, float %x, i32 %index
+  %splat = shufflevector <4 x float> %xv, <4 x float> undef, <4 x i32> <i32 undef, i32 1, i32 1, i32 undef>
+  ret <4 x float> %splat
+}
+
+define <4 x float> @insert_in_splat(float %x) {
+; CHECK-LABEL: @insert_in_splat(
+; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> poison, float [[X:%.*]], i32 0
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[XV]], <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 0, i32 0>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv = insertelement <4 x float> poison, float %x, i32 0
+  %splat = shufflevector <4 x float> %xv, <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 0, i32 undef>
+  %r = insertelement <4 x float> %splat, float %x, i32 3
+  ret <4 x float> %r
+}
+
+define <4 x float> @insert_in_splat_extra_uses(float %x) {
+; CHECK-LABEL: @insert_in_splat_extra_uses(
+; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> poison, float [[X:%.*]], i32 0
+; CHECK-NEXT:    call void @use(<4 x float> [[XV]])
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x float> [[XV]], <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 0, i32 undef>
+; CHECK-NEXT:    call void @use(<4 x float> [[SPLAT]])
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[XV]], <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 0, i32 0>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv = insertelement <4 x float> poison, float %x, i32 0
+  call void @use(<4 x float> %xv)
+  %splat = shufflevector <4 x float> %xv, <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 0, i32 undef>
+  call void @use(<4 x float> %splat)
+  %r = insertelement <4 x float> %splat, float %x, i32 3
+  ret <4 x float> %r
+}
+
+; Negative test - not a constant index insert
+
+define <4 x float> @insert_in_splat_variable_index(float %x, i32 %y) {
+; CHECK-LABEL: @insert_in_splat_variable_index(
+; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> poison, float [[X:%.*]], i32 0
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x float> [[XV]], <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 0, i32 undef>
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[SPLAT]], float [[X]], i32 [[Y:%.*]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv = insertelement <4 x float> poison, float %x, i32 0
+  %splat = shufflevector <4 x float> %xv, <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 0, i32 undef>
+  %r = insertelement <4 x float> %splat, float %x, i32 %y
+  ret <4 x float> %r
+}
+
+; Negative test - not a splat shuffle
+
+define <4 x float> @insert_in_nonsplat(float %x, <4 x float> %y) {
+; CHECK-LABEL: @insert_in_nonsplat(
+; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> poison, float [[X:%.*]], i32 0
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x float> [[XV]], <4 x float> [[Y:%.*]], <4 x i32> <i32 undef, i32 0, i32 4, i32 undef>
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[SPLAT]], float [[X]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv = insertelement <4 x float> poison, float %x, i32 0
+  %splat = shufflevector <4 x float> %xv, <4 x float> %y, <4 x i32> <i32 undef, i32 0, i32 4, i32 undef>
+  %r = insertelement <4 x float> %splat, float %x, i32 3
+  ret <4 x float> %r
+}
+
+; Negative test - not a splat shuffle
+
+define <4 x float> @insert_in_nonsplat2(float %x, <4 x float> %y) {
+; CHECK-LABEL: @insert_in_nonsplat2(
+; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[X:%.*]], i32 0
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x float> [[XV]], <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[SPLAT]], float [[X]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv = insertelement <4 x float> %y, float %x, i32 0
+  %splat = shufflevector <4 x float> %xv, <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
+  %r = insertelement <4 x float> %splat, float %x, i32 3
+  ret <4 x float> %r
+}
+
+define <4 x i8> @shuf_identity_padding(<2 x i8> %x, i8 %y) {
+; CHECK-LABEL: @shuf_identity_padding(
+; CHECK-NEXT:    [[V1:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x i8> [[V1]], i8 [[Y:%.*]], i32 2
+; CHECK-NEXT:    ret <4 x i8> [[V2]]
+;
+  %v0 = shufflevector <2 x i8> %x, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %x1 = extractelement <2 x i8> %x, i32 1
+  %v1 = insertelement <4 x i8> %v0, i8 %x1, i32 1
+  %v2 = insertelement <4 x i8> %v1, i8 %y, i32 2
+  ret <4 x i8> %v2
+}
+
+define <3 x i8> @shuf_identity_extract(<4 x i8> %x, i8 %y) {
+; CHECK-LABEL: @shuf_identity_extract(
+; CHECK-NEXT:    [[V1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> undef, <3 x i32> <i32 0, i32 1, i32 undef>
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <3 x i8> [[V1]], i8 [[Y:%.*]], i32 2
+; CHECK-NEXT:    ret <3 x i8> [[V2]]
+;
+  %v0 = shufflevector <4 x i8> %x, <4 x i8> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
+  %x1 = extractelement <4 x i8> %x, i32 1
+  %v1 = insertelement <3 x i8> %v0, i8 %x1, i32 1
+  %v2 = insertelement <3 x i8> %v1, i8 %y, i32 2
+  ret <3 x i8> %v2
+}
+
+define <4 x float> @shuf_identity_extract_extra_use(<6 x float> %x, float %y) {
+; CHECK-LABEL: @shuf_identity_extract_extra_use(
+; CHECK-NEXT:    [[V0:%.*]] = shufflevector <6 x float> [[X:%.*]], <6 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 3>
+; CHECK-NEXT:    call void @use(<4 x float> [[V0]])
+; CHECK-NEXT:    [[V1:%.*]] = shufflevector <6 x float> [[X]], <6 x float> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 3>
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[Y:%.*]], i32 1
+; CHECK-NEXT:    ret <4 x float> [[V2]]
+;
+  %v0 = shufflevector <6 x float> %x, <6 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 3>
+  call void @use(<4 x float> %v0)
+  %x1 = extractelement <6 x float> %x, i32 2
+  %v1 = insertelement <4 x float> %v0, float %x1, i32 2
+  %v2 = insertelement <4 x float> %v1, float %y, i32 1
+  ret <4 x float> %v2
+}
+
+; Negative test - can't map variable index to shuffle mask.
+
+define <4 x i8> @shuf_identity_padding_variable_index(<2 x i8> %x, i8 %y, i32 %index) {
+; CHECK-LABEL: @shuf_identity_padding_variable_index(
+; CHECK-NEXT:    [[V0:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <2 x i8> [[X]], i32 [[INDEX:%.*]]
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x i8> [[V0]], i8 [[X1]], i32 [[INDEX]]
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x i8> [[V1]], i8 [[Y:%.*]], i32 2
+; CHECK-NEXT:    ret <4 x i8> [[V2]]
+;
+  %v0 = shufflevector <2 x i8> %x, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %x1 = extractelement <2 x i8> %x, i32 %index
+  %v1 = insertelement <4 x i8> %v0, i8 %x1, i32 %index
+  %v2 = insertelement <4 x i8> %v1, i8 %y, i32 2
+  ret <4 x i8> %v2
+}
+
+; Negative test - don't create arbitrary shuffle masks.
+
+define <4 x i8> @shuf_identity_padding_wrong_source_vec(<2 x i8> %x, i8 %y, <2 x i8> %other) {
+; CHECK-LABEL: @shuf_identity_padding_wrong_source_vec(
+; CHECK-NEXT:    [[V0:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <2 x i8> [[OTHER:%.*]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x i8> [[V0]], i8 [[X1]], i32 1
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x i8> [[V1]], i8 [[Y:%.*]], i32 2
+; CHECK-NEXT:    ret <4 x i8> [[V2]]
+;
+  %v0 = shufflevector <2 x i8> %x, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %x1 = extractelement <2 x i8> %other, i32 1
+  %v1 = insertelement <4 x i8> %v0, i8 %x1, i32 1
+  %v2 = insertelement <4 x i8> %v1, i8 %y, i32 2
+  ret <4 x i8> %v2
+}
+
+; Negative test - don't create arbitrary shuffle masks.
+
+define <4 x i8> @shuf_identity_padding_wrong_index(<2 x i8> %x, i8 %y) {
+; CHECK-LABEL: @shuf_identity_padding_wrong_index(
+; CHECK-NEXT:    [[V0:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <2 x i8> [[X]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x i8> [[V0]], i8 [[X1]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x i8> [[V1]], i8 [[Y:%.*]], i32 3
+; CHECK-NEXT:    ret <4 x i8> [[V2]]
+;
+  %v0 = shufflevector <2 x i8> %x, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %x1 = extractelement <2 x i8> %x, i32 1
+  %v1 = insertelement <4 x i8> %v0, i8 %x1, i32 2
+  %v2 = insertelement <4 x i8> %v1, i8 %y, i32 3
+  ret <4 x i8> %v2
+}
+
+define <4 x float> @insert_undemanded_element_op0(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: @insert_undemanded_element_op0(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <4 x float> [[X:%.*]], float 4.200000e+01, i32 3
+; CHECK-NEXT:    call void @use(<4 x float> [[INS]])
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Y:%.*]], <4 x i32> <i32 0, i32 7, i32 1, i32 4>
+; CHECK-NEXT:    ret <4 x float> [[S]]
+;
+  %ins = insertelement <4 x float> %x, float 42.0, i32 3
+  call void @use(<4 x float> %ins)
+  %s = shufflevector <4 x float> %ins, <4 x float> %y, <4 x i32> <i32 0, i32 7, i32 1, i32 4>
+  ret <4 x float> %s
+}
+
+define <4 x float> @insert_undemanded_element_op1(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: @insert_undemanded_element_op1(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <4 x float> [[X:%.*]], float 4.200000e+01, i32 3
+; CHECK-NEXT:    call void @use(<4 x float> [[INS]])
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[X]], <4 x i32> <i32 3, i32 2, i32 1, i32 4>
+; CHECK-NEXT:    ret <4 x float> [[S]]
+;
+  %ins = insertelement <4 x float> %x, float 42.0, i32 3
+  call void @use(<4 x float> %ins)
+  %s = shufflevector <4 x float> %y, <4 x float> %ins, <4 x i32> <i32 3, i32 2, i32 1, i32 4>
+  ret <4 x float> %s
+}
+
+; Negative test - shuffle chooses the inserted constant.
+
+define <4 x float> @insert_demanded_element_op0(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: @insert_demanded_element_op0(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <4 x float> [[X:%.*]], float 4.200000e+01, i32 3
+; CHECK-NEXT:    call void @use(<4 x float> [[INS]])
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x float> [[INS]], <4 x float> [[Y:%.*]], <4 x i32> <i32 3, i32 2, i32 1, i32 4>
+; CHECK-NEXT:    ret <4 x float> [[S]]
+;
+  %ins = insertelement <4 x float> %x, float 42.0, i32 3
+  call void @use(<4 x float> %ins)
+  %s = shufflevector <4 x float> %ins, <4 x float> %y, <4 x i32> <i32 3, i32 2, i32 1, i32 4>
+  ret <4 x float> %s
+}
+
+; Negative test - shuffle chooses the inserted constant.
+
+define <4 x float> @insert_demanded_element_op1(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: @insert_demanded_element_op1(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <4 x float> [[X:%.*]], float 4.300000e+01, i32 3
+; CHECK-NEXT:    call void @use(<4 x float> [[INS]])
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[INS]], <4 x i32> <i32 0, i32 7, i32 1, i32 4>
+; CHECK-NEXT:    ret <4 x float> [[S]]
+;
+  %ins = insertelement <4 x float> %x, float 43.0, i32 3
+  call void @use(<4 x float> %ins)
+  %s = shufflevector <4 x float> %y, <4 x float> %ins, <4 x i32> <i32 0, i32 7, i32 1, i32 4>
+  ret <4 x float> %s
+}
+
+define <4 x float> @splat_constant(<4 x float> %x) {
+; CHECK-LABEL: @splat_constant(
+; CHECK-NEXT:    [[INS3:%.*]] = insertelement <4 x float> [[X:%.*]], float 3.000000e+00, i32 3
+; CHECK-NEXT:    [[R:%.*]] = fadd <4 x float> [[INS3]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %ins3 = insertelement <4 x float> %x, float 3.0, i32 3
+  %splat3 = shufflevector <4 x float> %ins3, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %r = fadd <4 x float> %ins3, %splat3
+  ret <4 x float> %r
+}
--- a/llvm/test/Transforms/InstCombine/masked_intrinsics-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/masked_intrinsics-inseltpoison.ll
@ -0,0 +1,271 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptrs, i32, <2 x i1> %mask, <2 x double> %src0)
+declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %val, <2 x double>* %ptrs, i32, <2 x i1> %mask)
+declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %passthru)
+declare <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %ptrs, i32, <4 x i1> %mask, <4 x double> %passthru)
+declare void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %val, <2 x double*> %ptrs, i32, <2 x i1> %mask)
+
+define <2 x double> @load_zeromask(<2 x double>* %ptr, <2 x double> %passthru)  {
+; CHECK-LABEL: @load_zeromask(
+; CHECK-NEXT:    ret <2 x double> [[PASSTHRU:%.*]]
+;
+  %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptr, i32 1, <2 x i1> zeroinitializer, <2 x double> %passthru)
+  ret <2 x double> %res
+}
+
+define <2 x double> @load_onemask(<2 x double>* %ptr, <2 x double> %passthru)  {
+; CHECK-LABEL: @load_onemask(
+; CHECK-NEXT:    [[UNMASKEDLOAD:%.*]] = load <2 x double>, <2 x double>* [[PTR:%.*]], align 2
+; CHECK-NEXT:    ret <2 x double> [[UNMASKEDLOAD]]
+;
+  %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptr, i32 2, <2 x i1> <i1 1, i1 1>, <2 x double> %passthru)
+  ret <2 x double> %res
+}
+
+define <2 x double> @load_undefmask(<2 x double>* %ptr, <2 x double> %passthru)  {
+; CHECK-LABEL: @load_undefmask(
+; CHECK-NEXT:    [[UNMASKEDLOAD:%.*]] = load <2 x double>, <2 x double>* [[PTR:%.*]], align 2
+; CHECK-NEXT:    ret <2 x double> [[UNMASKEDLOAD]]
+;
+  %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptr, i32 2, <2 x i1> <i1 1, i1 undef>, <2 x double> %passthru)
+  ret <2 x double> %res
+}
+
+@G = external global i8
+
+define <2 x double> @load_cemask(<2 x double>* %ptr, <2 x double> %passthru)  {
+; CHECK-LABEL: @load_cemask(
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* [[PTR:%.*]], i32 2, <2 x i1> <i1 true, i1 ptrtoint (i8* @G to i1)>, <2 x double> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptr, i32 2, <2 x i1> <i1 1, i1 ptrtoint (i8* @G to i1)>, <2 x double> %passthru)
+  ret <2 x double> %res
+}
+
+define <2 x double> @load_lane0(<2 x double>* %ptr, double %pt)  {
+; CHECK-LABEL: @load_lane0(
+; CHECK-NEXT:    [[PTV2:%.*]] = insertelement <2 x double> undef, double [[PT:%.*]], i64 1
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* [[PTR:%.*]], i32 2, <2 x i1> <i1 true, i1 false>, <2 x double> [[PTV2]])
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %ptv1 = insertelement <2 x double> poison, double %pt, i64 0
+  %ptv2 = insertelement <2 x double> %ptv1, double %pt, i64 1
+  %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptr, i32 2, <2 x i1> <i1 true, i1 false>, <2 x double> %ptv2)
+  ret <2 x double> %res
+}
+
+define double @load_all(double* %base, double %pt)  {
+; CHECK-LABEL: @load_all(
+; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr double, double* [[BASE:%.*]], <4 x i64> <i64 0, i64 undef, i64 2, i64 3>
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x double> undef)
+; CHECK-NEXT:    [[ELT:%.*]] = extractelement <4 x double> [[RES]], i64 2
+; CHECK-NEXT:    ret double [[ELT]]
+;
+  %ptrs = getelementptr double, double* %base, <4 x i64> <i64 0, i64 1, i64 2, i64 3>
+  %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %ptrs, i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x double> undef)
+  %elt = extractelement <4 x double> %res, i64 2
+  ret double %elt
+}
+
+define <2 x double> @load_generic(<2 x double>* %ptr, double %pt, <2 x i1> %mask)  {
+; CHECK-LABEL: @load_generic(
+; CHECK-NEXT:    [[PTV1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
+; CHECK-NEXT:    [[PTV2:%.*]] = shufflevector <2 x double> [[PTV1]], <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* [[PTR:%.*]], i32 4, <2 x i1> [[MASK:%.*]], <2 x double> [[PTV2]])
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %ptv1 = insertelement <2 x double> poison, double %pt, i64 0
+  %ptv2 = insertelement <2 x double> %ptv1, double %pt, i64 1
+  %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptr, i32 4, <2 x i1> %mask, <2 x double> %ptv2)
+  ret <2 x double> %res
+}
+
+define <2 x double> @load_speculative(<2 x double>* dereferenceable(16) align 4 %ptr, double %pt, <2 x i1> %mask)  {
+; CHECK-LABEL: @load_speculative(
+; CHECK-NEXT:    [[PTV1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
+; CHECK-NEXT:    [[PTV2:%.*]] = shufflevector <2 x double> [[PTV1]], <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[UNMASKEDLOAD:%.*]] = load <2 x double>, <2 x double>* [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = select <2 x i1> [[MASK:%.*]], <2 x double> [[UNMASKEDLOAD]], <2 x double> [[PTV2]]
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %ptv1 = insertelement <2 x double> poison, double %pt, i64 0
+  %ptv2 = insertelement <2 x double> %ptv1, double %pt, i64 1
+  %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptr, i32 4, <2 x i1> %mask, <2 x double> %ptv2)
+  ret <2 x double> %res
+}
+
+define <2 x double> @load_speculative_less_aligned(<2 x double>* dereferenceable(16) %ptr, double %pt, <2 x i1> %mask)  {
+; CHECK-LABEL: @load_speculative_less_aligned(
+; CHECK-NEXT:    [[PTV1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
+; CHECK-NEXT:    [[PTV2:%.*]] = shufflevector <2 x double> [[PTV1]], <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[UNMASKEDLOAD:%.*]] = load <2 x double>, <2 x double>* [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = select <2 x i1> [[MASK:%.*]], <2 x double> [[UNMASKEDLOAD]], <2 x double> [[PTV2]]
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %ptv1 = insertelement <2 x double> poison, double %pt, i64 0
+  %ptv2 = insertelement <2 x double> %ptv1, double %pt, i64 1
+  %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptr, i32 4, <2 x i1> %mask, <2 x double> %ptv2)
+  ret <2 x double> %res
+}
+
+; Can't speculate since only half of required size is known deref
+
+define <2 x double> @load_spec_neg_size(<2 x double>* dereferenceable(8) %ptr, double %pt, <2 x i1> %mask)  {
+; CHECK-LABEL: @load_spec_neg_size(
+; CHECK-NEXT:    [[PTV1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
+; CHECK-NEXT:    [[PTV2:%.*]] = shufflevector <2 x double> [[PTV1]], <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* nonnull [[PTR:%.*]], i32 4, <2 x i1> [[MASK:%.*]], <2 x double> [[PTV2]])
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %ptv1 = insertelement <2 x double> poison, double %pt, i64 0
+  %ptv2 = insertelement <2 x double> %ptv1, double %pt, i64 1
+  %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptr, i32 4, <2 x i1> %mask, <2 x double> %ptv2)
+  ret <2 x double> %res
+}
+
+; Can only speculate one lane (but it's the only one active)
+define <2 x double> @load_spec_lan0(<2 x double>* dereferenceable(8) %ptr, double %pt, <2 x i1> %mask)  {
+; CHECK-LABEL: @load_spec_lan0(
+; CHECK-NEXT:    [[PTV1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
+; CHECK-NEXT:    [[PTV2:%.*]] = shufflevector <2 x double> [[PTV1]], <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[MASK2:%.*]] = insertelement <2 x i1> [[MASK:%.*]], i1 false, i64 1
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* nonnull [[PTR:%.*]], i32 4, <2 x i1> [[MASK2]], <2 x double> [[PTV2]])
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %ptv1 = insertelement <2 x double> poison, double %pt, i64 0
+  %ptv2 = insertelement <2 x double> %ptv1, double %pt, i64 1
+  %mask2 = insertelement <2 x i1> %mask, i1 false, i64 1
+  %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptr, i32 4, <2 x i1> %mask2, <2 x double> %ptv2)
+  ret <2 x double> %res
+}
+
+define void @store_zeromask(<2 x double>* %ptr, <2 x double> %val)  {
+; CHECK-LABEL: @store_zeromask(
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %val, <2 x double>* %ptr, i32 4, <2 x i1> zeroinitializer)
+  ret void
+}
+
+define void @store_onemask(<2 x double>* %ptr, <2 x double> %val)  {
+; CHECK-LABEL: @store_onemask(
+; CHECK-NEXT:    store <2 x double> [[VAL:%.*]], <2 x double>* [[PTR:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %val, <2 x double>* %ptr, i32 4, <2 x i1> <i1 1, i1 1>)
+  ret void
+}
+
+define void @store_demandedelts(<2 x double>* %ptr, double %val)  {
+; CHECK-LABEL: @store_demandedelts(
+; CHECK-NEXT:    [[VALVEC1:%.*]] = insertelement <2 x double> poison, double [[VAL:%.*]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> [[VALVEC1]], <2 x double>* [[PTR:%.*]], i32 4, <2 x i1> <i1 true, i1 false>)
+; CHECK-NEXT:    ret void
+;
+  %valvec1 = insertelement <2 x double> poison, double %val, i32 0
+  %valvec2 = insertelement <2 x double> %valvec1, double %val, i32 1
+  call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %valvec2, <2 x double>* %ptr, i32 4, <2 x i1> <i1 true, i1 false>)
+  ret void
+}
+
+define <2 x double> @gather_generic(<2 x double*> %ptrs, <2 x i1> %mask, <2 x double> %passthru)  {
+; CHECK-LABEL: @gather_generic(
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> [[PTRS:%.*]], i32 4, <2 x i1> [[MASK:%.*]], <2 x double> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %passthru)
+  ret <2 x double> %res
+}
+
+
+define <2 x double> @gather_zeromask(<2 x double*> %ptrs, <2 x double> %passthru)  {
+; CHECK-LABEL: @gather_zeromask(
+; CHECK-NEXT:    ret <2 x double> [[PASSTHRU:%.*]]
+;
+  %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> zeroinitializer, <2 x double> %passthru)
+  ret <2 x double> %res
+}
+
+
+define <2 x double> @gather_onemask(<2 x double*> %ptrs, <2 x double> %passthru)  {
+; CHECK-LABEL: @gather_onemask(
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> [[PTRS:%.*]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x double> undef)
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> <i1 true, i1 true>, <2 x double> %passthru)
+  ret <2 x double> %res
+}
+
+define <4 x double> @gather_lane2(double* %base, double %pt)  {
+; CHECK-LABEL: @gather_lane2(
+; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr double, double* [[BASE:%.*]], <4 x i64> <i64 undef, i64 undef, i64 2, i64 undef>
+; CHECK-NEXT:    [[PT_V1:%.*]] = insertelement <4 x double> poison, double [[PT:%.*]], i64 0
+; CHECK-NEXT:    [[PT_V2:%.*]] = shufflevector <4 x double> [[PT_V1]], <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 undef, i32 0>
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> [[PTRS]], i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> [[PT_V2]])
+; CHECK-NEXT:    ret <4 x double> [[RES]]
+;
+  %ptrs = getelementptr double, double *%base, <4 x i64> <i64 0, i64 1, i64 2, i64 3>
+  %pt_v1 = insertelement <4 x double> poison, double %pt, i64 0
+  %pt_v2 = shufflevector <4 x double> %pt_v1, <4 x double> undef, <4 x i32> zeroinitializer
+  %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %ptrs, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> %pt_v2)
+  ret <4 x double> %res
+}
+
+define <2 x double> @gather_lane0_maybe(double* %base, double %pt, <2 x i1> %mask)  {
+; CHECK-LABEL: @gather_lane0_maybe(
+; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr double, double* [[BASE:%.*]], <2 x i64> <i64 0, i64 1>
+; CHECK-NEXT:    [[PT_V1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
+; CHECK-NEXT:    [[PT_V2:%.*]] = shufflevector <2 x double> [[PT_V1]], <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[MASK2:%.*]] = insertelement <2 x i1> [[MASK:%.*]], i1 false, i64 1
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> [[PTRS]], i32 4, <2 x i1> [[MASK2]], <2 x double> [[PT_V2]])
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %ptrs = getelementptr double, double *%base, <2 x i64> <i64 0, i64 1>
+  %pt_v1 = insertelement <2 x double> poison, double %pt, i64 0
+  %pt_v2 = insertelement <2 x double> %pt_v1, double %pt, i64 1
+  %mask2 = insertelement <2 x i1> %mask, i1 false, i64 1
+  %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask2, <2 x double> %pt_v2)
+  ret <2 x double> %res
+}
+
+define <2 x double> @gather_lane0_maybe_spec(double* %base, double %pt, <2 x i1> %mask)  {
+; CHECK-LABEL: @gather_lane0_maybe_spec(
+; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr double, double* [[BASE:%.*]], <2 x i64> <i64 0, i64 1>
+; CHECK-NEXT:    [[PT_V1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
+; CHECK-NEXT:    [[PT_V2:%.*]] = shufflevector <2 x double> [[PT_V1]], <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[MASK2:%.*]] = insertelement <2 x i1> [[MASK:%.*]], i1 false, i64 1
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> [[PTRS]], i32 4, <2 x i1> [[MASK2]], <2 x double> [[PT_V2]])
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %ptrs = getelementptr double, double *%base, <2 x i64> <i64 0, i64 1>
+  %pt_v1 = insertelement <2 x double> poison, double %pt, i64 0
+  %pt_v2 = insertelement <2 x double> %pt_v1, double %pt, i64 1
+  %mask2 = insertelement <2 x i1> %mask, i1 false, i64 1
+  %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask2, <2 x double> %pt_v2)
+  ret <2 x double> %res
+}
+
+
+define void @scatter_zeromask(<2 x double*> %ptrs, <2 x double> %val)  {
+; CHECK-LABEL: @scatter_zeromask(
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %val, <2 x double*> %ptrs, i32 8, <2 x i1> zeroinitializer)
+  ret void
+}
+
+define void @scatter_demandedelts(double* %ptr, double %val)  {
+; CHECK-LABEL: @scatter_demandedelts(
+; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr double, double* [[PTR:%.*]], <2 x i64> <i64 0, i64 undef>
+; CHECK-NEXT:    [[VALVEC1:%.*]] = insertelement <2 x double> poison, double [[VAL:%.*]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> [[VALVEC1]], <2 x double*> [[PTRS]], i32 8, <2 x i1> <i1 true, i1 false>)
+; CHECK-NEXT:    ret void
+;
+  %ptrs = getelementptr double, double* %ptr, <2 x i64> <i64 0, i64 1>
+  %valvec1 = insertelement <2 x double> poison, double %val, i32 0
+  %valvec2 = insertelement <2 x double> %valvec1, double %val, i32 1
+  call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %valvec2, <2 x double*> %ptrs, i32 8, <2 x i1> <i1 true, i1 false>)
+  ret void
+}
--- a/llvm/test/Transforms/InstCombine/pr38984-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/pr38984-inseltpoison.ll
@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "p:16:16"
+
+@a = external global [21 x i16], align 1
+@offsets = external global [4 x i16], align 1
+
+; The "same gep" optimization should work with vector icmp.
+define <4 x i1> @PR38984_1() {
+; CHECK-LABEL: @PR38984_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <4 x i1> <i1 true, i1 true, i1 true, i1 true>
+;
+entry:
+  %0 = load i16, i16* getelementptr ([4 x i16], [4 x i16]* @offsets, i16 0, i16 undef), align 1
+  %1 = insertelement <4 x i16> poison, i16 %0, i32 3
+  %2 = getelementptr i32, i32* null, <4 x i16> %1
+  %3 = getelementptr i32, i32* null, <4 x i16> %1
+  %4 = icmp eq <4 x i32*> %2, %3
+  ret <4 x i1> %4
+}
+
+; The "compare base pointers" optimization should not kick in for vector icmp.
+define <4 x i1> @PR38984_2() {
+; CHECK-LABEL: @PR38984_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* getelementptr ([4 x i16], [4 x i16]* @offsets, i16 0, i16 undef), align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, i16* getelementptr inbounds ([21 x i16], [21 x i16]* @a, i16 1, i16 0), <4 x i16> [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i16, i16* null, <4 x i16> [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i16*> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x i1> [[TMP4]]
+;
+entry:
+  %0 = load i16, i16* getelementptr ([4 x i16], [4 x i16]* @offsets, i16 0, i16 undef)
+  %1 = insertelement <4 x i16> poison, i16 %0, i32 3
+  %2 = getelementptr i16, i16* getelementptr ([21 x i16], [21 x i16]* @a, i64 1, i32 0), <4 x i16> %1
+  %3 = getelementptr i16, i16* null, <4 x i16> %1
+  %4 = icmp eq <4 x i16*> %2, %3
+  ret <4 x i1> %4
+}
--- a/llvm/test/Transforms/InstCombine/scalarization-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/scalarization-inseltpoison.ll
@ -0,0 +1,335 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+define i32 @extract_load(<4 x i32>* %p) {
+; CHECK-LABEL: @extract_load(
+; CHECK-NEXT:    [[X:%.*]] = load <4 x i32>, <4 x i32>* [[P:%.*]], align 4
+; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x i32> [[X]], i32 1
+; CHECK-NEXT:    ret i32 [[EXT]]
+;
+  %x = load <4 x i32>, <4 x i32>* %p, align 4
+  %ext = extractelement <4 x i32> %x, i32 1
+  ret i32 %ext
+}
+
+define double @extract_load_fp(<4 x double>* %p) {
+; CHECK-LABEL: @extract_load_fp(
+; CHECK-NEXT:    [[X:%.*]] = load <4 x double>, <4 x double>* [[P:%.*]], align 32
+; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x double> [[X]], i32 3
+; CHECK-NEXT:    ret double [[EXT]]
+;
+  %x = load <4 x double>, <4 x double>* %p, align 32
+  %ext = extractelement <4 x double> %x, i32 3
+  ret double %ext
+}
+
+define double @extract_load_volatile(<4 x double>* %p) {
+; CHECK-LABEL: @extract_load_volatile(
+; CHECK-NEXT:    [[X:%.*]] = load volatile <4 x double>, <4 x double>* [[P:%.*]], align 32
+; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x double> [[X]], i32 2
+; CHECK-NEXT:    ret double [[EXT]]
+;
+  %x = load volatile <4 x double>, <4 x double>* %p
+  %ext = extractelement <4 x double> %x, i32 2
+  ret double %ext
+}
+
+define double @extract_load_extra_use(<4 x double>* %p, <4 x double>* %p2) {
+; CHECK-LABEL: @extract_load_extra_use(
+; CHECK-NEXT:    [[X:%.*]] = load <4 x double>, <4 x double>* [[P:%.*]], align 8
+; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x double> [[X]], i32 0
+; CHECK-NEXT:    store <4 x double> [[X]], <4 x double>* [[P2:%.*]], align 32
+; CHECK-NEXT:    ret double [[EXT]]
+;
+  %x = load <4 x double>, <4 x double>* %p, align 8
+  %ext = extractelement <4 x double> %x, i32 0
+  store <4 x double> %x, <4 x double>* %p2
+  ret double %ext
+}
+
+define double @extract_load_variable_index(<4 x double>* %p, i32 %y) {
+; CHECK-LABEL: @extract_load_variable_index(
+; CHECK-NEXT:    [[X:%.*]] = load <4 x double>, <4 x double>* [[P:%.*]], align 32
+; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x double> [[X]], i32 [[Y:%.*]]
+; CHECK-NEXT:    ret double [[EXT]]
+;
+  %x = load <4 x double>, <4 x double>* %p
+  %ext = extractelement <4 x double> %x, i32 %y
+  ret double %ext
+}
+
+define void @scalarize_phi(i32 * %n, float * %inout) {
+; CHECK-LABEL: @scalarize_phi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[T0:%.*]] = load volatile float, float* [[INOUT:%.*]], align 4
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi float [ [[T0]], [[ENTRY:%.*]] ], [ [[TMP1:%.*]], [[FOR_BODY:%.*]] ]
+; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[T1:%.*]] = load i32, i32* [[N:%.*]], align 4
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[I_0]], [[T1]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.body:
+; CHECK-NEXT:    store volatile float [[TMP0]], float* [[INOUT]], align 4
+; CHECK-NEXT:    [[TMP1]] = fmul float [[TMP0]], 0x4002A3D700000000
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_0]], 1
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %t0 = load volatile float, float * %inout, align 4
+  %insert = insertelement <4 x float> poison, float %t0, i32 0
+  %splat = shufflevector <4 x float> %insert, <4 x float> undef, <4 x i32> zeroinitializer
+  %insert1 = insertelement <4 x float> poison, float 3.0, i32 0
+  br label %for.cond
+
+for.cond:
+  %x.0 = phi <4 x float> [ %splat, %entry ], [ %mul, %for.body ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %t1 = load i32, i32 * %n, align 4
+  %cmp = icmp ne i32 %i.0, %t1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  %t2 = extractelement <4 x float> %x.0, i32 1
+  store volatile float %t2, float * %inout, align 4
+  %mul = fmul <4 x float> %x.0, <float 0x4002A3D700000000, float 0x4002A3D700000000, float 0x4002A3D700000000, float 0x4002A3D700000000>
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret void
+}
+
+define float @extract_element_binop_splat_constant_index(<4 x float> %x) {
+; CHECK-LABEL: @extract_element_binop_splat_constant_index(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 2
+; CHECK-NEXT:    [[R:%.*]] = fadd float [[TMP1]], 0x4002A3D700000000
+; CHECK-NEXT:    ret float [[R]]
+;
+  %b = fadd <4 x float> %x, <float 0x4002A3D700000000, float 0x4002A3D700000000, float 0x4002A3D700000000, float 0x4002A3D700000000>
+  %r = extractelement <4 x float> %b, i32 2
+  ret float %r
+}
+
+define double @extract_element_binop_splat_with_undef_constant_index(<2 x double> %x) {
+; CHECK-LABEL: @extract_element_binop_splat_with_undef_constant_index(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[R:%.*]] = fdiv double 4.200000e+01, [[TMP1]]
+; CHECK-NEXT:    ret double [[R]]
+;
+  %b = fdiv <2 x double> <double 42.0, double undef>, %x
+  %r = extractelement <2 x double> %b, i32 0
+  ret double %r
+}
+
+define float @extract_element_binop_nonsplat_constant_index(<2 x float> %x) {
+; CHECK-LABEL: @extract_element_binop_nonsplat_constant_index(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1
+; CHECK-NEXT:    [[R:%.*]] = fmul float [[TMP1]], 4.300000e+01
+; CHECK-NEXT:    ret float [[R]]
+;
+  %b = fmul <2 x float> %x, <float 42.0, float 43.0>
+  %r = extractelement <2 x float> %b, i32 1
+  ret float %r
+}
+
+define i8 @extract_element_binop_splat_variable_index(<4 x i8> %x, i32 %y) {
+; CHECK-LABEL: @extract_element_binop_splat_variable_index(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = sdiv i8 [[TMP1]], 42
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %b = sdiv <4 x i8> %x, <i8 42, i8 42, i8 42, i8 42>
+  %r = extractelement <4 x i8> %b, i32 %y
+  ret i8 %r
+}
+
+define i8 @extract_element_binop_splat_with_undef_variable_index(<4 x i8> %x, i32 %y) {
+; CHECK-LABEL: @extract_element_binop_splat_with_undef_variable_index(
+; CHECK-NEXT:    [[B:%.*]] = mul <4 x i8> [[X:%.*]], <i8 42, i8 42, i8 undef, i8 42>
+; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i8> [[B]], i32 [[Y:%.*]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %b = mul <4 x i8> %x, <i8 42, i8 42, i8 undef, i8 42>
+  %r = extractelement <4 x i8> %b, i32 %y
+  ret i8 %r
+}
+
+define i8 @extract_element_binop_nonsplat_variable_index(<4 x i8> %x, i32 %y) {
+; CHECK-LABEL: @extract_element_binop_nonsplat_variable_index(
+; CHECK-NEXT:    [[B:%.*]] = lshr <4 x i8> [[X:%.*]], <i8 4, i8 3, i8 undef, i8 2>
+; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i8> [[B]], i32 [[Y:%.*]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %b = lshr <4 x i8> %x, <i8 4, i8 3, i8 undef, i8 2>
+  %r = extractelement <4 x i8> %b, i32 %y
+  ret i8 %r
+}
+
+define float @extract_element_load(<4 x float> %x, <4 x float>* %ptr) {
+; CHECK-LABEL: @extract_element_load(
+; CHECK-NEXT:    [[LOAD:%.*]] = load <4 x float>, <4 x float>* [[PTR:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[LOAD]], i32 2
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[X:%.*]], i32 2
+; CHECK-NEXT:    [[R:%.*]] = fadd float [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret float [[R]]
+;
+  %load = load <4 x float>, <4 x float>* %ptr
+  %add = fadd <4 x float> %x, %load
+  %r = extractelement <4 x float> %add, i32 2
+  ret float %r
+}
+
+define float @extract_element_multi_Use_load(<4 x float> %x, <4 x float>* %ptr0, <4 x float>* %ptr1) {
+; CHECK-LABEL: @extract_element_multi_Use_load(
+; CHECK-NEXT:    [[LOAD:%.*]] = load <4 x float>, <4 x float>* [[PTR0:%.*]], align 16
+; CHECK-NEXT:    store <4 x float> [[LOAD]], <4 x float>* [[PTR1:%.*]], align 16
+; CHECK-NEXT:    [[ADD:%.*]] = fadd <4 x float> [[LOAD]], [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x float> [[ADD]], i32 2
+; CHECK-NEXT:    ret float [[R]]
+;
+  %load = load <4 x float>, <4 x float>* %ptr0
+  store <4 x float> %load, <4 x float>* %ptr1
+  %add = fadd <4 x float> %x, %load
+  %r = extractelement <4 x float> %add, i32 2
+  ret float %r
+}
+
+define float @extract_element_variable_index(<4 x float> %x, i32 %y) {
+; CHECK-LABEL: @extract_element_variable_index(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = fadd float [[TMP1]], 1.000000e+00
+; CHECK-NEXT:    ret float [[R]]
+;
+  %add = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
+  %r = extractelement <4 x float> %add, i32 %y
+  ret float %r
+}
+
+define float @extelt_binop_insertelt(<4 x float> %A, <4 x float> %B, float %f) {
+; CHECK-LABEL: @extelt_binop_insertelt(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; CHECK-NEXT:    [[E:%.*]] = fmul nnan float [[TMP1]], [[F:%.*]]
+; CHECK-NEXT:    ret float [[E]]
+;
+  %C = insertelement <4 x float> %A, float %f, i32 0
+  %D = fmul nnan <4 x float> %C, %B
+  %E = extractelement <4 x float> %D, i32 0
+  ret float %E
+}
+
+; We recurse to find a scalarizable operand.
+; FIXME: We should propagate the IR flags including wrapping flags.
+
+define i32 @extelt_binop_binop_insertelt(<4 x i32> %A, <4 x i32> %B, i32 %f) {
+; CHECK-LABEL: @extelt_binop_binop_insertelt(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], [[F:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[B]], i32 0
+; CHECK-NEXT:    [[E:%.*]] = mul i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret i32 [[E]]
+;
+  %v = insertelement <4 x i32> %A, i32 %f, i32 0
+  %C = add <4 x i32> %v, %B
+  %D = mul nsw <4 x i32> %C, %B
+  %E = extractelement <4 x i32> %D, i32 0
+  ret i32 %E
+}
+
+define float @extract_element_constant_vector_variable_index(i32 %y) {
+; CHECK-LABEL: @extract_element_constant_vector_variable_index(
+; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, i32 [[Y:%.*]]
+; CHECK-NEXT:    ret float [[R]]
+;
+  %r = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %y
+  ret float %r
+}
+
+define i1 @cheap_to_extract_icmp(<4 x i32> %x, <4 x i1> %y) {
+; CHECK-LABEL: @cheap_to_extract_icmp(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[Y:%.*]], i32 2
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp eq <4 x i32> %x, zeroinitializer
+  %and = and <4 x i1> %cmp, %y
+  %r = extractelement <4 x i1> %and, i32 2
+  ret i1 %r
+}
+
+define i1 @cheap_to_extract_fcmp(<4 x float> %x, <4 x i1> %y) {
+; CHECK-LABEL: @cheap_to_extract_fcmp(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 2
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp oeq float [[TMP1]], 0.000000e+00
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[Y:%.*]], i32 2
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = fcmp oeq <4 x float> %x, zeroinitializer
+  %and = and <4 x i1> %cmp, %y
+  %r = extractelement <4 x i1> %and, i32 2
+  ret i1 %r
+}
+
+define i1 @extractelt_vector_icmp_constrhs(<2 x i32> %arg) {
+; CHECK-LABEL: @extractelt_vector_icmp_constrhs(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[ARG:%.*]], i32 0
+; CHECK-NEXT:    [[EXT:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[EXT]]
+;
+  %cmp = icmp eq <2 x i32> %arg, zeroinitializer
+  %ext = extractelement <2 x i1> %cmp, i32 0
+  ret i1 %ext
+}
+
+define i1 @extractelt_vector_fcmp_constrhs(<2 x float> %arg) {
+; CHECK-LABEL: @extractelt_vector_fcmp_constrhs(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[ARG:%.*]], i32 0
+; CHECK-NEXT:    [[EXT:%.*]] = fcmp oeq float [[TMP1]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[EXT]]
+;
+  %cmp = fcmp oeq <2 x float> %arg, zeroinitializer
+  %ext = extractelement <2 x i1> %cmp, i32 0
+  ret i1 %ext
+}
+
+define i1 @extractelt_vector_icmp_constrhs_dynidx(<2 x i32> %arg, i32 %idx) {
+; CHECK-LABEL: @extractelt_vector_icmp_constrhs_dynidx(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[ARG:%.*]], i32 [[IDX:%.*]]
+; CHECK-NEXT:    [[EXT:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[EXT]]
+;
+  %cmp = icmp eq <2 x i32> %arg, zeroinitializer
+  %ext = extractelement <2 x i1> %cmp, i32 %idx
+  ret i1 %ext
+}
+
+define i1 @extractelt_vector_fcmp_constrhs_dynidx(<2 x float> %arg, i32 %idx) {
+; CHECK-LABEL: @extractelt_vector_fcmp_constrhs_dynidx(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[ARG:%.*]], i32 [[IDX:%.*]]
+; CHECK-NEXT:    [[EXT:%.*]] = fcmp oeq float [[TMP1]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[EXT]]
+;
+  %cmp = fcmp oeq <2 x float> %arg, zeroinitializer
+  %ext = extractelement <2 x i1> %cmp, i32 %idx
+  ret i1 %ext
+}
+
+define i1 @extractelt_vector_fcmp_not_cheap_to_scalarize_multi_use(<2 x float> %arg0, <2 x float> %arg1, <2 x float> %arg2, i32 %idx) {
+; CHECK-LABEL: @extractelt_vector_fcmp_not_cheap_to_scalarize_multi_use(
+; CHECK-NEXT:    [[ADD:%.*]] = fadd <2 x float> [[ARG1:%.*]], [[ARG2:%.*]]
+; CHECK-NEXT:    store volatile <2 x float> [[ADD]], <2 x float>* undef, align 8
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq <2 x float> [[ADD]], [[ARG0:%.*]]
+; CHECK-NEXT:    [[EXT:%.*]] = extractelement <2 x i1> [[CMP]], i32 0
+; CHECK-NEXT:    ret i1 [[EXT]]
+;
+  %add = fadd <2 x float> %arg1, %arg2
+  store volatile <2 x float> %add, <2 x float>* undef
+  %cmp = fcmp oeq <2 x float> %arg0, %add
+  %ext = extractelement <2 x i1> %cmp, i32 0
+  ret i1 %ext
+}
--- a/llvm/test/Transforms/InstCombine/select-extractelement-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/select-extractelement-inseltpoison.ll
@ -0,0 +1,213 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+declare void @v4float_user(<4 x float>) #0
+
+define float @extract_one_select(<4 x float> %a, <4 x float> %b, i32 %c) #0 {
+; CHECK-LABEL: @extract_one_select(
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[C:%.*]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]]
+; CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x float> [[SEL]], i32 2
+; CHECK-NEXT:    ret float [[EXTRACT]]
+;
+  %cmp = icmp ne i32 %c, 0
+  %sel = select i1 %cmp, <4 x float> %a, <4 x float> %b
+  %extract = extractelement <4 x float> %sel, i32 2
+  ret float %extract
+}
+
+; Multiple extractelements
+define <2 x float> @extract_two_select(<4 x float> %a, <4 x float> %b, i32 %c) #0 {
+; CHECK-LABEL: @extract_two_select(
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[C:%.*]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]]
+; CHECK-NEXT:    [[BUILD2:%.*]] = shufflevector <4 x float> [[SEL]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    ret <2 x float> [[BUILD2]]
+;
+  %cmp = icmp ne i32 %c, 0
+  %sel = select i1 %cmp, <4 x float> %a, <4 x float> %b
+  %extract1 = extractelement <4 x float> %sel, i32 1
+  %extract2 = extractelement <4 x float> %sel, i32 2
+  %build1 = insertelement <2 x float> poison, float %extract1, i32 0
+  %build2 = insertelement <2 x float> %build1, float %extract2, i32 1
+  ret <2 x float> %build2
+}
+
+; Select has an extra non-extractelement user, don't change it
+define float @extract_one_select_user(<4 x float> %a, <4 x float> %b, i32 %c) #0 {
+; CHECK-LABEL: @extract_one_select_user(
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[C:%.*]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]]
+; CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x float> [[SEL]], i32 2
+; CHECK-NEXT:    call void @v4float_user(<4 x float> [[SEL]])
+; CHECK-NEXT:    ret float [[EXTRACT]]
+;
+  %cmp = icmp ne i32 %c, 0
+  %sel = select i1 %cmp, <4 x float> %a, <4 x float> %b
+  %extract = extractelement <4 x float> %sel, i32 2
+  call void @v4float_user(<4 x float> %sel)
+  ret float %extract
+}
+
+define float @extract_one_vselect_user(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @extract_one_vselect_user(
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq <4 x i32> [[C:%.*]], zeroinitializer
+; CHECK-NEXT:    [[SEL:%.*]] = select <4 x i1> [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]]
+; CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x float> [[SEL]], i32 2
+; CHECK-NEXT:    call void @v4float_user(<4 x float> [[SEL]])
+; CHECK-NEXT:    ret float [[EXTRACT]]
+;
+  %cmp = icmp ne <4 x i32> %c, zeroinitializer
+  %sel = select <4 x i1> %cmp, <4 x float> %a, <4 x float> %b
+  %extract = extractelement <4 x float> %sel, i32 2
+  call void @v4float_user(<4 x float> %sel)
+  ret float %extract
+}
+
+; Do not convert the vector select into a scalar select. That would increase
+; the instruction count and potentially obfuscate a vector min/max idiom.
+
+define float @extract_one_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @extract_one_vselect(
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq <4 x i32> [[C:%.*]], zeroinitializer
+; CHECK-NEXT:    [[SELECT:%.*]] = select <4 x i1> [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]]
+; CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x float> [[SELECT]], i32 0
+; CHECK-NEXT:    ret float [[EXTRACT]]
+;
+  %cmp = icmp ne <4 x i32> %c, zeroinitializer
+  %select = select <4 x i1> %cmp, <4 x float> %a, <4 x float> %b
+  %extract = extractelement <4 x float> %select, i32 0
+  ret float %extract
+}
+
+; Multiple extractelements from a vector select
+define <2 x float> @extract_two_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @extract_two_vselect(
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq <4 x i32> [[C:%.*]], zeroinitializer
+; CHECK-NEXT:    [[SEL:%.*]] = select <4 x i1> [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]]
+; CHECK-NEXT:    [[BUILD2:%.*]] = shufflevector <4 x float> [[SEL]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    ret <2 x float> [[BUILD2]]
+;
+  %cmp = icmp ne <4 x i32> %c, zeroinitializer
+  %sel = select <4 x i1> %cmp, <4 x float> %a, <4 x float> %b
+  %extract1 = extractelement <4 x float> %sel, i32 1
+  %extract2 = extractelement <4 x float> %sel, i32 2
+  %build1 = insertelement <2 x float> poison, float %extract1, i32 0
+  %build2 = insertelement <2 x float> %build1, float %extract2, i32 1
+  ret <2 x float> %build2
+}
+
+; The vector selects are not decomposed into scalar selects because that would increase
+; the instruction count. Extract+insert is converted to non-lane-crossing shuffles.
+; Test multiple extractelements
+define <4 x float> @simple_vector_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @simple_vector_select(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    [[A_SINK:%.*]] = select i1 [[TOBOOL_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; CHECK-NEXT:    [[TOBOOL1_NOT:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    [[A_SINK1:%.*]] = select i1 [[TOBOOL1_NOT]], <4 x float> [[B]], <4 x float> [[A]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A_SINK]], <4 x float> [[A_SINK1]], <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[C]], i32 2
+; CHECK-NEXT:    [[TOBOOL6_NOT:%.*]] = icmp eq i32 [[TMP3]], 0
+; CHECK-NEXT:    [[A_SINK2:%.*]] = select i1 [[TOBOOL6_NOT]], <4 x float> [[B]], <4 x float> [[A]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[A_SINK2]], <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[C]], i32 3
+; CHECK-NEXT:    [[TOBOOL11_NOT:%.*]] = icmp eq i32 [[TMP5]], 0
+; CHECK-NEXT:    [[A_SINK3:%.*]] = select i1 [[TOBOOL11_NOT]], <4 x float> [[B]], <4 x float> [[A]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[A_SINK3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[TMP6]]
+;
+entry:
+  %0 = extractelement <4 x i32> %c, i32 0
+  %tobool = icmp ne i32 %0, 0
+  %a.sink = select i1 %tobool, <4 x float> %a, <4 x float> %b
+  %1 = extractelement <4 x float> %a.sink, i32 0
+  %2 = insertelement <4 x float> poison, float %1, i32 0
+  %3 = extractelement <4 x i32> %c, i32 1
+  %tobool1 = icmp ne i32 %3, 0
+  %a.sink1 = select i1 %tobool1, <4 x float> %a, <4 x float> %b
+  %4 = extractelement <4 x float> %a.sink1, i32 1
+  %5 = insertelement <4 x float> %2, float %4, i32 1
+  %6 = extractelement <4 x i32> %c, i32 2
+  %tobool6 = icmp ne i32 %6, 0
+  %a.sink2 = select i1 %tobool6, <4 x float> %a, <4 x float> %b
+  %7 = extractelement <4 x float> %a.sink2, i32 2
+  %8 = insertelement <4 x float> %5, float %7, i32 2
+  %9 = extractelement <4 x i32> %c, i32 3
+  %tobool11 = icmp ne i32 %9, 0
+  %a.sink3 = select i1 %tobool11, <4 x float> %a, <4 x float> %b
+  %10 = extractelement <4 x float> %a.sink3, i32 3
+  %11 = insertelement <4 x float> %8, float %10, i32 3
+  ret <4 x float> %11
+}
+
+define <4 x i32> @extract_cond(<4 x i32> %x, <4 x i32> %y, <4 x i1> %condv) {
+; CHECK-LABEL: @extract_cond(
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i1> [[CONDV:%.*]], <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = select <4 x i1> [[DOTSPLAT]], <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %cond = extractelement <4 x i1> %condv, i32 3
+  %r = select i1 %cond, <4 x i32> %x, <4 x i32> %y
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @splat_cond(<4 x i32> %x, <4 x i32> %y, <4 x i1> %condv) {
+; CHECK-LABEL: @splat_cond(
+; CHECK-NEXT:    [[SPLATCOND:%.*]] = shufflevector <4 x i1> [[CONDV:%.*]], <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = select <4 x i1> [[SPLATCOND]], <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %splatcond = shufflevector <4 x i1> %condv, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %r = select <4 x i1> %splatcond, <4 x i32> %x, <4 x i32> %y
+  ret <4 x i32> %r
+}
+
+declare void @extra_use(i1)
+
+; Negative test
+
+define <4 x i32> @extract_cond_extra_use(<4 x i32> %x, <4 x i32> %y, <4 x i1> %condv) {
+; CHECK-LABEL: @extract_cond_extra_use(
+; CHECK-NEXT:    [[COND:%.*]] = extractelement <4 x i1> [[CONDV:%.*]], i32 3
+; CHECK-NEXT:    call void @extra_use(i1 [[COND]])
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[COND]], <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %cond = extractelement <4 x i1> %condv, i32 3
+  call void @extra_use(i1 %cond)
+  %r = select i1 %cond, <4 x i32> %x, <4 x i32> %y
+  ret <4 x i32> %r
+}
+
+; Negative test
+
+define <4 x i32> @extract_cond_variable_index(<4 x i32> %x, <4 x i32> %y, <4 x i1> %condv, i32 %index) {
+; CHECK-LABEL: @extract_cond_variable_index(
+; CHECK-NEXT:    [[COND:%.*]] = extractelement <4 x i1> [[CONDV:%.*]], i32 [[INDEX:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[COND]], <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %cond = extractelement <4 x i1> %condv, i32 %index
+  %r = select i1 %cond, <4 x i32> %x, <4 x i32> %y
+  ret <4 x i32> %r
+}
+
+; IR shuffle can alter the number of elements in the vector, so this is ok.
+
+define <4 x i32> @extract_cond_type_mismatch(<4 x i32> %x, <4 x i32> %y, <5 x i1> %condv) {
+; CHECK-LABEL: @extract_cond_type_mismatch(
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <5 x i1> [[CONDV:%.*]], <5 x i1> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[R:%.*]] = select <4 x i1> [[DOTSPLAT]], <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %cond = extractelement <5 x i1> %condv, i32 1
+  %r = select i1 %cond, <4 x i32> %x, <4 x i32> %y
+  ret <4 x i32> %r
+}
+
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
--- a/llvm/test/Transforms/InstCombine/shift-add-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/shift-add-inseltpoison.ll
@ -0,0 +1,122 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; This test makes sure that these instructions are properly eliminated.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+define i32 @shl_C1_add_A_C2_i32(i16 %A) {
+; CHECK-LABEL: @shl_C1_add_A_C2_i32(
+; CHECK-NEXT:    [[B:%.*]] = zext i16 [[A:%.*]] to i32
+; CHECK-NEXT:    [[D:%.*]] = shl i32 192, [[B]]
+; CHECK-NEXT:    ret i32 [[D]]
+;
+  %B = zext i16 %A to i32
+  %C = add i32 %B, 5
+  %D = shl i32 6, %C
+  ret i32 %D
+}
+
+define i32 @ashr_C1_add_A_C2_i32(i32 %A) {
+; CHECK-LABEL: @ashr_C1_add_A_C2_i32(
+; CHECK-NEXT:    ret i32 0
+;
+  %B = and i32 %A, 65535
+  %C = add i32 %B, 5
+  %D = ashr i32 6, %C
+  ret i32 %D
+}
+
+define i32 @lshr_C1_add_A_C2_i32(i32 %A) {
+; CHECK-LABEL: @lshr_C1_add_A_C2_i32(
+; CHECK-NEXT:    [[B:%.*]] = and i32 [[A:%.*]], 65535
+; CHECK-NEXT:    [[D:%.*]] = shl i32 192, [[B]]
+; CHECK-NEXT:    ret i32 [[D]]
+;
+  %B = and i32 %A, 65535
+  %C = add i32 %B, 5
+  %D = shl i32 6, %C
+  ret i32 %D
+}
+
+define <4 x i32> @shl_C1_add_A_C2_v4i32(<4 x i16> %A) {
+; CHECK-LABEL: @shl_C1_add_A_C2_v4i32(
+; CHECK-NEXT:    [[B:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[D:%.*]] = shl <4 x i32> <i32 6, i32 4, i32 poison, i32 -458752>, [[B]]
+; CHECK-NEXT:    ret <4 x i32> [[D]]
+;
+  %B = zext <4 x i16> %A to <4 x i32>
+  %C = add <4 x i32> %B, <i32 0, i32 1, i32 50, i32 16>
+  %D = shl <4 x i32> <i32 6, i32 2, i32 1, i32 -7>, %C
+  ret <4 x i32> %D
+}
+
+define <4 x i32> @ashr_C1_add_A_C2_v4i32(<4 x i32> %A) {
+; CHECK-LABEL: @ashr_C1_add_A_C2_v4i32(
+; CHECK-NEXT:    [[B:%.*]] = and <4 x i32> [[A:%.*]], <i32 0, i32 15, i32 255, i32 65535>
+; CHECK-NEXT:    [[D:%.*]] = ashr <4 x i32> <i32 6, i32 1, i32 poison, i32 -1>, [[B]]
+; CHECK-NEXT:    ret <4 x i32> [[D]]
+;
+  %B = and <4 x i32> %A, <i32 0, i32 15, i32 255, i32 65535>
+  %C = add <4 x i32> %B, <i32 0, i32 1, i32 50, i32 16>
+  %D = ashr <4 x i32> <i32 6, i32 2, i32 1, i32 -7>, %C
+  ret <4 x i32> %D
+}
+
+define <4 x i32> @lshr_C1_add_A_C2_v4i32(<4 x i32> %A) {
+; CHECK-LABEL: @lshr_C1_add_A_C2_v4i32(
+; CHECK-NEXT:    [[B:%.*]] = and <4 x i32> [[A:%.*]], <i32 0, i32 15, i32 255, i32 65535>
+; CHECK-NEXT:    [[D:%.*]] = lshr <4 x i32> <i32 6, i32 1, i32 poison, i32 65535>, [[B]]
+; CHECK-NEXT:    ret <4 x i32> [[D]]
+;
+  %B = and <4 x i32> %A, <i32 0, i32 15, i32 255, i32 65535>
+  %C = add <4 x i32> %B, <i32 0, i32 1, i32 50, i32 16>
+  %D = lshr <4 x i32> <i32 6, i32 2, i32 1, i32 -7>, %C
+  ret <4 x i32> %D
+}
+
+define <4 x i32> @shl_C1_add_A_C2_v4i32_splat(i16 %I) {
+; CHECK-LABEL: @shl_C1_add_A_C2_v4i32_splat(
+; CHECK-NEXT:    [[A:%.*]] = zext i16 [[I:%.*]] to i32
+; CHECK-NEXT:    [[B:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
+; CHECK-NEXT:    [[C:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[E:%.*]] = shl <4 x i32> <i32 6, i32 4, i32 poison, i32 -458752>, [[C]]
+; CHECK-NEXT:    ret <4 x i32> [[E]]
+;
+  %A = zext i16 %I to i32
+  %B = insertelement <4 x i32> poison, i32 %A, i32 0
+  %C = shufflevector <4 x i32> %B, <4 x i32> undef, <4 x i32> zeroinitializer
+  %D = add <4 x i32> %C, <i32 0, i32 1, i32 50, i32 16>
+  %E = shl <4 x i32> <i32 6, i32 2, i32 1, i32 -7>, %D
+  ret <4 x i32> %E
+}
+
+define <4 x i32> @ashr_C1_add_A_C2_v4i32_splat(i16 %I) {
+; CHECK-LABEL: @ashr_C1_add_A_C2_v4i32_splat(
+; CHECK-NEXT:    [[A:%.*]] = zext i16 [[I:%.*]] to i32
+; CHECK-NEXT:    [[B:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
+; CHECK-NEXT:    [[C:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[E:%.*]] = ashr <4 x i32> <i32 6, i32 1, i32 poison, i32 -1>, [[C]]
+; CHECK-NEXT:    ret <4 x i32> [[E]]
+;
+  %A = zext i16 %I to i32
+  %B = insertelement <4 x i32> poison, i32 %A, i32 0
+  %C = shufflevector <4 x i32> %B, <4 x i32> undef, <4 x i32> zeroinitializer
+  %D = add <4 x i32> %C, <i32 0, i32 1, i32 50, i32 16>
+  %E = ashr <4 x i32> <i32 6, i32 2, i32 1, i32 -7>, %D
+  ret <4 x i32> %E
+}
+
+define <4 x i32> @lshr_C1_add_A_C2_v4i32_splat(i16 %I) {
+; CHECK-LABEL: @lshr_C1_add_A_C2_v4i32_splat(
+; CHECK-NEXT:    [[A:%.*]] = zext i16 [[I:%.*]] to i32
+; CHECK-NEXT:    [[B:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
+; CHECK-NEXT:    [[C:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[E:%.*]] = lshr <4 x i32> <i32 6, i32 1, i32 poison, i32 65535>, [[C]]
+; CHECK-NEXT:    ret <4 x i32> [[E]]
+;
+  %A = zext i16 %I to i32
+  %B = insertelement <4 x i32> poison, i32 %A, i32 0
+  %C = shufflevector <4 x i32> %B, <4 x i32> undef, <4 x i32> zeroinitializer
+  %D = add <4 x i32> %C, <i32 0, i32 1, i32 50, i32 16>
+  %E = lshr <4 x i32> <i32 6, i32 2, i32 1, i32 -7>, %D
+  ret <4 x i32> %E
+}
--- a/llvm/test/Transforms/InstCombine/shufflevector-div-rem-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/shufflevector-div-rem-inseltpoison.ll
@ -0,0 +1,122 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S -o - | FileCheck %s
+
+; This test case was added as a reproducer for a miscompile, where instcombine
+; introduced an
+;   srem <2 x i16> %1, <i16 undef, i16 2>
+; instruction, which makes the whole srem undefined (even if we only end up
+; extracting the second element in the vector).
+define i16 @test_srem_orig(i16 %a, i1 %cmp) {
+; CHECK-LABEL: @test_srem_orig(
+; CHECK-NEXT:    [[SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = srem <2 x i16> [[SPLATINSERT]], <i16 2, i16 1>
+; CHECK-NEXT:    [[SPLAT_OP:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> undef, <2 x i32> <i32 undef, i32 0>
+; CHECK-NEXT:    [[T2:%.*]] = select i1 [[CMP:%.*]], <2 x i16> <i16 undef, i16 1>, <2 x i16> [[SPLAT_OP]]
+; CHECK-NEXT:    [[T3:%.*]] = extractelement <2 x i16> [[T2]], i32 1
+; CHECK-NEXT:    ret i16 [[T3]]
+;
+  %splatinsert = insertelement <2 x i16> poison, i16 %a, i32 0
+  %splat = shufflevector <2 x i16> %splatinsert, <2 x i16> undef, <2 x i32> zeroinitializer
+  %t1 = select i1 %cmp, <2 x i16> <i16 1, i16 1>, <2 x i16> %splat
+  %t2 = srem <2 x i16> %t1, <i16 2, i16 2>
+  %t3 = extractelement <2 x i16> %t2, i32 1
+  ret i16 %t3
+}
+
+; This is basically a reduced version of test_srem_orig (based on what the
+; code would look like after a few iterations of instcombine, just before we
+; try to transform the shufflevector by doing
+; "evaluateInDifferentElementOrder".
+define <2 x i16> @test_srem(i16 %a, i1 %cmp) {
+; CHECK-LABEL: @test_srem(
+; CHECK-NEXT:    [[SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[A:%.*]], i32 0
+; CHECK-NEXT:    [[T1:%.*]] = srem <2 x i16> [[SPLATINSERT]], <i16 2, i16 1>
+; CHECK-NEXT:    [[SPLAT_OP:%.*]] = shufflevector <2 x i16> [[T1]], <2 x i16> undef, <2 x i32> <i32 undef, i32 0>
+; CHECK-NEXT:    [[T2:%.*]] = select i1 [[CMP:%.*]], <2 x i16> <i16 77, i16 99>, <2 x i16> [[SPLAT_OP]]
+; CHECK-NEXT:    ret <2 x i16> [[T2]]
+;
+  %splatinsert = insertelement <2 x i16> poison, i16 %a, i32 0
+  %t1 = srem <2 x i16> %splatinsert, <i16 2, i16 1>
+  %splat.op = shufflevector <2 x i16> %t1, <2 x i16> undef, <2 x i32> <i32 undef, i32 0>
+  %t2 = select i1 %cmp, <2 x i16> <i16 77, i16 99>, <2 x i16> %splat.op
+  ret <2 x i16> %t2
+}
+
+define <2 x i16> @test_urem(i16 %a, i1 %cmp) {
+; CHECK-LABEL: @test_urem(
+; CHECK-NEXT:    [[SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[A:%.*]], i32 0
+; CHECK-NEXT:    [[T1:%.*]] = urem <2 x i16> [[SPLATINSERT]], <i16 3, i16 1>
+; CHECK-NEXT:    [[SPLAT_OP:%.*]] = shufflevector <2 x i16> [[T1]], <2 x i16> undef, <2 x i32> <i32 undef, i32 0>
+; CHECK-NEXT:    [[T2:%.*]] = select i1 [[CMP:%.*]], <2 x i16> <i16 77, i16 99>, <2 x i16> [[SPLAT_OP]]
+; CHECK-NEXT:    ret <2 x i16> [[T2]]
+;
+  %splatinsert = insertelement <2 x i16> poison, i16 %a, i32 0
+  %t1 = urem <2 x i16> %splatinsert, <i16 3, i16 1>
+  %splat.op = shufflevector <2 x i16> %t1, <2 x i16> undef, <2 x i32> <i32 undef, i32 0>
+  %t2 = select i1 %cmp, <2 x i16> <i16 77, i16 99>, <2 x i16> %splat.op
+  ret <2 x i16> %t2
+}
+
+define <2 x i16> @test_sdiv(i16 %a, i1 %cmp) {
+; CHECK-LABEL: @test_sdiv(
+; CHECK-NEXT:    [[SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[A:%.*]], i32 0
+; CHECK-NEXT:    [[T1:%.*]] = sdiv <2 x i16> [[SPLATINSERT]], <i16 2, i16 1>
+; CHECK-NEXT:    [[SPLAT_OP:%.*]] = shufflevector <2 x i16> [[T1]], <2 x i16> undef, <2 x i32> <i32 undef, i32 0>
+; CHECK-NEXT:    [[T2:%.*]] = select i1 [[CMP:%.*]], <2 x i16> <i16 77, i16 99>, <2 x i16> [[SPLAT_OP]]
+; CHECK-NEXT:    ret <2 x i16> [[T2]]
+;
+  %splatinsert = insertelement <2 x i16> poison, i16 %a, i32 0
+  %t1 = sdiv <2 x i16> %splatinsert, <i16 2, i16 1>
+  %splat.op = shufflevector <2 x i16> %t1, <2 x i16> undef, <2 x i32> <i32 undef, i32 0>
+  %t2 = select i1 %cmp, <2 x i16> <i16 77, i16 99>, <2 x i16> %splat.op
+  ret <2 x i16> %t2
+}
+
+define <2 x i16> @test_udiv(i16 %a, i1 %cmp) {
+; CHECK-LABEL: @test_udiv(
+; CHECK-NEXT:    [[SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[A:%.*]], i32 0
+; CHECK-NEXT:    [[T1:%.*]] = udiv <2 x i16> [[SPLATINSERT]], <i16 3, i16 1>
+; CHECK-NEXT:    [[SPLAT_OP:%.*]] = shufflevector <2 x i16> [[T1]], <2 x i16> undef, <2 x i32> <i32 undef, i32 0>
+; CHECK-NEXT:    [[T2:%.*]] = select i1 [[CMP:%.*]], <2 x i16> <i16 77, i16 99>, <2 x i16> [[SPLAT_OP]]
+; CHECK-NEXT:    ret <2 x i16> [[T2]]
+;
+  %splatinsert = insertelement <2 x i16> poison, i16 %a, i32 0
+  %t1 = udiv <2 x i16> %splatinsert, <i16 3, i16 1>
+  %splat.op = shufflevector <2 x i16> %t1, <2 x i16> undef, <2 x i32> <i32 undef, i32 0>
+  %t2 = select i1 %cmp, <2 x i16> <i16 77, i16 99>, <2 x i16> %splat.op
+  ret <2 x i16> %t2
+}
+
+; For fdiv we do not need to worry about div by undef. Verify that the
+; shufflevector is eliminated here.
+define <2 x float> @test_fdiv(float %a, float %b, i1 %cmp) {
+; CHECK-LABEL: @test_fdiv(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float [[A:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = fdiv <2 x float> [[TMP1]], <float undef, float 3.000000e+00>
+; CHECK-NEXT:    [[T2:%.*]] = select i1 [[CMP:%.*]], <2 x float> <float 7.700000e+01, float 9.900000e+01>, <2 x float> [[TMP2]]
+; CHECK-NEXT:    ret <2 x float> [[T2]]
+;
+  %splatinsert = insertelement <2 x float> poison, float %a, i32 0
+  %denom = insertelement <2 x float> <float 3.0, float undef>, float 1.0, i32 1
+  %t1 = fdiv <2 x float> %splatinsert, %denom
+  %splat.op = shufflevector <2 x float> %t1, <2 x float> undef, <2 x i32> <i32 undef, i32 0>
+  %t2 = select i1 %cmp, <2 x float> <float 77.0, float 99.0>, <2 x float> %splat.op
+  ret <2 x float> %t2
+}
+
+; For frem we do not need to worry about div by undef. Verify that the
+; shufflevector is eliminated here.
+define <2 x float> @test_frem(float %a, float %b, i1 %cmp) {
+; CHECK-LABEL: @test_frem(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float [[A:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = frem <2 x float> [[TMP1]], <float undef, float 3.000000e+00>
+; CHECK-NEXT:    [[T2:%.*]] = select i1 [[CMP:%.*]], <2 x float> <float 7.700000e+01, float 9.900000e+01>, <2 x float> [[TMP2]]
+; CHECK-NEXT:    ret <2 x float> [[T2]]
+;
+  %splatinsert = insertelement <2 x float> poison, float %a, i32 0
+  %denom = insertelement <2 x float> <float 3.0, float undef>, float 1.0, i32 1
+  %t1 = frem <2 x float> %splatinsert, %denom
+  %splat.op = shufflevector <2 x float> %t1, <2 x float> undef, <2 x i32> <i32 undef, i32 0>
+  %t2 = select i1 %cmp, <2 x float> <float 77.0, float 99.0>, <2 x float> %splat.op
+  ret <2 x float> %t2
+}
--- a/llvm/test/Transforms/InstCombine/trunc-extractelement-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/trunc-extractelement-inseltpoison.ll
@ -0,0 +1,195 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S -data-layout="e" | FileCheck %s --check-prefixes=ANY,LE
+; RUN: opt < %s -instcombine -S -data-layout="E" | FileCheck %s --check-prefixes=ANY,BE
+
+define i32 @shrinkExtractElt_i64_to_i32_0(<3 x i64> %x) {
+; LE-LABEL: @shrinkExtractElt_i64_to_i32_0(
+; LE-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <6 x i32>
+; LE-NEXT:    [[T:%.*]] = extractelement <6 x i32> [[TMP1]], i32 0
+; LE-NEXT:    ret i32 [[T]]
+;
+; BE-LABEL: @shrinkExtractElt_i64_to_i32_0(
+; BE-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <6 x i32>
+; BE-NEXT:    [[T:%.*]] = extractelement <6 x i32> [[TMP1]], i32 1
+; BE-NEXT:    ret i32 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i32 0
+  %t = trunc i64 %e to i32
+  ret i32 %t
+}
+
+define i32 @vscale_shrinkExtractElt_i64_to_i32_0(<vscale x 3 x i64> %x) {
+; LE-LABEL: @vscale_shrinkExtractElt_i64_to_i32_0(
+; LE-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 3 x i64> [[X:%.*]] to <vscale x 6 x i32>
+; LE-NEXT:    [[T:%.*]] = extractelement <vscale x 6 x i32> [[TMP1]], i32 0
+; LE-NEXT:    ret i32 [[T]]
+;
+; BE-LABEL: @vscale_shrinkExtractElt_i64_to_i32_0(
+; BE-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 3 x i64> [[X:%.*]] to <vscale x 6 x i32>
+; BE-NEXT:    [[T:%.*]] = extractelement <vscale x 6 x i32> [[TMP1]], i32 1
+; BE-NEXT:    ret i32 [[T]]
+;
+  %e = extractelement <vscale x 3 x i64> %x, i32 0
+  %t = trunc i64 %e to i32
+  ret i32 %t
+}
+
+
+define i32 @shrinkExtractElt_i64_to_i32_1(<3 x i64> %x) {
+; LE-LABEL: @shrinkExtractElt_i64_to_i32_1(
+; LE-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <6 x i32>
+; LE-NEXT:    [[T:%.*]] = extractelement <6 x i32> [[TMP1]], i32 2
+; LE-NEXT:    ret i32 [[T]]
+;
+; BE-LABEL: @shrinkExtractElt_i64_to_i32_1(
+; BE-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <6 x i32>
+; BE-NEXT:    [[T:%.*]] = extractelement <6 x i32> [[TMP1]], i32 3
+; BE-NEXT:    ret i32 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i32 1
+  %t = trunc i64 %e to i32
+  ret i32 %t
+}
+
+define i32 @shrinkExtractElt_i64_to_i32_2(<3 x i64> %x) {
+; LE-LABEL: @shrinkExtractElt_i64_to_i32_2(
+; LE-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <6 x i32>
+; LE-NEXT:    [[T:%.*]] = extractelement <6 x i32> [[TMP1]], i32 4
+; LE-NEXT:    ret i32 [[T]]
+;
+; BE-LABEL: @shrinkExtractElt_i64_to_i32_2(
+; BE-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <6 x i32>
+; BE-NEXT:    [[T:%.*]] = extractelement <6 x i32> [[TMP1]], i32 5
+; BE-NEXT:    ret i32 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i32 2
+  %t = trunc i64 %e to i32
+  ret i32 %t
+}
+
+define i16 @shrinkExtractElt_i64_to_i16_0(<3 x i64> %x) {
+; LE-LABEL: @shrinkExtractElt_i64_to_i16_0(
+; LE-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16>
+; LE-NEXT:    [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i32 0
+; LE-NEXT:    ret i16 [[T]]
+;
+; BE-LABEL: @shrinkExtractElt_i64_to_i16_0(
+; BE-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16>
+; BE-NEXT:    [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i32 3
+; BE-NEXT:    ret i16 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i16 0
+  %t = trunc i64 %e to i16
+  ret i16 %t
+}
+
+define i16 @shrinkExtractElt_i64_to_i16_1(<3 x i64> %x) {
+; LE-LABEL: @shrinkExtractElt_i64_to_i16_1(
+; LE-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16>
+; LE-NEXT:    [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i32 4
+; LE-NEXT:    ret i16 [[T]]
+;
+; BE-LABEL: @shrinkExtractElt_i64_to_i16_1(
+; BE-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16>
+; BE-NEXT:    [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i32 7
+; BE-NEXT:    ret i16 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i16 1
+  %t = trunc i64 %e to i16
+  ret i16 %t
+}
+
+define i16 @shrinkExtractElt_i64_to_i16_2(<3 x i64> %x) {
+; LE-LABEL: @shrinkExtractElt_i64_to_i16_2(
+; LE-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16>
+; LE-NEXT:    [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i32 8
+; LE-NEXT:    ret i16 [[T]]
+;
+; BE-LABEL: @shrinkExtractElt_i64_to_i16_2(
+; BE-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16>
+; BE-NEXT:    [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i32 11
+; BE-NEXT:    ret i16 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i16 2
+  %t = trunc i64 %e to i16
+  ret i16 %t
+}
+
+; Crazy types may be ok.
+define i11 @shrinkExtractElt_i33_to_11_2(<3 x i33> %x) {
+; LE-LABEL: @shrinkExtractElt_i33_to_11_2(
+; LE-NEXT:    [[TMP1:%.*]] = bitcast <3 x i33> [[X:%.*]] to <9 x i11>
+; LE-NEXT:    [[T:%.*]] = extractelement <9 x i11> [[TMP1]], i32 6
+; LE-NEXT:    ret i11 [[T]]
+;
+; BE-LABEL: @shrinkExtractElt_i33_to_11_2(
+; BE-NEXT:    [[TMP1:%.*]] = bitcast <3 x i33> [[X:%.*]] to <9 x i11>
+; BE-NEXT:    [[T:%.*]] = extractelement <9 x i11> [[TMP1]], i32 8
+; BE-NEXT:    ret i11 [[T]]
+;
+  %e = extractelement <3 x i33> %x, i16 2
+  %t = trunc i33 %e to i11
+  ret i11 %t
+}
+
+; Do not optimize if it would result in an invalid bitcast instruction.
+define i13 @shrinkExtractElt_i67_to_i13_2(<3 x i67> %x) {
+; ANY-LABEL: @shrinkExtractElt_i67_to_i13_2(
+; ANY-NEXT:    [[E:%.*]] = extractelement <3 x i67> [[X:%.*]], i459 2
+; ANY-NEXT:    [[T:%.*]] = trunc i67 [[E]] to i13
+; ANY-NEXT:    ret i13 [[T]]
+;
+  %e = extractelement <3 x i67> %x, i459 2
+  %t = trunc i67 %e to i13
+  ret i13 %t
+}
+
+; Do not optimize if the bitcast instruction would be valid, but the
+; transform would be wrong.
+define i30 @shrinkExtractElt_i40_to_i30_1(<3 x i40> %x) {
+; ANY-LABEL: @shrinkExtractElt_i40_to_i30_1(
+; ANY-NEXT:    [[E:%.*]] = extractelement <3 x i40> [[X:%.*]], i32 1
+; ANY-NEXT:    [[T:%.*]] = trunc i40 [[E]] to i30
+; ANY-NEXT:    ret i30 [[T]]
+;
+  %e = extractelement <3 x i40> %x, i32 1
+  %t = trunc i40 %e to i30
+  ret i30 %t
+}
+
+; Do not canonicalize if that would increase the instruction count.
+declare void @use(i64)
+define i16 @shrinkExtractElt_i64_to_i16_2_extra_use(<3 x i64> %x) {
+; ANY-LABEL: @shrinkExtractElt_i64_to_i16_2_extra_use(
+; ANY-NEXT:    [[E:%.*]] = extractelement <3 x i64> [[X:%.*]], i64 2
+; ANY-NEXT:    call void @use(i64 [[E]])
+; ANY-NEXT:    [[T:%.*]] = trunc i64 [[E]] to i16
+; ANY-NEXT:    ret i16 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i64 2
+  call void @use(i64 %e)
+  %t = trunc i64 %e to i16
+  ret i16 %t
+}
+
+; Check to ensure PR45314 remains fixed.
+define <4 x i64> @PR45314(<4 x i64> %x) {
+; LE-LABEL: @PR45314(
+; LE-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[X:%.*]] to <8 x i32>
+; LE-NEXT:    [[S:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; LE-NEXT:    [[B:%.*]] = bitcast <8 x i32> [[S]] to <4 x i64>
+; LE-NEXT:    ret <4 x i64> [[B]]
+;
+; BE-LABEL: @PR45314(
+; BE-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[X:%.*]] to <8 x i32>
+; BE-NEXT:    [[S:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; BE-NEXT:    [[B:%.*]] = bitcast <8 x i32> [[S]] to <4 x i64>
+; BE-NEXT:    ret <4 x i64> [[B]]
+;
+  %e = extractelement <4 x i64> %x, i32 0
+  %t = trunc i64 %e to i32
+  %i = insertelement <8 x i32> poison, i32 %t, i32 0
+  %s = shufflevector <8 x i32> %i, <8 x i32> undef, <8 x i32> zeroinitializer
+  %b = bitcast <8 x i32> %s to <4 x i64>
+  ret <4 x i64> %b
+}
--- a/llvm/test/Transforms/InstCombine/udiv-pow2-vscale-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/udiv-pow2-vscale-inseltpoison.ll
@ -0,0 +1,27 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+; This vscale udiv with a power-of-2 spalt on the rhs should not crash opt
+
+; CHECK: define <vscale x 2 x i32> @udiv_pow2_vscale(<vscale x 2 x i32> %lhs)
+define <vscale x 2 x i32> @udiv_pow2_vscale(<vscale x 2 x i32> %lhs) {
+  %splatter = insertelement <vscale x 2 x i32> poison, i32 2, i32 0
+  %rhs = shufflevector <vscale x 2 x i32> %splatter,
+                       <vscale x 2 x i32> undef,
+                       <vscale x 2 x i32> zeroinitializer
+  %res = udiv <vscale x 2 x i32> %lhs, %rhs
+  ret <vscale x 2 x i32> %res
+}
+
+; This fixed width udiv with a power-of-2 splat on the rhs should also not
+; crash, and instcombine should eliminate the udiv
+
+; CHECK-LABEL: define <2 x i32> @udiv_pow2_fixed(<2 x i32> %lhs)
+; CHECK-NOT: udiv
+define <2 x i32> @udiv_pow2_fixed(<2 x i32> %lhs) {
+  %splatter = insertelement <2 x i32> poison, i32 2, i32 0
+  %rhs = shufflevector <2 x i32> %splatter,
+                       <2 x i32> undef,
+                       <2 x i32> zeroinitializer
+  %res = udiv <2 x i32> %lhs, %rhs
+  ret <2 x i32> %res
+}
--- a/llvm/test/Transforms/InstCombine/vec_demanded_elts-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/vec_demanded_elts-inseltpoison.ll
@ -0,0 +1,850 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define i32 @test2(float %f) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[T5:%.*]] = fmul float [[F:%.*]], [[F]]
+; CHECK-NEXT:    [[T21:%.*]] = bitcast float [[T5]] to i32
+; CHECK-NEXT:    ret i32 [[T21]]
+;
+  %t5 = fmul float %f, %f
+  %t9 = insertelement <4 x float> poison, float %t5, i32 0
+  %t10 = insertelement <4 x float> %t9, float 0.000000e+00, i32 1
+  %t11 = insertelement <4 x float> %t10, float 0.000000e+00, i32 2
+  %t12 = insertelement <4 x float> %t11, float 0.000000e+00, i32 3
+  %t19 = bitcast <4 x float> %t12 to <4 x i32>
+  %t21 = extractelement <4 x i32> %t19, i32 0
+  ret i32 %t21
+}
+
+define void @get_image() nounwind {
+; CHECK-LABEL: @get_image(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @fgetc(i8* null) [[ATTR0:#.*]]
+; CHECK-NEXT:    br i1 false, label [[BB2:%.*]], label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    unreachable
+;
+entry:
+  %0 = call i32 @fgetc(i8* null) nounwind
+  %1 = trunc i32 %0 to i8
+  %t2 = insertelement <100 x i8> zeroinitializer, i8 %1, i32 1
+  %t1 = extractelement <100 x i8> %t2, i32 0
+  %2 = icmp eq i8 %t1, 80
+  br i1 %2, label %bb2, label %bb3
+
+bb2:            ; preds = %entry
+  br label %bb3
+
+bb3:            ; preds = %bb2, %entry
+  unreachable
+}
+
+; PR4340
+define void @vac(<4 x float>* nocapture %a) nounwind {
+; CHECK-LABEL: @vac(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store <4 x float> zeroinitializer, <4 x float>* [[A:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %t1 = load <4 x float>, <4 x float>* %a		; <<4 x float>> [#uses=1]
+  %vecins = insertelement <4 x float> %t1, float 0.000000e+00, i32 0	; <<4 x float>> [#uses=1]
+  %vecins4 = insertelement <4 x float> %vecins, float 0.000000e+00, i32 1; <<4 x float>> [#uses=1]
+  %vecins6 = insertelement <4 x float> %vecins4, float 0.000000e+00, i32 2; <<4 x float>> [#uses=1]
+  %vecins8 = insertelement <4 x float> %vecins6, float 0.000000e+00, i32 3; <<4 x float>> [#uses=1]
+  store <4 x float> %vecins8, <4 x float>* %a
+  ret void
+}
+
+declare i32 @fgetc(i8*)
+
+define <4 x float> @dead_shuffle_elt(<4 x float> %x, <2 x float> %y) nounwind {
+; CHECK-LABEL: @dead_shuffle_elt(
+; CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[Y:%.*]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[SHUFFLE9_I:%.*]] = shufflevector <4 x float> [[SHUFFLE_I]], <4 x float> [[X:%.*]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[SHUFFLE9_I]]
+;
+  %shuffle.i = shufflevector <2 x float> %y, <2 x float> %y, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %shuffle9.i = shufflevector <4 x float> %x, <4 x float> %shuffle.i, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  ret <4 x float> %shuffle9.i
+}
+
+define <2 x float> @test_fptrunc(double %f) {
+; CHECK-LABEL: @test_fptrunc(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> <double undef, double 0.000000e+00>, double [[F:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = fptrunc <2 x double> [[TMP1]] to <2 x float>
+; CHECK-NEXT:    ret <2 x float> [[TMP2]]
+;
+  %t9 = insertelement <4 x double> poison, double %f, i32 0
+  %t10 = insertelement <4 x double> %t9, double 0.000000e+00, i32 1
+  %t11 = insertelement <4 x double> %t10, double 0.000000e+00, i32 2
+  %t12 = insertelement <4 x double> %t11, double 0.000000e+00, i32 3
+  %t5 = fptrunc <4 x double> %t12 to <4 x float>
+  %ret = shufflevector <4 x float> %t5, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %ret
+}
+
+define <2 x double> @test_fpext(float %f) {
+; CHECK-LABEL: @test_fpext(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> <float undef, float 0.000000e+00>, float [[F:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double>
+; CHECK-NEXT:    ret <2 x double> [[TMP2]]
+;
+  %t9 = insertelement <4 x float> poison, float %f, i32 0
+  %t10 = insertelement <4 x float> %t9, float 0.000000e+00, i32 1
+  %t11 = insertelement <4 x float> %t10, float 0.000000e+00, i32 2
+  %t12 = insertelement <4 x float> %t11, float 0.000000e+00, i32 3
+  %t5 = fpext <4 x float> %t12 to <4 x double>
+  %ret = shufflevector <4 x double> %t5, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %ret
+}
+
+define <4 x double> @test_shuffle(<4 x double> %f) {
+; CHECK-LABEL: @test_shuffle(
+; CHECK-NEXT:    [[RET1:%.*]] = insertelement <4 x double> [[F:%.*]], double 1.000000e+00, i32 3
+; CHECK-NEXT:    ret <4 x double> [[RET1]]
+;
+  %ret = shufflevector <4 x double> %f, <4 x double> <double undef, double 1.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
+  ret <4 x double> %ret
+}
+
+define <4 x float> @test_select(float %f, float %g) {
+; CHECK-LABEL: @test_select(
+; CHECK-NEXT:    [[A3:%.*]] = insertelement <4 x float> <float undef, float undef, float undef, float 3.000000e+00>, float [[F:%.*]], i32 0
+; CHECK-NEXT:    [[RET:%.*]] = shufflevector <4 x float> [[A3]], <4 x float> <float undef, float 4.000000e+00, float 5.000000e+00, float undef>, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[RET]]
+;
+  %a0 = insertelement <4 x float> poison, float %f, i32 0
+  %a1 = insertelement <4 x float> %a0, float 1.000000e+00, i32 1
+  %a2 = insertelement <4 x float> %a1, float 2.000000e+00, i32 2
+  %a3 = insertelement <4 x float> %a2, float 3.000000e+00, i32 3
+  %b0 = insertelement <4 x float> poison, float %g, i32 0
+  %b1 = insertelement <4 x float> %b0, float 4.000000e+00, i32 1
+  %b2 = insertelement <4 x float> %b1, float 5.000000e+00, i32 2
+  %b3 = insertelement <4 x float> %b2, float 6.000000e+00, i32 3
+  %ret = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %a3, <4 x float> %b3
+  ret <4 x float> %ret
+}
+
+; Check that instcombine doesn't wrongly fold away the select completely.
+
+define <2 x i64> @PR24922(<2 x i64> %v) {
+; CHECK-LABEL: @PR24922(
+; CHECK-NEXT:    [[RESULT1:%.*]] = insertelement <2 x i64> [[V:%.*]], i64 0, i32 0
+; CHECK-NEXT:    ret <2 x i64> [[RESULT1]]
+;
+  %result = select <2 x i1> <i1 icmp eq (i64 extractelement (<2 x i64> bitcast (<4 x i32> <i32 15, i32 15, i32 15, i32 15> to <2 x i64>), i64 0), i64 0), i1 true>, <2 x i64> %v, <2 x i64> zeroinitializer
+  ret <2 x i64> %result
+}
+
+; The shuffle only demands the 0th (undef) element of 'out123', so everything should fold away.
+
+define <4 x float> @inselt_shuf_no_demand(float %a1, float %a2, float %a3) {
+; CHECK-LABEL: @inselt_shuf_no_demand(
+; CHECK-NEXT:    ret <4 x float> undef
+;
+  %out1 = insertelement <4 x float> poison, float %a1, i32 1
+  %out12 = insertelement <4 x float> %out1, float %a2, i32 2
+  %out123 = insertelement <4 x float> %out12, float %a3, i32 3
+  %shuffle = shufflevector <4 x float> %out123, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  ret <4 x float> %shuffle
+}
+
+; The shuffle only demands the 0th (undef) element of 'out123', so everything should fold away.
+
+define <4 x float> @inselt_shuf_no_demand_commute(float %a1, float %a2, float %a3) {
+; CHECK-LABEL: @inselt_shuf_no_demand_commute(
+; CHECK-NEXT:    ret <4 x float> undef
+;
+  %out1 = insertelement <4 x float> poison, float %a1, i32 1
+  %out12 = insertelement <4 x float> %out1, float %a2, i32 2
+  %out123 = insertelement <4 x float> %out12, float %a3, i32 3
+  %shuffle = shufflevector <4 x float> undef, <4 x float> %out123, <4 x i32> <i32 4, i32 undef, i32 undef, i32 undef>
+  ret <4 x float> %shuffle
+}
+
+; The add uses 'out012' giving it multiple uses after the shuffle is transformed to also
+; use 'out012'. The analysis should be able to see past that.
+
+define <4 x i32> @inselt_shuf_no_demand_multiuse(i32 %a0, i32 %a1, <4 x i32> %b) {
+; CHECK-LABEL: @inselt_shuf_no_demand_multiuse(
+; CHECK-NEXT:    [[OUT0:%.*]] = insertelement <4 x i32> poison, i32 [[A0:%.*]], i32 0
+; CHECK-NEXT:    [[OUT01:%.*]] = insertelement <4 x i32> [[OUT0]], i32 [[A1:%.*]], i32 1
+; CHECK-NEXT:    [[FOO:%.*]] = add <4 x i32> [[OUT01]], [[B:%.*]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[FOO]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <4 x i32> [[SHUFFLE]]
+;
+  %out0 = insertelement <4 x i32> poison, i32 %a0, i32 0
+  %out01 = insertelement <4 x i32> %out0, i32 %a1, i32 1
+  %out012 = insertelement <4 x i32> %out01, i32 %a0, i32 2
+  %foo = add <4 x i32> %out012, %b
+  %out0123 = insertelement <4 x i32> %foo, i32 %a1, i32 3
+  %shuffle = shufflevector <4 x i32> %out0123, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x float> @inselt_shuf_no_demand_bogus_insert_index_in_chain(float %a1, float %a2, float %a3, i32 %variable_index) {
+; CHECK-LABEL: @inselt_shuf_no_demand_bogus_insert_index_in_chain(
+; CHECK-NEXT:    [[OUT12:%.*]] = insertelement <4 x float> poison, float [[A2:%.*]], i32 [[VARIABLE_INDEX:%.*]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[OUT12]], <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <4 x float> [[SHUFFLE]]
+;
+  %out1 = insertelement <4 x float> poison, float %a1, i32 1
+  %out12 = insertelement <4 x float> %out1, float %a2, i32 %variable_index ; something unexpected
+  %out123 = insertelement <4 x float> %out12, float %a3, i32 3
+  %shuffle = shufflevector <4 x float> %out123, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  ret <4 x float> %shuffle
+}
+
+; Test undef replacement in constant vector elements with binops.
+
+define <3 x i8> @shuf_add(<3 x i8> %x) {
+; CHECK-LABEL: @shuf_add(
+; CHECK-NEXT:    [[BO:%.*]] = add <3 x i8> [[X:%.*]], <i8 undef, i8 2, i8 3>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 1, i32 undef, i32 2>
+; CHECK-NEXT:    ret <3 x i8> [[R]]
+;
+  %bo = add nsw <3 x i8> %x, <i8 1, i8 2, i8 3>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 1, i32 undef, i32 2>
+  ret <3 x i8> %r
+}
+
+define <3 x i8> @shuf_sub(<3 x i8> %x) {
+; CHECK-LABEL: @shuf_sub(
+; CHECK-NEXT:    [[BO:%.*]] = sub <3 x i8> <i8 1, i8 undef, i8 3>, [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 0, i32 undef, i32 2>
+; CHECK-NEXT:    ret <3 x i8> [[R]]
+;
+  %bo = sub nuw <3 x i8> <i8 1, i8 2, i8 3>, %x
+  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 0, i32 undef, i32 2>
+  ret <3 x i8> %r
+}
+
+define <3 x i8> @shuf_mul(<3 x i8> %x) {
+; CHECK-LABEL: @shuf_mul(
+; CHECK-NEXT:    [[BO:%.*]] = mul <3 x i8> [[X:%.*]], <i8 1, i8 undef, i8 3>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 0, i32 2, i32 0>
+; CHECK-NEXT:    ret <3 x i8> [[R]]
+;
+  %bo = mul nsw <3 x i8> %x, <i8 1, i8 2, i8 3>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 0, i32 2, i32 0>
+  ret <3 x i8> %r
+}
+
+define <3 x i8> @shuf_and(<3 x i8> %x) {
+; CHECK-LABEL: @shuf_and(
+; CHECK-NEXT:    [[BO:%.*]] = and <3 x i8> [[X:%.*]], <i8 1, i8 2, i8 undef>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 1, i32 1, i32 0>
+; CHECK-NEXT:    ret <3 x i8> [[R]]
+;
+  %bo = and <3 x i8> %x, <i8 1, i8 2, i8 3>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 1, i32 1, i32 0>
+  ret <3 x i8> %r
+}
+
+define <3 x i8> @shuf_or(<3 x i8> %x) {
+; CHECK-LABEL: @shuf_or(
+; CHECK-NEXT:    [[BO:%.*]] = or <3 x i8> [[X:%.*]], <i8 1, i8 2, i8 undef>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 1, i32 undef, i32 0>
+; CHECK-NEXT:    ret <3 x i8> [[R]]
+;
+  %bo = or <3 x i8> %x, <i8 1, i8 2, i8 3>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 1, i32 undef, i32 0>
+  ret <3 x i8> %r
+}
+
+define <3 x i8> @shuf_xor(<3 x i8> %x) {
+; CHECK-LABEL: @shuf_xor(
+; CHECK-NEXT:    [[BO:%.*]] = xor <3 x i8> [[X:%.*]], <i8 1, i8 undef, i8 3>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 2, i32 undef, i32 0>
+; CHECK-NEXT:    ret <3 x i8> [[R]]
+;
+  %bo = xor <3 x i8> %x, <i8 1, i8 2, i8 3>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 2, i32 undef, i32 0>
+  ret <3 x i8> %r
+}
+
+define <3 x i8> @shuf_lshr_const_op0(<3 x i8> %x) {
+; CHECK-LABEL: @shuf_lshr_const_op0(
+; CHECK-NEXT:    [[BO:%.*]] = lshr <3 x i8> <i8 1, i8 2, i8 3>, [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 2, i32 1, i32 undef>
+; CHECK-NEXT:    ret <3 x i8> [[R]]
+;
+  %bo = lshr <3 x i8> <i8 1, i8 2, i8 3>, %x
+  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 2, i32 1, i32 undef>
+  ret <3 x i8> %r
+}
+
+define <3 x i8> @shuf_lshr_const_op1(<3 x i8> %x) {
+; CHECK-LABEL: @shuf_lshr_const_op1(
+; CHECK-NEXT:    [[BO:%.*]] = lshr exact <3 x i8> [[X:%.*]], <i8 1, i8 2, i8 3>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 2, i32 1, i32 undef>
+; CHECK-NEXT:    ret <3 x i8> [[R]]
+;
+  %bo = lshr exact <3 x i8> %x, <i8 1, i8 2, i8 3>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 2, i32 1, i32 undef>
+  ret <3 x i8> %r
+}
+
+define <3 x i8> @shuf_ashr_const_op0(<3 x i8> %x) {
+; CHECK-LABEL: @shuf_ashr_const_op0(
+; CHECK-NEXT:    [[BO:%.*]] = lshr <3 x i8> <i8 1, i8 2, i8 3>, [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 0, i32 undef, i32 1>
+; CHECK-NEXT:    ret <3 x i8> [[R]]
+;
+  %bo = ashr <3 x i8> <i8 1, i8 2, i8 3>, %x
+  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 0, i32 undef, i32 1>
+  ret <3 x i8> %r
+}
+
+define <3 x i8> @shuf_ashr_const_op1(<3 x i8> %x) {
+; CHECK-LABEL: @shuf_ashr_const_op1(
+; CHECK-NEXT:    [[BO:%.*]] = ashr exact <3 x i8> [[X:%.*]], <i8 1, i8 2, i8 3>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 0, i32 undef, i32 1>
+; CHECK-NEXT:    ret <3 x i8> [[R]]
+;
+  %bo = ashr exact <3 x i8> %x, <i8 1, i8 2, i8 3>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 0, i32 undef, i32 1>
+  ret <3 x i8> %r
+}
+
+define <3 x i8> @shuf_shl_const_op0(<3 x i8> %x) {
+; CHECK-LABEL: @shuf_shl_const_op0(
+; CHECK-NEXT:    [[BO:%.*]] = shl nsw <3 x i8> <i8 1, i8 2, i8 3>, [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 2, i32 undef, i32 0>
+; CHECK-NEXT:    ret <3 x i8> [[R]]
+;
+  %bo = shl nsw <3 x i8> <i8 1, i8 2, i8 3>, %x
+  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 2, i32 undef, i32 0>
+  ret <3 x i8> %r
+}
+
+define <3 x i8> @shuf_shl_const_op1(<3 x i8> %x) {
+; CHECK-LABEL: @shuf_shl_const_op1(
+; CHECK-NEXT:    [[BO:%.*]] = shl nuw <3 x i8> [[X:%.*]], <i8 1, i8 2, i8 3>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 2, i32 undef, i32 0>
+; CHECK-NEXT:    ret <3 x i8> [[R]]
+;
+  %bo = shl nuw <3 x i8> %x, <i8 1, i8 2, i8 3>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 2, i32 undef, i32 0>
+  ret <3 x i8> %r
+}
+
+define <3 x i8> @shuf_sdiv_const_op0(<3 x i8> %x) {
+; CHECK-LABEL: @shuf_sdiv_const_op0(
+; CHECK-NEXT:    [[BO:%.*]] = sdiv exact <3 x i8> <i8 1, i8 2, i8 3>, [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 0, i32 undef, i32 1>
+; CHECK-NEXT:    ret <3 x i8> [[R]]
+;
+  %bo = sdiv exact <3 x i8> <i8 1, i8 2, i8 3>, %x
+  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 0, i32 undef, i32 1>
+  ret <3 x i8> %r
+}
+
+define <3 x i8> @shuf_sdiv_const_op1(<3 x i8> %x) {
+; CHECK-LABEL: @shuf_sdiv_const_op1(
+; CHECK-NEXT:    [[BO:%.*]] = sdiv <3 x i8> [[X:%.*]], <i8 1, i8 2, i8 3>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 1, i32 undef, i32 0>
+; CHECK-NEXT:    ret <3 x i8> [[R]]
+;
+  %bo = sdiv <3 x i8> %x, <i8 1, i8 2, i8 3>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 1, i32 undef, i32 0>
+  ret <3 x i8> %r
+}
+
+define <3 x i8> @shuf_srem_const_op0(<3 x i8> %x) {
+; CHECK-LABEL: @shuf_srem_const_op0(
+; CHECK-NEXT:    [[BO:%.*]] = srem <3 x i8> <i8 1, i8 2, i8 3>, [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 1, i32 undef, i32 2>
+; CHECK-NEXT:    ret <3 x i8> [[R]]
+;
+  %bo = srem <3 x i8> <i8 1, i8 2, i8 3>, %x
+  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 1, i32 undef, i32 2>
+  ret <3 x i8> %r
+}
+
+define <3 x i8> @shuf_srem_const_op1(<3 x i8> %x) {
+; CHECK-LABEL: @shuf_srem_const_op1(
+; CHECK-NEXT:    [[BO:%.*]] = srem <3 x i8> [[X:%.*]], <i8 1, i8 2, i8 3>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 2, i32 undef, i32 1>
+; CHECK-NEXT:    ret <3 x i8> [[R]]
+;
+  %bo = srem <3 x i8> %x, <i8 1, i8 2, i8 3>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 2, i32 undef, i32 1>
+  ret <3 x i8> %r
+}
+
+define <3 x i8> @shuf_udiv_const_op0(<3 x i8> %x) {
+; CHECK-LABEL: @shuf_udiv_const_op0(
+; CHECK-NEXT:    [[BO:%.*]] = udiv exact <3 x i8> <i8 1, i8 2, i8 3>, [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 2, i32 undef, i32 0>
+; CHECK-NEXT:    ret <3 x i8> [[R]]
+;
+  %bo = udiv exact <3 x i8> <i8 1, i8 2, i8 3>, %x
+  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 2, i32 undef, i32 0>
+  ret <3 x i8> %r
+}
+
+define <3 x i8> @shuf_udiv_const_op1(<3 x i8> %x) {
+; CHECK-LABEL: @shuf_udiv_const_op1(
+; CHECK-NEXT:    [[BO:%.*]] = udiv <3 x i8> [[X:%.*]], <i8 1, i8 2, i8 3>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 2, i32 undef, i32 0>
+; CHECK-NEXT:    ret <3 x i8> [[R]]
+;
+  %bo = udiv <3 x i8> %x, <i8 1, i8 2, i8 3>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 2, i32 undef, i32 0>
+  ret <3 x i8> %r
+}
+
+define <3 x i8> @shuf_urem_const_op0(<3 x i8> %x) {
+; CHECK-LABEL: @shuf_urem_const_op0(
+; CHECK-NEXT:    [[BO:%.*]] = urem <3 x i8> <i8 1, i8 2, i8 3>, [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 2, i32 1, i32 undef>
+; CHECK-NEXT:    ret <3 x i8> [[R]]
+;
+  %bo = urem <3 x i8> <i8 1, i8 2, i8 3>, %x
+  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 2, i32 1, i32 undef>
+  ret <3 x i8> %r
+}
+
+define <3 x i8> @shuf_urem_const_op1(<3 x i8> %x) {
+; CHECK-LABEL: @shuf_urem_const_op1(
+; CHECK-NEXT:    [[BO:%.*]] = urem <3 x i8> [[X:%.*]], <i8 1, i8 2, i8 3>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i8> [[BO]], <3 x i8> undef, <3 x i32> <i32 undef, i32 1, i32 0>
+; CHECK-NEXT:    ret <3 x i8> [[R]]
+;
+  %bo = urem <3 x i8> %x, <i8 1, i8 2, i8 3>
+  %r = shufflevector <3 x i8> %bo, <3 x i8> undef, <3 x i32> <i32 undef, i32 1, i32 0>
+  ret <3 x i8> %r
+}
+
+define <3 x float> @shuf_fadd(<3 x float> %x) {
+; CHECK-LABEL: @shuf_fadd(
+; CHECK-NEXT:    [[BO:%.*]] = fadd <3 x float> [[X:%.*]], <float 1.000000e+00, float 2.000000e+00, float undef>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x float> [[BO]], <3 x float> undef, <3 x i32> <i32 undef, i32 1, i32 0>
+; CHECK-NEXT:    ret <3 x float> [[R]]
+;
+  %bo = fadd <3 x float> %x, <float 1.0, float 2.0, float 3.0>
+  %r = shufflevector <3 x float> %bo, <3 x float> undef, <3 x i32> <i32 undef, i32 1, i32 0>
+  ret <3 x float> %r
+}
+
+define <3 x float> @shuf_fsub(<3 x float> %x) {
+; CHECK-LABEL: @shuf_fsub(
+; CHECK-NEXT:    [[BO:%.*]] = fsub fast <3 x float> <float 1.000000e+00, float undef, float 3.000000e+00>, [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x float> [[BO]], <3 x float> undef, <3 x i32> <i32 undef, i32 0, i32 2>
+; CHECK-NEXT:    ret <3 x float> [[R]]
+;
+  %bo = fsub fast <3 x float> <float 1.0, float 2.0, float 3.0>, %x
+  %r = shufflevector <3 x float> %bo, <3 x float> undef, <3 x i32> <i32 undef, i32 0, i32 2>
+  ret <3 x float> %r
+}
+
+define <3 x float> @shuf_fmul(<3 x float> %x) {
+; CHECK-LABEL: @shuf_fmul(
+; CHECK-NEXT:    [[BO:%.*]] = fmul reassoc <3 x float> [[X:%.*]], <float 1.000000e+00, float 2.000000e+00, float undef>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x float> [[BO]], <3 x float> undef, <3 x i32> <i32 undef, i32 1, i32 0>
+; CHECK-NEXT:    ret <3 x float> [[R]]
+;
+  %bo = fmul reassoc <3 x float> %x, <float 1.0, float 2.0, float 3.0>
+  %r = shufflevector <3 x float> %bo, <3 x float> undef, <3 x i32> <i32 undef, i32 1, i32 0>
+  ret <3 x float> %r
+}
+
+define <3 x float> @shuf_fdiv_const_op0(<3 x float> %x) {
+; CHECK-LABEL: @shuf_fdiv_const_op0(
+; CHECK-NEXT:    [[BO:%.*]] = fdiv reassoc ninf <3 x float> <float 1.000000e+00, float undef, float 3.000000e+00>, [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x float> [[BO]], <3 x float> undef, <3 x i32> <i32 undef, i32 0, i32 2>
+; CHECK-NEXT:    ret <3 x float> [[R]]
+;
+  %bo = fdiv ninf reassoc <3 x float> <float 1.0, float 2.0, float 3.0>, %x
+  %r = shufflevector <3 x float> %bo, <3 x float> undef, <3 x i32> <i32 undef, i32 0, i32 2>
+  ret <3 x float> %r
+}
+
+define <3 x float> @shuf_fdiv_const_op1(<3 x float> %x) {
+; CHECK-LABEL: @shuf_fdiv_const_op1(
+; CHECK-NEXT:    [[BO:%.*]] = fdiv nnan ninf <3 x float> [[X:%.*]], <float 1.000000e+00, float 2.000000e+00, float undef>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x float> [[BO]], <3 x float> undef, <3 x i32> <i32 undef, i32 1, i32 0>
+; CHECK-NEXT:    ret <3 x float> [[R]]
+;
+  %bo = fdiv ninf nnan <3 x float> %x, <float 1.0, float 2.0, float 3.0>
+  %r = shufflevector <3 x float> %bo, <3 x float> undef, <3 x i32> <i32 undef, i32 1, i32 0>
+  ret <3 x float> %r
+}
+
+define <3 x float> @shuf_frem_const_op0(<3 x float> %x) {
+; CHECK-LABEL: @shuf_frem_const_op0(
+; CHECK-NEXT:    [[BO:%.*]] = frem nnan <3 x float> <float 1.000000e+00, float undef, float 3.000000e+00>, [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x float> [[BO]], <3 x float> undef, <3 x i32> <i32 undef, i32 2, i32 0>
+; CHECK-NEXT:    ret <3 x float> [[R]]
+;
+  %bo = frem nnan <3 x float> <float 1.0, float 2.0, float 3.0>, %x
+  %r = shufflevector <3 x float> %bo, <3 x float> undef, <3 x i32> <i32 undef, i32 2, i32 0>
+  ret <3 x float> %r
+}
+
+define <3 x float> @shuf_frem_const_op1(<3 x float> %x) {
+; CHECK-LABEL: @shuf_frem_const_op1(
+; CHECK-NEXT:    [[BO:%.*]] = frem reassoc ninf <3 x float> [[X:%.*]], <float undef, float 2.000000e+00, float 3.000000e+00>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x float> [[BO]], <3 x float> undef, <3 x i32> <i32 1, i32 undef, i32 2>
+; CHECK-NEXT:    ret <3 x float> [[R]]
+;
+  %bo = frem ninf reassoc <3 x float> %x, <float 1.0, float 2.0, float 3.0>
+  %r = shufflevector <3 x float> %bo, <3 x float> undef, <3 x i32> <i32 1, i32 undef, i32 2>
+  ret <3 x float> %r
+}
+
+;; TODO: getelementptr tests below show missing simplifications for
+;; vector demanded elements on vector geps.
+
+define i32* @gep_vbase_w_s_idx(<2 x i32*> %base) {
+; CHECK-LABEL: @gep_vbase_w_s_idx(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, <2 x i32*> [[BASE:%.*]], i64 1
+; CHECK-NEXT:    [[EE:%.*]] = extractelement <2 x i32*> [[GEP]], i32 1
+; CHECK-NEXT:    ret i32* [[EE]]
+;
+  %gep = getelementptr i32, <2 x i32*> %base, i64 1
+  %ee = extractelement <2 x i32*> %gep, i32 1
+  ret i32* %ee
+}
+
+define i32* @gep_splat_base_w_s_idx(i32* %base) {
+; CHECK-LABEL: @gep_splat_base_w_s_idx(
+; CHECK-NEXT:    [[BASEVEC2:%.*]] = insertelement <2 x i32*> undef, i32* [[BASE:%.*]], i32 1
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, <2 x i32*> [[BASEVEC2]], i64 1
+; CHECK-NEXT:    [[EE:%.*]] = extractelement <2 x i32*> [[GEP]], i32 1
+; CHECK-NEXT:    ret i32* [[EE]]
+;
+  %basevec1 = insertelement <2 x i32*> poison, i32* %base, i32 0
+  %basevec2 = shufflevector <2 x i32*> %basevec1, <2 x i32*> undef, <2 x i32> zeroinitializer
+  %gep = getelementptr i32, <2 x i32*> %basevec2, i64 1
+  %ee = extractelement <2 x i32*> %gep, i32 1
+  ret i32* %ee
+}
+
+
+define i32* @gep_splat_base_w_cv_idx(i32* %base) {
+; CHECK-LABEL: @gep_splat_base_w_cv_idx(
+; CHECK-NEXT:    [[BASEVEC2:%.*]] = insertelement <2 x i32*> undef, i32* [[BASE:%.*]], i32 1
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, <2 x i32*> [[BASEVEC2]], <2 x i64> <i64 undef, i64 1>
+; CHECK-NEXT:    [[EE:%.*]] = extractelement <2 x i32*> [[GEP]], i32 1
+; CHECK-NEXT:    ret i32* [[EE]]
+;
+  %basevec1 = insertelement <2 x i32*> poison, i32* %base, i32 0
+  %basevec2 = shufflevector <2 x i32*> %basevec1, <2 x i32*> undef, <2 x i32> zeroinitializer
+  %gep = getelementptr i32, <2 x i32*> %basevec2, <2 x i64> <i64 0, i64 1>
+  %ee = extractelement <2 x i32*> %gep, i32 1
+  ret i32* %ee
+}
+
+define i32* @gep_splat_base_w_vidx(i32* %base, <2 x i64> %idxvec) {
+; CHECK-LABEL: @gep_splat_base_w_vidx(
+; CHECK-NEXT:    [[BASEVEC2:%.*]] = insertelement <2 x i32*> undef, i32* [[BASE:%.*]], i32 1
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, <2 x i32*> [[BASEVEC2]], <2 x i64> [[IDXVEC:%.*]]
+; CHECK-NEXT:    [[EE:%.*]] = extractelement <2 x i32*> [[GEP]], i32 1
+; CHECK-NEXT:    ret i32* [[EE]]
+;
+  %basevec1 = insertelement <2 x i32*> poison, i32* %base, i32 0
+  %basevec2 = shufflevector <2 x i32*> %basevec1, <2 x i32*> undef, <2 x i32> zeroinitializer
+  %gep = getelementptr i32, <2 x i32*> %basevec2, <2 x i64> %idxvec
+  %ee = extractelement <2 x i32*> %gep, i32 1
+  ret i32* %ee
+}
+
+
+@GLOBAL = internal global i32 zeroinitializer
+
+define i32* @gep_cvbase_w_s_idx(<2 x i32*> %base, i64 %raw_addr) {
+; CHECK-LABEL: @gep_cvbase_w_s_idx(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, <2 x i32*> <i32* undef, i32* @GLOBAL>, i64 [[RAW_ADDR:%.*]]
+; CHECK-NEXT:    [[EE:%.*]] = extractelement <2 x i32*> [[GEP]], i32 1
+; CHECK-NEXT:    ret i32* [[EE]]
+;
+  %gep = getelementptr i32, <2 x i32*> <i32* @GLOBAL, i32* @GLOBAL>, i64 %raw_addr
+  %ee = extractelement <2 x i32*> %gep, i32 1
+  ret i32* %ee
+}
+
+define i32* @gep_cvbase_w_cv_idx(<2 x i32*> %base, i64 %raw_addr) {
+; CHECK-LABEL: @gep_cvbase_w_cv_idx(
+; CHECK-NEXT:    ret i32* getelementptr inbounds (i32, i32* @GLOBAL, i64 1)
+;
+  %gep = getelementptr i32, <2 x i32*> <i32* @GLOBAL, i32* @GLOBAL>, <2 x i64> <i64 0, i64 1>
+  %ee = extractelement <2 x i32*> %gep, i32 1
+  ret i32* %ee
+}
+
+
+define i32* @gep_sbase_w_cv_idx(i32* %base) {
+; CHECK-LABEL: @gep_sbase_w_cv_idx(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, i32* [[BASE:%.*]], <2 x i64> <i64 undef, i64 1>
+; CHECK-NEXT:    [[EE:%.*]] = extractelement <2 x i32*> [[GEP]], i32 1
+; CHECK-NEXT:    ret i32* [[EE]]
+;
+  %gep = getelementptr i32, i32* %base, <2 x i64> <i64 0, i64 1>
+  %ee = extractelement <2 x i32*> %gep, i32 1
+  ret i32* %ee
+}
+
+define i32* @gep_sbase_w_splat_idx(i32* %base, i64 %idx) {
+; CHECK-LABEL: @gep_sbase_w_splat_idx(
+; CHECK-NEXT:    [[IDXVEC2:%.*]] = insertelement <2 x i64> undef, i64 [[IDX:%.*]], i32 1
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, i32* [[BASE:%.*]], <2 x i64> [[IDXVEC2]]
+; CHECK-NEXT:    [[EE:%.*]] = extractelement <2 x i32*> [[GEP]], i32 1
+; CHECK-NEXT:    ret i32* [[EE]]
+;
+  %idxvec1 = insertelement <2 x i64> poison, i64 %idx, i32 0
+  %idxvec2 = shufflevector <2 x i64> %idxvec1, <2 x i64> undef, <2 x i32> zeroinitializer
+  %gep = getelementptr i32, i32* %base, <2 x i64> %idxvec2
+  %ee = extractelement <2 x i32*> %gep, i32 1
+  ret i32* %ee
+}
+define i32* @gep_splat_both(i32* %base, i64 %idx) {
+; CHECK-LABEL: @gep_splat_both(
+; CHECK-NEXT:    [[BASEVEC2:%.*]] = insertelement <2 x i32*> undef, i32* [[BASE:%.*]], i32 1
+; CHECK-NEXT:    [[IDXVEC2:%.*]] = insertelement <2 x i64> undef, i64 [[IDX:%.*]], i32 1
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, <2 x i32*> [[BASEVEC2]], <2 x i64> [[IDXVEC2]]
+; CHECK-NEXT:    [[EE:%.*]] = extractelement <2 x i32*> [[GEP]], i32 1
+; CHECK-NEXT:    ret i32* [[EE]]
+;
+  %basevec1 = insertelement <2 x i32*> poison, i32* %base, i32 0
+  %basevec2 = shufflevector <2 x i32*> %basevec1, <2 x i32*> undef, <2 x i32> zeroinitializer
+  %idxvec1 = insertelement <2 x i64> poison, i64 %idx, i32 0
+  %idxvec2 = shufflevector <2 x i64> %idxvec1, <2 x i64> undef, <2 x i32> zeroinitializer
+  %gep = getelementptr i32, <2 x i32*> %basevec2, <2 x i64> %idxvec2
+  %ee = extractelement <2 x i32*> %gep, i32 1
+  ret i32* %ee
+}
+
+define <2 x i32*> @gep_all_lanes_undef(i32* %base, i64 %idx) {;
+; CHECK-LABEL: @gep_all_lanes_undef(
+; CHECK-NEXT:    ret <2 x i32*> undef
+;
+  %basevec = insertelement <2 x i32*> poison, i32* %base, i32 0
+  %idxvec = insertelement <2 x i64> poison, i64 %idx, i32 1
+  %gep = getelementptr i32, <2 x i32*> %basevec, <2 x i64> %idxvec
+  ret <2 x i32*> %gep
+}
+
+define i32* @gep_demanded_lane_undef(i32* %base, i64 %idx) {
+; CHECK-LABEL: @gep_demanded_lane_undef(
+; CHECK-NEXT:    ret i32* undef
+;
+  %basevec = insertelement <2 x i32*> poison, i32* %base, i32 0
+  %idxvec = insertelement <2 x i64> poison, i64 %idx, i32 1
+  %gep = getelementptr i32, <2 x i32*> %basevec, <2 x i64> %idxvec
+  %ee = extractelement <2 x i32*> %gep, i32 1
+  ret i32* %ee
+}
+
+
+;; LangRef has an odd quirk around FCAs which make it illegal to use undef
+;; indices.
+define i32* @PR41624(<2 x { i32, i32 }*> %a) {
+; CHECK-LABEL: @PR41624(
+; CHECK-NEXT:    [[W:%.*]] = getelementptr { i32, i32 }, <2 x { i32, i32 }*> [[A:%.*]], <2 x i64> <i64 5, i64 5>, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = extractelement <2 x i32*> [[W]], i32 0
+; CHECK-NEXT:    ret i32* [[R]]
+;
+  %w = getelementptr { i32, i32 }, <2 x { i32, i32 }*> %a, <2 x i64> <i64 5, i64 5>, <2 x i32> zeroinitializer
+  %r = extractelement <2 x i32*> %w, i32 0
+  ret i32* %r
+}
+
+@global = external global [0 x i32], align 4
+
+; Make sure we don't get stuck in a loop turning the zeroinitializer into
+; <0, undef, undef, undef> and then changing it back.
+define i32* @zero_sized_type_extract(<4 x i64> %arg, i64 %arg1) {
+; CHECK-LABEL: @zero_sized_type_extract(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[T:%.*]] = getelementptr inbounds [0 x i32], <4 x [0 x i32]*> <[0 x i32]* @global, [0 x i32]* undef, [0 x i32]* undef, [0 x i32]* undef>, <4 x i64> <i64 0, i64 undef, i64 undef, i64 undef>, <4 x i64> [[ARG:%.*]]
+; CHECK-NEXT:    [[T2:%.*]] = extractelement <4 x i32*> [[T]], i64 0
+; CHECK-NEXT:    ret i32* [[T2]]
+;
+bb:
+  %t = getelementptr inbounds [0 x i32], <4 x [0 x i32]*> <[0 x i32]* @global, [0 x i32]* @global, [0 x i32]* @global, [0 x i32]* @global>, <4 x i64> zeroinitializer, <4 x i64> %arg
+  %t2 = extractelement <4 x i32*> %t, i64 0
+  ret i32* %t2
+}
+
+; The non-zero elements of the result are always 'y', so the splat is unnecessary.
+
+define <4 x i8> @select_cond_with_eq_true_false_elts(<4 x i8> %x, <4 x i8> %y, <4 x i1> %cmp) {
+; CHECK-LABEL: @select_cond_with_eq_true_false_elts(
+; CHECK-NEXT:    [[SEL:%.*]] = select <4 x i1> [[CMP:%.*]], <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i8> [[SEL]], <4 x i8> [[Y]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x i8> [[R]]
+;
+  %tval = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  %splat = shufflevector <4 x i1> %cmp, <4 x i1> undef, <4 x i32> zeroinitializer
+  %r = select <4 x i1> %splat, <4 x i8> %tval, <4 x i8> %y
+  ret <4 x i8> %r
+}
+
+; First element of the result is always x[0], so first element of select condition is unnecessary.
+
+define <4 x i8> @select_cond_with_eq_true_false_elts2(<4 x i8> %x, <4 x i8> %y, <4 x i1> %cmp) {
+; CHECK-LABEL: @select_cond_with_eq_true_false_elts2(
+; CHECK-NEXT:    [[COND:%.*]] = shufflevector <4 x i1> [[CMP:%.*]], <4 x i1> undef, <4 x i32> <i32 undef, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[SEL:%.*]] = select <4 x i1> [[COND]], <4 x i8> [[Y:%.*]], <4 x i8> [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i8> [[X]], <4 x i8> [[SEL]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x i8> [[R]]
+;
+  %tval = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  %cond = shufflevector <4 x i1> %cmp, <4 x i1> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %r = select <4 x i1> %cond, <4 x i8> %tval, <4 x i8> %x
+  ret <4 x i8> %r
+}
+
+; Second element of the result is always x[3], so second element of select condition is unnecessary.
+; Fourth element of the result is always undef, so fourth element of select condition is unnecessary.
+
+define <4 x float> @select_cond_with_eq_true_false_elts3(<4 x float> %x, <4 x float> %y, <4 x i1> %cmp) {
+; CHECK-LABEL: @select_cond_with_eq_true_false_elts3(
+; CHECK-NEXT:    [[TVAL:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x i32> <i32 1, i32 3, i32 5, i32 undef>
+; CHECK-NEXT:    [[FVAL:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[X]], <4 x i32> <i32 0, i32 7, i32 6, i32 undef>
+; CHECK-NEXT:    [[COND:%.*]] = shufflevector <4 x i1> [[CMP:%.*]], <4 x i1> undef, <4 x i32> <i32 undef, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = select <4 x i1> [[COND]], <4 x float> [[TVAL]], <4 x float> [[FVAL]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %tval = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 undef>
+  %fval = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> <i32 0, i32 7, i32 6, i32 undef>
+  %cond = shufflevector <4 x i1> %cmp, <4 x i1> undef, <4 x i32> <i32 undef, i32 1, i32 2, i32 3>
+  %r = select <4 x i1> %cond, <4 x float> %tval, <4 x float> %fval
+  ret <4 x float> %r
+}
+
+define <4 x i8> @select_cond_with_undef_true_false_elts(<4 x i8> %x, <4 x i8> %y, <4 x i1> %cmp) {
+; CHECK-LABEL: @select_cond_with_undef_true_false_elts(
+; CHECK-NEXT:    [[TVAL:%.*]] = shufflevector <4 x i8> [[Y:%.*]], <4 x i8> undef, <4 x i32> <i32 undef, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[COND:%.*]] = shufflevector <4 x i1> [[CMP:%.*]], <4 x i1> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[R:%.*]] = select <4 x i1> [[COND]], <4 x i8> [[TVAL]], <4 x i8> [[X:%.*]]
+; CHECK-NEXT:    ret <4 x i8> [[R]]
+;
+  %tval = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 undef, i32 5, i32 6, i32 7>
+  %cond = shufflevector <4 x i1> %cmp, <4 x i1> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %r = select <4 x i1> %cond, <4 x i8> %tval, <4 x i8> %x
+  ret <4 x i8> %r
+}
+
+; The insert can be safely eliminated because the shuffle blocks poison from cmp[0].
+
+define <4 x i8> @select_cond_(<4 x i8> %x, <4 x i8> %min, <4 x i1> %cmp, i1 %poison_blocker) {
+; CHECK-LABEL: @select_cond_(
+; CHECK-NEXT:    [[SEL:%.*]] = select <4 x i1> [[CMP:%.*]], <4 x i8> [[MIN:%.*]], <4 x i8> [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i8> [[X]], <4 x i8> [[SEL]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x i8> [[R]]
+;
+  %ins = insertelement <4 x i1> %cmp, i1 %poison_blocker, i32 0
+  %vecins = shufflevector <4 x i8> %x, <4 x i8> %min, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  %r = select <4 x i1> %ins, <4 x i8> %vecins, <4 x i8> %x
+  ret <4 x i8> %r
+}
+
+define <4 x float> @ins_of_ext(<4 x float> %x, float %y) {
+; CHECK-LABEL: @ins_of_ext(
+; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x float> [[X:%.*]], float [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[Y]], i32 2
+; CHECK-NEXT:    [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[Y]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[I3]]
+;
+  %e0 = extractelement <4 x float> %x, i32 0
+  %i0 = insertelement <4 x float> poison, float %e0, i32 0
+  %i1 = insertelement <4 x float> %i0, float %y, i32 1
+  %i2 = insertelement <4 x float> %i1, float %y, i32 2
+  %i3 = insertelement <4 x float> %i2, float %y, i32 3
+  ret <4 x float> %i3
+}
+
+define <4 x float> @ins_of_ext_twice(<4 x float> %x, float %y) {
+; CHECK-LABEL: @ins_of_ext_twice(
+; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x float> [[X:%.*]], float [[Y:%.*]], i32 2
+; CHECK-NEXT:    [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[Y]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[I3]]
+;
+  %e0 = extractelement <4 x float> %x, i32 0
+  %i0 = insertelement <4 x float> poison, float %e0, i32 0
+  %e1 = extractelement <4 x float> %x, i32 1
+  %i1 = insertelement <4 x float> %i0, float %e1, i32 1
+  %i2 = insertelement <4 x float> %i1, float %y, i32 2
+  %i3 = insertelement <4 x float> %i2, float %y, i32 3
+  ret <4 x float> %i3
+}
+
+; Negative test - element 3 of the result must be undef to be poison safe.
+; TODO: Could convert insert/extract to identity shuffle with undef mask elements.
+
+define <4 x float> @ins_of_ext_wrong_demand(<4 x float> %x, float %y) {
+; CHECK-LABEL: @ins_of_ext_wrong_demand(
+; CHECK-NEXT:    [[E0:%.*]] = extractelement <4 x float> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[I0:%.*]] = insertelement <4 x float> poison, float [[E0]], i32 0
+; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[Y]], i32 2
+; CHECK-NEXT:    ret <4 x float> [[I2]]
+;
+  %e0 = extractelement <4 x float> %x, i32 0
+  %i0 = insertelement <4 x float> poison, float %e0, i32 0
+  %i1 = insertelement <4 x float> %i0, float %y, i32 1
+  %i2 = insertelement <4 x float> %i1, float %y, i32 2
+  ret <4 x float> %i2
+}
+
+; Negative test - can't replace i0 with x.
+; TODO: Could convert insert/extract to identity shuffle with undef mask elements.
+
+define <4 x float> @ins_of_ext_wrong_type(<5 x float> %x, float %y) {
+; CHECK-LABEL: @ins_of_ext_wrong_type(
+; CHECK-NEXT:    [[E0:%.*]] = extractelement <5 x float> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[I0:%.*]] = insertelement <4 x float> poison, float [[E0]], i32 0
+; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[Y]], i32 2
+; CHECK-NEXT:    [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[Y]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[I3]]
+;
+  %e0 = extractelement <5 x float> %x, i32 0
+  %i0 = insertelement <4 x float> poison, float %e0, i32 0
+  %i1 = insertelement <4 x float> %i0, float %y, i32 1
+  %i2 = insertelement <4 x float> %i1, float %y, i32 2
+  %i3 = insertelement <4 x float> %i2, float %y, i32 3
+  ret <4 x float> %i3
+}
+
+; This should reduce, but the shuffle mask must remain as-is (no extra undef).
+
+define <4 x i4> @ins_of_ext_undef_elts_propagation(<4 x i4> %v, <4 x i4> %v2, i4 %x) {
+; CHECK-LABEL: @ins_of_ext_undef_elts_propagation(
+; CHECK-NEXT:    [[T2:%.*]] = insertelement <4 x i4> [[V:%.*]], i4 [[X:%.*]], i32 2
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i4> [[T2]], <4 x i4> [[V2:%.*]], <4 x i32> <i32 0, i32 6, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x i4> [[R]]
+;
+  %v0 = extractelement <4 x i4> %v, i32 0
+  %t0 = insertelement <4 x i4> poison, i4 %v0, i32 0
+  %t2 = insertelement <4 x i4> %t0, i4 %x, i32 2
+  %r = shufflevector <4 x i4> %t2, <4 x i4> %v2, <4 x i32> <i32 0, i32 6, i32 2, i32 7>
+  ret <4 x i4> %r
+}
+
+; Similar to above, but more ops/uses to verify things work in more complicated cases.
+
+define <8 x i4> @ins_of_ext_undef_elts_propagation2(<8 x i4> %v, <8 x i4> %v2, i4 %x) {
+; CHECK-LABEL: @ins_of_ext_undef_elts_propagation2(
+; CHECK-NEXT:    [[I19:%.*]] = insertelement <8 x i4> [[V:%.*]], i4 [[X:%.*]], i32 2
+; CHECK-NEXT:    [[I20:%.*]] = shufflevector <8 x i4> [[I19]], <8 x i4> [[V2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 11, i32 10, i32 9, i32 8, i32 undef>
+; CHECK-NEXT:    [[I21:%.*]] = shufflevector <8 x i4> [[I20]], <8 x i4> [[V]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEXT:    ret <8 x i4> [[I21]]
+;
+  %i15 = extractelement <8 x i4> %v, i32 0
+  %i16 = insertelement <8 x i4> poison, i4 %i15, i32 0
+  %i17 = extractelement <8 x i4> %v, i32 1
+  %i18 = insertelement <8 x i4> %i16, i4 %i17, i32 1
+  %i19 = insertelement <8 x i4> %i18, i4 %x, i32 2
+  %i20 = shufflevector <8 x i4> %i19, <8 x i4> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 11, i32 10, i32 9, i32 8, i32 undef>
+  %i21 = shufflevector <8 x i4> %i20, <8 x i4> %v, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+  ret <8 x i4> %i21
+}
--- a/llvm/test/Transforms/InstCombine/vec_extract_var_elt-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/vec_extract_var_elt-inseltpoison.ll
@ -0,0 +1,26 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+define void @test (float %b, <8 x float> * %p)  {
+; CHECK: extractelement
+; CHECK: fptosi
+  %1 = load <8 x float> , <8 x float> * %p
+  %2 = bitcast <8 x float> %1 to <8 x i32>
+  %3 = bitcast <8 x i32> %2 to <8 x float>
+  %a = fptosi <8 x float> %3 to <8 x i32>
+  %4 = fptosi float %b to i32
+  %5 = add i32 %4, -2
+  %6 = extractelement <8 x i32> %a, i32 %5
+  %7 = insertelement <8 x i32> poison, i32 %6, i32 7
+  %8 = sitofp <8 x i32> %7 to <8 x float>
+  store <8 x float> %8, <8 x float>* %p
+  ret void    
+}
+
+; PR18600
+define i32 @test2(i32 %i) {
+  %e = extractelement <4 x i32> bitcast (<2 x i64> <i64 1, i64 2> to <4 x i32>), i32 %i
+  ret i32 %e
+
+; CHECK-LABEL: @test2
+; CHECK: extractelement
+}
--- a/llvm/test/Transforms/InstCombine/vec_gep_scalar_arg-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/vec_gep_scalar_arg-inseltpoison.ll
@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+define <4 x i16*> @PR41270([4 x i16]* %x) {
+; CHECK-LABEL: @PR41270(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x [4 x i16]*> undef, [4 x i16]* [[X:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [4 x i16], <4 x [4 x i16]*> [[TMP1]], i64 0, i64 3
+; CHECK-NEXT:    ret <4 x i16*> [[TMP2]]
+;
+  %ins = insertelement <4 x [4 x i16]*> poison, [4 x i16]* %x, i32 0
+  %splat = shufflevector <4 x [4 x i16]*> %ins, <4 x [4 x i16]*> undef, <4 x i32> zeroinitializer
+  %t2 = getelementptr inbounds [4 x i16], <4 x [4 x i16]*> %splat, i32 0, i32 3
+  %t3 = extractelement <4 x i16*> %t2, i32 3
+  %ins2 = insertelement <4 x i16*> poison, i16* %t3, i32 0
+  ret <4 x i16*> %ins2
+}
--- a/llvm/test/Transforms/InstCombine/vec_phi_extract-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/vec_phi_extract-inseltpoison.ll
@ -0,0 +1,107 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+define void @f(i64 %val, i32  %limit, i32 *%ptr) {
+; CHECK-LABEL: @f
+; CHECK: %0 = trunc i64 %val to i32
+; CHECK: %1 = phi i32 [ %0, %entry ], [ {{.*}}, %loop ]
+entry:
+  %tempvector = insertelement <16 x i64> poison, i64 %val, i32 0
+  %vector = shufflevector <16 x i64> %tempvector, <16 x i64> undef, <16 x i32> zeroinitializer
+  %0 = add <16 x i64> %vector, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+  %1 = trunc <16 x i64> %0 to <16 x i32>
+  br label %loop
+
+loop:
+  %2 = phi <16 x i32> [ %1, %entry ], [ %inc, %loop ]
+  %elt = extractelement <16 x i32> %2, i32 0
+  %end = icmp ult i32 %elt, %limit
+  %3 = add i32 10, %elt
+  %4 = sext i32 %elt to i64
+  %5 = getelementptr i32, i32* %ptr, i64 %4
+  store i32 %3, i32* %5
+  %inc = add <16 x i32> %2, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  br i1 %end, label %loop, label %ret
+
+ret:
+  ret void
+}
+
+define void @copy(i64 %val, i32  %limit, i32 *%ptr) {
+; CHECK-LABEL: @copy
+; CHECK: %0 = trunc i64 %val to i32
+; CHECK: %1 = phi i32 [ %0, %entry ], [ {{.*}}, %loop ]
+entry:
+  %tempvector = insertelement <16 x i64> poison, i64 %val, i32 0
+  %vector = shufflevector <16 x i64> %tempvector, <16 x i64> undef, <16 x i32> zeroinitializer
+  %0 = add <16 x i64> %vector, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+  %1 = trunc <16 x i64> %0 to <16 x i32>
+  br label %loop
+
+loop:
+  %2 = phi <16 x i32> [ %1, %entry ], [ %inc, %loop ]
+  %elt = extractelement <16 x i32> %2, i32 0
+  %eltcopy = extractelement <16 x i32> %2, i32 0
+  %end = icmp ult i32 %elt, %limit
+  %3 = add i32 10, %eltcopy
+  %4 = sext i32 %elt to i64
+  %5 = getelementptr i32, i32* %ptr, i64 %4
+  store i32 %3, i32* %5
+  %inc = add <16 x i32> %2, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  br i1 %end, label %loop, label %ret
+
+ret:
+  ret void
+}
+
+define void @nocopy(i64 %val, i32  %limit, i32 *%ptr) {
+; CHECK-LABEL: @nocopy
+; CHECK-NOT: phi i32
+; CHECK: phi <16 x i32> [ %3, %entry ], [ %inc, %loop ]
+entry:
+  %tempvector = insertelement <16 x i64> poison, i64 %val, i32 0
+  %vector = shufflevector <16 x i64> %tempvector, <16 x i64> undef, <16 x i32> zeroinitializer
+  %0 = add <16 x i64> %vector, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+  %1 = trunc <16 x i64> %0 to <16 x i32>
+  br label %loop
+
+loop:
+  %2 = phi <16 x i32> [ %1, %entry ], [ %inc, %loop ]
+  %elt = extractelement <16 x i32> %2, i32 0
+  %eltcopy = extractelement <16 x i32> %2, i32 1
+  %end = icmp ult i32 %elt, %limit
+  %3 = add i32 10, %eltcopy
+  %4 = sext i32 %elt to i64
+  %5 = getelementptr i32, i32* %ptr, i64 %4
+  store i32 %3, i32* %5
+  %inc = add <16 x i32> %2, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  br i1 %end, label %loop, label %ret
+
+ret:
+  ret void
+}
+
+define i1 @g(<3 x i32> %input_2) {
+; CHECK-LABEL: @g
+; CHECK: extractelement <3 x i32> %input_2, i32 0
+entry:
+  br label %for.cond
+
+for.cond:
+  %input_2.addr.0 = phi <3 x i32> [ %input_2, %entry ], [ %div45, %for.body ]
+  %input_1.addr.1 = phi <3 x i32> [ undef, %entry ], [ %dec43, %for.body ]
+  br i1 undef, label %for.end, label %for.body
+
+; CHECK-NOT: extractelement <3 x i32> %{{.*}}, i32 0
+for.body:
+  %dec43 = add <3 x i32> %input_1.addr.1, <i32 -1, i32 -1, i32 -1>
+  %sub44 = sub <3 x i32> <i32 -1, i32 -1, i32 -1>, %dec43
+  %div45 = sdiv <3 x i32> %input_2.addr.0, %sub44
+  br label %for.cond
+
+for.end:
+  %0 = extractelement <3 x i32> %input_2.addr.0, i32 0
+  %.89 = select i1 false, i32 0, i32 %0
+  %tobool313 = icmp eq i32 %.89, 0
+  ret i1 %tobool313
+}
+
--- a/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll
--- a/llvm/test/Transforms/InstCombine/vector-casts-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/vector-casts-inseltpoison.ll
@ -0,0 +1,413 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; Can't get smaller than this.
+
+define <2 x i1> @trunc(<2 x i64> %a) {
+; CHECK-LABEL: @trunc(
+; CHECK-NEXT:    [[T:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1>
+; CHECK-NEXT:    ret <2 x i1> [[T]]
+;
+  %t = trunc <2 x i64> %a to <2 x i1>
+  ret <2 x i1> %t
+}
+
+; This is trunc.
+
+define <2 x i1> @and_cmp_is_trunc(<2 x i64> %a) {
+; CHECK-LABEL: @and_cmp_is_trunc(
+; CHECK-NEXT:    [[R:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1>
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %t = and <2 x i64> %a, <i64 1, i64 1>
+  %r = icmp ne <2 x i64> %t, zeroinitializer
+  ret <2 x i1> %r
+}
+
+; This is trunc.
+
+define <2 x i1> @and_cmp_is_trunc_even_with_undef_elt(<2 x i64> %a) {
+; CHECK-LABEL: @and_cmp_is_trunc_even_with_undef_elt(
+; CHECK-NEXT:    [[R:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1>
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %t = and <2 x i64> %a, <i64 undef, i64 1>
+  %r = icmp ne <2 x i64> %t, zeroinitializer
+  ret <2 x i1> %r
+}
+
+; TODO: This could be just 1 instruction (trunc), but our undef matching is incomplete.
+
+define <2 x i1> @and_cmp_is_trunc_even_with_undef_elts(<2 x i64> %a) {
+; CHECK-LABEL: @and_cmp_is_trunc_even_with_undef_elts(
+; CHECK-NEXT:    [[T:%.*]] = and <2 x i64> [[A:%.*]], <i64 undef, i64 1>
+; CHECK-NEXT:    [[R:%.*]] = icmp ne <2 x i64> [[T]], <i64 undef, i64 0>
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %t = and <2 x i64> %a, <i64 undef, i64 1>
+  %r = icmp ne <2 x i64> %t, <i64 undef, i64 0>
+  ret <2 x i1> %r
+}
+
+; The ashr turns into an lshr.
+define <2 x i64> @test2(<2 x i64> %a) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[B:%.*]] = lshr <2 x i64> [[A:%.*]], <i64 1, i64 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i64> [[B]], <i64 32767, i64 32767>
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %b = and <2 x i64> %a, <i64 65535, i64 65535>
+  %t = ashr <2 x i64> %b, <i64 1, i64 1>
+  ret <2 x i64> %t
+}
+
+define <2 x i64> @test3(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ord <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[CONV:%.*]] = bitcast <4 x i32> [[AND]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[CONV]]
+;
+  %cmp = fcmp ord <4 x float> %a, zeroinitializer
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %cmp4 = fcmp ord <4 x float> %b, zeroinitializer
+  %sext5 = sext <4 x i1> %cmp4 to <4 x i32>
+  %and = and <4 x i32> %sext, %sext5
+  %conv = bitcast <4 x i32> %and to <2 x i64>
+  ret <2 x i64> %conv
+}
+
+define <2 x i64> @test4(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp uno <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[CONV:%.*]] = bitcast <4 x i32> [[OR]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[CONV]]
+;
+  %cmp = fcmp uno <4 x float> %a, zeroinitializer
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %cmp4 = fcmp uno <4 x float> %b, zeroinitializer
+  %sext5 = sext <4 x i1> %cmp4 to <4 x i32>
+  %or = or <4 x i32> %sext, %sext5
+  %conv = bitcast <4 x i32> %or to <2 x i64>
+  ret <2 x i64> %conv
+}
+
+; rdar://7434900
+define <2 x i64> @test5(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ult <4 x float> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    [[CMP4:%.*]] = fcmp ult <4 x float> [[B:%.*]], zeroinitializer
+; CHECK-NEXT:    [[AND1:%.*]] = and <4 x i1> [[CMP]], [[CMP4]]
+; CHECK-NEXT:    [[AND:%.*]] = sext <4 x i1> [[AND1]] to <4 x i32>
+; CHECK-NEXT:    [[CONV:%.*]] = bitcast <4 x i32> [[AND]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[CONV]]
+;
+  %cmp = fcmp ult <4 x float> %a, zeroinitializer
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %cmp4 = fcmp ult <4 x float> %b, zeroinitializer
+  %sext5 = sext <4 x i1> %cmp4 to <4 x i32>
+  %and = and <4 x i32> %sext, %sext5
+  %conv = bitcast <4 x i32> %and to <2 x i64>
+  ret <2 x i64> %conv
+}
+
+define <2 x i64> @test6(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ult <4 x float> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    [[CMP4:%.*]] = fcmp ult <4 x float> [[B:%.*]], zeroinitializer
+; CHECK-NEXT:    [[AND1:%.*]] = or <4 x i1> [[CMP]], [[CMP4]]
+; CHECK-NEXT:    [[AND:%.*]] = sext <4 x i1> [[AND1]] to <4 x i32>
+; CHECK-NEXT:    [[CONV:%.*]] = bitcast <4 x i32> [[AND]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[CONV]]
+;
+  %cmp = fcmp ult <4 x float> %a, zeroinitializer
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %cmp4 = fcmp ult <4 x float> %b, zeroinitializer
+  %sext5 = sext <4 x i1> %cmp4 to <4 x i32>
+  %and = or <4 x i32> %sext, %sext5
+  %conv = bitcast <4 x i32> %and to <2 x i64>
+  ret <2 x i64> %conv
+}
+
+define <2 x i64> @test7(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ult <4 x float> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    [[CMP4:%.*]] = fcmp ult <4 x float> [[B:%.*]], zeroinitializer
+; CHECK-NEXT:    [[AND1:%.*]] = xor <4 x i1> [[CMP]], [[CMP4]]
+; CHECK-NEXT:    [[AND:%.*]] = sext <4 x i1> [[AND1]] to <4 x i32>
+; CHECK-NEXT:    [[CONV:%.*]] = bitcast <4 x i32> [[AND]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[CONV]]
+;
+  %cmp = fcmp ult <4 x float> %a, zeroinitializer
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %cmp4 = fcmp ult <4 x float> %b, zeroinitializer
+  %sext5 = sext <4 x i1> %cmp4 to <4 x i32>
+  %and = xor <4 x i32> %sext, %sext5
+  %conv = bitcast <4 x i32> %and to <2 x i64>
+  ret <2 x i64> %conv
+}
+
+define void @convert(<2 x i32>* %dst.addr, <2 x i64> %src) {
+; CHECK-LABEL: @convert(
+; CHECK-NEXT:    [[VAL:%.*]] = trunc <2 x i64> [[SRC:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[VAL]], <i32 1, i32 1>
+; CHECK-NEXT:    store <2 x i32> [[ADD]], <2 x i32>* [[DST_ADDR:%.*]], align 8
+; CHECK-NEXT:    ret void
+;
+  %val = trunc <2 x i64> %src to <2 x i32>
+  %add = add <2 x i32> %val, <i32 1, i32 1>
+  store <2 x i32> %add, <2 x i32>* %dst.addr
+  ret void
+}
+
+define <2 x i65> @foo(<2 x i64> %t) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    [[A_MASK:%.*]] = and <2 x i64> [[T:%.*]], <i64 4294967295, i64 4294967295>
+; CHECK-NEXT:    [[B:%.*]] = zext <2 x i64> [[A_MASK]] to <2 x i65>
+; CHECK-NEXT:    ret <2 x i65> [[B]]
+;
+  %a = trunc <2 x i64> %t to <2 x i32>
+  %b = zext <2 x i32> %a to <2 x i65>
+  ret <2 x i65> %b
+}
+
+define <2 x i64> @bar(<2 x i65> %t) {
+; CHECK-LABEL: @bar(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i65> [[T:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[B:%.*]] = and <2 x i64> [[TMP1]], <i64 4294967295, i64 4294967295>
+; CHECK-NEXT:    ret <2 x i64> [[B]]
+;
+  %a = trunc <2 x i65> %t to <2 x i32>
+  %b = zext <2 x i32> %a to <2 x i64>
+  ret <2 x i64> %b
+}
+
+define <2 x i64> @bars(<2 x i65> %t) {
+; CHECK-LABEL: @bars(
+; CHECK-NEXT:    [[A:%.*]] = trunc <2 x i65> [[T:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[B:%.*]] = sext <2 x i32> [[A]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[B]]
+;
+  %a = trunc <2 x i65> %t to <2 x i32>
+  %b = sext <2 x i32> %a to <2 x i64>
+  ret <2 x i64> %b
+}
+
+define <2 x i64> @quxs(<2 x i64> %t) {
+; CHECK-LABEL: @quxs(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i64> [[T:%.*]], <i64 32, i64 32>
+; CHECK-NEXT:    [[B:%.*]] = ashr exact <2 x i64> [[TMP1]], <i64 32, i64 32>
+; CHECK-NEXT:    ret <2 x i64> [[B]]
+;
+  %a = trunc <2 x i64> %t to <2 x i32>
+  %b = sext <2 x i32> %a to <2 x i64>
+  ret <2 x i64> %b
+}
+
+define <2 x i64> @quxt(<2 x i64> %t) {
+; CHECK-LABEL: @quxt(
+; CHECK-NEXT:    [[A:%.*]] = shl <2 x i64> [[T:%.*]], <i64 32, i64 32>
+; CHECK-NEXT:    [[B:%.*]] = ashr exact <2 x i64> [[A]], <i64 32, i64 32>
+; CHECK-NEXT:    ret <2 x i64> [[B]]
+;
+  %a = shl <2 x i64> %t, <i64 32, i64 32>
+  %b = ashr <2 x i64> %a, <i64 32, i64 32>
+  ret <2 x i64> %b
+}
+
+define <2 x double> @fa(<2 x double> %t) {
+; CHECK-LABEL: @fa(
+; CHECK-NEXT:    [[A:%.*]] = fptrunc <2 x double> [[T:%.*]] to <2 x float>
+; CHECK-NEXT:    [[B:%.*]] = fpext <2 x float> [[A]] to <2 x double>
+; CHECK-NEXT:    ret <2 x double> [[B]]
+;
+  %a = fptrunc <2 x double> %t to <2 x float>
+  %b = fpext <2 x float> %a to <2 x double>
+  ret <2 x double> %b
+}
+
+define <2 x double> @fb(<2 x double> %t) {
+; CHECK-LABEL: @fb(
+; CHECK-NEXT:    [[A:%.*]] = fptoui <2 x double> [[T:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[B:%.*]] = uitofp <2 x i64> [[A]] to <2 x double>
+; CHECK-NEXT:    ret <2 x double> [[B]]
+;
+  %a = fptoui <2 x double> %t to <2 x i64>
+  %b = uitofp <2 x i64> %a to <2 x double>
+  ret <2 x double> %b
+}
+
+define <2 x double> @fc(<2 x double> %t) {
+; CHECK-LABEL: @fc(
+; CHECK-NEXT:    [[A:%.*]] = fptosi <2 x double> [[T:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[B:%.*]] = sitofp <2 x i64> [[A]] to <2 x double>
+; CHECK-NEXT:    ret <2 x double> [[B]]
+;
+  %a = fptosi <2 x double> %t to <2 x i64>
+  %b = sitofp <2 x i64> %a to <2 x double>
+  ret <2 x double> %b
+}
+
+; PR9228
+define <4 x float> @f(i32 %a) {
+; CHECK-LABEL: @f(
+; CHECK-NEXT:    ret <4 x float> undef
+;
+  %dim = insertelement <4 x i32> poison, i32 %a, i32 0
+  %dim30 = insertelement <4 x i32> %dim, i32 %a, i32 1
+  %dim31 = insertelement <4 x i32> %dim30, i32 %a, i32 2
+  %dim32 = insertelement <4 x i32> %dim31, i32 %a, i32 3
+
+  %offset_ptr = getelementptr <4 x float>, <4 x float>* null, i32 1
+  %offset_int = ptrtoint <4 x float>* %offset_ptr to i64
+  %sizeof32 = trunc i64 %offset_int to i32
+
+  %smearinsert33 = insertelement <4 x i32> poison, i32 %sizeof32, i32 0
+  %smearinsert34 = insertelement <4 x i32> %smearinsert33, i32 %sizeof32, i32 1
+  %smearinsert35 = insertelement <4 x i32> %smearinsert34, i32 %sizeof32, i32 2
+  %smearinsert36 = insertelement <4 x i32> %smearinsert35, i32 %sizeof32, i32 3
+
+  %delta_scale = mul <4 x i32> %dim32, %smearinsert36
+  %offset_delta = add <4 x i32> zeroinitializer, %delta_scale
+
+  %offset_varying_delta = add <4 x i32> %offset_delta, undef
+
+  ret <4 x float> undef
+}
+
+define <8 x i32> @pr24458(<8 x float> %n) {
+; CHECK-LABEL: @pr24458(
+; CHECK-NEXT:    ret <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+;
+  %notequal_b_load_.i = fcmp une <8 x float> %n, zeroinitializer
+  %equal_a_load72_.i = fcmp ueq <8 x float> %n, zeroinitializer
+  %notequal_b_load__to_boolvec.i = sext <8 x i1> %notequal_b_load_.i to <8 x i32>
+  %equal_a_load72__to_boolvec.i = sext <8 x i1> %equal_a_load72_.i to <8 x i32>
+  %wrong = or <8 x i32> %notequal_b_load__to_boolvec.i, %equal_a_load72__to_boolvec.i
+  ret <8 x i32> %wrong
+}
+
+; Hoist a trunc to a scalar if we're inserting into an undef vector.
+; trunc (inselt undef, X, Index) --> inselt undef, (trunc X), Index
+
+define <3 x i16> @trunc_inselt_undef(i32 %x) {
+; CHECK-LABEL: @trunc_inselt_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[TRUNC:%.*]] = insertelement <3 x i16> undef, i16 [[TMP1]], i32 1
+; CHECK-NEXT:    ret <3 x i16> [[TRUNC]]
+;
+  %vec = insertelement <3 x i32> poison, i32 %x, i32 1
+  %trunc = trunc <3 x i32> %vec to <3 x i16>
+  ret <3 x i16> %trunc
+}
+
+; Hoist a trunc to a scalar if we're inserting into an undef vector.
+; trunc (inselt undef, X, Index) --> inselt undef, (trunc X), Index
+
+define <2 x float> @fptrunc_inselt_undef(double %x, i32 %index) {
+; CHECK-LABEL: @fptrunc_inselt_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc double [[X:%.*]] to float
+; CHECK-NEXT:    [[TRUNC:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 [[INDEX:%.*]]
+; CHECK-NEXT:    ret <2 x float> [[TRUNC]]
+;
+  %vec = insertelement <2 x double> <double undef, double undef>, double %x, i32 %index
+  %trunc = fptrunc <2 x double> %vec to <2 x float>
+  ret <2 x float> %trunc
+}
+
+; TODO: Strengthen the backend, so we can have this canonicalization.
+; Insert a scalar int into a constant vector and truncate:
+; trunc (inselt C, X, Index) --> inselt C, (trunc X), Index
+
+define <3 x i16> @trunc_inselt1(i32 %x) {
+; CHECK-LABEL: @trunc_inselt1(
+; CHECK-NEXT:    [[VEC:%.*]] = insertelement <3 x i32> <i32 3, i32 undef, i32 65536>, i32 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc <3 x i32> [[VEC]] to <3 x i16>
+; CHECK-NEXT:    ret <3 x i16> [[TRUNC]]
+;
+  %vec = insertelement <3 x i32> <i32 3, i32 -2, i32 65536>, i32 %x, i32 1
+  %trunc = trunc <3 x i32> %vec to <3 x i16>
+  ret <3 x i16> %trunc
+}
+
+; TODO: Strengthen the backend, so we can have this canonicalization.
+; Insert a scalar FP into a constant vector and FP truncate:
+; fptrunc (inselt C, X, Index) --> inselt C, (fptrunc X), Index
+
+define <2 x float> @fptrunc_inselt1(double %x, i32 %index) {
+; CHECK-LABEL: @fptrunc_inselt1(
+; CHECK-NEXT:    [[VEC:%.*]] = insertelement <2 x double> <double undef, double 3.000000e+00>, double [[X:%.*]], i32 [[INDEX:%.*]]
+; CHECK-NEXT:    [[TRUNC:%.*]] = fptrunc <2 x double> [[VEC]] to <2 x float>
+; CHECK-NEXT:    ret <2 x float> [[TRUNC]]
+;
+  %vec = insertelement <2 x double> <double undef, double 3.0>, double %x, i32 %index
+  %trunc = fptrunc <2 x double> %vec to <2 x float>
+  ret <2 x float> %trunc
+}
+
+; TODO: Strengthen the backend, so we can have this canonicalization.
+; Insert a scalar int constant into a vector and truncate:
+; trunc (inselt X, C, Index) --> inselt (trunc X), C', Index
+
+define <8 x i16> @trunc_inselt2(<8 x i32> %x, i32 %index) {
+; CHECK-LABEL: @trunc_inselt2(
+; CHECK-NEXT:    [[VEC:%.*]] = insertelement <8 x i32> [[X:%.*]], i32 1048576, i32 [[INDEX:%.*]]
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc <8 x i32> [[VEC]] to <8 x i16>
+; CHECK-NEXT:    ret <8 x i16> [[TRUNC]]
+;
+  %vec = insertelement <8 x i32> %x, i32 1048576, i32 %index
+  %trunc = trunc <8 x i32> %vec to <8 x i16>
+  ret <8 x i16> %trunc
+}
+
+; TODO: Strengthen the backend, so we can have this canonicalization.
+; Insert a scalar FP constant into a vector and FP truncate:
+; fptrunc (inselt X, C, Index) --> inselt (fptrunc X), C', Index
+
+define <3 x float> @fptrunc_inselt2(<3 x double> %x) {
+; CHECK-LABEL: @fptrunc_inselt2(
+; CHECK-NEXT:    [[VEC:%.*]] = insertelement <3 x double> [[X:%.*]], double 4.000000e+00, i32 2
+; CHECK-NEXT:    [[TRUNC:%.*]] = fptrunc <3 x double> [[VEC]] to <3 x float>
+; CHECK-NEXT:    ret <3 x float> [[TRUNC]]
+;
+  %vec = insertelement <3 x double> %x, double 4.0, i32 2
+  %trunc = fptrunc <3 x double> %vec to <3 x float>
+  ret <3 x float> %trunc
+}
+
+; Converting to a wide type might reduce instruction count,
+; but we can not do that unless the backend can recover from
+; the creation of a potentially illegal op (like a 64-bit vmul).
+; PR40032 - https://bugs.llvm.org/show_bug.cgi?id=40032
+
+define <2 x i64> @sext_less_casting_with_wideop(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: @sext_less_casting_with_wideop(
+; CHECK-NEXT:    [[XNARROW:%.*]] = trunc <2 x i64> [[X:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[YNARROW:%.*]] = trunc <2 x i64> [[Y:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[XNARROW]], [[YNARROW]]
+; CHECK-NEXT:    [[R:%.*]] = sext <2 x i32> [[MUL]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %xnarrow = trunc <2 x i64> %x to <2 x i32>
+  %ynarrow = trunc <2 x i64> %y to <2 x i32>
+  %mul = mul <2 x i32> %xnarrow, %ynarrow
+  %r = sext <2 x i32> %mul to <2 x i64>
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @zext_less_casting_with_wideop(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: @zext_less_casting_with_wideop(
+; CHECK-NEXT:    [[XNARROW:%.*]] = trunc <2 x i64> [[X:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[YNARROW:%.*]] = trunc <2 x i64> [[Y:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[XNARROW]], [[YNARROW]]
+; CHECK-NEXT:    [[R:%.*]] = zext <2 x i32> [[MUL]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %xnarrow = trunc <2 x i64> %x to <2 x i32>
+  %ynarrow = trunc <2 x i64> %y to <2 x i32>
+  %mul = mul <2 x i32> %xnarrow, %ynarrow
+  %r = zext <2 x i32> %mul to <2 x i64>
+  ret <2 x i64> %r
+}
+
--- a/llvm/test/Transforms/InstCombine/vector_gep1-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/vector_gep1-inseltpoison.ll
@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@G1 = global i8 zeroinitializer
+
+define <2 x i1> @test(<2 x i8*> %a, <2 x i8*> %b) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[C:%.*]] = icmp eq <2 x i8*> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <2 x i1> [[C]]
+;
+  %c = icmp eq <2 x i8*> %a, %b
+  ret <2 x i1> %c
+}
+
+define <2 x i1> @test2(<2 x i8*> %a) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %c = inttoptr <2 x i32> <i32 1, i32 2> to <2 x i8*>
+  %d = icmp ult <2 x i8*> %c, zeroinitializer
+  ret <2 x i1> %d
+}
+
+define <2 x i1> @test3(<2 x i8*> %a) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %g = getelementptr i8, <2 x i8*> %a, <2 x i32> <i32 1, i32 0>
+  %B = icmp ult <2 x i8*> %g, zeroinitializer
+  ret <2 x i1> %B
+}
+
+define <1 x i1> @test4(<1 x i8*> %a) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    ret <1 x i1> zeroinitializer
+;
+  %g = getelementptr i8, <1 x i8*> %a, <1 x i32> <i32 1>
+  %B = icmp ult <1 x i8*> %g, zeroinitializer
+  ret <1 x i1> %B
+}
+
+define <2 x i1> @test5(<2 x i8*> %a) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %w = getelementptr i8, <2 x i8*> %a, <2 x i32> zeroinitializer
+  %e = getelementptr i8, <2 x i8*> %w, <2 x i32> <i32 5, i32 9>
+  %g = getelementptr i8, <2 x i8*> %e, <2 x i32> <i32 1, i32 0>
+  %B = icmp ult <2 x i8*> %g, zeroinitializer
+  ret <2 x i1> %B
+}
+
+define <2 x i32*> @test7(<2 x {i32, i32}*> %a) {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    [[W:%.*]] = getelementptr { i32, i32 }, <2 x { i32, i32 }*> [[A:%.*]], <2 x i64> <i64 5, i64 9>, <2 x i32> zeroinitializer
+; CHECK-NEXT:    ret <2 x i32*> [[W]]
+;
+  %w = getelementptr {i32, i32}, <2 x {i32, i32}*> %a, <2 x i32> <i32 5, i32 9>, <2 x i32> zeroinitializer
+  ret <2 x i32*> %w
+}
+
+define <vscale x 2 x i1> @test8() {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:    ret <vscale x 2 x i1> icmp ult (<vscale x 2 x i64> zext (<vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer) to <vscale x 2 x i64>), <vscale x 2 x i64> zeroinitializer)
+;
+  %ins = insertelement <vscale x 2 x i32> poison, i32 1, i32 0
+  %b = shufflevector <vscale x 2 x i32> %ins, <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer
+  %c = inttoptr <vscale x 2 x i32> %b to <vscale x 2 x i8*>
+  %d = icmp ult <vscale x 2 x i8*> %c, zeroinitializer
+  ret <vscale x 2 x i1> %d
+}
--- a/llvm/test/Transforms/InstCombine/vector_insertelt_shuffle-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/vector_insertelt_shuffle-inseltpoison.ll
@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; insertelements should fold to shuffle
+define <4 x float> @foo(<4 x float> %x) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    [[INS2:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> <float undef, float 1.000000e+00, float 2.000000e+00, float undef>, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[INS2]]
+;
+  %ins1 = insertelement<4 x float> %x, float 1.0, i32 1
+  %ins2 = insertelement<4 x float> %ins1, float 2.0, i32 2
+  ret <4 x float> %ins2
+}
+
+; Insert of a constant is canonicalized ahead of insert of a variable.
+
+define <4 x float> @bar(<4 x float> %x, float %a) {
+; CHECK-LABEL: @bar(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[X:%.*]], float 2.000000e+00, i32 2
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x float> [[TMP1]], float [[A:%.*]], i32 1
+; CHECK-NEXT:    ret <4 x float> [[INS2]]
+;
+  %ins1 = insertelement<4 x float> %x, float %a, i32 1
+  %ins2 = insertelement<4 x float> %ins1, float 2.0, i32 2
+  ret <4 x float> %ins2
+}
+
+define <4 x float> @baz(<4 x float> %x, i32 %a) {
+; CHECK-LABEL: @baz(
+; CHECK-NEXT:    [[INS1:%.*]] = insertelement <4 x float> [[X:%.*]], float 1.000000e+00, i32 1
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x float> [[INS1]], float 2.000000e+00, i32 [[A:%.*]]
+; CHECK-NEXT:    ret <4 x float> [[INS2]]
+;
+  %ins1 = insertelement<4 x float> %x, float 1.0, i32 1
+  %ins2 = insertelement<4 x float> %ins1, float 2.0, i32 %a
+  ret <4 x float> %ins2
+}
+
+; insertelements should fold to shuffle
+define <4 x float> @bazz(<4 x float> %x, i32 %a) {
+; CHECK-LABEL: @bazz(
+; CHECK-NEXT:    [[INS1:%.*]] = insertelement <4 x float> [[X:%.*]], float 1.000000e+00, i32 3
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x float> [[INS1]], float 5.000000e+00, i32 [[A:%.*]]
+; CHECK-NEXT:    [[INS5:%.*]] = shufflevector <4 x float> [[INS2]], <4 x float> <float undef, float 1.000000e+00, float 2.000000e+00, float undef>, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    [[INS6:%.*]] = insertelement <4 x float> [[INS5]], float 7.000000e+00, i32 [[A]]
+; CHECK-NEXT:    ret <4 x float> [[INS6]]
+;
+  %ins1 = insertelement<4 x float> %x, float 1.0, i32 3
+  %ins2 = insertelement<4 x float> %ins1, float 5.0, i32 %a
+  %ins3 = insertelement<4 x float> %ins2, float 3.0, i32 2
+  %ins4 = insertelement<4 x float> %ins3, float 1.0, i32 1
+  %ins5 = insertelement<4 x float> %ins4, float 2.0, i32 2
+  %ins6 = insertelement<4 x float> %ins5, float 7.0, i32 %a
+  ret <4 x float> %ins6
+}
+
+; Out of bounds index folds to undef
+define <4 x float> @bazzz(<4 x float> %x) {
+; CHECK-LABEL: @bazzz(
+; CHECK-NEXT:    ret <4 x float> <float undef, float undef, float 2.000000e+00, float undef>
+;
+  %ins1 = insertelement<4 x float> %x, float 1.0, i32 5
+  %ins2 = insertelement<4 x float> %ins1, float 2.0, i32 2
+  ret <4 x float> %ins2
+}
+
+define <4 x float> @bazzzz(<4 x float> %x) {
+; CHECK-LABEL: @bazzzz(
+; CHECK-NEXT:    ret <4 x float> <float undef, float undef, float 2.000000e+00, float undef>
+;
+  %ins1 = insertelement<4 x float> %x, float 1.0, i32 undef
+  %ins2 = insertelement<4 x float> %ins1, float 2.0, i32 2
+  ret <4 x float> %ins2
+}
+
+define <4 x float> @bazzzzz() {
+; CHECK-LABEL: @bazzzzz(
+; CHECK-NEXT:    ret <4 x float> <float 1.000000e+00, float 5.000000e+00, float 1.000000e+01, float 4.000000e+00>
+;
+  %ins1 = insertelement <4 x float> insertelement (<4 x float> <float 1.0, float 2.0, float 3.0, float undef>, float 4.0, i32 3), float 5.0, i32 1
+  %ins2 = insertelement<4 x float> %ins1, float 10.0, i32 2
+  ret <4 x float> %ins2
+}
+
+define <4 x float> @bazzzzzz(<4 x float> %x, i32 %a) {
+; CHECK-LABEL: @bazzzzzz(
+; CHECK-NEXT:    ret <4 x float> <float poison, float 5.000000e+00, float undef, float 4.000000e+00>
+;
+  %ins1 = insertelement <4 x float> insertelement (<4 x float> shufflevector (<4 x float> poison, <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0> , <4 x i32> <i32 0, i32 5, i32 undef, i32 6> ), float 4.0, i32 3), float 5.0, i32 1
+  ret <4 x float> %ins1
+}
+
+
--- a/llvm/test/Transforms/InstCombine/vscale_extractelement-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/vscale_extractelement-inseltpoison.ll
@ -0,0 +1,185 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+define i32 @extractelement_in_range(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: @extractelement_in_range(
+; CHECK-NEXT:    [[R:%.*]] = extractelement <vscale x 4 x i32> [[A:%.*]], i64 1
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %r = extractelement <vscale x 4 x i32> %a, i64 1
+  ret i32 %r
+}
+
+define i32 @extractelement_maybe_out_of_range(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: @extractelement_maybe_out_of_range(
+; CHECK-NEXT:    [[R:%.*]] = extractelement <vscale x 4 x i32> [[A:%.*]], i64 4
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %r = extractelement <vscale x 4 x i32> %a, i64 4
+  ret i32 %r
+}
+
+define i32 @extractelement_bitcast(float %f) {
+; CHECK-LABEL: @extractelement_bitcast(
+; CHECK-NEXT:    [[R:%.*]] = bitcast float [[F:%.*]] to i32
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %vec_float = insertelement <vscale x 4 x float> poison, float %f, i32 0
+  %vec_int = bitcast <vscale x 4 x float> %vec_float to <vscale x 4 x i32>
+  %r = extractelement <vscale x 4 x i32> %vec_int, i32 0
+  ret i32 %r
+}
+
+define i8 @extractelement_bitcast_to_trunc(<vscale x 2 x i32> %a, i32 %x) {
+; CHECK-LABEL: @extractelement_bitcast_to_trunc(
+; CHECK-NEXT:    [[R:%.*]] = trunc i32 [[X:%.*]] to i8
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %vec = insertelement <vscale x 2 x i32> %a, i32 %x, i32 1
+  %vec_cast = bitcast <vscale x 2 x i32> %vec to <vscale x 8 x i8>
+  %r = extractelement <vscale x 8 x i8> %vec_cast, i32 4
+  ret i8 %r
+}
+
+; TODO: Instcombine could remove the insert.
+define i8 @extractelement_bitcast_wrong_insert(<vscale x 2 x i32> %a, i32 %x) {
+; CHECK-LABEL: @extractelement_bitcast_wrong_insert(
+; CHECK-NEXT:    [[VEC:%.*]] = insertelement <vscale x 2 x i32> [[A:%.*]], i32 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast <vscale x 2 x i32> [[VEC]] to <vscale x 8 x i8>
+; CHECK-NEXT:    [[R:%.*]] = extractelement <vscale x 8 x i8> [[VEC_CAST]], i32 2
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %vec = insertelement <vscale x 2 x i32> %a, i32 %x, i32 1 ; <- This insert could be removed.
+  %vec_cast = bitcast <vscale x 2 x i32> %vec to <vscale x 8 x i8>
+  %r = extractelement <vscale x 8 x i8> %vec_cast, i32 2
+  ret i8 %r
+}
+
+; TODO: Instcombine could optimize to return %v.
+define i32 @extractelement_shuffle_in_range(i32 %v) {
+; CHECK-LABEL: @extractelement_shuffle_in_range(
+; CHECK-NEXT:    [[IN:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[V:%.*]], i32 0
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[IN]], <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = extractelement <vscale x 4 x i32> [[SPLAT]], i32 1
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %in = insertelement <vscale x 4 x i32> poison, i32 %v, i32 0
+  %splat = shufflevector <vscale x 4 x i32> %in, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  %r = extractelement <vscale x 4 x i32> %splat, i32 1
+  ret i32 %r
+}
+
+define i32 @extractelement_shuffle_maybe_out_of_range(i32 %v) {
+; CHECK-LABEL: @extractelement_shuffle_maybe_out_of_range(
+; CHECK-NEXT:    [[IN:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[V:%.*]], i32 0
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[IN]], <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = extractelement <vscale x 4 x i32> [[SPLAT]], i32 4
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %in = insertelement <vscale x 4 x i32> poison, i32 %v, i32 0
+  %splat = shufflevector <vscale x 4 x i32> %in, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  %r = extractelement <vscale x 4 x i32> %splat, i32 4
+  ret i32 %r
+}
+
+define i32 @extractelement_shuffle_invalid_index(i32 %v) {
+; CHECK-LABEL: @extractelement_shuffle_invalid_index(
+; CHECK-NEXT:    [[IN:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[V:%.*]], i32 0
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[IN]], <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = extractelement <vscale x 4 x i32> [[SPLAT]], i32 -1
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %in = insertelement <vscale x 4 x i32> poison, i32 %v, i32 0
+  %splat = shufflevector <vscale x 4 x i32> %in, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  %r = extractelement <vscale x 4 x i32> %splat, i32 -1
+  ret i32 %r
+}
+
+
+define i32 @extractelement_shuffle_symbolic_index(i32 %v, i32 %idx) {
+; CHECK-LABEL: @extractelement_shuffle_symbolic_index(
+; CHECK-NEXT:    [[IN:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[V:%.*]], i32 0
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[IN]], <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = extractelement <vscale x 4 x i32> [[SPLAT]], i32 [[IDX:%.*]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %in = insertelement <vscale x 4 x i32> poison, i32 %v, i32 0
+  %splat = shufflevector <vscale x 4 x i32> %in, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  %r = extractelement <vscale x 4 x i32> %splat, i32 %idx
+  ret i32 %r
+}
+
+define <vscale x 4 x i32> @extractelement_insertelement_same_positions(<vscale x 4 x i32> %vec) {
+; CHECK-LABEL: @extractelement_insertelement_same_positions(
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[VEC:%.*]]
+;
+  %vec.e0 = extractelement <vscale x 4 x i32> %vec, i32 0
+  %vec.e1 = extractelement <vscale x 4 x i32> %vec, i32 1
+  %vec.e2 = extractelement <vscale x 4 x i32> %vec, i32 2
+  %vec.e3 = extractelement <vscale x 4 x i32> %vec, i32 3
+  %1 = insertelement <vscale x 4 x i32> %vec, i32 %vec.e0, i32 0
+  %2 = insertelement <vscale x 4 x i32> %1, i32 %vec.e1, i32 1
+  %3 = insertelement <vscale x 4 x i32> %2, i32 %vec.e2, i32 2
+  %4 = insertelement <vscale x 4 x i32> %3, i32 %vec.e3, i32 3
+  ret <vscale x 4 x i32> %4
+}
+
+define <vscale x 4 x i32> @extractelement_insertelement_diff_positions(<vscale x 4 x i32> %vec) {
+; CHECK-LABEL: @extractelement_insertelement_diff_positions(
+; CHECK-NEXT:    [[VEC_E0:%.*]] = extractelement <vscale x 4 x i32> [[VEC:%.*]], i32 4
+; CHECK-NEXT:    [[VEC_E1:%.*]] = extractelement <vscale x 4 x i32> [[VEC]], i32 5
+; CHECK-NEXT:    [[VEC_E2:%.*]] = extractelement <vscale x 4 x i32> [[VEC]], i32 6
+; CHECK-NEXT:    [[VEC_E3:%.*]] = extractelement <vscale x 4 x i32> [[VEC]], i32 7
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 4 x i32> [[VEC]], i32 [[VEC_E0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <vscale x 4 x i32> [[TMP1]], i32 [[VEC_E1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <vscale x 4 x i32> [[TMP2]], i32 [[VEC_E2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <vscale x 4 x i32> [[TMP3]], i32 [[VEC_E3]], i32 3
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP4]]
+;
+  %vec.e0 = extractelement <vscale x 4 x i32> %vec, i32 4
+  %vec.e1 = extractelement <vscale x 4 x i32> %vec, i32 5
+  %vec.e2 = extractelement <vscale x 4 x i32> %vec, i32 6
+  %vec.e3 = extractelement <vscale x 4 x i32> %vec, i32 7
+  %1 = insertelement <vscale x 4 x i32> %vec, i32 %vec.e0, i32 0
+  %2 = insertelement <vscale x 4 x i32> %1, i32 %vec.e1, i32 1
+  %3 = insertelement <vscale x 4 x i32> %2, i32 %vec.e2, i32 2
+  %4 = insertelement <vscale x 4 x i32> %3, i32 %vec.e3, i32 3
+  ret <vscale x 4 x i32> %4
+}
+
+define i32 @bitcast_of_extractelement( <vscale x 2 x float> %d) {
+; CHECK-LABEL: @bitcast_of_extractelement(
+; CHECK-NEXT:    [[BC:%.*]] = bitcast <vscale x 2 x float> [[D:%.*]] to <vscale x 2 x i32>
+; CHECK-NEXT:    [[CAST:%.*]] = extractelement <vscale x 2 x i32> [[BC]], i32 0
+; CHECK-NEXT:    ret i32 [[CAST]]
+;
+  %ext = extractelement <vscale x 2 x float> %d, i32 0
+  %cast = bitcast float %ext to i32
+  ret i32 %cast
+}
+
+define i1 @extractelement_is_zero(<vscale x 2 x i32> %d, i1 %b, i32 %z) {
+; CHECK-LABEL: @extractelement_is_zero(
+; CHECK-NEXT:    [[EXT:%.*]] = extractelement <vscale x 2 x i32> [[D:%.*]], i32 0
+; CHECK-NEXT:    [[BB:%.*]] = icmp eq i32 [[EXT]], 0
+; CHECK-NEXT:    ret i1 [[BB]]
+;
+  %ext = extractelement <vscale x 2 x i32> %d, i32 0
+  %bb = icmp eq i32 %ext, 0
+  ret i1 %bb
+}
+
+; OSS-Fuzz #25272
+; https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=25272
+define i32 @ossfuzz_25272(float %f) {
+; CHECK-LABEL: @ossfuzz_25272(
+; CHECK-NEXT:    [[VEC_FLOAT:%.*]] = insertelement <vscale x 4 x float> poison, float [[F:%.*]], i32 0
+; CHECK-NEXT:    [[VEC_INT:%.*]] = bitcast <vscale x 4 x float> [[VEC_FLOAT]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[E:%.*]] = extractelement <vscale x 4 x i32> [[VEC_INT]], i32 2147483647
+; CHECK-NEXT:    ret i32 [[E]]
+;
+  %vec_float = insertelement <vscale x 4 x float> poison, float %f, i32 0
+  %vec_int = bitcast <vscale x 4 x float> %vec_float to <vscale x 4 x i32>
+  %E = extractelement <vscale x 4 x i32> %vec_int, i32 2147483647
+  ret i32 %E
+}
--- a/llvm/test/Transforms/InstCombine/vscale_insertelement-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/vscale_insertelement-inseltpoison.ll
@ -0,0 +1,102 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; This test checks that bitcast is moved after insertelement when both vector and scalar are
+; bitcast from the same element type.
+; inselt (bitcast VecSrc), (bitcast ScalarSrc), IdxOp
+;  --> bitcast (inselt VecSrc, ScalarSrc, IdxOp)
+define <vscale x 4 x float> @insertelement_bitcast(<vscale x 4 x i32> %vec, i32 %x) {
+; CHECK-LABEL: @insertelement_bitcast(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 4 x i32> [[VEC:%.*]], i32 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[R:%.*]] = bitcast <vscale x 4 x i32> [[TMP1]] to <vscale x 4 x float>
+; CHECK-NEXT:    ret <vscale x 4 x float> [[R]]
+;
+  %x_cast = bitcast i32 %x to float
+  %vec_cast = bitcast <vscale x 4 x i32> %vec to <vscale x 4 x float>
+  %r = insertelement <vscale x 4 x float> %vec_cast, float %x_cast, i32 0
+  ret <vscale x 4 x float> %r
+}
+
+; This test checks that code-path "Try to form a shuffle from a chain of extract-insert ops" is
+; not taken when both extract and insert are scalable type.
+; For scalable type, the vector length needed to create shuffle mask is not a compile-time constant.
+; Meanwhile, for scalable type shuffle mask only support splat and undef in the current code base.
+; Otherwise we crash at:
+; "Assertion `isValidOperands(V1, V2, Mask) && "Invalid shuffle vector instruction operands!"' failed."
+define <vscale x 4 x i32> @insertelement_extractelement(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: @insertelement_extractelement(
+; CHECK-NEXT:    [[T0:%.*]] = extractelement <vscale x 4 x i32> [[A:%.*]], i32 1
+; CHECK-NEXT:    [[T1:%.*]] = insertelement <vscale x 4 x i32> [[B:%.*]], i32 [[T0]], i32 0
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[T1]]
+;
+  %t0 = extractelement <vscale x 4 x i32> %a, i32 1
+  %t1 = insertelement <vscale x 4 x i32> %b, i32 %t0, i32 0
+  ret <vscale x 4 x i32> %t1
+}
+
+; This test checks that we are not attempting to create a shuffle from extract/insert chain,
+; when extract is from a scalable type, and the insert vector is fixed-length.
+define <4 x i32> @insertelement_extractelement_fixed_vec_extract_from_scalable(<vscale x 4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @insertelement_extractelement_fixed_vec_extract_from_scalable(
+; CHECK-NEXT:    [[T0:%.*]] = extractelement <vscale x 4 x i32> [[A:%.*]], i32 1
+; CHECK-NEXT:    [[T1:%.*]] = insertelement <4 x i32> [[B:%.*]], i32 [[T0]], i32 0
+; CHECK-NEXT:    ret <4 x i32> [[T1]]
+;
+  %t0 = extractelement <vscale x 4 x i32> %a, i32 1
+  %t1 = insertelement <4 x i32> %b, i32 %t0, i32 0
+  ret <4 x i32> %t1
+}
+
+; This test checks that the optimization "foldConstantInsEltInfoShuffle" is not taken for scalable type.
+; Particularly the fold:
+; insertelt (insertelt X, C1, CIndex1), C, CIndex
+;  --> shufflevector X, CVec', Mask'
+; For scalable type, the vector length needed to create shuffle mask is not a compile-time constant.
+; Meanwhile, for scalable type shuffle mask only support splat and undef in the current code base.
+; Otherwise we crash at:
+; "Assertion `isValidOperands(V1, V2, Mask) && "Invalid shuffle vector instruction operands!"' failed."
+define <vscale x 4 x i32> @insertelement_insertelement(<vscale x 4 x i32> %vec) {
+; CHECK-LABEL: @insertelement_insertelement(
+; CHECK-NEXT:    [[T0:%.*]] = insertelement <vscale x 4 x i32> [[VEC:%.*]], i32 1, i32 1
+; CHECK-NEXT:    [[T1:%.*]] = insertelement <vscale x 4 x i32> [[T0]], i32 2, i32 2
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[T1]]
+;
+  %t0 = insertelement <vscale x 4 x i32> %vec, i32 1, i32 1
+  %t1 = insertelement <vscale x 4 x i32> %t0, i32 2, i32 2
+  ret <vscale x 4 x i32> %t1
+}
+
+; This test checks that the following insertelement sequence is not folded into shuffle splat.
+; The length of scalable vector is unknown at compile-time. Therefore the following insertelements
+; may not form a valid splat.
+define <vscale x 4 x float> @insertelement_sequene_may_not_be_splat(float %x) {
+; CHECK-LABEL: @insertelement_sequene_may_not_be_splat(
+; CHECK-NEXT:    [[T0:%.*]] = insertelement <vscale x 4 x float> poison, float [[X:%.*]], i32 0
+; CHECK-NEXT:    [[T1:%.*]] = insertelement <vscale x 4 x float> [[T0]], float [[X]], i32 1
+; CHECK-NEXT:    [[T2:%.*]] = insertelement <vscale x 4 x float> [[T1]], float [[X]], i32 2
+; CHECK-NEXT:    [[T3:%.*]] = insertelement <vscale x 4 x float> [[T2]], float [[X]], i32 3
+; CHECK-NEXT:    ret <vscale x 4 x float> [[T3]]
+;
+  %t0 = insertelement <vscale x 4 x float> poison, float %x, i32 0
+  %t1 = insertelement <vscale x 4 x float> %t0, float %x, i32 1
+  %t2 = insertelement <vscale x 4 x float> %t1, float %x, i32 2
+  %t3 = insertelement <vscale x 4 x float> %t2, float %x, i32 3
+  ret <vscale x 4 x float> %t3
+}
+
+; OSS-Fuzz #27416
+; https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=27416
+define void @ossfuzz_27416(i32 %v) {
+; CHECK-LABEL: @ossfuzz_27416(
+; CHECK-NEXT:    [[IN:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[V:%.*]], i32 0
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[IN]], <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[I1:%.*]] = insertelement <vscale x 4 x i32> [[SPLAT]], i32 undef, i8 -128
+; CHECK-NEXT:    store <vscale x 4 x i32> [[I1]], <vscale x 4 x i32>* undef, align 16
+; CHECK-NEXT:    ret void
+;
+  %in = insertelement <vscale x 4 x i32> poison, i32 %v, i32 0
+  %splat = shufflevector <vscale x 4 x i32> %in, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  %I1 = insertelement <vscale x 4 x i32> %splat, i32 undef, i8 -128
+  store <vscale x 4 x i32> %I1, <vscale x 4 x i32>* undef, align 16
+  ret void
+}
--- a/llvm/test/Transforms/InstSimplify/ConstProp/InsertElement-inseltpoison.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/InsertElement-inseltpoison.ll
@ -0,0 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+define i32 @test1() {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    ret i32 2139171423
+;
+  %A = bitcast i32 2139171423 to float
+  %B = insertelement <1 x float> poison, float %A, i32 0
+  %C = extractelement <1 x float> %B, i32 0
+  %D = bitcast float %C to i32
+  ret i32 %D
+}
+
+define <4 x i64> @insertelement() {
+; CHECK-LABEL: @insertelement(
+; CHECK-NEXT:    ret <4 x i64> <i64 -1, i64 -2, i64 -3, i64 -4>
+;
+  %vec1 = insertelement <4 x i64> poison, i64 -1, i32 0
+  %vec2 = insertelement <4 x i64> %vec1, i64 -2, i32 1
+  %vec3 = insertelement <4 x i64> %vec2, i64 -3, i32 2
+  %vec4 = insertelement <4 x i64> %vec3, i64 -4, i32 3
+  ret <4 x i64> %vec4
+}
+
+define <4 x i64> @insertelement_undef() {
+; CHECK-LABEL: @insertelement_undef(
+; CHECK-NEXT:    ret <4 x i64> poison
+;
+  %vec1 = insertelement <4 x i64> poison, i64 -1, i32 0
+  %vec2 = insertelement <4 x i64> %vec1, i64 -2, i32 1
+  %vec3 = insertelement <4 x i64> %vec2, i64 -3, i32 2
+  %vec4 = insertelement <4 x i64> %vec3, i64 -4, i32 3
+  %vec5 = insertelement <4 x i64> %vec3, i64 -5, i32 4
+  ret <4 x i64> %vec5
+}
+
+define i64 @extract_undef_index_from_zero_vec() {
+; CHECK-LABEL: @extract_undef_index_from_zero_vec(
+; CHECK-NEXT:    ret i64 poison
+;
+  %E = extractelement <2 x i64> zeroinitializer, i64 undef
+  ret i64 %E
+}
+
+define i64 @extract_undef_index_from_nonzero_vec() {
+; CHECK-LABEL: @extract_undef_index_from_nonzero_vec(
+; CHECK-NEXT:    ret i64 poison
+;
+  %E = extractelement <2 x i64> <i64 -1, i64 -1>, i64 undef
+  ret i64 %E
+}
--- a/llvm/test/Transforms/InstSimplify/ConstProp/vscale-inseltpoison.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vscale-inseltpoison.ll
@ -0,0 +1,301 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instsimplify -S -verify | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Unary Operations
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 2 x double> @fneg(<vscale x 2 x double> %val) {
+; CHECK-LABEL: @fneg(
+; CHECK-NEXT:    ret <vscale x 2 x double> undef
+;
+  %r = fneg <vscale x 2 x double> undef
+  ret <vscale x 2 x double> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Binary Operations
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 4 x i32> @add() {
+; CHECK-LABEL: @add(
+; CHECK-NEXT:    ret <vscale x 4 x i32> undef
+;
+  %r = add <vscale x 4 x i32> undef, undef
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x float> @fadd() {
+; CHECK-LABEL: @fadd(
+; CHECK-NEXT:    ret <vscale x 4 x float> undef
+;
+  %r = fadd <vscale x 4 x float> undef, undef
+  ret <vscale x 4 x float> %r
+}
+
+define <vscale x 4 x i32> @sub() {
+; CHECK-LABEL: @sub(
+; CHECK-NEXT:    ret <vscale x 4 x i32> undef
+;
+  %r = sub <vscale x 4 x i32> undef, undef
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x i32> @sub_splat() {
+; CHECK-LABEL: @sub_splat(
+; CHECK-NEXT:    ret <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 -16, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+;
+  %r = sub <vscale x 4 x i32> zeroinitializer, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 16, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x float> @fsub() {
+; CHECK-LABEL: @fsub(
+; CHECK-NEXT:    ret <vscale x 4 x float> undef
+;
+  %r = fsub <vscale x 4 x float> undef, undef
+  ret <vscale x 4 x float> %r
+}
+
+define <vscale x 4 x i32> @mul() {
+; CHECK-LABEL: @mul(
+; CHECK-NEXT:    ret <vscale x 4 x i32> undef
+;
+  %r = mul <vscale x 4 x i32> undef, undef
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x float> @fmul() {
+; CHECK-LABEL: @fmul(
+; CHECK-NEXT:    ret <vscale x 4 x float> undef
+;
+  %r = fmul <vscale x 4 x float> undef, undef
+  ret <vscale x 4 x float> %r
+}
+
+define <vscale x 4 x i32> @udiv() {
+; CHECK-LABEL: @udiv(
+; CHECK-NEXT:    ret <vscale x 4 x i32> poison
+;
+  %r = udiv <vscale x 4 x i32> undef, undef
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x i32> @udiv_splat_zero() {
+; CHECK-LABEL: @udiv_splat_zero(
+; CHECK-NEXT:    ret <vscale x 4 x i32> poison
+;
+  %r = udiv <vscale x 4 x i32> zeroinitializer, zeroinitializer
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x i32> @sdiv() {
+; CHECK-LABEL: @sdiv(
+; CHECK-NEXT:    ret <vscale x 4 x i32> poison
+;
+  %r = sdiv <vscale x 4 x i32> undef, undef
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x float> @fdiv() {
+; CHECK-LABEL: @fdiv(
+; CHECK-NEXT:    ret <vscale x 4 x float> undef
+;
+  %r = fdiv <vscale x 4 x float> undef, undef
+  ret <vscale x 4 x float> %r
+}
+
+define <vscale x 4 x i32> @urem() {
+; CHECK-LABEL: @urem(
+; CHECK-NEXT:    ret <vscale x 4 x i32> poison
+;
+  %r = urem <vscale x 4 x i32> undef, undef
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x i32> @srem() {
+; CHECK-LABEL: @srem(
+; CHECK-NEXT:    ret <vscale x 4 x i32> poison
+;
+  %r = srem <vscale x 4 x i32> undef, undef
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x float> @frem() {
+; CHECK-LABEL: @frem(
+; CHECK-NEXT:    ret <vscale x 4 x float> undef
+;
+  %r = frem <vscale x 4 x float> undef, undef
+  ret <vscale x 4 x float> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Bitwise Binary Operations
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 4 x i32> @shl() {
+; CHECK-LABEL: @shl(
+; CHECK-NEXT:    ret <vscale x 4 x i32> poison
+;
+  %r = shl <vscale x 4 x i32> undef, undef
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x i32> @lshr() {
+; CHECK-LABEL: @lshr(
+; CHECK-NEXT:    ret <vscale x 4 x i32> poison
+;
+  %r = lshr <vscale x 4 x i32> undef, undef
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x i32> @ashr() {
+; CHECK-LABEL: @ashr(
+; CHECK-NEXT:    ret <vscale x 4 x i32> poison
+;
+  %r = ashr <vscale x 4 x i32> undef, undef
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x i32> @and() {
+; CHECK-LABEL: @and(
+; CHECK-NEXT:    ret <vscale x 4 x i32> undef
+;
+  %r = and <vscale x 4 x i32> undef, undef
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x i32> @or() {
+; CHECK-LABEL: @or(
+; CHECK-NEXT:    ret <vscale x 4 x i32> undef
+;
+  %r = or <vscale x 4 x i32> undef, undef
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x i32> @xor() {
+; CHECK-LABEL: @xor(
+; CHECK-NEXT:    ret <vscale x 4 x i32> zeroinitializer
+;
+  %r = xor <vscale x 4 x i32> undef, undef
+  ret <vscale x 4 x i32> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Vector Operations
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 4 x i32> @insertelement() {
+; CHECK-LABEL: @insertelement(
+; CHECK-NEXT:    ret <vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0)
+;
+  %i = insertelement <vscale x 4 x i32> poison, i32 1, i32 0
+  ret <vscale x 4 x i32> %i
+}
+
+define <vscale x 4 x i32> @shufflevector() {
+; CHECK-LABEL: @shufflevector(
+; CHECK-NEXT:    ret <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+;
+  %i = insertelement <vscale x 4 x i32> poison, i32 1, i32 0
+  %i2 = shufflevector <vscale x 4 x i32> %i, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x i32> %i2
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Memory Access and Addressing Operations
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 2 x double> @load() {
+; CHECK-LABEL: @load(
+; CHECK-NEXT:    [[R:%.*]] = load <vscale x 2 x double>, <vscale x 2 x double>* getelementptr (<vscale x 2 x double>, <vscale x 2 x double>* null, i64 1), align 16
+; CHECK-NEXT:    ret <vscale x 2 x double> [[R]]
+;
+  %r = load <vscale x 2 x double>, <vscale x 2 x double>* getelementptr (<vscale x 2 x double>, <vscale x 2 x double>* null, i64 1)
+  ret <vscale x 2 x double> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Conversion Operations
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 4 x float> @bitcast() {
+; CHECK-LABEL: @bitcast(
+; CHECK-NEXT:    ret <vscale x 4 x float> bitcast (<vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x float>)
+;
+  %i1 = insertelement <vscale x 4 x i32> poison, i32 1, i32 0
+  %i2 = shufflevector <vscale x 4 x i32> %i1, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  %i3 = bitcast <vscale x 4 x i32> %i2 to <vscale x 4 x float>
+  ret <vscale x 4 x float> %i3
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Other Operations
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 4 x i32> @select() {
+; CHECK-LABEL: @select(
+; CHECK-NEXT:    ret <vscale x 4 x i32> undef
+;
+  %r = select <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> undef
+  ret <vscale x 4 x i32> %r
+}
+
+declare <vscale x 16 x i8> @llvm.something(<vscale x 16 x i8>, <vscale x 16 x i8>)
+
+define <vscale x 16 x i8> @call() {
+; CHECK-LABEL: @call(
+; CHECK-NEXT:    [[R:%.*]] = call <vscale x 16 x i8> @llvm.something(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef)
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[R]]
+;
+  %r =  call <vscale x 16 x i8> @llvm.something(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef)
+  ret <vscale x 16 x i8> %r
+}
+
+define <vscale x 4 x i1> @icmp_undef() {
+; CHECK-LABEL: @icmp_undef(
+; CHECK-NEXT:    ret <vscale x 4 x i1> undef
+;
+  %r = icmp eq <vscale x 4 x i32> undef, undef
+  ret <vscale x 4 x i1> %r
+}
+
+define <vscale x 4 x i1> @icmp_zero() {
+; CHECK-LABEL: @icmp_zero(
+; CHECK-NEXT:    ret <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> undef, i1 true, i32 0), <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer)
+;
+  %r = icmp eq <vscale x 4 x i32> zeroinitializer, zeroinitializer
+  ret <vscale x 4 x i1> %r
+}
+
+define <vscale x 4 x i1> @fcmp_true() {
+; CHECK-LABEL: @fcmp_true(
+; CHECK-NEXT:    ret <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> undef, i1 true, i32 0), <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer)
+;
+  %r = fcmp true <vscale x 4 x float> undef, undef
+  ret <vscale x 4 x i1> %r
+}
+
+define <vscale x 4 x i1> @fcmp_false() {
+; CHECK-LABEL: @fcmp_false(
+; CHECK-NEXT:    ret <vscale x 4 x i1> zeroinitializer
+;
+  %r = fcmp false <vscale x 4 x float> undef, undef
+  ret <vscale x 4 x i1> %r
+}
+
+define <vscale x 4 x i1> @fcmp_undef() {
+; CHECK-LABEL: @fcmp_undef(
+; CHECK-NEXT:    ret <vscale x 4 x i1> undef
+;
+  %r = icmp ne <vscale x 4 x i32> undef, undef
+  ret <vscale x 4 x i1> %r
+}
+
+define <vscale x 4 x i1> @fcmp_not_equality() {
+; CHECK-LABEL: @fcmp_not_equality(
+; CHECK-NEXT:    ret <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> undef, i1 true, i32 0), <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer)
+;
+  %r = icmp ule <vscale x 4 x i32> undef, zeroinitializer
+  ret <vscale x 4 x i1> %r
+}
--- a/llvm/test/Transforms/InstSimplify/ConstProp/vscale-shufflevector-inseltpoison.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vscale-shufflevector-inseltpoison.ll
@ -0,0 +1,39 @@
+; RUN: opt -early-cse -earlycse-debug-hash -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64"
+
+; This test checks that SimplifyInstruction does not blow up in the face of
+; a scalable shufflevector. vscale is a constant value known only at runtime.
+; Therefore, it is not possible to know the concrete value of, or the length
+; of the mask at compile time. Simplifications that depend on the value
+; of the mask cannot be performed.
+
+; Given the fact that the value of the mask is unknown at compile time for
+; scalable vectors, very few simplifications will be done. Here, we want to
+; see that the instruction can be passed to SimplifyInstruction and not crash
+; the compiler. It happens to be the case that this will be the result.
+
+; CHECK-LABEL: define <vscale x 8 x i1> @vscale_version()
+; CHECK-NEXT: ret <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i32 0), <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer)
+
+define <vscale x 8 x i1> @vscale_version() {
+  %splatter = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %foo = shufflevector <vscale x 8 x i1> %splatter,
+                       <vscale x 8 x i1> undef,
+                       <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x i1> %foo
+}
+
+; The non-scalable version should be optimized as normal.
+
+; CHECK-LABEL: define <8 x i1> @fixed_length_version() {
+; CHECK-NEXT:  ret <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+define <8 x i1> @fixed_length_version() {
+  %splatter = insertelement <8 x i1> poison, i1 true, i32 0
+  %foo = shufflevector <8 x i1> %splatter,
+                       <8 x i1> undef,
+                       <8 x i32> zeroinitializer
+  ret <8 x i1> %foo
+}
+
--- a/llvm/test/Transforms/InstSimplify/select-inseltpoison.ll
+++ b/llvm/test/Transforms/InstSimplify/select-inseltpoison.ll
--- a/llvm/test/Transforms/InstSimplify/vscale-inseltpoison.ll
+++ b/llvm/test/Transforms/InstSimplify/vscale-inseltpoison.ll
@ -0,0 +1,199 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instsimplify -S -verify | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Vector Operations
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; insertelement
+
+define <vscale x 4 x i32> @insertelement_idx_undef(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: @insertelement_idx_undef(
+; CHECK-NEXT:    ret <vscale x 4 x i32> undef
+;
+  %r = insertelement <vscale x 4 x i32> %a, i32 5, i64 undef
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x i32> @insertelement_value_undef(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: @insertelement_value_undef(
+; CHECK-NEXT:    [[R:%.*]] = insertelement <vscale x 4 x i32> [[A:%.*]], i32 undef, i64 0
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[R]]
+;
+  %r = insertelement <vscale x 4 x i32> %a, i32 undef, i64 0
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x i32> @insertelement_idx_maybe_out_of_bound(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: @insertelement_idx_maybe_out_of_bound(
+; CHECK-NEXT:    [[R:%.*]] = insertelement <vscale x 4 x i32> [[A:%.*]], i32 5, i64 4
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[R]]
+;
+  %r = insertelement <vscale x 4 x i32> %a, i32 5, i64 4
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x i32> @insertelement_idx_large_bound(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: @insertelement_idx_large_bound(
+; CHECK-NEXT:    [[R:%.*]] = insertelement <vscale x 4 x i32> [[A:%.*]], i32 5, i64 12345
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[R]]
+;
+  %r = insertelement <vscale x 4 x i32> %a, i32 5, i64 12345
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x i32> @insert_extract_element_same_vec_idx_1(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: @insert_extract_element_same_vec_idx_1(
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[A:%.*]]
+;
+  %v = extractelement <vscale x 4 x i32> %a, i64 1
+  %r = insertelement <vscale x 4 x i32> %a, i32 %v, i64 1
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x i32> @insertelement_inline_to_ret() {
+; CHECK-LABEL: @insertelement_inline_to_ret(
+; CHECK-NEXT:    ret <vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0)
+;
+  %i = insertelement <vscale x 4 x i32> poison, i32 1, i32 0
+  ret <vscale x 4 x i32> %i
+}
+
+define <vscale x 4 x i32> @insertelement_shufflevector_inline_to_ret() {
+; CHECK-LABEL: @insertelement_shufflevector_inline_to_ret(
+; CHECK-NEXT:    ret <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+;
+  %i = insertelement <vscale x 4 x i32> poison, i32 1, i32 0
+  %i2 = shufflevector <vscale x 4 x i32> %i, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x i32> %i2
+}
+
+; extractelement
+
+define i32 @extractelement_idx_undef(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: @extractelement_idx_undef(
+; CHECK-NEXT:    ret i32 undef
+;
+  %r = extractelement <vscale x 4 x i32> %a, i64 undef
+  ret i32 %r
+}
+
+define i32 @extractelement_vec_undef(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: @extractelement_vec_undef(
+; CHECK-NEXT:    ret i32 undef
+;
+  %r = extractelement <vscale x 4 x i32> undef, i64 1
+  ret i32 %r
+}
+
+define i32 @extractelement_idx_maybe_out_of_bound(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: @extractelement_idx_maybe_out_of_bound(
+; CHECK-NEXT:    [[R:%.*]] = extractelement <vscale x 4 x i32> [[A:%.*]], i64 4
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %r = extractelement <vscale x 4 x i32> %a, i64 4
+  ret i32 %r
+}
+define i32 @extractelement_idx_large_bound(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: @extractelement_idx_large_bound(
+; CHECK-NEXT:    [[R:%.*]] = extractelement <vscale x 4 x i32> [[A:%.*]], i64 12345
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %r = extractelement <vscale x 4 x i32> %a, i64 12345
+  ret i32 %r
+}
+
+define i32 @insert_extract_element_same_vec_idx_2() {
+; CHECK-LABEL: @insert_extract_element_same_vec_idx_2(
+; CHECK-NEXT:    ret i32 1
+;
+  %v = insertelement <vscale x 4 x i32> poison, i32 1, i64 4
+  %r = extractelement <vscale x 4 x i32> %v, i64 4
+  ret i32 %r
+}
+
+define i32 @insert_extract_element_same_vec_idx_3() {
+; CHECK-LABEL: @insert_extract_element_same_vec_idx_3(
+; CHECK-NEXT:    ret i32 1
+;
+  %r = extractelement <vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 1, i64 4), i64 4
+  ret i32 %r
+}
+
+define i32 @insert_extract_element_same_vec_idx_4() {
+; CHECK-LABEL: @insert_extract_element_same_vec_idx_4(
+; CHECK-NEXT:    ret i32 1
+;
+  %r = extractelement <vscale x 4 x i32> insertelement (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 1, i32 4), i32 2, i64 3), i64 4
+  ret i32 %r
+}
+
+; more complicated expressions
+
+define <vscale x 2 x i1> @cmp_le_smax_always_true(<vscale x 2 x i64> %x) {
+; CHECK-LABEL: @cmp_le_smax_always_true(
+; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer)
+;
+  %cmp = icmp sle <vscale x 2 x i64> %x, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> undef, i64 9223372036854775807, i32 0), <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer)
+  ret <vscale x 2 x i1> %cmp
+}
+
+define <vscale x 4 x float> @bitcast() {
+; CHECK-LABEL: @bitcast(
+; CHECK-NEXT:    ret <vscale x 4 x float> bitcast (<vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x float>)
+;
+  %i1 = insertelement <vscale x 4 x i32> poison, i32 1, i32 0
+  %i2 = shufflevector <vscale x 4 x i32> %i1, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  %i3 = bitcast <vscale x 4 x i32> %i2 to <vscale x 4 x float>
+  ret <vscale x 4 x float> %i3
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Memory Access and Addressing Operations
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; getelementptr
+
+define <vscale x 4 x i32*> @getelementptr_constant_foldable_1() {
+; CHECK-LABEL: @getelementptr_constant_foldable_1(
+; CHECK-NEXT:    ret <vscale x 4 x i32*> zeroinitializer
+;
+  %ptr = getelementptr i32, <vscale x 4 x i32*> zeroinitializer, <vscale x 4 x i64> undef
+  ret <vscale x 4 x i32*> %ptr
+}
+
+define <vscale x 4 x <vscale x 4 x i32>*> @getelementptr_constant_foldable_2() {
+; CHECK-LABEL: @getelementptr_constant_foldable_2(
+; CHECK-NEXT:    ret <vscale x 4 x <vscale x 4 x i32>*> zeroinitializer
+;
+  %ptr = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* null, <vscale x 4 x i64> undef
+  ret <vscale x 4 x <vscale x 4 x i32>*> %ptr
+}
+
+; fold getelementptr P, 0 -> P.
+define <vscale x 4 x i32>* @getelementptr_constant_foldable_3() {
+; CHECK-LABEL: @getelementptr_constant_foldable_3(
+; CHECK-NEXT:    ret <vscale x 4 x i32>* null
+;
+  %ptr = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* null, i64 0
+  ret <vscale x 4 x i32>* %ptr
+}
+
+define <vscale x 4 x i32>* @getelementptr_not_constant_foldable(i64 %x) {
+; CHECK-LABEL: @getelementptr_not_constant_foldable(
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* null, i64 [[X:%.*]]
+; CHECK-NEXT:    ret <vscale x 4 x i32>* [[PTR]]
+;
+  %ptr = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* null, i64 %x
+  ret <vscale x 4 x i32>* %ptr
+}
+
+; Check GEP's result is known to be non-null.
+define i1 @getelementptr_check_non_null(<vscale x 16 x i8>* %ptr) {
+; CHECK-LABEL: @getelementptr_check_non_null(
+; CHECK-NEXT:    ret i1 false
+;
+  %x = getelementptr inbounds <vscale x 16 x i8>, <vscale x 16 x i8>* %ptr, i32 1
+  %cmp = icmp eq <vscale x 16 x i8>* %x, null
+  ret i1 %cmp
+}
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/selects-inseltpoison.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/selects-inseltpoison.ll
@ -0,0 +1,95 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -dce -S -o - %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+define void @base_case(i1 %cnd, i32 addrspace(1)* %a, i32 addrspace(1)* %b, <3 x i32> addrspace(1)* %out) {
+; CHECK-LABEL: @base_case
+; CHECK: load <3 x i32>
+entry:
+  %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 1
+  %gep2 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 2
+  %gep4 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 1
+  %gep5 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 2
+  %selected = select i1 %cnd, i32 addrspace(1)* %a, i32 addrspace(1)* %b
+  %selected14 = select i1 %cnd, i32 addrspace(1)* %gep1, i32 addrspace(1)* %gep4
+  %selected25 = select i1 %cnd, i32 addrspace(1)* %gep2, i32 addrspace(1)* %gep5
+  %val0 = load i32, i32 addrspace(1)* %selected, align 4
+  %val1 = load i32, i32 addrspace(1)* %selected14, align 4
+  %val2 = load i32, i32 addrspace(1)* %selected25, align 4
+  %t0 = insertelement <3 x i32> poison, i32 %val0, i32 0
+  %t1 = insertelement <3 x i32> %t0, i32 %val1, i32 1
+  %t2 = insertelement <3 x i32> %t1, i32 %val2, i32 2
+  store <3 x i32> %t2, <3 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @scev_targeting_complex_case(i1 %cnd, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 %base, <2 x i32> addrspace(1)* %out) {
+; CHECK-LABEL: @scev_targeting_complex_case
+; CHECK: load <2 x i32>
+entry:
+  %base.x4 = shl i32 %base, 2
+  %base.x4.p1 = add i32 %base.x4, 1
+  %base.x4.p2 = add i32 %base.x4, 2
+  %base.x4.p3 = add i32 %base.x4, 3
+  %zext.x4 = zext i32 %base.x4 to i64
+  %zext.x4.p1 = zext i32 %base.x4.p1 to i64
+  %zext.x4.p2 = zext i32 %base.x4.p2 to i64
+  %zext.x4.p3 = zext i32 %base.x4.p3 to i64
+  %base.x16 = mul i64 %zext.x4, 4
+  %base.x16.p4 = shl i64 %zext.x4.p1, 2
+  %base.x16.p8 = shl i64 %zext.x4.p2, 2
+  %base.x16.p12 = mul i64 %zext.x4.p3, 4
+  %a.pi8 = bitcast i32 addrspace(1)* %a to i8 addrspace(1)*
+  %b.pi8 = bitcast i32 addrspace(1)* %b to i8 addrspace(1)*
+  %gep.a.base.x16 = getelementptr inbounds i8, i8 addrspace(1)* %a.pi8, i64 %base.x16
+  %gep.b.base.x16.p4 = getelementptr inbounds i8, i8 addrspace(1)* %b.pi8, i64 %base.x16.p4
+  %gep.a.base.x16.p8 = getelementptr inbounds i8, i8 addrspace(1)* %a.pi8, i64 %base.x16.p8
+  %gep.b.base.x16.p12 = getelementptr inbounds i8, i8 addrspace(1)* %b.pi8, i64 %base.x16.p12
+  %a.base.x16 = bitcast i8 addrspace(1)* %gep.a.base.x16 to i32 addrspace(1)*
+  %b.base.x16.p4 = bitcast i8 addrspace(1)* %gep.b.base.x16.p4 to i32 addrspace(1)*
+  %selected.base.x16.p0.or.4 = select i1 %cnd, i32 addrspace(1)* %a.base.x16, i32 addrspace(1)* %b.base.x16.p4
+  %gep.selected.base.x16.p8.or.12 = select i1 %cnd, i8 addrspace(1)* %gep.a.base.x16.p8, i8 addrspace(1)* %gep.b.base.x16.p12
+  %selected.base.x16.p8.or.12 = bitcast i8 addrspace(1)* %gep.selected.base.x16.p8.or.12 to i32 addrspace(1)*
+  %selected.base.x16.p40.or.44 = getelementptr inbounds i32, i32 addrspace(1)* %selected.base.x16.p0.or.4, i64 10
+  %selected.base.x16.p44.or.48 = getelementptr inbounds i32, i32 addrspace(1)* %selected.base.x16.p8.or.12, i64 9
+  %val0 = load i32, i32 addrspace(1)* %selected.base.x16.p40.or.44, align 4
+  %val1 = load i32, i32 addrspace(1)* %selected.base.x16.p44.or.48, align 4
+  %t0 = insertelement <2 x i32> poison, i32 %val0, i32 0
+  %t1 = insertelement <2 x i32> %t0, i32 %val1, i32 1
+  store <2 x i32> %t1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @nested_selects(i1 %cnd0, i1 %cnd1, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 %base, <2 x i32> addrspace(1)* %out) {
+; CHECK-LABEL: @nested_selects
+; CHECK: load <2 x i32>
+entry:
+  %base.p1 = add nsw i32 %base, 1
+  %base.p2 = add i32 %base, 2
+  %base.p3 = add nsw i32 %base, 3
+  %base.x4 = mul i32 %base, 4
+  %base.x4.p5 = add i32 %base.x4, 5
+  %base.x4.p6 = add i32 %base.x4, 6
+  %sext = sext i32 %base to i64
+  %sext.p1 = sext i32 %base.p1 to i64
+  %sext.p2 = sext i32 %base.p2 to i64
+  %sext.p3 = sext i32 %base.p3 to i64
+  %sext.x4.p5 = sext i32 %base.x4.p5 to i64
+  %sext.x4.p6 = sext i32 %base.x4.p6 to i64
+  %gep.a.base = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext
+  %gep.a.base.p1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.p1
+  %gep.a.base.p2 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.p2
+  %gep.a.base.p3 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.p3
+  %gep.b.base.x4.p5 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.x4.p5
+  %gep.b.base.x4.p6 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.x4.p6
+  %selected.1.L = select i1 %cnd1, i32 addrspace(1)* %gep.a.base.p2, i32 addrspace(1)* %gep.b.base.x4.p5
+  %selected.1.R = select i1 %cnd1, i32 addrspace(1)* %gep.a.base.p3, i32 addrspace(1)* %gep.b.base.x4.p6
+  %selected.0.L = select i1 %cnd0, i32 addrspace(1)* %gep.a.base, i32 addrspace(1)* %selected.1.L
+  %selected.0.R = select i1 %cnd0, i32 addrspace(1)* %gep.a.base.p1, i32 addrspace(1)* %selected.1.R
+  %val0 = load i32, i32 addrspace(1)* %selected.0.L, align 4
+  %val1 = load i32, i32 addrspace(1)* %selected.0.R, align 4
+  %t0 = insertelement <2 x i32> poison, i32 %val0, i32 0
+  %t1 = insertelement <2 x i32> %t0, i32 %val1, i32 1
+  store <2 x i32> %t1, <2 x i32> addrspace(1)* %out
+  ret void
+}
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/load-width-inseltpoison.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/load-width-inseltpoison.ll
@ -0,0 +1,40 @@
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu haswell -S -o - %s | FileCheck --check-prefix=CHECK-HSW %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu knl -S -o - %s | FileCheck --check-prefix=CHECK-KNL %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -mcpu haswell -S -o - %s | FileCheck --check-prefix=CHECK-HSW %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -mcpu knl -S -o - %s | FileCheck --check-prefix=CHECK-KNL %s
+
+define <8 x double> @loadwidth_insert_extract(double* %ptr) {
+    %a = bitcast double* %ptr to <2 x double> *
+    %b = getelementptr <2 x double>, <2 x double>* %a, i32 1
+    %c = getelementptr <2 x double>, <2 x double>* %a, i32 2
+    %d = getelementptr <2 x double>, <2 x double>* %a, i32 3
+; CHECK-HSW: load <4 x double>
+; CHECK-HSW: load <4 x double>
+; CHECK-HSW-NOT: load
+; CHECK-KNL: load <8 x double>
+; CHECK-KNL-NOT: load
+    %la = load <2 x double>, <2 x double> *%a
+    %lb = load <2 x double>, <2 x double> *%b
+    %lc = load <2 x double>, <2 x double> *%c
+    %ld = load <2 x double>, <2 x double> *%d
+    ; Scalarize everything - Explicitly not a shufflevector to test this code
+    ; path in the LSV
+    %v1 = extractelement <2 x double> %la, i32 0
+    %v2 = extractelement <2 x double> %la, i32 1
+    %v3 = extractelement <2 x double> %lb, i32 0
+    %v4 = extractelement <2 x double> %lb, i32 1
+    %v5 = extractelement <2 x double> %lc, i32 0
+    %v6 = extractelement <2 x double> %lc, i32 1
+    %v7 = extractelement <2 x double> %ld, i32 0
+    %v8 = extractelement <2 x double> %ld, i32 1
+    ; Make a vector again
+    %i1 = insertelement <8 x double> poison, double %v1, i32 0
+    %i2 = insertelement <8 x double> %i1, double %v2, i32 1
+    %i3 = insertelement <8 x double> %i2, double %v3, i32 2
+    %i4 = insertelement <8 x double> %i3, double %v4, i32 3
+    %i5 = insertelement <8 x double> %i4, double %v5, i32 4
+    %i6 = insertelement <8 x double> %i5, double %v6, i32 5
+    %i7 = insertelement <8 x double> %i6, double %v7, i32 6
+    %i8 = insertelement <8 x double> %i7, double %v8, i32 7
+    ret <8 x double> %i8
+}
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add-inseltpoison.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add-inseltpoison.ll
@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -o - -S -load-store-vectorizer -dce %s | FileCheck %s
+
+; Make sure LoadStoreVectorizer vectorizes the loads below.
+; In order to prove that the vectorization is safe, it tries to
+; match nested adds and find an expression that adds a constant
+; value to an existing index and the result doesn't overflow.
+
+target triple = "x86_64--"
+
+define void @ld_v4i8_add_nsw(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) {
+; CHECK-LABEL: @ld_v4i8_add_nsw(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP:%.*]] = add nsw i32 [[V0:%.*]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i32 [[V1:%.*]], [[TMP]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> poison, i8 [[TMP41]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3
+; CHECK-NEXT:    store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+bb:
+  %tmp = add nsw i32 %v0, -1
+  %tmp1 = add nsw i32 %v1, %tmp
+  %tmp2 = sext i32 %tmp1 to i64
+  %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2
+  %tmp4 = load i8, i8* %tmp3, align 1
+  %tmp5 = add nsw i32 %v1, %v0
+  %tmp6 = sext i32 %tmp5 to i64
+  %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6
+  %tmp8 = load i8, i8* %tmp7, align 1
+  %tmp9 = add nsw i32 %v0, 1
+  %tmp10 = add nsw i32 %v1, %tmp9
+  %tmp11 = sext i32 %tmp10 to i64
+  %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11
+  %tmp13 = load i8, i8* %tmp12, align 1
+  %tmp14 = add nsw i32 %v0, 2
+  %tmp15 = add nsw i32 %v1, %tmp14
+  %tmp16 = sext i32 %tmp15 to i64
+  %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16
+  %tmp18 = load i8, i8* %tmp17, align 1
+  %tmp19 = insertelement <4 x i8> poison, i8 %tmp4, i32 0
+  %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
+  %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
+  %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
+  store <4 x i8> %tmp22, <4 x i8>* %dst
+  ret void
+}
+
+define void @ld_v4i8_add_nuw(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) {
+; CHECK-LABEL: @ld_v4i8_add_nuw(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP:%.*]] = add nuw i32 [[V0:%.*]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[V1:%.*]], [[TMP]]
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> poison, i8 [[TMP41]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3
+; CHECK-NEXT:    store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+bb:
+  %tmp = add nuw i32 %v0, -1
+  %tmp1 = add nuw i32 %v1, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2
+  %tmp4 = load i8, i8* %tmp3, align 1
+  %tmp5 = add nuw i32 %v1, %v0
+  %tmp6 = zext i32 %tmp5 to i64
+  %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6
+  %tmp8 = load i8, i8* %tmp7, align 1
+  %tmp9 = add nuw i32 %v0, 1
+  %tmp10 = add nuw i32 %v1, %tmp9
+  %tmp11 = zext i32 %tmp10 to i64
+  %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11
+  %tmp13 = load i8, i8* %tmp12, align 1
+  %tmp14 = add nuw i32 %v0, 2
+  %tmp15 = add nuw i32 %v1, %tmp14
+  %tmp16 = zext i32 %tmp15 to i64
+  %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16
+  %tmp18 = load i8, i8* %tmp17, align 1
+  %tmp19 = insertelement <4 x i8> poison, i8 %tmp4, i32 0
+  %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
+  %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
+  %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
+  store <4 x i8> %tmp22, <4 x i8>* %dst
+  ret void
+}
+
+; Make sure we don't vectorize the loads below because the source of
+; sext instructions doesn't have the nsw flag.
+
+define void @ld_v4i8_add_not_safe(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) {
+; CHECK-LABEL: @ld_v4i8_add_not_safe(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP:%.*]] = add nsw i32 [[V0:%.*]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[V1:%.*]], [[TMP]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, i8* [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = add nsw i32 [[V0]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[V1]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP12]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = add nsw i32 [[V0]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[V1]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP15]] to i64
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i8, i8* [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> poison, i8 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP13]], i32 2
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP18]], i32 3
+; CHECK-NEXT:    store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+bb:
+  %tmp = add nsw i32 %v0, -1
+  %tmp1 = add i32 %v1, %tmp
+  %tmp2 = sext i32 %tmp1 to i64
+  %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2
+  %tmp4 = load i8, i8* %tmp3, align 1
+  %tmp5 = add i32 %v1, %v0
+  %tmp6 = sext i32 %tmp5 to i64
+  %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6
+  %tmp8 = load i8, i8* %tmp7, align 1
+  %tmp9 = add nsw i32 %v0, 1
+  %tmp10 = add i32 %v1, %tmp9
+  %tmp11 = sext i32 %tmp10 to i64
+  %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11
+  %tmp13 = load i8, i8* %tmp12, align 1
+  %tmp14 = add nsw i32 %v0, 2
+  %tmp15 = add i32 %v1, %tmp14
+  %tmp16 = sext i32 %tmp15 to i64
+  %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16
+  %tmp18 = load i8, i8* %tmp17, align 1
+  %tmp19 = insertelement <4 x i8> poison, i8 %tmp4, i32 0
+  %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
+  %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
+  %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
+  store <4 x i8> %tmp22, <4 x i8>* %dst
+  ret void
+}
--- a/llvm/test/Transforms/LoopStrengthReduce/ARM/vctp-chains-inseltpoison.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/ARM/vctp-chains-inseltpoison.ll
@ -0,0 +1,257 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=thumbv8.1m.main -mattr=+mve %s -S -loop-reduce -o - | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m-arm-none-eabi"
+
+define float @vctp8(float* %0, i32 %1) {
+; CHECK-LABEL: @vctp8(
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP1:%.*]], -1
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 -32, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]]
+; CHECK-NEXT:    br label [[TMP11:%.*]]
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ [[TMP5]], [[TMP2:%.*]] ], [ [[TMP21:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP19:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP17:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <16 x i1> @llvm.arm.mve.vctp8(i32 [[TMP12]])
+; CHECK-NEXT:    [[MASK:%.*]] = tail call <4 x i1> @v16i1_to_v4i1(<16 x i1> [[TMP15]])
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP14]], i32 32, <4 x i1> [[MASK]])
+; CHECK-NEXT:    [[TMP17]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 0
+; CHECK-NEXT:    [[TMP19]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP13]], <4 x float> [[TMP18]], <4 x i1> [[MASK]], <4 x float> [[TMP13]])
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP12]], 4
+; CHECK-NEXT:    [[TMP21]] = add i32 [[TMP12]], -4
+; CHECK-NEXT:    br i1 [[TMP20]], label [[TMP11]], label [[TMP22:%.*]]
+; CHECK:       22:
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> [[TMP19]])
+; CHECK-NEXT:    [[TMP24:%.*]] = sitofp i32 [[TMP23]] to float
+; CHECK-NEXT:    [[TMP25:%.*]] = tail call float @llvm.fabs.f32(float [[TMP24]])
+; CHECK-NEXT:    ret float [[TMP25]]
+;
+  %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+  %4 = extractvalue { <4 x i32>, i32 } %3, 0
+  %5 = add nsw i32 %1, -1
+  %6 = ptrtoint float* %0 to i32
+  %7 = insertelement <4 x i32> poison, i32 %6, i32 0
+  %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
+  %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %10 = add <4 x i32> %4, %9
+  br label %11
+
+11:                                               ; preds = %11, %2
+  %12 = phi i32 [ %5, %2 ], [ %20, %11 ]
+  %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]
+  %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]
+  %15 = tail call <16 x i1> @llvm.arm.mve.vctp8(i32 %12)
+  %mask = tail call <4 x i1> @v16i1_to_v4i1(<16 x i1> %15)
+  %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %mask)
+  %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1
+  %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0
+  %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %mask, <4 x float> %13)
+  %20 = add nsw i32 %12, -4
+  %21 = icmp sgt i32 %12, 4
+  br i1 %21, label %11, label %22
+
+22:                                               ; preds = %11
+  %23 = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19)
+  %24 = sitofp i32 %23 to float
+  %25 = tail call float @llvm.fabs.f32(float %24)
+  ret float %25
+}
+
+define float @vctp16(float* %0, i32 %1) {
+; CHECK-LABEL: @vctp16(
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP1:%.*]], -1
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 -32, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]]
+; CHECK-NEXT:    br label [[TMP11:%.*]]
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ [[TMP5]], [[TMP2:%.*]] ], [ [[TMP21:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP19:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP17:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP12]])
+; CHECK-NEXT:    [[MASK:%.*]] = tail call <4 x i1> @v8i1_to_v4i1(<8 x i1> [[TMP15]])
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP14]], i32 32, <4 x i1> [[MASK]])
+; CHECK-NEXT:    [[TMP17]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 0
+; CHECK-NEXT:    [[TMP19]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP13]], <4 x float> [[TMP18]], <4 x i1> [[MASK]], <4 x float> [[TMP13]])
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP12]], 4
+; CHECK-NEXT:    [[TMP21]] = add i32 [[TMP12]], -4
+; CHECK-NEXT:    br i1 [[TMP20]], label [[TMP11]], label [[TMP22:%.*]]
+; CHECK:       22:
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> [[TMP19]])
+; CHECK-NEXT:    [[TMP24:%.*]] = sitofp i32 [[TMP23]] to float
+; CHECK-NEXT:    [[TMP25:%.*]] = tail call float @llvm.fabs.f32(float [[TMP24]])
+; CHECK-NEXT:    ret float [[TMP25]]
+;
+  %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+  %4 = extractvalue { <4 x i32>, i32 } %3, 0
+  %5 = add nsw i32 %1, -1
+  %6 = ptrtoint float* %0 to i32
+  %7 = insertelement <4 x i32> poison, i32 %6, i32 0
+  %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
+  %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %10 = add <4 x i32> %4, %9
+  br label %11
+
+11:                                               ; preds = %11, %2
+  %12 = phi i32 [ %5, %2 ], [ %20, %11 ]
+  %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]
+  %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]
+  %15 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %12)
+  %mask = tail call <4 x i1> @v8i1_to_v4i1(<8 x i1> %15)
+  %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %mask)
+  %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1
+  %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0
+  %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %mask, <4 x float> %13)
+  %20 = add nsw i32 %12, -4
+  %21 = icmp sgt i32 %12, 4
+  br i1 %21, label %11, label %22
+
+22:                                               ; preds = %11
+  %23 = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19)
+  %24 = sitofp i32 %23 to float
+  %25 = tail call float @llvm.fabs.f32(float %24)
+  ret float %25
+}
+
+define float @vctpi32(float* %0, i32 %1) {
+; CHECK-LABEL: @vctpi32(
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP1:%.*]], -1
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 -32, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]]
+; CHECK-NEXT:    br label [[TMP11:%.*]]
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ [[TMP5]], [[TMP2:%.*]] ], [ [[TMP21:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP19:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP17:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP12]])
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP14]], i32 32, <4 x i1> [[TMP15]])
+; CHECK-NEXT:    [[TMP17]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 0
+; CHECK-NEXT:    [[TMP19]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP13]], <4 x float> [[TMP18]], <4 x i1> [[TMP15]], <4 x float> [[TMP13]])
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP12]], 4
+; CHECK-NEXT:    [[TMP21]] = add i32 [[TMP12]], -4
+; CHECK-NEXT:    br i1 [[TMP20]], label [[TMP11]], label [[TMP22:%.*]]
+; CHECK:       22:
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> [[TMP19]])
+; CHECK-NEXT:    [[TMP24:%.*]] = sitofp i32 [[TMP23]] to float
+; CHECK-NEXT:    [[TMP25:%.*]] = tail call float @llvm.fabs.f32(float [[TMP24]])
+; CHECK-NEXT:    ret float [[TMP25]]
+;
+  %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+  %4 = extractvalue { <4 x i32>, i32 } %3, 0
+  %5 = add nsw i32 %1, -1
+  %6 = ptrtoint float* %0 to i32
+  %7 = insertelement <4 x i32> poison, i32 %6, i32 0
+  %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
+  %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %10 = add <4 x i32> %4, %9
+  br label %11
+
+11:                                               ; preds = %11, %2
+  %12 = phi i32 [ %5, %2 ], [ %20, %11 ]
+  %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]
+  %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]
+  %15 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %12)
+  %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %15)
+  %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1
+  %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0
+  %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %15, <4 x float> %13)
+  %20 = add nsw i32 %12, -4
+  %21 = icmp sgt i32 %12, 4
+  br i1 %21, label %11, label %22
+
+22:                                               ; preds = %11
+  %23 = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19)
+  %24 = sitofp i32 %23 to float
+  %25 = tail call float @llvm.fabs.f32(float %24)
+  ret float %25
+}
+
+
+define float @vctpi64(float* %0, i32 %1) {
+; CHECK-LABEL: @vctpi64(
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP1:%.*]], -1
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 -32, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]]
+; CHECK-NEXT:    br label [[TMP11:%.*]]
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ [[TMP5]], [[TMP2:%.*]] ], [ [[TMP21:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP19:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP17:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <4 x i1> @llvm.arm.mve.vctp64(i32 [[TMP12]])
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP14]], i32 32, <4 x i1> [[TMP15]])
+; CHECK-NEXT:    [[TMP17]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 0
+; CHECK-NEXT:    [[TMP19]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP13]], <4 x float> [[TMP18]], <4 x i1> [[TMP15]], <4 x float> [[TMP13]])
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP12]], 4
+; CHECK-NEXT:    [[TMP21]] = add i32 [[TMP12]], -4
+; CHECK-NEXT:    br i1 [[TMP20]], label [[TMP11]], label [[TMP22:%.*]]
+; CHECK:       22:
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> [[TMP19]])
+; CHECK-NEXT:    [[TMP24:%.*]] = sitofp i32 [[TMP23]] to float
+; CHECK-NEXT:    [[TMP25:%.*]] = tail call float @llvm.fabs.f32(float [[TMP24]])
+; CHECK-NEXT:    ret float [[TMP25]]
+;
+  %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+  %4 = extractvalue { <4 x i32>, i32 } %3, 0
+  %5 = add nsw i32 %1, -1
+  %6 = ptrtoint float* %0 to i32
+  %7 = insertelement <4 x i32> poison, i32 %6, i32 0
+  %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
+  %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %10 = add <4 x i32> %4, %9
+  br label %11
+
+11:                                               ; preds = %11, %2
+  %12 = phi i32 [ %5, %2 ], [ %20, %11 ]
+  %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]
+  %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]
+  %15 = tail call <4 x i1> @llvm.arm.mve.vctp64(i32 %12)
+  %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %15)
+  %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1
+  %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0
+  %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %15, <4 x float> %13)
+  %20 = add nsw i32 %12, -4
+  %21 = icmp sgt i32 %12, 4
+  br i1 %21, label %11, label %22
+
+22:                                               ; preds = %11
+  %23 = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19)
+  %24 = sitofp i32 %23 to float
+  %25 = tail call float @llvm.fabs.f32(float %24)
+  ret float %25
+}
+
+declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32, i32)
+declare <16 x i1> @llvm.arm.mve.vctp8(i32)
+declare <8 x i1> @llvm.arm.mve.vctp16(i32)
+declare <4 x i1> @llvm.arm.mve.vctp32(i32)
+declare <4 x i1> @llvm.arm.mve.vctp64(i32)
+declare { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>)
+declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
+declare i32 @vecAddAcrossF32Mve(...)
+declare <4 x i1> @v8i1_to_v4i1(<8 x i1>)
+declare <4 x i1> @v16i1_to_v4i1(<16 x i1>)
+declare float @llvm.fabs.f32(float)
--- a/llvm/test/Transforms/LoopUnroll/PowerPC/p8-unrolling-legalize-vectors-inseltpoison.ll
+++ b/llvm/test/Transforms/LoopUnroll/PowerPC/p8-unrolling-legalize-vectors-inseltpoison.ll
@ -0,0 +1,256 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -loop-unroll | FileCheck %s
+; RUN: opt < %s -S -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -loop-unroll | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; Function Attrs: norecurse nounwind
+define i8* @f(i8* returned %s, i32 zeroext %x, i32 signext %k) local_unnamed_addr #0 {
+; CHECK-LABEL: @f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[K:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP10]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[K]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[K]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967280
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[N_VEC]], -16
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[XTRAITER1:%.*]] = and i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP1]], 1
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]]
+; CHECK:       vector.ph.new:
+; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[TMP2]], [[XTRAITER1]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND12:%.*]] = phi <16 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, [[VECTOR_PH_NEW]] ], [ [[VEC_IND_NEXT13_1:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ [[UNROLL_ITER]], [[VECTOR_PH_NEW]] ], [ [[NITER_NSUB_1:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = shl <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, [[VEC_IND12]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i32> [[TMP4]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <16 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> <i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48>, <16 x i8> <i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49>
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[S:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <16 x i8>*
+; CHECK-NEXT:    store <16 x i8> [[TMP7]], <16 x i8>* [[TMP9]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT:%.*]] = add nuw nsw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[VEC_IND_NEXT13:%.*]] = add <16 x i32> [[VEC_IND12]], <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[NITER_NSUB:%.*]] = sub i64 [[NITER]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = shl <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, [[VEC_IND_NEXT13]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i32> [[TMP10]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq <16 x i32> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i8> <i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48>, <16 x i8> <i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49>
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[S]], i64 [[INDEX_NEXT]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <16 x i8>*
+; CHECK-NEXT:    store <16 x i8> [[TMP13]], <16 x i8>* [[TMP15]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT_1]] = add i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    [[VEC_IND_NEXT13_1]] = add <16 x i32> [[VEC_IND_NEXT13]], <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[NITER_NSUB_1]] = sub i64 [[NITER_NSUB]], 1
+; CHECK-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NSUB_1]], 0
+; CHECK-NEXT:    br i1 [[NITER_NCMP_1]], label [[MIDDLE_BLOCK_UNR_LCSSA_LOOPEXIT:%.*]], label [[VECTOR_BODY]]
+; CHECK:       middle.block.unr-lcssa.loopexit:
+; CHECK-NEXT:    [[INDEX_UNR_PH:%.*]] = phi i64 [ [[INDEX_NEXT_1]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND12_UNR_PH:%.*]] = phi <16 x i32> [ [[VEC_IND_NEXT13_1]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK_UNR_LCSSA]]
+; CHECK:       middle.block.unr-lcssa:
+; CHECK-NEXT:    [[INDEX_UNR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_UNR_PH]], [[MIDDLE_BLOCK_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[VEC_IND12_UNR:%.*]] = phi <16 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, [[VECTOR_PH]] ], [ [[VEC_IND12_UNR_PH]], [[MIDDLE_BLOCK_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[LCMP_MOD2:%.*]] = icmp ne i64 [[XTRAITER1]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD2]], label [[VECTOR_BODY_EPIL_PREHEADER:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       vector.body.epil.preheader:
+; CHECK-NEXT:    br label [[VECTOR_BODY_EPIL:%.*]]
+; CHECK:       vector.body.epil:
+; CHECK-NEXT:    [[INDEX_EPIL:%.*]] = phi i64 [ [[INDEX_UNR]], [[VECTOR_BODY_EPIL_PREHEADER]] ]
+; CHECK-NEXT:    [[VEC_IND12_EPIL:%.*]] = phi <16 x i32> [ [[VEC_IND12_UNR]], [[VECTOR_BODY_EPIL_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = shl <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, [[VEC_IND12_EPIL]]
+; CHECK-NEXT:    [[TMP17:%.*]] = and <16 x i32> [[TMP16]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq <16 x i32> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = select <16 x i1> [[TMP18]], <16 x i8> <i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48>, <16 x i8> <i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49>
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, i8* [[S]], i64 [[INDEX_EPIL]]
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8* [[TMP20]] to <16 x i8>*
+; CHECK-NEXT:    store <16 x i8> [[TMP19]], <16 x i8>* [[TMP21]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT_EPIL:%.*]] = add i64 [[INDEX_EPIL]], 16
+; CHECK-NEXT:    [[VEC_IND_NEXT13_EPIL:%.*]] = add <16 x i32> [[VEC_IND12_EPIL]], <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT_EPIL]], [[N_VEC]]
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK_EPILOG_LCSSA:%.*]]
+; CHECK:       middle.block.epilog-lcssa:
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[INDVARS_IV_PH]]
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[WIDE_TRIP_COUNT]], -1
+; CHECK-NEXT:    [[TMP25:%.*]] = sub i64 [[TMP24]], [[INDVARS_IV_PH]]
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[TMP23]], 7
+; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label [[FOR_BODY_PROL_PREHEADER:%.*]], label [[FOR_BODY_PROL_LOOPEXIT:%.*]]
+; CHECK:       for.body.prol.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY_PROL:%.*]]
+; CHECK:       for.body.prol:
+; CHECK-NEXT:    [[INDVARS_IV_PROL:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_PROL:%.*]], [[FOR_BODY_PROL]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PROL_PREHEADER]] ]
+; CHECK-NEXT:    [[PROL_ITER:%.*]] = phi i64 [ [[XTRAITER]], [[FOR_BODY_PROL_PREHEADER]] ], [ [[PROL_ITER_SUB:%.*]], [[FOR_BODY_PROL]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[INDVARS_IV_PROL]] to i32
+; CHECK-NEXT:    [[SHL_PROL:%.*]] = shl i32 1, [[TMP26]]
+; CHECK-NEXT:    [[AND_PROL:%.*]] = and i32 [[SHL_PROL]], [[X]]
+; CHECK-NEXT:    [[TOBOOL_PROL:%.*]] = icmp eq i32 [[AND_PROL]], 0
+; CHECK-NEXT:    [[CONV_PROL:%.*]] = select i1 [[TOBOOL_PROL]], i8 48, i8 49
+; CHECK-NEXT:    [[ARRAYIDX_PROL:%.*]] = getelementptr inbounds i8, i8* [[S]], i64 [[INDVARS_IV_PROL]]
+; CHECK-NEXT:    store i8 [[CONV_PROL]], i8* [[ARRAYIDX_PROL]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_PROL]] = add nuw nsw i64 [[INDVARS_IV_PROL]], 1
+; CHECK-NEXT:    [[EXITCOND_PROL:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_PROL]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    [[PROL_ITER_SUB]] = sub i64 [[PROL_ITER]], 1
+; CHECK-NEXT:    [[PROL_ITER_CMP:%.*]] = icmp ne i64 [[PROL_ITER_SUB]], 0
+; CHECK-NEXT:    br i1 [[PROL_ITER_CMP]], label [[FOR_BODY_PROL]], label [[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA:%.*]], [[LOOP0:!llvm.loop !.*]]
+; CHECK:       for.body.prol.loopexit.unr-lcssa:
+; CHECK-NEXT:    [[INDVARS_IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_PROL]], [[FOR_BODY_PROL]] ]
+; CHECK-NEXT:    br label [[FOR_BODY_PROL_LOOPEXIT]]
+; CHECK:       for.body.prol.loopexit:
+; CHECK-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_UNR_PH]], [[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]] ]
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ult i64 [[TMP25]], 7
+; CHECK-NEXT:    br i1 [[TMP27]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY_PREHEADER_NEW:%.*]]
+; CHECK:       for.body.preheader.new:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], [[FOR_BODY_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_7:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 1, [[TMP28]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[SHL]], [[X]]
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = select i1 [[TOBOOL]], i8 48, i8 49
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[S]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 [[CONV]], i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP29:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[SHL_1:%.*]] = shl i32 1, [[TMP29]]
+; CHECK-NEXT:    [[AND_1:%.*]] = and i32 [[SHL_1]], [[X]]
+; CHECK-NEXT:    [[TOBOOL_1:%.*]] = icmp eq i32 [[AND_1]], 0
+; CHECK-NEXT:    [[CONV_1:%.*]] = select i1 [[TOBOOL_1]], i8 48, i8 49
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, i8* [[S]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    store i8 [[CONV_1]], i8* [[ARRAYIDX_1]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT]], 1
+; CHECK-NEXT:    [[TMP30:%.*]] = trunc i64 [[INDVARS_IV_NEXT_1]] to i32
+; CHECK-NEXT:    [[SHL_2:%.*]] = shl i32 1, [[TMP30]]
+; CHECK-NEXT:    [[AND_2:%.*]] = and i32 [[SHL_2]], [[X]]
+; CHECK-NEXT:    [[TOBOOL_2:%.*]] = icmp eq i32 [[AND_2]], 0
+; CHECK-NEXT:    [[CONV_2:%.*]] = select i1 [[TOBOOL_2]], i8 48, i8 49
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, i8* [[S]], i64 [[INDVARS_IV_NEXT_1]]
+; CHECK-NEXT:    store i8 [[CONV_2]], i8* [[ARRAYIDX_2]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT_1]], 1
+; CHECK-NEXT:    [[TMP31:%.*]] = trunc i64 [[INDVARS_IV_NEXT_2]] to i32
+; CHECK-NEXT:    [[SHL_3:%.*]] = shl i32 1, [[TMP31]]
+; CHECK-NEXT:    [[AND_3:%.*]] = and i32 [[SHL_3]], [[X]]
+; CHECK-NEXT:    [[TOBOOL_3:%.*]] = icmp eq i32 [[AND_3]], 0
+; CHECK-NEXT:    [[CONV_3:%.*]] = select i1 [[TOBOOL_3]], i8 48, i8 49
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, i8* [[S]], i64 [[INDVARS_IV_NEXT_2]]
+; CHECK-NEXT:    store i8 [[CONV_3]], i8* [[ARRAYIDX_3]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT_2]], 1
+; CHECK-NEXT:    [[TMP32:%.*]] = trunc i64 [[INDVARS_IV_NEXT_3]] to i32
+; CHECK-NEXT:    [[SHL_4:%.*]] = shl i32 1, [[TMP32]]
+; CHECK-NEXT:    [[AND_4:%.*]] = and i32 [[SHL_4]], [[X]]
+; CHECK-NEXT:    [[TOBOOL_4:%.*]] = icmp eq i32 [[AND_4]], 0
+; CHECK-NEXT:    [[CONV_4:%.*]] = select i1 [[TOBOOL_4]], i8 48, i8 49
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i8, i8* [[S]], i64 [[INDVARS_IV_NEXT_3]]
+; CHECK-NEXT:    store i8 [[CONV_4]], i8* [[ARRAYIDX_4]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_4:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT_3]], 1
+; CHECK-NEXT:    [[TMP33:%.*]] = trunc i64 [[INDVARS_IV_NEXT_4]] to i32
+; CHECK-NEXT:    [[SHL_5:%.*]] = shl i32 1, [[TMP33]]
+; CHECK-NEXT:    [[AND_5:%.*]] = and i32 [[SHL_5]], [[X]]
+; CHECK-NEXT:    [[TOBOOL_5:%.*]] = icmp eq i32 [[AND_5]], 0
+; CHECK-NEXT:    [[CONV_5:%.*]] = select i1 [[TOBOOL_5]], i8 48, i8 49
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i8, i8* [[S]], i64 [[INDVARS_IV_NEXT_4]]
+; CHECK-NEXT:    store i8 [[CONV_5]], i8* [[ARRAYIDX_5]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_5:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT_4]], 1
+; CHECK-NEXT:    [[TMP34:%.*]] = trunc i64 [[INDVARS_IV_NEXT_5]] to i32
+; CHECK-NEXT:    [[SHL_6:%.*]] = shl i32 1, [[TMP34]]
+; CHECK-NEXT:    [[AND_6:%.*]] = and i32 [[SHL_6]], [[X]]
+; CHECK-NEXT:    [[TOBOOL_6:%.*]] = icmp eq i32 [[AND_6]], 0
+; CHECK-NEXT:    [[CONV_6:%.*]] = select i1 [[TOBOOL_6]], i8 48, i8 49
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i8, i8* [[S]], i64 [[INDVARS_IV_NEXT_5]]
+; CHECK-NEXT:    store i8 [[CONV_6]], i8* [[ARRAYIDX_6]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_6:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT_5]], 1
+; CHECK-NEXT:    [[TMP35:%.*]] = trunc i64 [[INDVARS_IV_NEXT_6]] to i32
+; CHECK-NEXT:    [[SHL_7:%.*]] = shl i32 1, [[TMP35]]
+; CHECK-NEXT:    [[AND_7:%.*]] = and i32 [[SHL_7]], [[X]]
+; CHECK-NEXT:    [[TOBOOL_7:%.*]] = icmp eq i32 [[AND_7]], 0
+; CHECK-NEXT:    [[CONV_7:%.*]] = select i1 [[TOBOOL_7]], i8 48, i8 49
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i8, i8* [[S]], i64 [[INDVARS_IV_NEXT_6]]
+; CHECK-NEXT:    store i8 [[CONV_7]], i8* [[ARRAYIDX_7]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV_NEXT_6]], 1
+; CHECK-NEXT:    [[EXITCOND_7:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_7]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_7]], label [[FOR_END_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end.loopexit.unr-lcssa:
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = sext i32 [[K]] to i64
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[S]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    store i8 0, i8* [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    ret i8* [[S]]
+;
+entry:
+  %cmp10 = icmp sgt i32 %k, 0
+  br i1 %cmp10, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %wide.trip.count = zext i32 %k to i64
+  %min.iters.check = icmp ult i32 %k, 16
+  br i1 %min.iters.check, label %for.body.preheader, label %vector.ph
+
+vector.ph:                                        ; preds = %for.body.lr.ph
+  %n.vec = and i64 %wide.trip.count, 4294967280
+  %broadcast.splatinsert = insertelement <16 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.ind12 = phi <16 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, %vector.ph ], [ %vec.ind.next13, %vector.body ]
+  %0 = shl <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, %vec.ind12
+  %1 = and <16 x i32> %0, %broadcast.splat
+  %2 = icmp eq <16 x i32> %1, zeroinitializer
+  %3 = select <16 x i1> %2, <16 x i8> <i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48>, <16 x i8> <i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49, i8 49>
+  %4 = getelementptr inbounds i8, i8* %s, i64 %index
+  %5 = bitcast i8* %4 to <16 x i8>*
+  store <16 x i8> %3, <16 x i8>* %5, align 1
+  %index.next = add i64 %index, 16
+  %vec.ind.next13 = add <16 x i32> %vec.ind12, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %6 = icmp eq i64 %index.next, %n.vec
+  br i1 %6, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
+  br i1 %cmp.n, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %middle.block, %for.body.lr.ph
+  %indvars.iv.ph = phi i64 [ 0, %for.body.lr.ph ], [ %n.vec, %middle.block ]
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader ]
+  %7 = trunc i64 %indvars.iv to i32
+  %shl = shl i32 1, %7
+  %and = and i32 %shl, %x
+  %tobool = icmp eq i32 %and, 0
+  %conv = select i1 %tobool, i8 48, i8 49
+  %arrayidx = getelementptr inbounds i8, i8* %s, i64 %indvars.iv
+  store i8 %conv, i8* %arrayidx, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %middle.block, %entry
+  %idxprom1 = sext i32 %k to i64
+  %arrayidx2 = getelementptr inbounds i8, i8* %s, i64 %idxprom1
+  store i8 0, i8* %arrayidx2, align 1
+  ret i8* %s
+}
+
--- a/llvm/test/Transforms/NewGVN/2016-08-30-MaskedScatterGather-xfail-inseltpoison.ll
+++ b/llvm/test/Transforms/NewGVN/2016-08-30-MaskedScatterGather-xfail-inseltpoison.ll
@ -0,0 +1,43 @@
+; XFAIL: *
+; RUN: opt < %s -basic-aa -newgvn -S | FileCheck %s
+
+declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
+declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
+
+; This test ensures that masked scatter and gather operations, which take vectors of pointers,
+; do not have pointer aliasing ignored when being processed.
+; No scatter/gather calls should end up eliminated
+; CHECK: llvm.masked.gather
+; CHECK: llvm.masked.gather
+; CHECK: llvm.masked.scatter
+; CHECK: llvm.masked.gather
+; CHECK: llvm.masked.scatter
+; CHECK: llvm.masked.gather
+define spir_kernel void @test(<2 x i32*> %in1, <2 x i32*> %in2, i32* %out) {
+entry:
+  ; Just some temporary storage
+  %tmp.0 = alloca i32
+  %tmp.1 = alloca i32
+  %tmp.i = insertelement <2 x i32*> poison, i32* %tmp.0, i32 0
+  %tmp = insertelement <2 x i32*> %tmp.i, i32* %tmp.1, i32 1
+  ; Read from in1 and in2
+  %in1.v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %in1, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
+  %in2.v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %in2, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
+  ; Store in1 to the allocas
+  call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %in1.v, <2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>);
+  ; Read in1 from the allocas
+  ; This gather should alias the scatter we just saw
+  %tmp.v.0 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
+  ; Store in2 to the allocas
+  call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %in2.v, <2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>);
+  ; Read in2 from the allocas
+  ; This gather should alias the scatter we just saw, and not be eliminated
+  %tmp.v.1 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
+  ; Store in2 to out for good measure
+  %tmp.v.1.0 = extractelement <2 x i32> %tmp.v.1, i32 0
+  %tmp.v.1.1 = extractelement <2 x i32> %tmp.v.1, i32 1
+  store i32 %tmp.v.1.0, i32* %out
+  %out.1 = getelementptr i32, i32* %out, i32 1
+  store i32 %tmp.v.1.1, i32* %out.1
+  ret void
+}
--- a/llvm/test/Transforms/PGOProfile/counter_promo_nest-inseltpoison.ll
+++ b/llvm/test/Transforms/PGOProfile/counter_promo_nest-inseltpoison.ll
@ -0,0 +1,165 @@
+; TEST that counter updates are promoted outside the whole loop nest
+; RUN: opt < %s -pgo-instr-gen -instrprof -do-counter-promotion=true -S | FileCheck --check-prefix=PROMO  %s
+; RUN: opt < %s --passes=pgo-instr-gen,instrprof -do-counter-promotion=true -S | FileCheck --check-prefix=PROMO  %s 
+
+@g = common local_unnamed_addr global i32 0, align 4
+@c = local_unnamed_addr global i32 10, align 4
+
+; Function Attrs: noinline norecurse nounwind uwtable
+define void @bar() local_unnamed_addr #0 {
+bb:
+  %tmp2 = load i32, i32* @g, align 4, !tbaa !2
+  %tmp3 = add nsw i32 %tmp2, 1
+  store i32 %tmp3, i32* @g, align 4, !tbaa !2
+  ret void
+}
+
+; Function Attrs: norecurse nounwind uwtable
+define i32 @main() local_unnamed_addr #1 {
+bb:
+  store i32 0, i32* @g, align 4, !tbaa !2
+  %tmp = load i32, i32* @c, align 4, !tbaa !2
+  %tmp1 = icmp sgt i32 %tmp, 0
+  br i1 %tmp1, label %bb2_1, label %bb84
+
+bb2_1:
+  br label %bb2
+
+bb2:                                              ; preds = %bb39, %bb
+  %tmp3 = phi i32 [ %tmp40, %bb39 ], [ %tmp, %bb2_1 ]
+  %tmp5 = phi i32 [ %tmp43, %bb39 ], [ 0, %bb2_1 ]
+  %tmp7 = icmp sgt i32 %tmp3, 0
+  br i1 %tmp7, label %bb14_1, label %bb39
+
+bb8:                                              ; preds = %bb39
+; PROMO-LABEL: bb8
+; PROMO: load {{.*}} @__profc_main{{.*}}
+; PROMO-NEXT: add
+; PROMO-NEXT: store {{.*}}@__profc_main{{.*}}
+; PROMO-NEXT: load {{.*}} @__profc_main{{.*}}
+; PROMO-NEXT: add
+; PROMO-NEXT: store {{.*}}@__profc_main{{.*}}
+; PROMO-NEXT: load {{.*}} @__profc_main{{.*}}
+; PROMO-NEXT: add
+; PROMO-NEXT: store {{.*}}@__profc_main{{.*}}
+; PROMO-NEXT: load {{.*}} @__profc_main{{.*}}
+; PROMO-NEXT: add
+; PROMO-NEXT: store {{.*}}@__profc_main{{.*}}
+; PROMO-NEXT: load {{.*}} @__profc_main{{.*}}
+; PROMO-NEXT: add
+; PROMO-NEXT: store {{.*}}@__profc_main{{.*}}
+
+  %tmp13 = icmp sgt i32 %tmp40, 0
+  br i1 %tmp13, label %bb45, label %bb84
+
+bb14_1:
+  br label %bb14
+
+bb14:                                             ; preds = %bb29, %bb2
+  %tmp15 = phi i32 [ %tmp30, %bb29 ], [ %tmp3, %bb14_1 ]
+  %tmp16 = phi i64 [ %tmp31, %bb29 ], [ 0, %bb14_1 ]
+  %tmp17 = phi i64 [ %tmp32, %bb29 ], [ 0, %bb14_1 ]
+  %tmp18 = phi i32 [ %tmp33, %bb29 ], [ 0, %bb14_1 ]
+  %tmp19 = icmp sgt i32 %tmp15, 0
+  br i1 %tmp19, label %bb20_split, label %bb29
+
+bb20_split:                                             
+ br label %bb20
+
+bb20:                                             ; preds = %bb20, %bb14
+  %tmp21 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb20_split ]
+  %tmp22 = phi i32 [ %tmp24, %bb20 ], [ 0, %bb20_split ]
+  %tmp23 = add nuw i64 %tmp21, 1
+  tail call void @bar()
+  %tmp24 = add nuw nsw i32 %tmp22, 1
+  %tmp25 = load i32, i32* @c, align 4, !tbaa !2
+  %tmp26 = icmp slt i32 %tmp24, %tmp25
+  br i1 %tmp26, label %bb20, label %bb27
+
+bb27:                                             ; preds = %bb20
+  %tmp28 = add i64 %tmp23, %tmp16
+  br label %bb29
+
+bb29:                                             ; preds = %bb27, %bb14
+  %tmp30 = phi i32 [ %tmp25, %bb27 ], [ %tmp15, %bb14 ]
+  %tmp31 = phi i64 [ %tmp28, %bb27 ], [ %tmp16, %bb14 ]
+  %tmp32 = add nuw i64 %tmp17, 1
+  %tmp33 = add nuw nsw i32 %tmp18, 1
+  %tmp34 = icmp slt i32 %tmp33, %tmp30
+  br i1 %tmp34, label %bb14, label %bb35
+
+bb35:                                             ; preds = %bb29
+  %tmp36 = insertelement <2 x i64> poison, i64 %tmp31, i32 0
+  br label %bb39
+
+bb39:                                             ; preds = %bb35, %bb2
+  %tmp40 = phi i32 [ %tmp30, %bb35 ], [ %tmp3, %bb2 ]
+  %tmp43 = add nuw nsw i32 %tmp5, 1
+  %tmp44 = icmp slt i32 %tmp43, %tmp40
+  br i1 %tmp44, label %bb2, label %bb8
+
+bb45:                                             ; preds = %bb67, %bb8
+  %tmp46 = phi i32 [ %tmp68, %bb67 ], [ %tmp40, %bb8 ]
+  %tmp47 = phi i64 [ %tmp69, %bb67 ], [ 0, %bb8 ]
+  %tmp48 = phi i64 [ %tmp70, %bb67 ], [ 0, %bb8 ]
+  %tmp49 = phi i32 [ %tmp71, %bb67 ], [ 0, %bb8 ]
+  %tmp50 = icmp sgt i32 %tmp46, 0
+  br i1 %tmp50, label %bb57, label %bb67
+
+bb51:                                             ; preds = %bb67
+  %tmp56 = icmp sgt i32 %tmp68, 0
+  br i1 %tmp56, label %bb73, label %bb84
+
+bb57:                                             ; preds = %bb57, %bb45
+  %tmp58 = phi i64 [ %tmp60, %bb57 ], [ 0, %bb45 ]
+  %tmp59 = phi i32 [ %tmp61, %bb57 ], [ 0, %bb45 ]
+  %tmp60 = add nuw i64 %tmp58, 1
+  tail call void @bar()
+  %tmp61 = add nuw nsw i32 %tmp59, 1
+  %tmp62 = load i32, i32* @c, align 4, !tbaa !2
+  %tmp63 = mul nsw i32 %tmp62, 10
+  %tmp64 = icmp slt i32 %tmp61, %tmp63
+  br i1 %tmp64, label %bb57, label %bb65
+
+bb65:                                             ; preds = %bb57
+  %tmp66 = add i64 %tmp60, %tmp47
+  br label %bb67
+
+bb67:                                             ; preds = %bb65, %bb45
+  %tmp68 = phi i32 [ %tmp62, %bb65 ], [ %tmp46, %bb45 ]
+  %tmp69 = phi i64 [ %tmp66, %bb65 ], [ %tmp47, %bb45 ]
+  %tmp70 = add nuw i64 %tmp48, 1
+  %tmp71 = add nuw nsw i32 %tmp49, 1
+  %tmp72 = icmp slt i32 %tmp71, %tmp68
+  br i1 %tmp72, label %bb45, label %bb51
+
+bb73:                                             ; preds = %bb73, %bb51
+  %tmp74 = phi i64 [ %tmp76, %bb73 ], [ 0, %bb51 ]
+  %tmp75 = phi i32 [ %tmp77, %bb73 ], [ 0, %bb51 ]
+  %tmp76 = add nuw i64 %tmp74, 1
+  tail call void @bar()
+  %tmp77 = add nuw nsw i32 %tmp75, 1
+  %tmp78 = load i32, i32* @c, align 4, !tbaa !2
+  %tmp79 = mul nsw i32 %tmp78, 100
+  %tmp80 = icmp slt i32 %tmp77, %tmp79
+  br i1 %tmp80, label %bb73, label %bb81
+
+bb81:                                             ; preds = %bb73
+  br label %bb84
+
+bb84:                                             ; preds = %bb81, %bb51, %bb8, %bb
+  ret i32 0
+}
+
+attributes #0 = { noinline }
+attributes #1 = { norecurse nounwind uwtable } 
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 5.0.0 (trunk 307355)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
--- a/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll
@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -O3 -S                                        | FileCheck %s
+; RUN: opt < %s -passes='default<O3>' -aa-pipeline=default -S | FileCheck %s
+
+target triple = "x86_64--"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Ideally, this should reach the backend with 1 fsub, 1 fadd, and 1 shuffle.
+; That may require some coordination between VectorCombine, SLP, and other passes.
+; The end goal is to get a single "vaddsubps" instruction for x86 with AVX.
+
+define <4 x float> @PR45015(<4 x float> %arg, <4 x float> %arg1) {
+; CHECK-LABEL: @PR45015(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[ARG:%.*]], [[ARG1:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[ARG]], [[ARG1]]
+; CHECK-NEXT:    [[T16:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[T16]]
+;
+  %t = extractelement <4 x float> %arg, i32 0
+  %t2 = extractelement <4 x float> %arg1, i32 0
+  %t3 = fsub float %t, %t2
+  %t4 = insertelement <4 x float> poison, float %t3, i32 0
+  %t5 = extractelement <4 x float> %arg, i32 1
+  %t6 = extractelement <4 x float> %arg1, i32 1
+  %t7 = fadd float %t5, %t6
+  %t8 = insertelement <4 x float> %t4, float %t7, i32 1
+  %t9 = extractelement <4 x float> %arg, i32 2
+  %t10 = extractelement <4 x float> %arg1, i32 2
+  %t11 = fsub float %t9, %t10
+  %t12 = insertelement <4 x float> %t8, float %t11, i32 2
+  %t13 = extractelement <4 x float> %arg, i32 3
+  %t14 = extractelement <4 x float> %arg1, i32 3
+  %t15 = fadd float %t13, %t14
+  %t16 = insertelement <4 x float> %t12, float %t15, i32 3
+  ret <4 x float> %t16
+}
+
+; PR42022 - https://bugs.llvm.org/show_bug.cgi?id=42022
+
+%struct.Vector4 = type { float, float, float, float }
+
+define { <2 x float>, <2 x float> } @add_aggregate(<2 x float> %a0, <2 x float> %a1, <2 x float> %b0, <2 x float> %b1) {
+; CHECK-LABEL: @add_aggregate(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
+; CHECK-NEXT:    [[FCA_0_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } undef, <2 x float> [[TMP1]], 0
+; CHECK-NEXT:    [[FCA_1_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } [[FCA_0_INSERT]], <2 x float> [[TMP2]], 1
+; CHECK-NEXT:    ret { <2 x float>, <2 x float> } [[FCA_1_INSERT]]
+;
+  %a00 = extractelement <2 x float> %a0, i32 0
+  %b00 = extractelement <2 x float> %b0, i32 0
+  %add = fadd float %a00, %b00
+  %retval.0.0.insert = insertelement <2 x float> poison, float %add, i32 0
+  %a01 = extractelement <2 x float> %a0, i32 1
+  %b01 = extractelement <2 x float> %b0, i32 1
+  %add4 = fadd float %a01, %b01
+  %retval.0.1.insert = insertelement <2 x float> %retval.0.0.insert, float %add4, i32 1
+  %a10 = extractelement <2 x float> %a1, i32 0
+  %b10 = extractelement <2 x float> %b1, i32 0
+  %add7 = fadd float %a10, %b10
+  %retval.1.0.insert = insertelement <2 x float> poison, float %add7, i32 0
+  %a11 = extractelement <2 x float> %a1, i32 1
+  %b11 = extractelement <2 x float> %b1, i32 1
+  %add10 = fadd float %a11, %b11
+  %retval.1.1.insert = insertelement <2 x float> %retval.1.0.insert, float %add10, i32 1
+  %fca.0.insert = insertvalue { <2 x float>, <2 x float> } undef, <2 x float> %retval.0.1.insert, 0
+  %fca.1.insert = insertvalue { <2 x float>, <2 x float> } %fca.0.insert, <2 x float> %retval.1.1.insert, 1
+  ret { <2 x float>, <2 x float> } %fca.1.insert
+}
+
+define void @add_aggregate_store(<2 x float> %a0, <2 x float> %a1, <2 x float> %b0, <2 x float> %b1, %struct.Vector4* nocapture dereferenceable(16) %r) {
+; CHECK-LABEL: @add_aggregate_store(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.Vector4* [[R:%.*]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    ret void
+;
+  %a00 = extractelement <2 x float> %a0, i32 0
+  %b00 = extractelement <2 x float> %b0, i32 0
+  %add = fadd float %a00, %b00
+  %r0 = getelementptr inbounds %struct.Vector4, %struct.Vector4* %r, i64 0, i32 0
+  store float %add, float* %r0, align 4
+  %a01 = extractelement <2 x float> %a0, i32 1
+  %b01 = extractelement <2 x float> %b0, i32 1
+  %add4 = fadd float %a01, %b01
+  %r1 = getelementptr inbounds %struct.Vector4, %struct.Vector4* %r, i64 0, i32 1
+  store float %add4, float* %r1, align 4
+  %a10 = extractelement <2 x float> %a1, i32 0
+  %b10 = extractelement <2 x float> %b1, i32 0
+  %add7 = fadd float %a10, %b10
+  %r2 = getelementptr inbounds %struct.Vector4, %struct.Vector4* %r, i64 0, i32 2
+  store float %add7, float* %r2, align 4
+  %a11 = extractelement <2 x float> %a1, i32 1
+  %b11 = extractelement <2 x float> %b1, i32 1
+  %add10 = fadd float %a11, %b11
+  %r3 = getelementptr inbounds %struct.Vector4, %struct.Vector4* %r, i64 0, i32 3
+  store float %add10, float* %r3, align 4
+  ret void
+}
--- a/llvm/test/Transforms/PhaseOrdering/X86/horiz-math-inseltpoison.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/horiz-math-inseltpoison.ll
@ -0,0 +1,153 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -O3                   -S < %s  | FileCheck %s
+; RUN: opt -passes='default<O3>' -S < %s  | FileCheck %s
+
+target triple = "x86_64--"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; PR41813 - https://bugs.llvm.org/show_bug.cgi?id=41813
+
+define <4 x float> @hadd_reverse_v4f32(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: @hadd_reverse_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 3, i32 1, i32 7, i32 5>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 2, i32 0, i32 6, i32 4>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %shuffle1 = shufflevector <4 x float> %b, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %vecext = extractelement <4 x float> %shuffle, i32 0
+  %vecext2 = extractelement <4 x float> %shuffle, i32 1
+  %add = fadd float %vecext, %vecext2
+  %vecinit = insertelement <4 x float> poison, float %add, i32 0
+  %vecext3 = extractelement <4 x float> %shuffle, i32 2
+  %vecext4 = extractelement <4 x float> %shuffle, i32 3
+  %add5 = fadd float %vecext3, %vecext4
+  %vecinit6 = insertelement <4 x float> %vecinit, float %add5, i32 1
+  %vecext7 = extractelement <4 x float> %shuffle1, i32 0
+  %vecext8 = extractelement <4 x float> %shuffle1, i32 1
+  %add9 = fadd float %vecext7, %vecext8
+  %vecinit10 = insertelement <4 x float> %vecinit6, float %add9, i32 2
+  %vecext11 = extractelement <4 x float> %shuffle1, i32 2
+  %vecext12 = extractelement <4 x float> %shuffle1, i32 3
+  %add13 = fadd float %vecext11, %vecext12
+  %vecinit14 = insertelement <4 x float> %vecinit10, float %add13, i32 3
+  ret <4 x float> %vecinit14
+}
+
+define <4 x float> @reverse_hadd_v4f32(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: @reverse_hadd_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 6, i32 4>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> [[A]], <4 x i32> <i32 3, i32 1, i32 7, i32 5>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+  %vecext = extractelement <4 x float> %a, i32 0
+  %vecext1 = extractelement <4 x float> %a, i32 1
+  %add = fadd float %vecext, %vecext1
+  %vecinit = insertelement <4 x float> poison, float %add, i32 0
+  %vecext2 = extractelement <4 x float> %a, i32 2
+  %vecext3 = extractelement <4 x float> %a, i32 3
+  %add4 = fadd float %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
+  %vecext6 = extractelement <4 x float> %b, i32 0
+  %vecext7 = extractelement <4 x float> %b, i32 1
+  %add8 = fadd float %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2
+  %vecext10 = extractelement <4 x float> %b, i32 2
+  %vecext11 = extractelement <4 x float> %b, i32 3
+  %add12 = fadd float %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3
+  %shuffle = shufflevector <4 x float> %vecinit13, <4 x float> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x float> %shuffle
+}
+
+define <8 x float> @hadd_reverse_v8f32(<8 x float> %a, <8 x float> %b) #0 {
+; CHECK-LABEL: @hadd_reverse_v8f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 7, i32 5, i32 15, i32 13, i32 3, i32 1, i32 11, i32 9>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 6, i32 4, i32 14, i32 12, i32 2, i32 0, i32 10, i32 8>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+;
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %shuffle1 = shufflevector <8 x float> %b, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %vecext = extractelement <8 x float> %shuffle, i32 0
+  %vecext2 = extractelement <8 x float> %shuffle, i32 1
+  %add = fadd float %vecext, %vecext2
+  %vecinit = insertelement <8 x float> poison, float %add, i32 0
+  %vecext3 = extractelement <8 x float> %shuffle, i32 2
+  %vecext4 = extractelement <8 x float> %shuffle, i32 3
+  %add5 = fadd float %vecext3, %vecext4
+  %vecinit6 = insertelement <8 x float> %vecinit, float %add5, i32 1
+  %vecext7 = extractelement <8 x float> %shuffle1, i32 0
+  %vecext8 = extractelement <8 x float> %shuffle1, i32 1
+  %add9 = fadd float %vecext7, %vecext8
+  %vecinit10 = insertelement <8 x float> %vecinit6, float %add9, i32 2
+  %vecext11 = extractelement <8 x float> %shuffle1, i32 2
+  %vecext12 = extractelement <8 x float> %shuffle1, i32 3
+  %add13 = fadd float %vecext11, %vecext12
+  %vecinit14 = insertelement <8 x float> %vecinit10, float %add13, i32 3
+  %vecext15 = extractelement <8 x float> %shuffle, i32 4
+  %vecext16 = extractelement <8 x float> %shuffle, i32 5
+  %add17 = fadd float %vecext15, %vecext16
+  %vecinit18 = insertelement <8 x float> %vecinit14, float %add17, i32 4
+  %vecext19 = extractelement <8 x float> %shuffle, i32 6
+  %vecext20 = extractelement <8 x float> %shuffle, i32 7
+  %add21 = fadd float %vecext19, %vecext20
+  %vecinit22 = insertelement <8 x float> %vecinit18, float %add21, i32 5
+  %vecext23 = extractelement <8 x float> %shuffle1, i32 4
+  %vecext24 = extractelement <8 x float> %shuffle1, i32 5
+  %add25 = fadd float %vecext23, %vecext24
+  %vecinit26 = insertelement <8 x float> %vecinit22, float %add25, i32 6
+  %vecext27 = extractelement <8 x float> %shuffle1, i32 6
+  %vecext28 = extractelement <8 x float> %shuffle1, i32 7
+  %add29 = fadd float %vecext27, %vecext28
+  %vecinit30 = insertelement <8 x float> %vecinit26, float %add29, i32 7
+  ret <8 x float> %vecinit30
+}
+
+define <8 x float> @reverse_hadd_v8f32(<8 x float> %a, <8 x float> %b) #0 {
+; CHECK-LABEL: @reverse_hadd_v8f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x float> [[SHUFFLE]]
+;
+  %vecext = extractelement <8 x float> %a, i32 0
+  %vecext1 = extractelement <8 x float> %a, i32 1
+  %add = fadd float %vecext, %vecext1
+  %vecinit = insertelement <8 x float> poison, float %add, i32 0
+  %vecext2 = extractelement <8 x float> %a, i32 2
+  %vecext3 = extractelement <8 x float> %a, i32 3
+  %add4 = fadd float %vecext2, %vecext3
+  %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1
+  %vecext6 = extractelement <8 x float> %b, i32 0
+  %vecext7 = extractelement <8 x float> %b, i32 1
+  %add8 = fadd float %vecext6, %vecext7
+  %vecinit9 = insertelement <8 x float> %vecinit5, float %add8, i32 2
+  %vecext10 = extractelement <8 x float> %b, i32 2
+  %vecext11 = extractelement <8 x float> %b, i32 3
+  %add12 = fadd float %vecext10, %vecext11
+  %vecinit13 = insertelement <8 x float> %vecinit9, float %add12, i32 3
+  %vecext14 = extractelement <8 x float> %a, i32 4
+  %vecext15 = extractelement <8 x float> %a, i32 5
+  %add16 = fadd float %vecext14, %vecext15
+  %vecinit17 = insertelement <8 x float> %vecinit13, float %add16, i32 4
+  %vecext18 = extractelement <8 x float> %a, i32 6
+  %vecext19 = extractelement <8 x float> %a, i32 7
+  %add20 = fadd float %vecext18, %vecext19
+  %vecinit21 = insertelement <8 x float> %vecinit17, float %add20, i32 5
+  %vecext22 = extractelement <8 x float> %b, i32 4
+  %vecext23 = extractelement <8 x float> %b, i32 5
+  %add24 = fadd float %vecext22, %vecext23
+  %vecinit25 = insertelement <8 x float> %vecinit21, float %add24, i32 6
+  %vecext26 = extractelement <8 x float> %b, i32 6
+  %vecext27 = extractelement <8 x float> %b, i32 7
+  %add28 = fadd float %vecext26, %vecext27
+  %vecinit29 = insertelement <8 x float> %vecinit25, float %add28, i32 7
+  %shuffle = shufflevector <8 x float> %vecinit29, <8 x float> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ret <8 x float> %shuffle
+}
+
+attributes #0 = { "min-legal-vector-width"="128" "target-cpu"="btver2" "target-features"="+avx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+sse4a,+ssse3" }
--- a/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll
@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -O3                   -S < %s  | FileCheck %s
+; RUN: opt -passes='default<O3>' -S < %s  | FileCheck %s
+
+target triple = "x86_64--"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; PR42174 - https://bugs.llvm.org/show_bug.cgi?id=42174
+; This test should match the IR produced by clang after running -mem2reg.
+; All math before the final 'add' should be scalarized.
+
+define <4 x i32> @square(<4 x i32> %num, i32 %y, i32 %x, i32 %h, i32 %k, i32 %w, i32 %p, i32 %j, i32 %u) {
+; CHECK-LABEL: @square(
+; CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[K:%.*]], 2
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[P:%.*]], 6234
+; CHECK-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[H:%.*]], 75
+; CHECK-NEXT:    [[DIV9:%.*]] = sdiv i32 [[J:%.*]], 3452
+; CHECK-NEXT:    [[MUL13:%.*]] = mul nsw i32 [[W:%.*]], 53
+; CHECK-NEXT:    [[DIV17:%.*]] = sdiv i32 [[X:%.*]], 820
+; CHECK-NEXT:    [[MUL21:%.*]] = shl nsw i32 [[U:%.*]], 2
+; CHECK-NEXT:    [[DOTSCALAR:%.*]] = add i32 [[Y:%.*]], 1
+; CHECK-NEXT:    [[DOTSCALAR1:%.*]] = add i32 [[DOTSCALAR]], [[DIV17]]
+; CHECK-NEXT:    [[DOTSCALAR2:%.*]] = add i32 [[DOTSCALAR1]], [[MUL5]]
+; CHECK-NEXT:    [[DOTSCALAR3:%.*]] = add i32 [[DOTSCALAR2]], [[DIV]]
+; CHECK-NEXT:    [[DOTSCALAR4:%.*]] = add i32 [[DOTSCALAR3]], [[MUL13]]
+; CHECK-NEXT:    [[DOTSCALAR5:%.*]] = add i32 [[DOTSCALAR4]], [[MUL]]
+; CHECK-NEXT:    [[DOTSCALAR6:%.*]] = add i32 [[DOTSCALAR5]], [[DIV9]]
+; CHECK-NEXT:    [[DOTSCALAR7:%.*]] = add i32 [[DOTSCALAR6]], [[MUL21]]
+; CHECK-NEXT:    [[DOTSCALAR8:%.*]] = add i32 [[DOTSCALAR7]], 317425
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[DOTSCALAR8]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[ADD29:%.*]] = add <4 x i32> [[TMP2]], [[NUM:%.*]]
+; CHECK-NEXT:    ret <4 x i32> [[ADD29]]
+;
+  %add = add <4 x i32> %num, <i32 1, i32 1, i32 1, i32 1>
+  %div = sdiv i32 %k, 2
+  %splatinsert = insertelement <4 x i32> poison, i32 %div, i32 0
+  %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add1 = add <4 x i32> %add, %splat
+  %mul = mul nsw i32 %p, 6234
+  %splatinsert2 = insertelement <4 x i32> poison, i32 %mul, i32 0
+  %splat3 = shufflevector <4 x i32> %splatinsert2, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add4 = add <4 x i32> %add1, %splat3
+  %mul5 = mul nsw i32 75, %h
+  %splatinsert6 = insertelement <4 x i32> poison, i32 %mul5, i32 0
+  %splat7 = shufflevector <4 x i32> %splatinsert6, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add8 = add <4 x i32> %add4, %splat7
+  %div9 = sdiv i32 %j, 3452
+  %splatinsert10 = insertelement <4 x i32> poison, i32 %div9, i32 0
+  %splat11 = shufflevector <4 x i32> %splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add12 = add <4 x i32> %add8, %splat11
+  %mul13 = mul nsw i32 53, %w
+  %splatinsert14 = insertelement <4 x i32> poison, i32 %mul13, i32 0
+  %splat15 = shufflevector <4 x i32> %splatinsert14, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add16 = add <4 x i32> %add12, %splat15
+  %div17 = sdiv i32 %x, 820
+  %splatinsert18 = insertelement <4 x i32> poison, i32 %div17, i32 0
+  %splat19 = shufflevector <4 x i32> %splatinsert18, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add20 = add <4 x i32> %add16, %splat19
+  %mul21 = mul nsw i32 4, %u
+  %splatinsert22 = insertelement <4 x i32> poison, i32 %mul21, i32 0
+  %splat23 = shufflevector <4 x i32> %splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add24 = add <4 x i32> %add20, %splat23
+  %splatinsert25 = insertelement <4 x i32> poison, i32 %y, i32 0
+  %splat26 = shufflevector <4 x i32> %splatinsert25, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add27 = add <4 x i32> %add24, %splat26
+  %add28 = add <4 x i32> %add27, <i32 25, i32 25, i32 25, i32 25>
+  %add29 = add <4 x i32> %add28, <i32 317400, i32 317400, i32 317400, i32 317400>
+  ret <4 x i32> %add29
+}
+
--- a/llvm/test/Transforms/PhaseOrdering/vector-trunc-inseltpoison.ll
+++ b/llvm/test/Transforms/PhaseOrdering/vector-trunc-inseltpoison.ll
@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -O2                   -S -data-layout="e" < %s | FileCheck %s
+; RUN: opt -passes='default<O2>' -S -data-layout="e" < %s | FileCheck %s
+
+define <4 x i16> @truncate(<4 x i32> %x) {
+; CHECK-LABEL: @truncate(
+; CHECK-NEXT:    [[V3:%.*]] = trunc <4 x i32> [[X:%.*]] to <4 x i16>
+; CHECK-NEXT:    ret <4 x i16> [[V3]]
+;
+  %x0 = extractelement <4 x i32> %x, i32 0
+  %t0 = trunc i32 %x0 to i16
+  %v0 = insertelement <4 x i16> poison, i16 %t0, i32 0
+  %x1 = extractelement <4 x i32> %x, i32 1
+  %t1 = trunc i32 %x1 to i16
+  %v1 = insertelement <4 x i16> %v0, i16 %t1, i32 1
+  %x2 = extractelement <4 x i32> %x, i32 2
+  %t2 = trunc i32 %x2 to i16
+  %v2 = insertelement <4 x i16> %v1, i16 %t2, i32 2
+  %x3 = extractelement <4 x i32> %x, i32 3
+  %t3 = trunc i32 %x3 to i16
+  %v3 = insertelement <4 x i16> %v2, i16 %t3, i32 3
+  ret <4 x i16> %v3
+}
--- a/llvm/test/Transforms/RewriteStatepointsForGC/base-vector-inseltpoison.ll
+++ b/llvm/test/Transforms/RewriteStatepointsForGC/base-vector-inseltpoison.ll
@ -0,0 +1,279 @@
+; RUN: opt < %s -rewrite-statepoints-for-gc -S | FileCheck  %s
+; RUN: opt < %s -passes=rewrite-statepoints-for-gc -S | FileCheck  %s
+
+
+define i64 addrspace(1)* @test(<2 x i64 addrspace(1)*> %vec, i32 %idx) gc "statepoint-example" {
+; CHECK-LABEL: @test
+; CHECK: extractelement
+; CHECK: extractelement
+; CHECK: statepoint
+; CHECK: gc.relocate
+; CHECK-DAG: ; (%base_ee, %obj)
+; CHECK: gc.relocate
+; CHECK-DAG: ; (%base_ee, %base_ee)
+; Note that the second extractelement is actually redundant here.  A correct output would
+; be to reuse the existing obj as a base since it is actually a base pointer.
+entry:
+  %obj = extractelement <2 x i64 addrspace(1)*> %vec, i32 %idx
+  call void @do_safepoint() [ "deopt"() ]
+  ret i64 addrspace(1)* %obj
+}
+
+define i64 addrspace(1)* @test2(<2 x i64 addrspace(1)*>* %ptr, i1 %cnd, i32 %idx1, i32 %idx2) gc "statepoint-example" {
+; CHECK-LABEL: test2
+entry:
+  br i1 %cnd, label %taken, label %untaken
+
+taken:                                            ; preds = %entry
+  %obja = load <2 x i64 addrspace(1)*>, <2 x i64 addrspace(1)*>* %ptr
+  br label %merge
+
+untaken:                                          ; preds = %entry
+  %objb = load <2 x i64 addrspace(1)*>, <2 x i64 addrspace(1)*>* %ptr
+  br label %merge
+
+merge:                                            ; preds = %untaken, %taken
+  %vec = phi <2 x i64 addrspace(1)*> [ %obja, %taken ], [ %objb, %untaken ]
+  br i1 %cnd, label %taken2, label %untaken2
+
+taken2:                                           ; preds = %merge
+  %obj0 = extractelement <2 x i64 addrspace(1)*> %vec, i32 %idx1
+  br label %merge2
+
+untaken2:                                         ; preds = %merge
+  %obj1 = extractelement <2 x i64 addrspace(1)*> %vec, i32 %idx2
+  br label %merge2
+
+merge2:                                           ; preds = %untaken2, %taken2
+; CHECK-LABEL: merge2:
+; CHECK: %obj.base = phi i64 addrspace(1)*
+; CHECK: %obj = phi i64 addrspace(1)*
+; CHECK: statepoint
+; CHECK: gc.relocate
+; CHECK-DAG: ; (%obj.base, %obj)
+; CHECK: gc.relocate
+; CHECK-DAG: ; (%obj.base, %obj.base)
+  %obj = phi i64 addrspace(1)* [ %obj0, %taken2 ], [ %obj1, %untaken2 ]
+  call void @do_safepoint() [ "deopt"() ]
+  ret i64 addrspace(1)* %obj
+}
+
+define i64 addrspace(1)* @test3(i64 addrspace(1)* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: test3
+; CHECK: insertelement
+; CHECK: extractelement
+; CHECK: statepoint
+; CHECK: gc.relocate
+; CHECK-DAG: (%obj.base, %obj)
+entry:
+  %vec = insertelement <2 x i64 addrspace(1)*> poison, i64 addrspace(1)* %ptr, i32 0
+  %obj = extractelement <2 x i64 addrspace(1)*> %vec, i32 0
+  call void @do_safepoint() [ "deopt"() ]
+  ret i64 addrspace(1)* %obj
+}
+
+define i64 addrspace(1)* @test4(i64 addrspace(1)* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: test4
+; CHECK: statepoint
+; CHECK: gc.relocate
+; CHECK-DAG: ; (%obj.base, %obj)
+; When we can optimize an extractelement from a known
+; index and avoid introducing new base pointer instructions
+entry:
+  %derived = getelementptr i64, i64 addrspace(1)* %ptr, i64 16
+  %veca = insertelement <2 x i64 addrspace(1)*> poison, i64 addrspace(1)* %derived, i32 0
+  %vec = insertelement <2 x i64 addrspace(1)*> %veca, i64 addrspace(1)* %ptr, i32 1
+  %obj = extractelement <2 x i64 addrspace(1)*> %vec, i32 0
+  call void @do_safepoint() [ "deopt"() ]
+  ret i64 addrspace(1)* %obj
+}
+
+declare void @use(i64 addrspace(1)*) "gc-leaf-function"
+declare void @use_vec(<4 x i64 addrspace(1)*>) "gc-leaf-function"
+
+define void @test5(i1 %cnd, i64 addrspace(1)* %obj) gc "statepoint-example" {
+; CHECK-LABEL: @test5
+; CHECK: gc.relocate
+; CHECK-DAG: (%bdv.base, %bdv)
+; When we fundementally have to duplicate
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %obj, i64 1
+  %vec = insertelement <2 x i64 addrspace(1)*> poison, i64 addrspace(1)* %gep, i32 0
+  %bdv = extractelement <2 x i64 addrspace(1)*> %vec, i32 0
+  call void @do_safepoint() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  call void @use(i64 addrspace(1)* %bdv)
+  ret void
+}
+
+define void @test6(i1 %cnd, i64 addrspace(1)* %obj, i64 %idx) gc "statepoint-example" {
+; CHECK-LABEL: @test6
+; CHECK: %gep = getelementptr i64, i64 addrspace(1)* %obj, i64 1
+; CHECK: %vec.base = insertelement <2 x i64 addrspace(1)*> zeroinitializer, i64 addrspace(1)* %obj, i32 0, !is_base_value !0
+; CHECK: %vec = insertelement <2 x i64 addrspace(1)*> poison, i64 addrspace(1)* %gep, i32 0
+; CHECK: %bdv.base = extractelement <2 x i64 addrspace(1)*> %vec.base, i64 %idx, !is_base_value !0
+; CHECK:  %bdv = extractelement <2 x i64 addrspace(1)*> %vec, i64 %idx
+; CHECK: gc.statepoint
+; CHECK: gc.relocate
+; CHECK-DAG: (%bdv.base, %bdv)
+; A more complicated example involving vector and scalar bases.
+; This is derived from a failing test case when we didn't have correct
+; insertelement handling.
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %obj, i64 1
+  %vec = insertelement <2 x i64 addrspace(1)*> poison, i64 addrspace(1)* %gep, i32 0
+  %bdv = extractelement <2 x i64 addrspace(1)*> %vec, i64 %idx
+  call void @do_safepoint() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  call void @use(i64 addrspace(1)* %bdv)
+  ret void
+}
+
+define i64 addrspace(1)* @test7(i1 %cnd, i64 addrspace(1)* %obj, i64 addrspace(1)* %obj2) gc "statepoint-example" {
+; CHECK-LABEL: @test7
+entry:
+  %vec = insertelement <2 x i64 addrspace(1)*> poison, i64 addrspace(1)* %obj2, i32 0
+  br label %merge1
+
+merge1:                                           ; preds = %merge1, %entry
+; CHECK-LABEL: merge1:
+; CHECK: vec2.base
+; CHECK: vec2
+; CHECK: gep
+; CHECK: vec3.base
+; CHECK: vec3
+  %vec2 = phi <2 x i64 addrspace(1)*> [ %vec, %entry ], [ %vec3, %merge1 ]
+  %gep = getelementptr i64, i64 addrspace(1)* %obj2, i64 1
+  %vec3 = insertelement <2 x i64 addrspace(1)*> poison, i64 addrspace(1)* %gep, i32 0
+  br i1 %cnd, label %merge1, label %next1
+
+next1:                                            ; preds = %merge1
+; CHECK-LABEL: next1:
+; CHECK: bdv.base = 
+; CHECK: bdv = 
+  %bdv = extractelement <2 x i64 addrspace(1)*> %vec2, i32 0
+  br label %merge
+
+merge:                                            ; preds = %merge, %next1
+; CHECK-LABEL: merge:
+; CHECK: %objb.base
+; CHECK: %objb
+; CHECK: gc.statepoint
+; CHECK: gc.relocate
+; CHECK-DAG: (%objb.base, %objb)
+  %objb = phi i64 addrspace(1)* [ %obj, %next1 ], [ %bdv, %merge ]
+  br i1 %cnd, label %merge, label %next
+
+next:                                             ; preds = %merge
+  call void @do_safepoint() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  ret i64 addrspace(1)* %objb
+}
+
+; identify base for shufflevector
+define void @test8(i64 addrspace(1)* %obj, i64 %idx) gc "statepoint-example" {
+; CHECK-LABEL: @test8
+; CHECK: %gep = getelementptr i64, i64 addrspace(1)* %obj, i64 1
+; CHECK: %gep2 = getelementptr i64, i64 addrspace(1)* %obj, i64 2
+; CHECK: %vec1.base = insertelement <4 x i64 addrspace(1)*> zeroinitializer, i64 addrspace(1)* %obj, i32 0, !is_base_value !0
+; CHECK: %vec1 = insertelement <4 x i64 addrspace(1)*> poison, i64 addrspace(1)* %gep, i32 0
+; CHECK: %vec2.base = insertelement <4 x i64 addrspace(1)*> zeroinitializer, i64 addrspace(1)* %obj, i32 2, !is_base_value !0
+; CHECK: %vec2 = insertelement <4 x i64 addrspace(1)*> poison, i64 addrspace(1)* %gep2, i32 2
+; CHECK: %vec.base = shufflevector <4 x i64 addrspace(1)*> %vec1.base, <4 x i64 addrspace(1)*> %vec2.base, <2 x i32> <i32 0, i32 2>, !is_base_value !0
+; CHECK: %vec = shufflevector <4 x i64 addrspace(1)*> %vec1, <4 x i64 addrspace(1)*> %vec2, <2 x i32> <i32 0, i32 2>
+; CHECK: %bdv.base = extractelement <2 x i64 addrspace(1)*> %vec.base, i64 %idx, !is_base_value !0
+; CHECK: %bdv = extractelement <2 x i64 addrspace(1)*> %vec, i64 %idx
+; CHECK: gc.statepoint
+; CHECK: gc.relocate
+; CHECK-DAG: (%bdv.base, %bdv)
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %obj, i64 1
+  %gep2 = getelementptr i64, i64 addrspace(1)* %obj, i64 2
+  %vec1 = insertelement <4 x i64 addrspace(1)*> poison, i64 addrspace(1)* %gep, i32 0
+  %vec2 = insertelement <4 x i64 addrspace(1)*> poison, i64 addrspace(1)* %gep2, i32 2
+  %vec = shufflevector <4 x i64 addrspace(1)*> %vec1, <4 x i64 addrspace(1)*> %vec2, <2 x i32> <i32 0, i32 2>
+  %bdv = extractelement <2 x i64 addrspace(1)*> %vec, i64 %idx
+  call void @do_safepoint() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  call void @use(i64 addrspace(1)* %bdv)
+  ret void
+}
+
+; Since the same 'base' vector is used in the shuffle operands, we do not need
+; create a shufflevector base.
+define void @test9(<4 x i64 addrspace(1)*> %vec1, i64 %idx) gc "statepoint-example" {
+; CHECK-LABEL: @test9
+; CHECK: %vec = shufflevector <4 x i64 addrspace(1)*> %vec1, <4 x i64 addrspace(1)*> %vec1, <2 x i32> <i32 0, i32 2>
+; CHECK: %base_ee = extractelement <4 x i64 addrspace(1)*> %vec1, i64 %idx, !is_base_value !0
+; CHECK: %bdv = extractelement <2 x i64 addrspace(1)*> %vec, i64 %idx
+; CHECK: gc.statepoint
+; CHECK: gc.relocate
+; CHECK-DAG: (%base_ee, %bdv)
+entry:
+ ; shrinking vec1 into vec
+  %vec = shufflevector <4 x i64 addrspace(1)*> %vec1, <4 x i64 addrspace(1)*> %vec1, <2 x i32> <i32 0, i32 2>
+  %bdv = extractelement <2 x i64 addrspace(1)*> %vec, i64 %idx
+  call void @do_safepoint() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  call void @use(i64 addrspace(1)* %bdv)
+  ret void
+}
+
+; vector operand of shufflevector is a phi
+define i64 addrspace(1)* @test10(i1 %cnd, i64 addrspace(1)* %obj, i64 addrspace(1)* %obj2) gc "statepoint-example" {
+; CHECK-LABEL: @test10
+entry:
+  %vec1 = insertelement <4 x i64 addrspace(1)*> poison, i64 addrspace(1)* %obj, i32 0
+  br i1 %cnd, label %here, label %merge
+
+here:
+  %vec2 = insertelement <4 x i64 addrspace(1)*> poison, i64 addrspace(1)* %obj2, i32 2
+  br label %merge
+
+merge:                                           ; preds = %merge, %entry, %here
+; CHECK-LABEL: merge:
+; CHECK: %vec.base = phi <4 x i64 addrspace(1)*> [ %vec1.base, %entry ], [ %vec2.base, %here ], [ %vec3.base, %merge ], !is_base_value !0
+; CHECK: vec
+; CHECK: vec3.base = shufflevector <4 x i64 addrspace(1)*> %vec.base, <4 x i64 addrspace(1)*> %vec.base
+; CHECK: vec3
+; CHECK: bdv.base
+; CHECK: bdv
+  %vec = phi <4 x i64 addrspace(1)*> [ %vec1, %entry ], [ %vec2, %here], [ %vec3, %merge]
+  %vec3 = shufflevector <4 x i64 addrspace(1)*> %vec, <4 x i64 addrspace(1)*> %vec, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
+  %bdv = extractelement <4 x i64 addrspace(1)*> %vec3, i32 0
+  br i1 %cnd, label %merge, label %next
+
+next:
+; CHECK-LABEL: next:
+; CHECK: gc.statepoint
+; CHECK: gc.relocate
+; CHECK-DAG: (%bdv.base, %bdv)
+  call void @do_safepoint() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  ret i64 addrspace(1)* %bdv
+}
+declare void @do_safepoint()
+
+define void @test11(<4 x i64 addrspace(1)*> %vec1) gc "statepoint-example" {
+; CHECK-LABEL: @test11(
+; CHECK: @llvm.experimental.gc.statepoint.p0f_isVoidf{{.*}}<4 x i64 addrspace(1)*> %vec1)
+; CHECK: %vec1.relocated = call coldcc <4 x i8 addrspace(1)*> @llvm.experimental.gc.relocate.v4p1i8
+; CHECK: %vec1.relocated.casted = bitcast <4 x i8 addrspace(1)*> %vec1.relocated to <4 x i64 addrspace(1)*>
+; CHECK: %vec2.remat = getelementptr i64, <4 x i64 addrspace(1)*> %vec1.relocated.casted, i32 1024
+; CHECK: call void @use_vec(<4 x i64 addrspace(1)*> %vec2.remat)
+entry:
+  %vec2 = getelementptr i64, <4 x i64 addrspace(1)*> %vec1, i32 1024
+  call void @do_safepoint() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  call void @use_vec(<4 x i64 addrspace(1) *> %vec2)
+  ret void
+}
+
+declare <4 x i64 addrspace(1)*> @def_vec() "gc-leaf-function"
+
+define void @test12(<4 x i64 addrspace(1)*> %vec1) gc "statepoint-example" {
+; CHECK-LABEL: @test12(
+; CHECK: @llvm.experimental.gc.statepoint.p0f_isVoidf{{.*}}<4 x i64 addrspace(1)*> %vec)
+; CHECK-NEXT: %vec.relocated = call coldcc <4 x i8 addrspace(1)*> @llvm.experimental.gc.relocate.v4p1i8(
+; CHECK-NEXT: %vec.relocated.casted = bitcast <4 x i8 addrspace(1)*> %vec.relocated to <4 x i64 addrspace(1)*>
+; CHECK-NEXT: call void @use_vec(<4 x i64 addrspace(1)*> %vec.relocated.casted)
+; CHECK-NEXT: ret void
+entry:
+  %vec = call <4 x i64 addrspace(1)*> @def_vec()
+  call void @do_safepoint() [ "deopt"() ]
+  call void @use_vec(<4 x i64 addrspace(1)*> %vec)
+  ret void
+}
--- a/llvm/test/Transforms/RewriteStatepointsForGC/check_traversal_order-inseltpoison.ll
+++ b/llvm/test/Transforms/RewriteStatepointsForGC/check_traversal_order-inseltpoison.ll
@ -0,0 +1,38 @@
+; RUN: opt -S -rewrite-statepoints-for-gc < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @f()
+declare void @g(i8 addrspace(1)*, i8 addrspace(1)*)
+declare i32 @personality_function()
+
+; Make sure that we do not fail assertion because we process call of @g before
+; we process the call of @f.
+
+define void @test_01(i8 addrspace(1)* %p, i1 %cond) gc "statepoint-example" personality i32 ()* @personality_function {
+
+; CHECK-LABEL: @test_01(
+
+entry:
+  %tmp0 = insertelement <2 x i8 addrspace(1)*> poison, i8 addrspace(1)* %p, i32 0
+  %tmp1 = insertelement <2 x i8 addrspace(1)*> %tmp0, i8 addrspace(1)* %p, i32 1
+  %tmp2 = extractelement <2 x i8 addrspace(1)*> %tmp1, i32 1
+  %tmp3 = extractelement <2 x i8 addrspace(1)*> %tmp1, i32 0
+  br label %loop
+
+loop:
+  br i1 %cond, label %cond_block, label %exit
+
+cond_block:
+  br i1 %cond, label %backedge, label %exit
+
+exit:
+  %tmp4 = phi i8 addrspace(1)* [ %tmp2, %loop ], [ %tmp2, %cond_block ]
+  call void @g(i8 addrspace(1)* %tmp3, i8 addrspace(1)* %tmp4)
+  ret void
+
+backedge:
+  call void @f()
+  br label %loop
+}
--- a/llvm/test/Transforms/RewriteStatepointsForGC/live-vector-nosplit-inseltpoison.ll
+++ b/llvm/test/Transforms/RewriteStatepointsForGC/live-vector-nosplit-inseltpoison.ll
@ -0,0 +1,119 @@
+; Test that we can correctly handle vectors of pointers in statepoint 
+; rewriting.  
+; RUN: opt < %s -rewrite-statepoints-for-gc -S | FileCheck  %s
+; RUN: opt < %s -passes=rewrite-statepoints-for-gc -S | FileCheck  %s
+
+; A non-vector relocation for comparison
+define i64 addrspace(1)* @test(i64 addrspace(1)* %obj) gc "statepoint-example" {
+; CHECK-LABEL: test
+; CHECK: gc.statepoint
+; CHECK-NEXT: gc.relocate
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: ret i64 addrspace(1)*
+; A base vector from a argument
+entry:
+  call void @do_safepoint() [ "deopt"() ]
+  ret i64 addrspace(1)* %obj
+}
+
+; A vector argument
+define <2 x i64 addrspace(1)*> @test2(<2 x i64 addrspace(1)*> %obj) gc "statepoint-example" {
+; CHECK-LABEL: test2
+; CHECK-NEXT: gc.statepoint
+; CHECK-NEXT: gc.relocate
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: ret <2 x i64 addrspace(1)*>
+  call void @do_safepoint() [ "deopt"() ]
+  ret <2 x i64 addrspace(1)*> %obj
+}
+
+; A load
+define <2 x i64 addrspace(1)*> @test3(<2 x i64 addrspace(1)*>* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: test3
+; CHECK: load
+; CHECK-NEXT: gc.statepoint
+; CHECK-NEXT: gc.relocate
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: ret <2 x i64 addrspace(1)*>
+entry:
+  %obj = load <2 x i64 addrspace(1)*>, <2 x i64 addrspace(1)*>* %ptr
+  call void @do_safepoint() [ "deopt"() ]
+  ret <2 x i64 addrspace(1)*> %obj
+}
+
+declare i32 @fake_personality_function()
+
+; When a statepoint is an invoke rather than a call
+define <2 x i64 addrspace(1)*> @test4(<2 x i64 addrspace(1)*>* %ptr) gc "statepoint-example" personality i32 ()* @fake_personality_function {
+; CHECK-LABEL: test4
+; CHECK: load
+; CHECK-NEXT: gc.statepoint
+entry:
+  %obj = load <2 x i64 addrspace(1)*>, <2 x i64 addrspace(1)*>* %ptr
+  invoke void @do_safepoint() [ "deopt"() ]
+          to label %normal_return unwind label %exceptional_return
+
+normal_return:                                    ; preds = %entry
+; CHECK-LABEL: normal_return:
+; CHECK: gc.relocate
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: ret <2 x i64 addrspace(1)*>
+  ret <2 x i64 addrspace(1)*> %obj
+
+exceptional_return:                               ; preds = %entry
+; CHECK-LABEL: exceptional_return:
+; CHECK: gc.relocate
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: ret <2 x i64 addrspace(1)*>
+  %landing_pad4 = landingpad token
+          cleanup
+  ret <2 x i64 addrspace(1)*> %obj
+}
+
+; A newly created vector
+define <2 x i64 addrspace(1)*> @test5(i64 addrspace(1)* %p) gc "statepoint-example" {
+; CHECK-LABEL: test5
+; CHECK: insertelement
+; CHECK-NEXT: insertelement
+; CHECK-NEXT: gc.statepoint
+; CHECK-NEXT: gc.relocate
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: gc.relocate
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: ret <2 x i64 addrspace(1)*> %vec.relocated.casted
+entry:
+  %vec = insertelement <2 x i64 addrspace(1)*> poison, i64 addrspace(1)* %p, i32 0
+  call void @do_safepoint() [ "deopt"() ]
+  ret <2 x i64 addrspace(1)*> %vec
+}
+
+; A merge point
+define <2 x i64 addrspace(1)*> @test6(i1 %cnd, <2 x i64 addrspace(1)*>* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: test6
+entry:
+  br i1 %cnd, label %taken, label %untaken
+
+taken:                                            ; preds = %entry
+  %obja = load <2 x i64 addrspace(1)*>, <2 x i64 addrspace(1)*>* %ptr
+  br label %merge
+
+untaken:                                          ; preds = %entry
+  %objb = load <2 x i64 addrspace(1)*>, <2 x i64 addrspace(1)*>* %ptr
+  br label %merge
+
+merge:                                            ; preds = %untaken, %taken
+; CHECK-LABEL: merge:
+; CHECK-NEXT: = phi
+; CHECK-NEXT: = phi
+; CHECK-NEXT: gc.statepoint
+; CHECK-NEXT: gc.relocate
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: gc.relocate
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: ret <2 x i64 addrspace(1)*>
+  %obj = phi <2 x i64 addrspace(1)*> [ %obja, %taken ], [ %objb, %untaken ]
+  call void @do_safepoint() [ "deopt"() ]
+  ret <2 x i64 addrspace(1)*> %obj
+}
+
+declare void @do_safepoint()
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement-inseltpoison.ll
@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S 2>%t | FileCheck %s
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; WARN-NOT: warning
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+define <2 x float> @insertelement-fixed-vector() {
+; CHECK-LABEL: @insertelement-fixed-vector(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x float> @llvm.fabs.v2f32(<2 x float> undef)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[I0:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[I1:%.*]] = insertelement <2 x float> [[I0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    ret <2 x float> [[I1]]
+;
+  %f0 = tail call fast float @llvm.fabs.f32(float undef)
+  %f1 = tail call fast float @llvm.fabs.f32(float undef)
+  %i0 = insertelement <2 x float> poison, float %f0, i32 0
+  %i1 = insertelement <2 x float> %i0, float %f1, i32 1
+  ret <2 x float> %i1
+}
+
+; TODO: llvm.fabs could be optimized in vector form. It's legal to extract
+; elements from fixed-length vector and insert into scalable vector.
+define <vscale x 2 x float> @insertelement-scalable-vector() {
+; CHECK-LABEL: @insertelement-scalable-vector(
+; CHECK-NEXT:    [[F0:%.*]] = tail call fast float @llvm.fabs.f32(float undef)
+; CHECK-NEXT:    [[F1:%.*]] = tail call fast float @llvm.fabs.f32(float undef)
+; CHECK-NEXT:    [[I0:%.*]] = insertelement <vscale x 2 x float> poison, float [[F0]], i32 0
+; CHECK-NEXT:    [[I1:%.*]] = insertelement <vscale x 2 x float> [[I0]], float [[F1]], i32 1
+; CHECK-NEXT:    ret <vscale x 2 x float> [[I1]]
+;
+  %f0 = tail call fast float @llvm.fabs.f32(float undef)
+  %f1 = tail call fast float @llvm.fabs.f32(float undef)
+  %i0 = insertelement <vscale x 2 x float> poison, float %f0, i32 0
+  %i1 = insertelement <vscale x 2 x float> %i0, float %f1, i32 1
+  ret <vscale x 2 x float> %i1
+}
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare float @llvm.fabs.f32(float)
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
@ -0,0 +1,294 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) {
+; CHECK-LABEL: @build_vec_v2i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[V0:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[V1:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sub <2 x i64> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP9]]
+;
+  %v0.0 = extractelement <2 x i64> %v0, i32 0
+  %v0.1 = extractelement <2 x i64> %v0, i32 1
+  %v1.0 = extractelement <2 x i64> %v1, i32 0
+  %v1.1 = extractelement <2 x i64> %v1, i32 1
+  %tmp0.0 = add i64 %v0.0, %v1.0
+  %tmp0.1 = add i64 %v0.1, %v1.1
+  %tmp1.0 = sub i64 %v0.0, %v1.0
+  %tmp1.1 = sub i64 %v0.1, %v1.1
+  %tmp2.0 = add i64 %tmp0.0, %tmp0.1
+  %tmp2.1 = add i64 %tmp1.0, %tmp1.1
+  %tmp3.0 = insertelement <2 x i64> poison, i64 %tmp2.0, i32 0
+  %tmp3.1 = insertelement <2 x i64> %tmp3.0, i64 %tmp2.1, i32 1
+  ret <2 x i64> %tmp3.1
+}
+
+define void @store_chain_v2i64(i64* %a, i64* %b, i64* %c) {
+; CHECK-LABEL: @store_chain_v2i64(
+; CHECK-NEXT:    [[A_1:%.*]] = getelementptr i64, i64* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[B_1:%.*]] = getelementptr i64, i64* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[C_1:%.*]] = getelementptr i64, i64* [[C:%.*]], i64 1
+; CHECK-NEXT:    [[V0_0:%.*]] = load i64, i64* [[A]], align 8
+; CHECK-NEXT:    [[V0_1:%.*]] = load i64, i64* [[A_1]], align 8
+; CHECK-NEXT:    [[V1_0:%.*]] = load i64, i64* [[B]], align 8
+; CHECK-NEXT:    [[V1_1:%.*]] = load i64, i64* [[B_1]], align 8
+; CHECK-NEXT:    [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]]
+; CHECK-NEXT:    [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP1_0:%.*]] = sub i64 [[V0_0]], [[V1_0]]
+; CHECK-NEXT:    [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]]
+; CHECK-NEXT:    [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]]
+; CHECK-NEXT:    store i64 [[TMP2_0]], i64* [[C]], align 8
+; CHECK-NEXT:    store i64 [[TMP2_1]], i64* [[C_1]], align 8
+; CHECK-NEXT:    ret void
+;
+  %a.0 = getelementptr i64, i64* %a, i64 0
+  %a.1 = getelementptr i64, i64* %a, i64 1
+  %b.0 = getelementptr i64, i64* %b, i64 0
+  %b.1 = getelementptr i64, i64* %b, i64 1
+  %c.0 = getelementptr i64, i64* %c, i64 0
+  %c.1 = getelementptr i64, i64* %c, i64 1
+  %v0.0 = load i64, i64* %a.0, align 8
+  %v0.1 = load i64, i64* %a.1, align 8
+  %v1.0 = load i64, i64* %b.0, align 8
+  %v1.1 = load i64, i64* %b.1, align 8
+  %tmp0.0 = add i64 %v0.0, %v1.0
+  %tmp0.1 = add i64 %v0.1, %v1.1
+  %tmp1.0 = sub i64 %v0.0, %v1.0
+  %tmp1.1 = sub i64 %v0.1, %v1.1
+  %tmp2.0 = add i64 %tmp0.0, %tmp0.1
+  %tmp2.1 = add i64 %tmp1.0, %tmp1.1
+  store i64 %tmp2.0, i64* %c.0, align 8
+  store i64 %tmp2.1, i64* %c.1, align 8
+  ret void
+}
+
+define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @build_vec_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sub <4 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP9]]
+;
+  %v0.0 = extractelement <4 x i32> %v0, i32 0
+  %v0.1 = extractelement <4 x i32> %v0, i32 1
+  %v0.2 = extractelement <4 x i32> %v0, i32 2
+  %v0.3 = extractelement <4 x i32> %v0, i32 3
+  %v1.0 = extractelement <4 x i32> %v1, i32 0
+  %v1.1 = extractelement <4 x i32> %v1, i32 1
+  %v1.2 = extractelement <4 x i32> %v1, i32 2
+  %v1.3 = extractelement <4 x i32> %v1, i32 3
+  %tmp0.0 = add i32 %v0.0, %v1.0
+  %tmp0.1 = add i32 %v0.1, %v1.1
+  %tmp0.2 = add i32 %v0.2, %v1.2
+  %tmp0.3 = add i32 %v0.3, %v1.3
+  %tmp1.0 = sub i32 %v0.0, %v1.0
+  %tmp1.1 = sub i32 %v0.1, %v1.1
+  %tmp1.2 = sub i32 %v0.2, %v1.2
+  %tmp1.3 = sub i32 %v0.3, %v1.3
+  %tmp2.0 = add i32 %tmp0.0, %tmp0.1
+  %tmp2.1 = add i32 %tmp1.0, %tmp1.1
+  %tmp2.2 = add i32 %tmp0.2, %tmp0.3
+  %tmp2.3 = add i32 %tmp1.2, %tmp1.3
+  %tmp3.0 = insertelement <4 x i32> poison, i32 %tmp2.0, i32 0
+  %tmp3.1 = insertelement <4 x i32> %tmp3.0, i32 %tmp2.1, i32 1
+  %tmp3.2 = insertelement <4 x i32> %tmp3.1, i32 %tmp2.2, i32 2
+  %tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.3, i32 3
+  ret <4 x i32> %tmp3.3
+}
+
+define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
+; CHECK-LABEL: @build_vec_v4i32_reuse_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sub <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    ret <4 x i32> [[TMP3_3]]
+;
+  %v0.0 = extractelement <2 x i32> %v0, i32 0
+  %v0.1 = extractelement <2 x i32> %v0, i32 1
+  %v1.0 = extractelement <2 x i32> %v1, i32 0
+  %v1.1 = extractelement <2 x i32> %v1, i32 1
+  %tmp0.0 = add i32 %v0.0, %v1.0
+  %tmp0.1 = add i32 %v0.1, %v1.1
+  %tmp1.0 = sub i32 %v0.0, %v1.0
+  %tmp1.1 = sub i32 %v0.1, %v1.1
+  %tmp2.0 = add i32 %tmp0.0, %tmp0.1
+  %tmp2.1 = add i32 %tmp1.0, %tmp1.1
+  %tmp3.0 = insertelement <4 x i32> poison, i32 %tmp2.0, i32 0
+  %tmp3.1 = insertelement <4 x i32> %tmp3.0, i32 %tmp2.1, i32 1
+  %tmp3.2 = insertelement <4 x i32> %tmp3.1, i32 %tmp2.0, i32 2
+  %tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.1, i32 3
+  ret <4 x i32> %tmp3.3
+}
+
+define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
+; CHECK-LABEL: @build_vec_v4i32_reuse_1(
+; CHECK-NEXT:    [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0
+; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1
+; CHECK-NEXT:    [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0
+; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1
+; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]]
+; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP0_2:%.*]] = xor i32 [[V0_0]], [[V1_0]]
+; CHECK-NEXT:    [[TMP0_3:%.*]] = xor i32 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP1_0:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]]
+; CHECK-NEXT:    [[TMP1_1:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]]
+; CHECK-NEXT:    [[TMP1_2:%.*]] = sub i32 [[TMP0_2]], [[TMP0_3]]
+; CHECK-NEXT:    [[TMP1_3:%.*]] = sub i32 [[TMP0_3]], [[TMP0_2]]
+; CHECK-NEXT:    [[TMP2_0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1_0]], i32 0
+; CHECK-NEXT:    [[TMP2_1:%.*]] = insertelement <4 x i32> [[TMP2_0]], i32 [[TMP1_1]], i32 1
+; CHECK-NEXT:    [[TMP2_2:%.*]] = insertelement <4 x i32> [[TMP2_1]], i32 [[TMP1_2]], i32 2
+; CHECK-NEXT:    [[TMP2_3:%.*]] = insertelement <4 x i32> [[TMP2_2]], i32 [[TMP1_3]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[TMP2_3]]
+;
+  %v0.0 = extractelement <2 x i32> %v0, i32 0
+  %v0.1 = extractelement <2 x i32> %v0, i32 1
+  %v1.0 = extractelement <2 x i32> %v1, i32 0
+  %v1.1 = extractelement <2 x i32> %v1, i32 1
+  %tmp0.0 = add i32 %v0.0, %v1.0
+  %tmp0.1 = add i32 %v0.1, %v1.1
+  %tmp0.2 = xor i32 %v0.0, %v1.0
+  %tmp0.3 = xor i32 %v0.1, %v1.1
+  %tmp1.0 = sub i32 %tmp0.0, %tmp0.1
+  %tmp1.1 = sub i32 %tmp0.0, %tmp0.1
+  %tmp1.2 = sub i32 %tmp0.2, %tmp0.3
+  %tmp1.3 = sub i32 %tmp0.3, %tmp0.2
+  %tmp2.0 = insertelement <4 x i32> poison, i32 %tmp1.0, i32 0
+  %tmp2.1 = insertelement <4 x i32> %tmp2.0, i32 %tmp1.1, i32 1
+  %tmp2.2 = insertelement <4 x i32> %tmp2.1, i32 %tmp1.2, i32 2
+  %tmp2.3 = insertelement <4 x i32> %tmp2.2, i32 %tmp1.3, i32 3
+  ret <4 x i32> %tmp2.3
+}
+
+define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
+; CHECK-LABEL: @build_vec_v4i32_3_binops(
+; CHECK-NEXT:    [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0
+; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1
+; CHECK-NEXT:    [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0
+; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1
+; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]]
+; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP1_0:%.*]] = mul i32 [[V0_0]], [[V1_0]]
+; CHECK-NEXT:    [[TMP1_1:%.*]] = mul i32 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]]
+; CHECK-NEXT:    [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3_0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2_0]], i32 0
+; CHECK-NEXT:    [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1
+; CHECK-NEXT:    [[TMP3_3:%.*]] = shufflevector <4 x i32> [[TMP3_1]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x i32> [[TMP3_3]]
+;
+  %v0.0 = extractelement <2 x i32> %v0, i32 0
+  %v0.1 = extractelement <2 x i32> %v0, i32 1
+  %v1.0 = extractelement <2 x i32> %v1, i32 0
+  %v1.1 = extractelement <2 x i32> %v1, i32 1
+  %tmp0.0 = add i32 %v0.0, %v1.0
+  %tmp0.1 = add i32 %v0.1, %v1.1
+  %tmp0.2 = xor i32 %v0.0, %v1.0
+  %tmp0.3 = xor i32 %v0.1, %v1.1
+  %tmp1.0 = mul i32 %v0.0, %v1.0
+  %tmp1.1 = mul i32 %v0.1, %v1.1
+  %tmp1.2 = xor i32 %v0.0, %v1.0
+  %tmp1.3 = xor i32 %v0.1, %v1.1
+  %tmp2.0 = add i32 %tmp0.0, %tmp0.1
+  %tmp2.1 = add i32 %tmp1.0, %tmp1.1
+  %tmp2.2 = add i32 %tmp0.2, %tmp0.3
+  %tmp2.3 = add i32 %tmp1.2, %tmp1.3
+  %tmp3.0 = insertelement <4 x i32> poison, i32 %tmp2.0, i32 0
+  %tmp3.1 = insertelement <4 x i32> %tmp3.0, i32 %tmp2.1, i32 1
+  %tmp3.2 = insertelement <4 x i32> %tmp3.1, i32 %tmp2.2, i32 2
+  %tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.3, i32 3
+  ret <4 x i32> %tmp3.3
+}
+
+define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @reduction_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = lshr <4 x i32> [[TMP9]], <i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP11:%.*]] = and <4 x i32> [[TMP10]], <i32 65537, i32 65537, i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP12:%.*]] = mul nuw <4 x i32> [[TMP11]], <i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <4 x i32> [[TMP13]], [[TMP12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP14]])
+; CHECK-NEXT:    ret i32 [[TMP15]]
+;
+  %v0.0 = extractelement <4 x i32> %v0, i32 0
+  %v0.1 = extractelement <4 x i32> %v0, i32 1
+  %v0.2 = extractelement <4 x i32> %v0, i32 2
+  %v0.3 = extractelement <4 x i32> %v0, i32 3
+  %v1.0 = extractelement <4 x i32> %v1, i32 0
+  %v1.1 = extractelement <4 x i32> %v1, i32 1
+  %v1.2 = extractelement <4 x i32> %v1, i32 2
+  %v1.3 = extractelement <4 x i32> %v1, i32 3
+  %tmp0.0 = add i32 %v0.0, %v1.0
+  %tmp0.1 = add i32 %v0.1, %v1.1
+  %tmp0.2 = add i32 %v0.2, %v1.2
+  %tmp0.3 = add i32 %v0.3, %v1.3
+  %tmp1.0 = sub i32 %v0.0, %v1.0
+  %tmp1.1 = sub i32 %v0.1, %v1.1
+  %tmp1.2 = sub i32 %v0.2, %v1.2
+  %tmp1.3 = sub i32 %v0.3, %v1.3
+  %tmp2.0 = add i32 %tmp0.0, %tmp0.1
+  %tmp2.1 = add i32 %tmp1.0, %tmp1.1
+  %tmp2.2 = add i32 %tmp0.2, %tmp0.3
+  %tmp2.3 = add i32 %tmp1.2, %tmp1.3
+  %tmp3.0 = lshr i32 %tmp2.0, 15
+  %tmp3.1 = lshr i32 %tmp2.1, 15
+  %tmp3.2 = lshr i32 %tmp2.2, 15
+  %tmp3.3 = lshr i32 %tmp2.3, 15
+  %tmp4.0 = and i32 %tmp3.0, 65537
+  %tmp4.1 = and i32 %tmp3.1, 65537
+  %tmp4.2 = and i32 %tmp3.2, 65537
+  %tmp4.3 = and i32 %tmp3.3, 65537
+  %tmp5.0 = mul nuw i32 %tmp4.0, 65535
+  %tmp5.1 = mul nuw i32 %tmp4.1, 65535
+  %tmp5.2 = mul nuw i32 %tmp4.2, 65535
+  %tmp5.3 = mul nuw i32 %tmp4.3, 65535
+  %tmp6.0 = add i32 %tmp5.0, %tmp2.0
+  %tmp6.1 = add i32 %tmp5.1, %tmp2.1
+  %tmp6.2 = add i32 %tmp5.2, %tmp2.2
+  %tmp6.3 = add i32 %tmp5.3, %tmp2.3
+  %tmp7.0 = xor i32 %tmp6.0, %tmp5.0
+  %tmp7.1 = xor i32 %tmp6.1, %tmp5.1
+  %tmp7.2 = xor i32 %tmp6.2, %tmp5.2
+  %tmp7.3 = xor i32 %tmp6.3, %tmp5.3
+  %reduce.0 = add i32 %tmp7.1, %tmp7.0
+  %reduce.1 = add i32 %reduce.0, %tmp7.2
+  %reduce.2 = add i32 %reduce.1, %tmp7.3
+  ret i32 %reduce.2
+}
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
@ -0,0 +1,336 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -slp-vectorizer -instcombine %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
+
+define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
+; GFX7-LABEL: @uadd_sat_v2i16(
+; GFX7-NEXT:  bb:
+; GFX7-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0
+; GFX7-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1
+; GFX7-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0
+; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
+; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX7-NEXT:    ret <2 x i16> [[INS_1]]
+;
+; GFX8-LABEL: @uadd_sat_v2i16(
+; GFX8-NEXT:  bb:
+; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
+;
+bb:
+  %arg0.0 = extractelement <2 x i16> %arg0, i64 0
+  %arg0.1 = extractelement <2 x i16> %arg0, i64 1
+  %arg1.0 = extractelement <2 x i16> %arg1, i64 0
+  %arg1.1 = extractelement <2 x i16> %arg1, i64 1
+  %add.0 = call i16 @llvm.uadd.sat.i16(i16 %arg0.0, i16 %arg1.0)
+  %add.1 = call i16 @llvm.uadd.sat.i16(i16 %arg0.1, i16 %arg1.1)
+  %ins.0 = insertelement <2 x i16> poison, i16 %add.0, i64 0
+  %ins.1 = insertelement <2 x i16> %ins.0, i16 %add.1, i64 1
+  ret <2 x i16> %ins.1
+}
+
+define <2 x i16> @usub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
+; GFX7-LABEL: @usub_sat_v2i16(
+; GFX7-NEXT:  bb:
+; GFX7-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0
+; GFX7-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1
+; GFX7-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0
+; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
+; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX7-NEXT:    ret <2 x i16> [[INS_1]]
+;
+; GFX8-LABEL: @usub_sat_v2i16(
+; GFX8-NEXT:  bb:
+; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
+;
+bb:
+  %arg0.0 = extractelement <2 x i16> %arg0, i64 0
+  %arg0.1 = extractelement <2 x i16> %arg0, i64 1
+  %arg1.0 = extractelement <2 x i16> %arg1, i64 0
+  %arg1.1 = extractelement <2 x i16> %arg1, i64 1
+  %add.0 = call i16 @llvm.usub.sat.i16(i16 %arg0.0, i16 %arg1.0)
+  %add.1 = call i16 @llvm.usub.sat.i16(i16 %arg0.1, i16 %arg1.1)
+  %ins.0 = insertelement <2 x i16> poison, i16 %add.0, i64 0
+  %ins.1 = insertelement <2 x i16> %ins.0, i16 %add.1, i64 1
+  ret <2 x i16> %ins.1
+}
+
+define <2 x i16> @sadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
+; GFX7-LABEL: @sadd_sat_v2i16(
+; GFX7-NEXT:  bb:
+; GFX7-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0
+; GFX7-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1
+; GFX7-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0
+; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
+; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.sadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.sadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX7-NEXT:    ret <2 x i16> [[INS_1]]
+;
+; GFX8-LABEL: @sadd_sat_v2i16(
+; GFX8-NEXT:  bb:
+; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
+;
+bb:
+  %arg0.0 = extractelement <2 x i16> %arg0, i64 0
+  %arg0.1 = extractelement <2 x i16> %arg0, i64 1
+  %arg1.0 = extractelement <2 x i16> %arg1, i64 0
+  %arg1.1 = extractelement <2 x i16> %arg1, i64 1
+  %add.0 = call i16 @llvm.sadd.sat.i16(i16 %arg0.0, i16 %arg1.0)
+  %add.1 = call i16 @llvm.sadd.sat.i16(i16 %arg0.1, i16 %arg1.1)
+  %ins.0 = insertelement <2 x i16> poison, i16 %add.0, i64 0
+  %ins.1 = insertelement <2 x i16> %ins.0, i16 %add.1, i64 1
+  ret <2 x i16> %ins.1
+}
+
+define <2 x i16> @ssub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
+; GFX7-LABEL: @ssub_sat_v2i16(
+; GFX7-NEXT:  bb:
+; GFX7-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0
+; GFX7-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1
+; GFX7-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0
+; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
+; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.ssub.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.ssub.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX7-NEXT:    ret <2 x i16> [[INS_1]]
+;
+; GFX8-LABEL: @ssub_sat_v2i16(
+; GFX8-NEXT:  bb:
+; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
+;
+bb:
+  %arg0.0 = extractelement <2 x i16> %arg0, i64 0
+  %arg0.1 = extractelement <2 x i16> %arg0, i64 1
+  %arg1.0 = extractelement <2 x i16> %arg1, i64 0
+  %arg1.1 = extractelement <2 x i16> %arg1, i64 1
+  %add.0 = call i16 @llvm.ssub.sat.i16(i16 %arg0.0, i16 %arg1.0)
+  %add.1 = call i16 @llvm.ssub.sat.i16(i16 %arg0.1, i16 %arg1.1)
+  %ins.0 = insertelement <2 x i16> poison, i16 %add.0, i64 0
+  %ins.1 = insertelement <2 x i16> %ins.0, i16 %add.1, i64 1
+  ret <2 x i16> %ins.1
+}
+
+define <2 x i32> @uadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
+; GCN-LABEL: @uadd_sat_v2i32(
+; GCN-NEXT:  bb:
+; GCN-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0
+; GCN-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1
+; GCN-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0
+; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
+; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
+; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
+; GCN-NEXT:    ret <2 x i32> [[INS_1]]
+;
+bb:
+  %arg0.0 = extractelement <2 x i32> %arg0, i64 0
+  %arg0.1 = extractelement <2 x i32> %arg0, i64 1
+  %arg1.0 = extractelement <2 x i32> %arg1, i64 0
+  %arg1.1 = extractelement <2 x i32> %arg1, i64 1
+  %add.0 = call i32 @llvm.uadd.sat.i32(i32 %arg0.0, i32 %arg1.0)
+  %add.1 = call i32 @llvm.uadd.sat.i32(i32 %arg0.1, i32 %arg1.1)
+  %ins.0 = insertelement <2 x i32> poison, i32 %add.0, i64 0
+  %ins.1 = insertelement <2 x i32> %ins.0, i32 %add.1, i64 1
+  ret <2 x i32> %ins.1
+}
+
+define <2 x i32> @usub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
+; GCN-LABEL: @usub_sat_v2i32(
+; GCN-NEXT:  bb:
+; GCN-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0
+; GCN-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1
+; GCN-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0
+; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
+; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
+; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
+; GCN-NEXT:    ret <2 x i32> [[INS_1]]
+;
+bb:
+  %arg0.0 = extractelement <2 x i32> %arg0, i64 0
+  %arg0.1 = extractelement <2 x i32> %arg0, i64 1
+  %arg1.0 = extractelement <2 x i32> %arg1, i64 0
+  %arg1.1 = extractelement <2 x i32> %arg1, i64 1
+  %add.0 = call i32 @llvm.usub.sat.i32(i32 %arg0.0, i32 %arg1.0)
+  %add.1 = call i32 @llvm.usub.sat.i32(i32 %arg0.1, i32 %arg1.1)
+  %ins.0 = insertelement <2 x i32> poison, i32 %add.0, i64 0
+  %ins.1 = insertelement <2 x i32> %ins.0, i32 %add.1, i64 1
+  ret <2 x i32> %ins.1
+}
+
+define <2 x i32> @sadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
+; GCN-LABEL: @sadd_sat_v2i32(
+; GCN-NEXT:  bb:
+; GCN-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0
+; GCN-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1
+; GCN-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0
+; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
+; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
+; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
+; GCN-NEXT:    ret <2 x i32> [[INS_1]]
+;
+bb:
+  %arg0.0 = extractelement <2 x i32> %arg0, i64 0
+  %arg0.1 = extractelement <2 x i32> %arg0, i64 1
+  %arg1.0 = extractelement <2 x i32> %arg1, i64 0
+  %arg1.1 = extractelement <2 x i32> %arg1, i64 1
+  %add.0 = call i32 @llvm.sadd.sat.i32(i32 %arg0.0, i32 %arg1.0)
+  %add.1 = call i32 @llvm.sadd.sat.i32(i32 %arg0.1, i32 %arg1.1)
+  %ins.0 = insertelement <2 x i32> poison, i32 %add.0, i64 0
+  %ins.1 = insertelement <2 x i32> %ins.0, i32 %add.1, i64 1
+  ret <2 x i32> %ins.1
+}
+
+define <2 x i32> @ssub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
+; GCN-LABEL: @ssub_sat_v2i32(
+; GCN-NEXT:  bb:
+; GCN-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0
+; GCN-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1
+; GCN-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0
+; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
+; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
+; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
+; GCN-NEXT:    ret <2 x i32> [[INS_1]]
+;
+bb:
+  %arg0.0 = extractelement <2 x i32> %arg0, i64 0
+  %arg0.1 = extractelement <2 x i32> %arg0, i64 1
+  %arg1.0 = extractelement <2 x i32> %arg1, i64 0
+  %arg1.1 = extractelement <2 x i32> %arg1, i64 1
+  %add.0 = call i32 @llvm.ssub.sat.i32(i32 %arg0.0, i32 %arg1.0)
+  %add.1 = call i32 @llvm.ssub.sat.i32(i32 %arg0.1, i32 %arg1.1)
+  %ins.0 = insertelement <2 x i32> poison, i32 %add.0, i64 0
+  %ins.1 = insertelement <2 x i32> %ins.0, i32 %add.1, i64 1
+  ret <2 x i32> %ins.1
+}
+
+define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
+; GFX7-LABEL: @uadd_sat_v3i16(
+; GFX7-NEXT:  bb:
+; GFX7-NEXT:    [[ARG0_0:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 0
+; GFX7-NEXT:    [[ARG0_1:%.*]] = extractelement <3 x i16> [[ARG0]], i64 1
+; GFX7-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0]], i64 2
+; GFX7-NEXT:    [[ARG1_0:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 0
+; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <3 x i16> [[ARG1]], i64 1
+; GFX7-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1]], i64 2
+; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; GFX7-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <3 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX7-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2
+; GFX7-NEXT:    ret <3 x i16> [[INS_2]]
+;
+; GFX8-LABEL: @uadd_sat_v3i16(
+; GFX8-NEXT:  bb:
+; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
+; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
+; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> undef, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> undef, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
+; GFX8-NEXT:    [[TMP3:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0
+; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <3 x i16> poison, i16 [[TMP3]], i64 0
+; GFX8-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1
+; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[TMP4]], i64 1
+; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2
+; GFX8-NEXT:    ret <3 x i16> [[INS_2]]
+;
+bb:
+  %arg0.0 = extractelement <3 x i16> %arg0, i64 0
+  %arg0.1 = extractelement <3 x i16> %arg0, i64 1
+  %arg0.2 = extractelement <3 x i16> %arg0, i64 2
+  %arg1.0 = extractelement <3 x i16> %arg1, i64 0
+  %arg1.1 = extractelement <3 x i16> %arg1, i64 1
+  %arg1.2 = extractelement <3 x i16> %arg1, i64 2
+  %add.0 = call i16 @llvm.uadd.sat.i16(i16 %arg0.0, i16 %arg1.0)
+  %add.1 = call i16 @llvm.uadd.sat.i16(i16 %arg0.1, i16 %arg1.1)
+  %add.2 = call i16 @llvm.uadd.sat.i16(i16 %arg0.2, i16 %arg1.2)
+  %ins.0 = insertelement <3 x i16> poison, i16 %add.0, i64 0
+  %ins.1 = insertelement <3 x i16> %ins.0, i16 %add.1, i64 1
+  %ins.2 = insertelement <3 x i16> %ins.1, i16 %add.2, i64 2
+  ret <3 x i16> %ins.2
+}
+
+define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
+; GFX7-LABEL: @uadd_sat_v4i16(
+; GFX7-NEXT:  bb:
+; GFX7-NEXT:    [[ARG0_0:%.*]] = extractelement <4 x i16> [[ARG0:%.*]], i64 0
+; GFX7-NEXT:    [[ARG0_1:%.*]] = extractelement <4 x i16> [[ARG0]], i64 1
+; GFX7-NEXT:    [[ARG0_2:%.*]] = extractelement <4 x i16> [[ARG0]], i64 2
+; GFX7-NEXT:    [[ARG0_3:%.*]] = extractelement <4 x i16> [[ARG0]], i64 3
+; GFX7-NEXT:    [[ARG1_0:%.*]] = extractelement <4 x i16> [[ARG1:%.*]], i64 0
+; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <4 x i16> [[ARG1]], i64 1
+; GFX7-NEXT:    [[ARG1_2:%.*]] = extractelement <4 x i16> [[ARG1]], i64 2
+; GFX7-NEXT:    [[ARG1_3:%.*]] = extractelement <4 x i16> [[ARG1]], i64 3
+; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; GFX7-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
+; GFX7-NEXT:    [[ADD_3:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_3]], i16 [[ARG1_3]])
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <4 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <4 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX7-NEXT:    [[INS_2:%.*]] = insertelement <4 x i16> [[INS_1]], i16 [[ADD_2]], i64 2
+; GFX7-NEXT:    [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3
+; GFX7-NEXT:    ret <4 x i16> [[INS_3]]
+;
+; GFX8-LABEL: @uadd_sat_v4i16(
+; GFX8-NEXT:  bb:
+; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
+; GFX8-NEXT:    [[INS_3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX8-NEXT:    ret <4 x i16> [[INS_3]]
+;
+bb:
+  %arg0.0 = extractelement <4 x i16> %arg0, i64 0
+  %arg0.1 = extractelement <4 x i16> %arg0, i64 1
+  %arg0.2 = extractelement <4 x i16> %arg0, i64 2
+  %arg0.3 = extractelement <4 x i16> %arg0, i64 3
+  %arg1.0 = extractelement <4 x i16> %arg1, i64 0
+  %arg1.1 = extractelement <4 x i16> %arg1, i64 1
+  %arg1.2 = extractelement <4 x i16> %arg1, i64 2
+  %arg1.3 = extractelement <4 x i16> %arg1, i64 3
+  %add.0 = call i16 @llvm.uadd.sat.i16(i16 %arg0.0, i16 %arg1.0)
+  %add.1 = call i16 @llvm.uadd.sat.i16(i16 %arg0.1, i16 %arg1.1)
+  %add.2 = call i16 @llvm.uadd.sat.i16(i16 %arg0.2, i16 %arg1.2)
+  %add.3 = call i16 @llvm.uadd.sat.i16(i16 %arg0.3, i16 %arg1.3)
+  %ins.0 = insertelement <4 x i16> poison, i16 %add.0, i64 0
+  %ins.1 = insertelement <4 x i16> %ins.0, i16 %add.1, i64 1
+  %ins.2 = insertelement <4 x i16> %ins.1, i16 %add.2, i64 2
+  %ins.3 = insertelement <4 x i16> %ins.2, i16 %add.3, i64 3
+  ret <4 x i16> %ins.3
+}
+
+declare i16 @llvm.uadd.sat.i16(i16, i16) #0
+declare i16 @llvm.usub.sat.i16(i16, i16) #0
+declare i16 @llvm.sadd.sat.i16(i16, i16) #0
+declare i16 @llvm.ssub.sat.i16(i16, i16) #0
+
+declare i32 @llvm.uadd.sat.i32(i32, i32) #0
+declare i32 @llvm.usub.sat.i32(i32, i32) #0
+declare i32 @llvm.sadd.sat.i32(i32, i32) #0
+declare i32 @llvm.ssub.sat.i32(i32, i32) #0
+
+attributes #0 = { nounwind readnone speculatable willreturn }
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap-inseltpoison.ll
@ -0,0 +1,38 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s
+
+; GCN-LABEL: @bswap_v2i16(
+; GFX7: call i16 @llvm.bswap.i16(
+; GFX7: call i16 @llvm.bswap.i16(
+
+; GFX8: call <2 x i16> @llvm.bswap.v2i16(
+define <2 x i16> @bswap_v2i16(<2 x i16> %arg) {
+bb:
+  %tmp = extractelement <2 x i16> %arg, i64 0
+  %tmp1 = tail call i16 @llvm.bswap.i16(i16 %tmp)
+  %tmp2 = insertelement <2 x i16> poison, i16 %tmp1, i64 0
+  %tmp3 = extractelement <2 x i16> %arg, i64 1
+  %tmp4 = tail call i16 @llvm.bswap.i16(i16 %tmp3)
+  %tmp5 = insertelement <2 x i16> %tmp2, i16 %tmp4, i64 1
+  ret <2 x i16> %tmp5
+}
+
+; GCN-LABEL: @bswap_v2i32(
+; GCN: call i32 @llvm.bswap.i32
+; GCN: call i32 @llvm.bswap.i32
+define <2 x i32> @bswap_v2i32(<2 x i32> %arg) {
+bb:
+  %tmp = extractelement <2 x i32> %arg, i64 0
+  %tmp1 = tail call i32 @llvm.bswap.i32(i32 %tmp)
+  %tmp2 = insertelement <2 x i32> poison, i32 %tmp1, i64 0
+  %tmp3 = extractelement <2 x i32> %arg, i64 1
+  %tmp4 = tail call i32 @llvm.bswap.i32(i32 %tmp3)
+  %tmp5 = insertelement <2 x i32> %tmp2, i32 %tmp4, i64 1
+  ret <2 x i32> %tmp5
+}
+
+declare i16 @llvm.bswap.i16(i16) #0
+declare i32 @llvm.bswap.i32(i32) #0
+
+attributes #0 = { nounwind readnone speculatable willreturn }
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/round-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/round-inseltpoison.ll
@ -0,0 +1,38 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s
+
+; GCN-LABEL: @round_v2f16(
+; GFX7: call half @llvm.round.f16(
+; GFX7: call half @llvm.round.f16(
+
+; GFX8: call <2 x half> @llvm.round.v2f16(
+define <2 x half> @round_v2f16(<2 x half> %arg) {
+bb:
+  %tmp = extractelement <2 x half> %arg, i64 0
+  %tmp1 = tail call half @llvm.round.half(half %tmp)
+  %tmp2 = insertelement <2 x half> poison, half %tmp1, i64 0
+  %tmp3 = extractelement <2 x half> %arg, i64 1
+  %tmp4 = tail call half @llvm.round.half(half %tmp3)
+  %tmp5 = insertelement <2 x half> %tmp2, half %tmp4, i64 1
+  ret <2 x half> %tmp5
+}
+
+; GCN-LABEL: @round_v2f32(
+; GCN: call float @llvm.round.f32(
+; GCN: call float @llvm.round.f32(
+define <2 x float> @round_v2f32(<2 x float> %arg) {
+bb:
+  %tmp = extractelement <2 x float> %arg, i64 0
+  %tmp1 = tail call float @llvm.round.f32(float %tmp)
+  %tmp2 = insertelement <2 x float> poison, float %tmp1, i64 0
+  %tmp3 = extractelement <2 x float> %arg, i64 1
+  %tmp4 = tail call float @llvm.round.f32(float %tmp3)
+  %tmp5 = insertelement <2 x float> %tmp2, float %tmp4, i64 1
+  ret <2 x float> %tmp5
+}
+
+declare half @llvm.round.half(half) #0
+declare float @llvm.round.f32(float) #0
+
+attributes #0 = { nounwind readnone speculatable willreturn }
--- a/llvm/test/Transforms/SLPVectorizer/ARM/extract-insert-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/ARM/extract-insert-inseltpoison.ll
@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S -mtriple=thumb7 -mcpu=swift | FileCheck %s
+
+define <4 x i32> @PR13837(<4 x float> %in) {
+; CHECK-LABEL: @PR13837(
+; CHECK-NEXT:    [[TMP1:%.*]] = fptosi <4 x float> [[IN:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP5]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[V3]]
+;
+  %t0 = extractelement <4 x float> %in, i64 0
+  %t1 = extractelement <4 x float> %in, i64 1
+  %t2 = extractelement <4 x float> %in, i64 2
+  %t3 = extractelement <4 x float> %in, i64 3
+  %c0 = fptosi float %t0 to i32
+  %c1 = fptosi float %t1 to i32
+  %c2 = fptosi float %t2 to i32
+  %c3 = fptosi float %t3 to i32
+  %v0 = insertelement <4 x i32> poison, i32 %c0, i32 0
+  %v1 = insertelement <4 x i32> %v0, i32 %c1, i32 1
+  %v2 = insertelement <4 x i32> %v1, i32 %c2, i32 2
+  %v3 = insertelement <4 x i32> %v2, i32 %c3, i32 3
+  ret <4 x i32> %v3
+}
+
--- a/llvm/test/Transforms/SLPVectorizer/NVPTX/non-vectorizable-intrinsic-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/NVPTX/non-vectorizable-intrinsic-inseltpoison.ll
@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -o - -S -slp-threshold=-1000 | FileCheck %s
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx--nvidiacl"
+
+; CTLZ cannot be vectorized currently because the second argument is a scalar
+; for both the scalar and vector forms of the intrinsic. In the future it
+; should be possible to vectorize such functions.
+; Test causes an assert if LLVM tries to vectorize CTLZ.
+
+define <2 x i8> @cltz_test(<2 x i8> %x) #0 {
+; CHECK-LABEL: @cltz_test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[CALL_I:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP0]], i1 false)
+; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x i8> poison, i8 [[CALL_I]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i8> [[X]], i32 1
+; CHECK-NEXT:    [[CALL_I4:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP1]], i1 false)
+; CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <2 x i8> [[VECINIT]], i8 [[CALL_I4]], i32 1
+; CHECK-NEXT:    ret <2 x i8> [[VECINIT2]]
+;
+entry:
+  %0 = extractelement <2 x i8> %x, i32 0
+  %call.i = call i8 @llvm.ctlz.i8(i8 %0, i1 false)
+  %vecinit = insertelement <2 x i8> poison, i8 %call.i, i32 0
+  %1 = extractelement <2 x i8> %x, i32 1
+  %call.i4 = call i8 @llvm.ctlz.i8(i8 %1, i1 false)
+  %vecinit2 = insertelement <2 x i8> %vecinit, i8 %call.i4, i32 1
+  ret <2 x i8> %vecinit2
+}
+
+define <2 x i8> @cltz_test2(<2 x i8> %x) #1 {
+; CHECK-LABEL: @cltz_test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i8> [[X]], i32 1
+; CHECK-NEXT:    [[CALL_I:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP0]], i1 false)
+; CHECK-NEXT:    [[CALL_I4:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP1]], i1 false)
+; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x i8> poison, i8 [[CALL_I]], i32 0
+; CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <2 x i8> [[VECINIT]], i8 [[CALL_I4]], i32 1
+; CHECK-NEXT:    ret <2 x i8> [[VECINIT2]]
+;
+entry:
+  %0 = extractelement <2 x i8> %x, i32 0
+  %1 = extractelement <2 x i8> %x, i32 1
+  %call.i = call i8 @llvm.ctlz.i8(i8 %0, i1 false)
+  %call.i4 = call i8 @llvm.ctlz.i8(i8 %1, i1 false)
+  %vecinit = insertelement <2 x i8> poison, i8 %call.i, i32 0
+  %vecinit2 = insertelement <2 x i8> %vecinit, i8 %call.i4, i32 1
+  ret <2 x i8> %vecinit2
+}
+
+declare i8 @llvm.ctlz.i8(i8, i1) #3
+
+attributes #0 = { alwaysinline nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR35865-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35865-inseltpoison.ll
@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer < %s -S -o - -mtriple=x86_64-apple-macosx10.10.0 -mcpu=core2 | FileCheck %s
+
+define void @_Z10fooConvertPDv4_xS0_S0_PKS_() {
+; CHECK-LABEL: @_Z10fooConvertPDv4_xS0_S0_PKS_(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <16 x half> undef, i32 4
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <16 x half> undef, i32 5
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x half> undef, half [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x half> [[TMP2]], half [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext <2 x half> [[TMP3]] to <2 x float>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
+; CHECK-NEXT:    [[VECINS_I_4_I:%.*]] = insertelement <8 x i32> poison, i32 [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
+; CHECK-NEXT:    [[VECINS_I_5_I:%.*]] = insertelement <8 x i32> [[VECINS_I_4_I]], i32 [[TMP7]], i32 5
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = extractelement <16 x half> undef, i32 4
+  %conv.i.4.i = fpext half %0 to float
+  %1 = bitcast float %conv.i.4.i to i32
+  %vecins.i.4.i = insertelement <8 x i32> poison, i32 %1, i32 4
+  %2 = extractelement <16 x half> undef, i32 5
+  %conv.i.5.i = fpext half %2 to float
+  %3 = bitcast float %conv.i.5.i to i32
+  %vecins.i.5.i = insertelement <8 x i32> %vecins.i.4.i, i32 %3, i32 5
+  ret void
+}
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s
+
+define <8 x float> @ceil_floor(<8 x float> %a) {
+; CHECK-LABEL: @ceil_floor(
+; CHECK-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
+; CHECK-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
+; CHECK-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
+; CHECK-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
+; CHECK-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
+; CHECK-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
+; CHECK-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
+; CHECK-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
+; CHECK-NEXT:    [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]])
+; CHECK-NEXT:    [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]])
+; CHECK-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
+; CHECK-NEXT:    [[AB4:%.*]] = call float @llvm.ceil.f32(float [[A4]])
+; CHECK-NEXT:    [[AB5:%.*]] = call float @llvm.ceil.f32(float [[A5]])
+; CHECK-NEXT:    [[AB6:%.*]] = call float @llvm.floor.f32(float [[A6]])
+; CHECK-NEXT:    [[AB7:%.*]] = call float @llvm.floor.f32(float [[A7]])
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; CHECK-NEXT:    ret <8 x float> [[R7]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %ab0 = call float @llvm.ceil.f32(float %a0)
+  %ab1 = call float @llvm.floor.f32(float %a1)
+  %ab2 = call float @llvm.floor.f32(float %a2)
+  %ab3 = call float @llvm.ceil.f32(float %a3)
+  %ab4 = call float @llvm.ceil.f32(float %a4)
+  %ab5 = call float @llvm.ceil.f32(float %a5)
+  %ab6 = call float @llvm.floor.f32(float %a6)
+  %ab7 = call float @llvm.floor.f32(float %a7)
+  %r0 = insertelement <8 x float> poison, float %ab0, i32 0
+  %r1 = insertelement <8 x float>   %r0, float %ab1, i32 1
+  %r2 = insertelement <8 x float>   %r1, float %ab2, i32 2
+  %r3 = insertelement <8 x float>   %r2, float %ab3, i32 3
+  %r4 = insertelement <8 x float>   %r3, float %ab4, i32 4
+  %r5 = insertelement <8 x float>   %r4, float %ab5, i32 5
+  %r6 = insertelement <8 x float>   %r5, float %ab6, i32 6
+  %r7 = insertelement <8 x float>   %r6, float %ab7, i32 7
+  ret <8 x float> %r7
+}
+
+declare float @llvm.ceil.f32(float)
+declare float @llvm.floor.f32(float)
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
@ -0,0 +1,466 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
+
+define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
+; SSE-LABEL: @sitofp_uitofp(
+; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; SSE-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
+; SSE-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
+; SSE-NEXT:    [[AB2:%.*]] = sitofp i32 [[A2]] to float
+; SSE-NEXT:    [[AB3:%.*]] = sitofp i32 [[A3]] to float
+; SSE-NEXT:    [[AB4:%.*]] = uitofp i32 [[A4]] to float
+; SSE-NEXT:    [[AB5:%.*]] = uitofp i32 [[A5]] to float
+; SSE-NEXT:    [[AB6:%.*]] = uitofp i32 [[A6]] to float
+; SSE-NEXT:    [[AB7:%.*]] = uitofp i32 [[A7]] to float
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; SSE-NEXT:    ret <8 x float> [[R7]]
+;
+; SLM-LABEL: @sitofp_uitofp(
+; SLM-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
+; SLM-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
+; SLM-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    ret <8 x float> [[R7]]
+;
+; AVX-LABEL: @sitofp_uitofp(
+; AVX-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
+; AVX-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
+; AVX-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:    ret <8 x float> [[R7]]
+;
+; AVX512-LABEL: @sitofp_uitofp(
+; AVX512-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
+; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    ret <8 x float> [[R7]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %ab0 = sitofp i32 %a0 to float
+  %ab1 = sitofp i32 %a1 to float
+  %ab2 = sitofp i32 %a2 to float
+  %ab3 = sitofp i32 %a3 to float
+  %ab4 = uitofp i32 %a4 to float
+  %ab5 = uitofp i32 %a5 to float
+  %ab6 = uitofp i32 %a6 to float
+  %ab7 = uitofp i32 %a7 to float
+  %r0 = insertelement <8 x float> poison, float %ab0, i32 0
+  %r1 = insertelement <8 x float>   %r0, float %ab1, i32 1
+  %r2 = insertelement <8 x float>   %r1, float %ab2, i32 2
+  %r3 = insertelement <8 x float>   %r2, float %ab3, i32 3
+  %r4 = insertelement <8 x float>   %r3, float %ab4, i32 4
+  %r5 = insertelement <8 x float>   %r4, float %ab5, i32 5
+  %r6 = insertelement <8 x float>   %r5, float %ab6, i32 6
+  %r7 = insertelement <8 x float>   %r6, float %ab7, i32 7
+  ret <8 x float> %r7
+}
+
+define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
+; SSE-LABEL: @fptosi_fptoui(
+; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
+; SSE-NEXT:    [[AB0:%.*]] = fptosi float [[A0]] to i32
+; SSE-NEXT:    [[AB1:%.*]] = fptosi float [[A1]] to i32
+; SSE-NEXT:    [[AB2:%.*]] = fptosi float [[A2]] to i32
+; SSE-NEXT:    [[AB3:%.*]] = fptosi float [[A3]] to i32
+; SSE-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
+; SSE-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
+; SSE-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
+; SSE-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; SSE-NEXT:    ret <8 x i32> [[R7]]
+;
+; SLM-LABEL: @fptosi_fptoui(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
+; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
+; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
+; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
+; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
+; SLM-NEXT:    [[AB0:%.*]] = fptosi float [[A0]] to i32
+; SLM-NEXT:    [[AB1:%.*]] = fptosi float [[A1]] to i32
+; SLM-NEXT:    [[AB2:%.*]] = fptosi float [[A2]] to i32
+; SLM-NEXT:    [[AB3:%.*]] = fptosi float [[A3]] to i32
+; SLM-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
+; SLM-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
+; SLM-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
+; SLM-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; SLM-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX-LABEL: @fptosi_fptoui(
+; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
+; AVX-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
+; AVX-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
+; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
+; AVX-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
+; AVX-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
+; AVX-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
+; AVX-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
+; AVX-NEXT:    [[AB0:%.*]] = fptosi float [[A0]] to i32
+; AVX-NEXT:    [[AB1:%.*]] = fptosi float [[A1]] to i32
+; AVX-NEXT:    [[AB2:%.*]] = fptosi float [[A2]] to i32
+; AVX-NEXT:    [[AB3:%.*]] = fptosi float [[A3]] to i32
+; AVX-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
+; AVX-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
+; AVX-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
+; AVX-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
+; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
+; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; AVX-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; AVX-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; AVX-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; AVX-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; AVX-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX512-LABEL: @fptosi_fptoui(
+; AVX512-NEXT:    [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32>
+; AVX512-NEXT:    [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
+; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    ret <8 x i32> [[R7]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %ab0 = fptosi float %a0 to i32
+  %ab1 = fptosi float %a1 to i32
+  %ab2 = fptosi float %a2 to i32
+  %ab3 = fptosi float %a3 to i32
+  %ab4 = fptoui float %a4 to i32
+  %ab5 = fptoui float %a5 to i32
+  %ab6 = fptoui float %a6 to i32
+  %ab7 = fptoui float %a7 to i32
+  %r0 = insertelement <8 x i32> poison, i32 %ab0, i32 0
+  %r1 = insertelement <8 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <8 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <8 x i32>   %r2, i32 %ab3, i32 3
+  %r4 = insertelement <8 x i32>   %r3, i32 %ab4, i32 4
+  %r5 = insertelement <8 x i32>   %r4, i32 %ab5, i32 5
+  %r6 = insertelement <8 x i32>   %r5, i32 %ab6, i32 6
+  %r7 = insertelement <8 x i32>   %r6, i32 %ab7, i32 7
+  ret <8 x i32> %r7
+}
+
+define <8 x float> @fneg_fabs(<8 x float> %a) {
+; CHECK-LABEL: @fneg_fabs(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[A:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <8 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i32> [[TMP1]], <i32 undef, i32 undef, i32 undef, i32 undef, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP4]] to <8 x float>
+; CHECK-NEXT:    ret <8 x float> [[TMP5]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %aa0 = bitcast float %a0 to i32
+  %aa1 = bitcast float %a1 to i32
+  %aa2 = bitcast float %a2 to i32
+  %aa3 = bitcast float %a3 to i32
+  %aa4 = bitcast float %a4 to i32
+  %aa5 = bitcast float %a5 to i32
+  %aa6 = bitcast float %a6 to i32
+  %aa7 = bitcast float %a7 to i32
+  %ab0 = xor i32 %aa0, -2147483648
+  %ab1 = xor i32 %aa1, -2147483648
+  %ab2 = xor i32 %aa2, -2147483648
+  %ab3 = xor i32 %aa3, -2147483648
+  %ab4 = and i32 %aa4, 2147483647
+  %ab5 = and i32 %aa5, 2147483647
+  %ab6 = and i32 %aa6, 2147483647
+  %ab7 = and i32 %aa7, 2147483647
+  %ac0 = bitcast i32 %ab0 to float
+  %ac1 = bitcast i32 %ab1 to float
+  %ac2 = bitcast i32 %ab2 to float
+  %ac3 = bitcast i32 %ab3 to float
+  %ac4 = bitcast i32 %ab4 to float
+  %ac5 = bitcast i32 %ab5 to float
+  %ac6 = bitcast i32 %ab6 to float
+  %ac7 = bitcast i32 %ab7 to float
+  %r0 = insertelement <8 x float> poison, float %ac0, i32 0
+  %r1 = insertelement <8 x float>   %r0, float %ac1, i32 1
+  %r2 = insertelement <8 x float>   %r1, float %ac2, i32 2
+  %r3 = insertelement <8 x float>   %r2, float %ac3, i32 3
+  %r4 = insertelement <8 x float>   %r3, float %ac4, i32 4
+  %r5 = insertelement <8 x float>   %r4, float %ac5, i32 5
+  %r6 = insertelement <8 x float>   %r5, float %ac6, i32 6
+  %r7 = insertelement <8 x float>   %r6, float %ac7, i32 7
+  ret <8 x float> %r7
+}
+
+define <8 x i32> @sext_zext(<8 x i16> %a) {
+; CHECK-LABEL: @sext_zext(
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
+; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[R7]]
+;
+  %a0 = extractelement <8 x i16> %a, i32 0
+  %a1 = extractelement <8 x i16> %a, i32 1
+  %a2 = extractelement <8 x i16> %a, i32 2
+  %a3 = extractelement <8 x i16> %a, i32 3
+  %a4 = extractelement <8 x i16> %a, i32 4
+  %a5 = extractelement <8 x i16> %a, i32 5
+  %a6 = extractelement <8 x i16> %a, i32 6
+  %a7 = extractelement <8 x i16> %a, i32 7
+  %ab0 = sext i16 %a0 to i32
+  %ab1 = sext i16 %a1 to i32
+  %ab2 = sext i16 %a2 to i32
+  %ab3 = sext i16 %a3 to i32
+  %ab4 = zext i16 %a4 to i32
+  %ab5 = zext i16 %a5 to i32
+  %ab6 = zext i16 %a6 to i32
+  %ab7 = zext i16 %a7 to i32
+  %r0 = insertelement <8 x i32> poison, i32 %ab0, i32 0
+  %r1 = insertelement <8 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <8 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <8 x i32>   %r2, i32 %ab3, i32 3
+  %r4 = insertelement <8 x i32>   %r3, i32 %ab4, i32 4
+  %r5 = insertelement <8 x i32>   %r4, i32 %ab5, i32 5
+  %r6 = insertelement <8 x i32>   %r5, i32 %ab6, i32 6
+  %r7 = insertelement <8 x i32>   %r6, i32 %ab7, i32 7
+  ret <8 x i32> %r7
+}
+
+define <8 x float> @sitofp_4i32_8i16(<4 x i32> %a, <8 x i16> %b) {
+; CHECK-LABEL: @sitofp_4i32_8i16(
+; CHECK-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
+; CHECK-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
+; CHECK-NEXT:    [[B2:%.*]] = extractelement <8 x i16> [[B]], i32 2
+; CHECK-NEXT:    [[B3:%.*]] = extractelement <8 x i16> [[B]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; CHECK-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
+; CHECK-NEXT:    [[AB5:%.*]] = sitofp i16 [[B1]] to float
+; CHECK-NEXT:    [[AB6:%.*]] = sitofp i16 [[B2]] to float
+; CHECK-NEXT:    [[AB7:%.*]] = sitofp i16 [[B3]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; CHECK-NEXT:    ret <8 x float> [[R7]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %b0 = extractelement <8 x i16> %b, i32 0
+  %b1 = extractelement <8 x i16> %b, i32 1
+  %b2 = extractelement <8 x i16> %b, i32 2
+  %b3 = extractelement <8 x i16> %b, i32 3
+  %ab0 = sitofp i32 %a0 to float
+  %ab1 = sitofp i32 %a1 to float
+  %ab2 = sitofp i32 %a2 to float
+  %ab3 = sitofp i32 %a3 to float
+  %ab4 = sitofp i16 %b0 to float
+  %ab5 = sitofp i16 %b1 to float
+  %ab6 = sitofp i16 %b2 to float
+  %ab7 = sitofp i16 %b3 to float
+  %r0 = insertelement <8 x float> poison, float %ab0, i32 0
+  %r1 = insertelement <8 x float>   %r0, float %ab1, i32 1
+  %r2 = insertelement <8 x float>   %r1, float %ab2, i32 2
+  %r3 = insertelement <8 x float>   %r2, float %ab3, i32 3
+  %r4 = insertelement <8 x float>   %r3, float %ab4, i32 4
+  %r5 = insertelement <8 x float>   %r4, float %ab5, i32 5
+  %r6 = insertelement <8 x float>   %r5, float %ab6, i32 6
+  %r7 = insertelement <8 x float>   %r6, float %ab7, i32 7
+  ret <8 x float> %r7
+}
+
+; Inspired by PR38154
+define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16 x i8> %c) {
+; SSE-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
+; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3
+; SSE-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
+; SSE-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
+; SSE-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
+; SSE-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
+; SSE-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
+; SSE-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
+; SSE-NEXT:    [[AB2:%.*]] = uitofp i32 [[A2]] to float
+; SSE-NEXT:    [[AB3:%.*]] = uitofp i32 [[A3]] to float
+; SSE-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
+; SSE-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
+; SSE-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
+; SSE-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; SSE-NEXT:    ret <8 x float> [[R7]]
+;
+; SLM-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
+; SLM-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
+; SLM-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
+; SLM-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
+; SLM-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; SLM-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float>
+; SLM-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
+; SLM-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
+; SLM-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
+; SLM-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
+; SLM-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP4]], i32 1
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP5]], i32 2
+; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP6]], i32 3
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; SLM-NEXT:    ret <8 x float> [[R7]]
+;
+; AVX-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
+; AVX-NEXT:    [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0
+; AVX-NEXT:    [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1
+; AVX-NEXT:    [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2
+; AVX-NEXT:    [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3
+; AVX-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
+; AVX-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
+; AVX-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
+; AVX-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
+; AVX-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
+; AVX-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
+; AVX-NEXT:    [[AB2:%.*]] = uitofp i32 [[A2]] to float
+; AVX-NEXT:    [[AB3:%.*]] = uitofp i32 [[A3]] to float
+; AVX-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
+; AVX-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
+; AVX-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
+; AVX-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
+; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0
+; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
+; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
+; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; AVX-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; AVX-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; AVX-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; AVX-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; AVX-NEXT:    ret <8 x float> [[R7]]
+;
+; AVX512-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
+; AVX512-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
+; AVX512-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
+; AVX512-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
+; AVX512-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
+; AVX512-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float>
+; AVX512-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
+; AVX512-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
+; AVX512-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
+; AVX512-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
+; AVX512-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; AVX512-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
+; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; AVX512-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP4]], i32 1
+; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; AVX512-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP5]], i32 2
+; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; AVX512-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP6]], i32 3
+; AVX512-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; AVX512-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; AVX512-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; AVX512-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; AVX512-NEXT:    ret <8 x float> [[R7]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %b0 = extractelement <8 x i16> %b, i32 0
+  %b1 = extractelement <8 x i16> %b, i32 1
+  %c0 = extractelement <16 x i8> %c, i32 0
+  %c1 = extractelement <16 x i8> %c, i32 1
+  %ab0 = sitofp i32 %a0 to float
+  %ab1 = sitofp i32 %a1 to float
+  %ab2 = uitofp i32 %a2 to float
+  %ab3 = uitofp i32 %a3 to float
+  %ab4 = sitofp i16 %b0 to float
+  %ab5 = uitofp i16 %b1 to float
+  %ab6 = sitofp  i8 %c0 to float
+  %ab7 = uitofp  i8 %c1 to float
+  %r0 = insertelement <8 x float> poison, float %ab0, i32 0
+  %r1 = insertelement <8 x float>   %r0, float %ab1, i32 1
+  %r2 = insertelement <8 x float>   %r1, float %ab2, i32 2
+  %r3 = insertelement <8 x float>   %r2, float %ab3, i32 3
+  %r4 = insertelement <8 x float>   %r3, float %ab4, i32 4
+  %r5 = insertelement <8 x float>   %r4, float %ab5, i32 5
+  %r6 = insertelement <8 x float>   %r5, float %ab6, i32 6
+  %r7 = insertelement <8 x float>   %r6, float %ab7, i32 7
+  ret <8 x float> %r7
+}
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
@ -0,0 +1,179 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
+
+define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: @fadd_fsub_v8f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]]
+; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[R7]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %b0 = extractelement <8 x float> %b, i32 0
+  %b1 = extractelement <8 x float> %b, i32 1
+  %b2 = extractelement <8 x float> %b, i32 2
+  %b3 = extractelement <8 x float> %b, i32 3
+  %b4 = extractelement <8 x float> %b, i32 4
+  %b5 = extractelement <8 x float> %b, i32 5
+  %b6 = extractelement <8 x float> %b, i32 6
+  %b7 = extractelement <8 x float> %b, i32 7
+  %ab0 = fadd float %a0, %b0
+  %ab1 = fsub float %a1, %b1
+  %ab2 = fsub float %a2, %b2
+  %ab3 = fadd float %a3, %b3
+  %ab4 = fadd float %a4, %b4
+  %ab5 = fsub float %a5, %b5
+  %ab6 = fsub float %a6, %b6
+  %ab7 = fadd float %a7, %b7
+  %r0 = insertelement <8 x float> poison, float %ab0, i32 0
+  %r1 = insertelement <8 x float>   %r0, float %ab1, i32 1
+  %r2 = insertelement <8 x float>   %r1, float %ab2, i32 2
+  %r3 = insertelement <8 x float>   %r2, float %ab3, i32 3
+  %r4 = insertelement <8 x float>   %r3, float %ab4, i32 4
+  %r5 = insertelement <8 x float>   %r4, float %ab5, i32 5
+  %r6 = insertelement <8 x float>   %r5, float %ab6, i32 6
+  %r7 = insertelement <8 x float>   %r6, float %ab7, i32 7
+  ret <8 x float> %r7
+}
+
+define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
+; SSE-LABEL: @fmul_fdiv_v8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
+; SSE-NEXT:    [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]
+; SSE-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
+; SSE-NEXT:    ret <8 x float> [[R7]]
+;
+; SLM-LABEL: @fmul_fdiv_v8f32(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
+; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
+; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
+; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
+; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
+; SLM-NEXT:    [[B0:%.*]] = extractelement <8 x float> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <8 x float> [[B]], i32 1
+; SLM-NEXT:    [[B2:%.*]] = extractelement <8 x float> [[B]], i32 2
+; SLM-NEXT:    [[B3:%.*]] = extractelement <8 x float> [[B]], i32 3
+; SLM-NEXT:    [[B4:%.*]] = extractelement <8 x float> [[B]], i32 4
+; SLM-NEXT:    [[B5:%.*]] = extractelement <8 x float> [[B]], i32 5
+; SLM-NEXT:    [[B6:%.*]] = extractelement <8 x float> [[B]], i32 6
+; SLM-NEXT:    [[B7:%.*]] = extractelement <8 x float> [[B]], i32 7
+; SLM-NEXT:    [[AB0:%.*]] = fmul float [[A0]], [[B0]]
+; SLM-NEXT:    [[AB1:%.*]] = fdiv float [[A1]], [[B1]]
+; SLM-NEXT:    [[AB2:%.*]] = fdiv float [[A2]], [[B2]]
+; SLM-NEXT:    [[AB3:%.*]] = fmul float [[A3]], [[B3]]
+; SLM-NEXT:    [[AB4:%.*]] = fmul float [[A4]], [[B4]]
+; SLM-NEXT:    [[AB5:%.*]] = fdiv float [[A5]], [[B5]]
+; SLM-NEXT:    [[AB6:%.*]] = fdiv float [[A6]], [[B6]]
+; SLM-NEXT:    [[AB7:%.*]] = fmul float [[A7]], [[B7]]
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; SLM-NEXT:    ret <8 x float> [[R7]]
+;
+; AVX-LABEL: @fmul_fdiv_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT:    [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]
+; AVX-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
+; AVX-NEXT:    ret <8 x float> [[R7]]
+;
+; AVX512-LABEL: @fmul_fdiv_v8f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
+; AVX512-NEXT:    [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]
+; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
+; AVX512-NEXT:    ret <8 x float> [[R7]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %b0 = extractelement <8 x float> %b, i32 0
+  %b1 = extractelement <8 x float> %b, i32 1
+  %b2 = extractelement <8 x float> %b, i32 2
+  %b3 = extractelement <8 x float> %b, i32 3
+  %b4 = extractelement <8 x float> %b, i32 4
+  %b5 = extractelement <8 x float> %b, i32 5
+  %b6 = extractelement <8 x float> %b, i32 6
+  %b7 = extractelement <8 x float> %b, i32 7
+  %ab0 = fmul float %a0, %b0
+  %ab1 = fdiv float %a1, %b1
+  %ab2 = fdiv float %a2, %b2
+  %ab3 = fmul float %a3, %b3
+  %ab4 = fmul float %a4, %b4
+  %ab5 = fdiv float %a5, %b5
+  %ab6 = fdiv float %a6, %b6
+  %ab7 = fmul float %a7, %b7
+  %r0 = insertelement <8 x float> poison, float %ab0, i32 0
+  %r1 = insertelement <8 x float>   %r0, float %ab1, i32 1
+  %r2 = insertelement <8 x float>   %r1, float %ab2, i32 2
+  %r3 = insertelement <8 x float>   %r2, float %ab3, i32 3
+  %r4 = insertelement <8 x float>   %r3, float %ab4, i32 4
+  %r5 = insertelement <8 x float>   %r4, float %ab5, i32 5
+  %r6 = insertelement <8 x float>   %r5, float %ab6, i32 6
+  %r7 = insertelement <8 x float>   %r6, float %ab7, i32 7
+  ret <8 x float> %r7
+}
+
+define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) {
+; SSE-LABEL: @fmul_fdiv_v4f32_const(
+; SSE-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; SSE-NEXT:    ret <4 x float> [[TMP1]]
+;
+; SLM-LABEL: @fmul_fdiv_v4f32_const(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
+; SLM-NEXT:    [[AB0:%.*]] = fmul float [[A0]], 2.000000e+00
+; SLM-NEXT:    [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00
+; SLM-NEXT:    [[R0:%.*]] = insertelement <4 x float> poison, float [[AB0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[A1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[A2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i32 3
+; SLM-NEXT:    ret <4 x float> [[R3]]
+;
+; AVX-LABEL: @fmul_fdiv_v4f32_const(
+; AVX-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; AVX-NEXT:    ret <4 x float> [[TMP1]]
+;
+; AVX512-LABEL: @fmul_fdiv_v4f32_const(
+; AVX512-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; AVX512-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %ab0 = fmul float %a0, 2.0
+  %ab1 = fmul float %a1, 1.0
+  %ab2 = fdiv float %a2, 1.0
+  %ab3 = fdiv float %a3, 0.5
+  %r0 = insertelement <4 x float> poison, float %ab0, i32 0
+  %r1 = insertelement <4 x float>   %r0, float %ab1, i32 1
+  %r2 = insertelement <4 x float>   %r1, float %ab2, i32 2
+  %r3 = insertelement <4 x float>   %r2, float %ab3, i32 3
+  ret <4 x float> %r3
+}
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
@ -0,0 +1,497 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
+
+define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: @add_sub_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[R7]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %b0 = extractelement <8 x i32> %b, i32 0
+  %b1 = extractelement <8 x i32> %b, i32 1
+  %b2 = extractelement <8 x i32> %b, i32 2
+  %b3 = extractelement <8 x i32> %b, i32 3
+  %b4 = extractelement <8 x i32> %b, i32 4
+  %b5 = extractelement <8 x i32> %b, i32 5
+  %b6 = extractelement <8 x i32> %b, i32 6
+  %b7 = extractelement <8 x i32> %b, i32 7
+  %ab0 = add i32 %a0, %b0
+  %ab1 = add i32 %a1, %b1
+  %ab2 = add i32 %a2, %b2
+  %ab3 = add i32 %a3, %b3
+  %ab4 = sub i32 %a4, %b4
+  %ab5 = sub i32 %a5, %b5
+  %ab6 = sub i32 %a6, %b6
+  %ab7 = sub i32 %a7, %b7
+  %r0 = insertelement <8 x i32> poison, i32 %ab0, i32 0
+  %r1 = insertelement <8 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <8 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <8 x i32>   %r2, i32 %ab3, i32 3
+  %r4 = insertelement <8 x i32>   %r3, i32 %ab4, i32 4
+  %r5 = insertelement <8 x i32>   %r4, i32 %ab5, i32 5
+  %r6 = insertelement <8 x i32>   %r5, i32 %ab6, i32 6
+  %r7 = insertelement <8 x i32>   %r6, i32 %ab7, i32 7
+  ret <8 x i32> %r7
+}
+
+define <4 x i32> @add_and_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @add_and_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x i32> [[R3]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %ab0 = add i32 %a0, %b0
+  %ab1 = add i32 %a1, %b1
+  %ab2 = and i32 %a2, %b2
+  %ab3 = and i32 %a3, %b3
+  %r0 = insertelement <4 x i32> poison, i32 %ab0, i32 0
+  %r1 = insertelement <4 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <4 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <4 x i32>   %r2, i32 %ab3, i32 3
+  ret <4 x i32> %r3
+}
+
+define <4 x i32> @add_mul_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @add_mul_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[R3]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %ab0 = mul i32 %a0, %b0
+  %ab1 = add i32 %a1, %b1
+  %ab2 = add i32 %a2, %b2
+  %ab3 = mul i32 %a3, %b3
+  %r0 = insertelement <4 x i32> poison, i32 %ab0, i32 0
+  %r1 = insertelement <4 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <4 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <4 x i32>   %r2, i32 %ab3, i32 3
+  ret <4 x i32> %r3
+}
+
+define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; SSE-LABEL: @ashr_shl_v8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
+; SSE-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; SSE-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX1-LABEL: @ashr_shl_v8i32(
+; AVX1-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; AVX1-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; AVX1-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; AVX1-NEXT:    [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
+; AVX1-NEXT:    [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
+; AVX1-NEXT:    [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2
+; AVX1-NEXT:    [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3
+; AVX1-NEXT:    [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
+; AVX1-NEXT:    [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
+; AVX1-NEXT:    [[AB2:%.*]] = ashr i32 [[A2]], [[B2]]
+; AVX1-NEXT:    [[AB3:%.*]] = ashr i32 [[A3]], [[B3]]
+; AVX1-NEXT:    [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]]
+; AVX1-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
+; AVX1-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; AVX1-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; AVX1-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; AVX1-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX2-LABEL: @ashr_shl_v8i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
+; AVX2-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; AVX2-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX512-LABEL: @ashr_shl_v8i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
+; AVX512-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    ret <8 x i32> [[R7]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %b0 = extractelement <8 x i32> %b, i32 0
+  %b1 = extractelement <8 x i32> %b, i32 1
+  %b2 = extractelement <8 x i32> %b, i32 2
+  %b3 = extractelement <8 x i32> %b, i32 3
+  %b4 = extractelement <8 x i32> %b, i32 4
+  %b5 = extractelement <8 x i32> %b, i32 5
+  %b6 = extractelement <8 x i32> %b, i32 6
+  %b7 = extractelement <8 x i32> %b, i32 7
+  %ab0 = ashr i32 %a0, %b0
+  %ab1 = ashr i32 %a1, %b1
+  %ab2 = ashr i32 %a2, %b2
+  %ab3 = ashr i32 %a3, %b3
+  %ab4 = shl  i32 %a4, %b4
+  %ab5 = shl  i32 %a5, %b5
+  %ab6 = shl  i32 %a6, %b6
+  %ab7 = shl  i32 %a7, %b7
+  %r0 = insertelement <8 x i32> poison, i32 %ab0, i32 0
+  %r1 = insertelement <8 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <8 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <8 x i32>   %r2, i32 %ab3, i32 3
+  %r4 = insertelement <8 x i32>   %r3, i32 %ab4, i32 4
+  %r5 = insertelement <8 x i32>   %r4, i32 %ab5, i32 5
+  %r6 = insertelement <8 x i32>   %r5, i32 %ab6, i32 6
+  %r7 = insertelement <8 x i32>   %r6, i32 %ab7, i32 7
+  ret <8 x i32> %r7
+}
+
+define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
+; SSE-LABEL: @ashr_shl_v8i32_const(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], <i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX1-LABEL: @ashr_shl_v8i32_const(
+; AVX1-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2>
+; AVX1-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], <i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX2-LABEL: @ashr_shl_v8i32_const(
+; AVX2-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX512-LABEL: @ashr_shl_v8i32_const(
+; AVX512-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    ret <8 x i32> [[R7]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %ab0 = ashr i32 %a0, 2
+  %ab1 = ashr i32 %a1, 2
+  %ab2 = ashr i32 %a2, 2
+  %ab3 = ashr i32 %a3, 2
+  %ab4 = shl  i32 %a4, 3
+  %ab5 = shl  i32 %a5, 3
+  %ab6 = shl  i32 %a6, 3
+  %ab7 = shl  i32 %a7, 3
+  %r0 = insertelement <8 x i32> poison, i32 %ab0, i32 0
+  %r1 = insertelement <8 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <8 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <8 x i32>   %r2, i32 %ab3, i32 3
+  %r4 = insertelement <8 x i32>   %r3, i32 %ab4, i32 4
+  %r5 = insertelement <8 x i32>   %r4, i32 %ab5, i32 5
+  %r6 = insertelement <8 x i32>   %r5, i32 %ab6, i32 6
+  %r7 = insertelement <8 x i32>   %r6, i32 %ab7, i32 7
+  ret <8 x i32> %r7
+}
+
+define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; SSE-LABEL: @ashr_lshr_shl_v8i32(
+; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; SSE-NEXT:    [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
+; SSE-NEXT:    [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
+; SSE-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6
+; SSE-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
+; SSE-NEXT:    [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
+; SSE-NEXT:    [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
+; SSE-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]]
+; SSE-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
+; SSE-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; SSE-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX1-LABEL: @ashr_lshr_shl_v8i32(
+; AVX1-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; AVX1-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; AVX1-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; AVX1-NEXT:    [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
+; AVX1-NEXT:    [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
+; AVX1-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6
+; AVX1-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
+; AVX1-NEXT:    [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
+; AVX1-NEXT:    [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
+; AVX1-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]]
+; AVX1-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
+; AVX1-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
+; AVX1-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
+; AVX1-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; AVX1-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2
+; AVX1-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2
+; AVX1-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3
+; AVX1-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3
+; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4
+; AVX1-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5
+; AVX1-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5
+; AVX1-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; AVX1-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; AVX1-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX2-LABEL: @ashr_lshr_shl_v8i32(
+; AVX2-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 6
+; AVX2-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; AVX2-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6
+; AVX2-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]]
+; AVX2-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
+; AVX2-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
+; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP6]], i32 0
+; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; AVX2-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1
+; AVX2-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; AVX2-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2
+; AVX2-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; AVX2-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3
+; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4
+; AVX2-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4
+; AVX2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5
+; AVX2-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5
+; AVX2-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; AVX2-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; AVX2-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX512-LABEL: @ashr_lshr_shl_v8i32(
+; AVX512-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 6
+; AVX512-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; AVX512-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6
+; AVX512-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]]
+; AVX512-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
+; AVX512-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
+; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; AVX512-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP6]], i32 0
+; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; AVX512-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1
+; AVX512-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; AVX512-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2
+; AVX512-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; AVX512-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3
+; AVX512-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4
+; AVX512-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4
+; AVX512-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5
+; AVX512-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5
+; AVX512-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; AVX512-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; AVX512-NEXT:    ret <8 x i32> [[R7]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %b0 = extractelement <8 x i32> %b, i32 0
+  %b1 = extractelement <8 x i32> %b, i32 1
+  %b2 = extractelement <8 x i32> %b, i32 2
+  %b3 = extractelement <8 x i32> %b, i32 3
+  %b4 = extractelement <8 x i32> %b, i32 4
+  %b5 = extractelement <8 x i32> %b, i32 5
+  %b6 = extractelement <8 x i32> %b, i32 6
+  %b7 = extractelement <8 x i32> %b, i32 7
+  %ab0 = ashr i32 %a0, %b0
+  %ab1 = ashr i32 %a1, %b1
+  %ab2 = lshr i32 %a2, %b2
+  %ab3 = lshr i32 %a3, %b3
+  %ab4 = lshr i32 %a4, %b4
+  %ab5 = lshr i32 %a5, %b5
+  %ab6 = shl  i32 %a6, %b6
+  %ab7 = shl  i32 %a7, %b7
+  %r0 = insertelement <8 x i32> poison, i32 %ab0, i32 0
+  %r1 = insertelement <8 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <8 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <8 x i32>   %r2, i32 %ab3, i32 3
+  %r4 = insertelement <8 x i32>   %r3, i32 %ab4, i32 4
+  %r5 = insertelement <8 x i32>   %r4, i32 %ab5, i32 5
+  %r6 = insertelement <8 x i32>   %r5, i32 %ab6, i32 6
+  %r7 = insertelement <8 x i32>   %r6, i32 %ab7, i32 7
+  ret <8 x i32> %r7
+}
+
+define <8 x i32> @add_v8i32_undefs(<8 x i32> %a) {
+; CHECK-LABEL: @add_v8i32_undefs(
+; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], <i32 undef, i32 4, i32 8, i32 16, i32 undef, i32 4, i32 8, i32 16>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %ab0 = add i32 %a0, undef
+  %ab1 = add i32 %a1, 4
+  %ab2 = add i32 %a2, 8
+  %ab3 = add i32 %a3, 16
+  %ab4 = add i32 %a4, undef
+  %ab5 = add i32 %a5, 4
+  %ab6 = add i32 %a6, 8
+  %ab7 = add i32 %a7, 16
+  %r0 = insertelement <8 x i32> poison, i32 %ab0, i32 0
+  %r1 = insertelement <8 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <8 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <8 x i32>   %r2, i32 %ab3, i32 3
+  %r4 = insertelement <8 x i32>   %r3, i32 %ab4, i32 4
+  %r5 = insertelement <8 x i32>   %r4, i32 %ab5, i32 5
+  %r6 = insertelement <8 x i32>   %r5, i32 %ab6, i32 6
+  %r7 = insertelement <8 x i32>   %r6, i32 %ab7, i32 7
+  ret <8 x i32> %r7
+}
+
+define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) {
+; CHECK-LABEL: @sdiv_v8i32_undefs(
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1
+; CHECK-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; CHECK-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; CHECK-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; CHECK-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; CHECK-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; CHECK-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
+; CHECK-NEXT:    [[AB2:%.*]] = sdiv i32 [[A2]], 8
+; CHECK-NEXT:    [[AB3:%.*]] = sdiv i32 [[A3]], 16
+; CHECK-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
+; CHECK-NEXT:    [[AB6:%.*]] = sdiv i32 [[A6]], 8
+; CHECK-NEXT:    [[AB7:%.*]] = sdiv i32 [[A7]], 16
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[AB1]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[AB2]], i32 2
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[AB3]], i32 3
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; CHECK-NEXT:    ret <8 x i32> [[R7]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %ab0 = sdiv i32 %a0, undef
+  %ab1 = sdiv i32 %a1, 4
+  %ab2 = sdiv i32 %a2, 8
+  %ab3 = sdiv i32 %a3, 16
+  %ab4 = sdiv i32 %a4, undef
+  %ab5 = sdiv i32 %a5, 4
+  %ab6 = sdiv i32 %a6, 8
+  %ab7 = sdiv i32 %a7, 16
+  %r0 = insertelement <8 x i32> poison, i32 %ab0, i32 0
+  %r1 = insertelement <8 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <8 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <8 x i32>   %r2, i32 %ab3, i32 3
+  %r4 = insertelement <8 x i32>   %r3, i32 %ab4, i32 4
+  %r5 = insertelement <8 x i32>   %r4, i32 %ab5, i32 5
+  %r6 = insertelement <8 x i32>   %r5, i32 %ab6, i32 6
+  %r7 = insertelement <8 x i32>   %r6, i32 %ab7, i32 7
+  ret <8 x i32> %r7
+}
+
+define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
+; CHECK-LABEL: @add_sub_v8i32_splat(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
+; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[R7]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %ab0 = add i32 %a0, %b
+  %ab1 = add i32 %b, %a1
+  %ab2 = add i32 %a2, %b
+  %ab3 = add i32 %b, %a3
+  %ab4 = sub i32 %b, %a4
+  %ab5 = sub i32 %b, %a5
+  %ab6 = sub i32 %b, %a6
+  %ab7 = sub i32 %b, %a7
+  %r0 = insertelement <8 x i32> poison, i32 %ab0, i32 0
+  %r1 = insertelement <8 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <8 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <8 x i32>   %r2, i32 %ab3, i32 3
+  %r4 = insertelement <8 x i32>   %r3, i32 %ab4, i32 4
+  %r5 = insertelement <8 x i32>   %r4, i32 %ab5, i32 5
+  %r6 = insertelement <8 x i32>   %r5, i32 %ab6, i32 6
+  %r7 = insertelement <8 x i32>   %r6, i32 %ab7, i32 7
+  ret <8 x i32> %r7
+}
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
@ -0,0 +1,200 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 -instcombine | FileCheck %s
+
+define <2 x i8> @g(<2 x i8> %x, <2 x i8> %y) {
+; CHECK-LABEL: @g(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <2 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[Y1:%.*]] = extractelement <2 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
+; CHECK-NEXT:    [[INS1:%.*]] = insertelement <2 x i8> poison, i8 [[X0X0]], i32 0
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <2 x i8> [[INS1]], i8 [[Y1Y1]], i32 1
+; CHECK-NEXT:    ret <2 x i8> [[INS2]]
+;
+  %x0 = extractelement <2 x i8> %x, i32 0
+  %y1 = extractelement <2 x i8> %y, i32 1
+  %x0x0 = mul i8 %x0, %x0
+  %y1y1 = mul i8 %y1, %y1
+  %ins1 = insertelement <2 x i8> poison, i8 %x0x0, i32 0
+  %ins2 = insertelement <2 x i8> %ins1, i8 %y1y1, i32 1
+  ret <2 x i8> %ins2
+}
+
+define <4 x i8> @h(<4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: @h(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
+; CHECK-NEXT:    [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
+; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
+; CHECK-NEXT:    [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
+; CHECK-NEXT:    [[INS1:%.*]] = insertelement <4 x i8> poison, i8 [[X0X0]], i32 0
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x i8> [[INS1]], i8 [[X3X3]], i32 1
+; CHECK-NEXT:    [[INS3:%.*]] = insertelement <4 x i8> [[INS2]], i8 [[Y1Y1]], i32 2
+; CHECK-NEXT:    [[INS4:%.*]] = insertelement <4 x i8> [[INS3]], i8 [[Y2Y2]], i32 3
+; CHECK-NEXT:    ret <4 x i8> [[INS4]]
+;
+  %x0 = extractelement <4 x i8> %x, i32 0
+  %x3 = extractelement <4 x i8> %x, i32 3
+  %y1 = extractelement <4 x i8> %y, i32 1
+  %y2 = extractelement <4 x i8> %y, i32 2
+  %x0x0 = mul i8 %x0, %x0
+  %x3x3 = mul i8 %x3, %x3
+  %y1y1 = mul i8 %y1, %y1
+  %y2y2 = mul i8 %y2, %y2
+  %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
+  %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
+  %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
+  %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
+  ret <4 x i8> %ins4
+}
+
+define <4 x i8> @h_undef(<4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: @h_undef(
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 3
+; CHECK-NEXT:    [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
+; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
+; CHECK-NEXT:    [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x i8> undef, i8 [[X3X3]], i32 1
+; CHECK-NEXT:    [[INS3:%.*]] = insertelement <4 x i8> [[INS2]], i8 [[Y1Y1]], i32 2
+; CHECK-NEXT:    [[INS4:%.*]] = insertelement <4 x i8> [[INS3]], i8 [[Y2Y2]], i32 3
+; CHECK-NEXT:    ret <4 x i8> [[INS4]]
+;
+  %x0 = extractelement <4 x i8> undef, i32 0
+  %x3 = extractelement <4 x i8> %x, i32 3
+  %y1 = extractelement <4 x i8> %y, i32 1
+  %y2 = extractelement <4 x i8> %y, i32 2
+  %x0x0 = mul i8 %x0, %x0
+  %x3x3 = mul i8 %x3, %x3
+  %y1y1 = mul i8 %y1, %y1
+  %y2y2 = mul i8 %y2, %y2
+  %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
+  %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
+  %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
+  %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
+  ret <4 x i8> %ins4
+}
+
+define i8 @i(<4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: @i(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
+; CHECK-NEXT:    [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
+; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
+; CHECK-NEXT:    [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i8 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i8 [[TMP3]]
+;
+  %x0 = extractelement <4 x i8> %x, i32 0
+  %x3 = extractelement <4 x i8> %x, i32 3
+  %y1 = extractelement <4 x i8> %y, i32 1
+  %y2 = extractelement <4 x i8> %y, i32 2
+  %x0x0 = mul i8 %x0, %x0
+  %x3x3 = mul i8 %x3, %x3
+  %y1y1 = mul i8 %y1, %y1
+  %y2y2 = mul i8 %y2, %y2
+  %1 = add i8 %x0x0, %x3x3
+  %2 = add i8 %y1y1, %y2y2
+  %3 = add i8 %1, %2
+  ret i8 %3
+}
+
+define i8 @j(<4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: @j(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
+; CHECK-NEXT:    [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
+; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
+; CHECK-NEXT:    [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i8 [[TMP3]]
+;
+  %x0 = extractelement <4 x i8> %x, i32 0
+  %x3 = extractelement <4 x i8> %x, i32 3
+  %y1 = extractelement <4 x i8> %y, i32 1
+  %y2 = extractelement <4 x i8> %y, i32 2
+  %x0x0 = mul i8 %x0, %x0
+  %x3x3 = mul i8 %x3, %x3
+  %y1y1 = mul i8 %y1, %y1
+  %y2y2 = mul i8 %y2, %y2
+  %1 = add i8 %x0x0, %x3x3
+  %2 = add i8 %y1y1, %y2y2
+  %3 = sdiv i8 %1, %2
+  ret i8 %3
+}
+
+define i8 @k(<4 x i8> %x) {
+; CHECK-LABEL: @k(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1
+; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2
+; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT:    [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
+; CHECK-NEXT:    [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i8 [[TMP3]]
+;
+  %x0 = extractelement <4 x i8> %x, i32 0
+  %x3 = extractelement <4 x i8> %x, i32 3
+  %x1 = extractelement <4 x i8> %x, i32 1
+  %x2 = extractelement <4 x i8> %x, i32 2
+  %x0x0 = mul i8 %x0, %x0
+  %x3x3 = mul i8 %x3, %x3
+  %x1x1 = mul i8 %x1, %x1
+  %x2x2 = mul i8 %x2, %x2
+  %1 = add i8 %x0x0, %x3x3
+  %2 = add i8 %x1x1, %x2x2
+  %3 = sdiv i8 %1, %2
+  ret i8 %3
+}
+
+define i8 @k_bb(<4 x i8> %x) {
+; CHECK-LABEL: @k_bb(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1
+; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2
+; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT:    [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
+; CHECK-NEXT:    [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i8 [[TMP3]]
+;
+  %x0 = extractelement <4 x i8> %x, i32 0
+  br label %bb1
+bb1:
+  %x3 = extractelement <4 x i8> %x, i32 3
+  %x1 = extractelement <4 x i8> %x, i32 1
+  %x2 = extractelement <4 x i8> %x, i32 2
+  %x0x0 = mul i8 %x0, %x0
+  %x3x3 = mul i8 %x3, %x3
+  %x1x1 = mul i8 %x1, %x1
+  %x2x2 = mul i8 %x2, %x2
+  %1 = add i8 %x0x0, %x3x3
+  %2 = add i8 %x1x1, %x2x2
+  %3 = sdiv i8 %1, %2
+  ret i8 %3
+}
--- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll
@ -0,0 +1,283 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s
+; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx  | FileCheck %s
+
+;
+; Check that we can commute operands based on the predicate.
+;
+
+define <4 x i32> @icmp_eq_v4i32(<4 x i32> %a, i32* %b) {
+; CHECK-LABEL: @icmp_eq_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i32> [[TMP2]], [[A:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %p0 = getelementptr inbounds i32, i32* %b, i32 0
+  %p1 = getelementptr inbounds i32, i32* %b, i32 1
+  %p2 = getelementptr inbounds i32, i32* %b, i32 2
+  %p3 = getelementptr inbounds i32, i32* %b, i32 3
+  %b0 = load i32, i32* %p0, align 4
+  %b1 = load i32, i32* %p1, align 4
+  %b2 = load i32, i32* %p2, align 4
+  %b3 = load i32, i32* %p3, align 4
+  %c0 = icmp eq i32 %a0, %b0
+  %c1 = icmp eq i32 %b1, %a1
+  %c2 = icmp eq i32 %b2, %a2
+  %c3 = icmp eq i32 %a3, %b3
+  %d0 = insertelement <4 x i1> poison, i1 %c0, i32 0
+  %d1 = insertelement <4 x i1>   %d0, i1 %c1, i32 1
+  %d2 = insertelement <4 x i1>   %d1, i1 %c2, i32 2
+  %d3 = insertelement <4 x i1>   %d2, i1 %c3, i32 3
+  %r = sext <4 x i1> %d3 to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @icmp_ne_v4i32(<4 x i32> %a, i32* %b) {
+; CHECK-LABEL: @icmp_ne_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], [[A:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %p0 = getelementptr inbounds i32, i32* %b, i32 0
+  %p1 = getelementptr inbounds i32, i32* %b, i32 1
+  %p2 = getelementptr inbounds i32, i32* %b, i32 2
+  %p3 = getelementptr inbounds i32, i32* %b, i32 3
+  %b0 = load i32, i32* %p0, align 4
+  %b1 = load i32, i32* %p1, align 4
+  %b2 = load i32, i32* %p2, align 4
+  %b3 = load i32, i32* %p3, align 4
+  %c0 = icmp ne i32 %a0, %b0
+  %c1 = icmp ne i32 %b1, %a1
+  %c2 = icmp ne i32 %b2, %a2
+  %c3 = icmp ne i32 %a3, %b3
+  %d0 = insertelement <4 x i1> poison, i1 %c0, i32 0
+  %d1 = insertelement <4 x i1>   %d0, i1 %c1, i32 1
+  %d2 = insertelement <4 x i1>   %d1, i1 %c2, i32 2
+  %d3 = insertelement <4 x i1>   %d2, i1 %c3, i32 3
+  %r = sext <4 x i1> %d3 to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @fcmp_oeq_v4i32(<4 x float> %a, float* %b) {
+; CHECK-LABEL: @fcmp_oeq_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[B:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp oeq <4 x float> [[TMP2]], [[A:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %p0 = getelementptr inbounds float, float* %b, i32 0
+  %p1 = getelementptr inbounds float, float* %b, i32 1
+  %p2 = getelementptr inbounds float, float* %b, i32 2
+  %p3 = getelementptr inbounds float, float* %b, i32 3
+  %b0 = load float, float* %p0, align 4
+  %b1 = load float, float* %p1, align 4
+  %b2 = load float, float* %p2, align 4
+  %b3 = load float, float* %p3, align 4
+  %c0 = fcmp oeq float %a0, %b0
+  %c1 = fcmp oeq float %b1, %a1
+  %c2 = fcmp oeq float %b2, %a2
+  %c3 = fcmp oeq float %a3, %b3
+  %d0 = insertelement <4 x i1> poison, i1 %c0, i32 0
+  %d1 = insertelement <4 x i1>   %d0, i1 %c1, i32 1
+  %d2 = insertelement <4 x i1>   %d1, i1 %c2, i32 2
+  %d3 = insertelement <4 x i1>   %d2, i1 %c3, i32 3
+  %r = sext <4 x i1> %d3 to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @fcmp_uno_v4i32(<4 x float> %a, float* %b) {
+; CHECK-LABEL: @fcmp_uno_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[B:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp uno <4 x float> [[TMP2]], [[A:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %p0 = getelementptr inbounds float, float* %b, i32 0
+  %p1 = getelementptr inbounds float, float* %b, i32 1
+  %p2 = getelementptr inbounds float, float* %b, i32 2
+  %p3 = getelementptr inbounds float, float* %b, i32 3
+  %b0 = load float, float* %p0, align 4
+  %b1 = load float, float* %p1, align 4
+  %b2 = load float, float* %p2, align 4
+  %b3 = load float, float* %p3, align 4
+  %c0 = fcmp uno float %a0, %b0
+  %c1 = fcmp uno float %b1, %a1
+  %c2 = fcmp uno float %b2, %a2
+  %c3 = fcmp uno float %a3, %b3
+  %d0 = insertelement <4 x i1> poison, i1 %c0, i32 0
+  %d1 = insertelement <4 x i1>   %d0, i1 %c1, i32 1
+  %d2 = insertelement <4 x i1>   %d1, i1 %c2, i32 2
+  %d3 = insertelement <4 x i1>   %d2, i1 %c3, i32 3
+  %r = sext <4 x i1> %d3 to <4 x i32>
+  ret <4 x i32> %r
+}
+
+;
+; Check that we can commute operands by swapping the predicate.
+;
+
+define <4 x i32> @icmp_sgt_slt_v4i32(<4 x i32> %a, i32* %b) {
+; CHECK-LABEL: @icmp_sgt_slt_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt <4 x i32> [[TMP2]], [[A:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %p0 = getelementptr inbounds i32, i32* %b, i32 0
+  %p1 = getelementptr inbounds i32, i32* %b, i32 1
+  %p2 = getelementptr inbounds i32, i32* %b, i32 2
+  %p3 = getelementptr inbounds i32, i32* %b, i32 3
+  %b0 = load i32, i32* %p0, align 4
+  %b1 = load i32, i32* %p1, align 4
+  %b2 = load i32, i32* %p2, align 4
+  %b3 = load i32, i32* %p3, align 4
+  %c0 = icmp sgt i32 %a0, %b0
+  %c1 = icmp slt i32 %b1, %a1
+  %c2 = icmp slt i32 %b2, %a2
+  %c3 = icmp sgt i32 %a3, %b3
+  %d0 = insertelement <4 x i1> poison, i1 %c0, i32 0
+  %d1 = insertelement <4 x i1>   %d0, i1 %c1, i32 1
+  %d2 = insertelement <4 x i1>   %d1, i1 %c2, i32 2
+  %d3 = insertelement <4 x i1>   %d2, i1 %c3, i32 3
+  %r = sext <4 x i1> %d3 to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @icmp_uge_ule_v4i32(<4 x i32> %a, i32* %b) {
+; CHECK-LABEL: @icmp_uge_ule_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ule <4 x i32> [[TMP2]], [[A:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %p0 = getelementptr inbounds i32, i32* %b, i32 0
+  %p1 = getelementptr inbounds i32, i32* %b, i32 1
+  %p2 = getelementptr inbounds i32, i32* %b, i32 2
+  %p3 = getelementptr inbounds i32, i32* %b, i32 3
+  %b0 = load i32, i32* %p0, align 4
+  %b1 = load i32, i32* %p1, align 4
+  %b2 = load i32, i32* %p2, align 4
+  %b3 = load i32, i32* %p3, align 4
+  %c0 = icmp uge i32 %a0, %b0
+  %c1 = icmp ule i32 %b1, %a1
+  %c2 = icmp ule i32 %b2, %a2
+  %c3 = icmp uge i32 %a3, %b3
+  %d0 = insertelement <4 x i1> poison, i1 %c0, i32 0
+  %d1 = insertelement <4 x i1>   %d0, i1 %c1, i32 1
+  %d2 = insertelement <4 x i1>   %d1, i1 %c2, i32 2
+  %d3 = insertelement <4 x i1>   %d2, i1 %c3, i32 3
+  %r = sext <4 x i1> %d3 to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @fcmp_ogt_olt_v4i32(<4 x float> %a, float* %b) {
+; CHECK-LABEL: @fcmp_ogt_olt_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[B:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp olt <4 x float> [[TMP2]], [[A:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %p0 = getelementptr inbounds float, float* %b, i32 0
+  %p1 = getelementptr inbounds float, float* %b, i32 1
+  %p2 = getelementptr inbounds float, float* %b, i32 2
+  %p3 = getelementptr inbounds float, float* %b, i32 3
+  %b0 = load float, float* %p0, align 4
+  %b1 = load float, float* %p1, align 4
+  %b2 = load float, float* %p2, align 4
+  %b3 = load float, float* %p3, align 4
+  %c0 = fcmp ogt float %a0, %b0
+  %c1 = fcmp olt float %b1, %a1
+  %c2 = fcmp olt float %b2, %a2
+  %c3 = fcmp ogt float %a3, %b3
+  %d0 = insertelement <4 x i1> poison, i1 %c0, i32 0
+  %d1 = insertelement <4 x i1>   %d0, i1 %c1, i32 1
+  %d2 = insertelement <4 x i1>   %d1, i1 %c2, i32 2
+  %d3 = insertelement <4 x i1>   %d2, i1 %c3, i32 3
+  %r = sext <4 x i1> %d3 to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) {
+; CHECK-LABEL: @fcmp_ord_uno_v4i32(
+; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
+; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
+; CHECK-NEXT:    [[B0:%.*]] = load float, float* [[B]], align 4
+; CHECK-NEXT:    [[B1:%.*]] = load float, float* [[P1]], align 4
+; CHECK-NEXT:    [[B2:%.*]] = load float, float* [[P2]], align 4
+; CHECK-NEXT:    [[B3:%.*]] = load float, float* [[P3]], align 4
+; CHECK-NEXT:    [[C0:%.*]] = fcmp ord float [[A0]], [[B0]]
+; CHECK-NEXT:    [[C1:%.*]] = fcmp uno float [[B1]], [[A1]]
+; CHECK-NEXT:    [[C2:%.*]] = fcmp uno float [[B2]], [[A2]]
+; CHECK-NEXT:    [[C3:%.*]] = fcmp ord float [[A3]], [[B3]]
+; CHECK-NEXT:    [[D0:%.*]] = insertelement <4 x i1> poison, i1 [[C0]], i32 0
+; CHECK-NEXT:    [[D1:%.*]] = insertelement <4 x i1> [[D0]], i1 [[C1]], i32 1
+; CHECK-NEXT:    [[D2:%.*]] = insertelement <4 x i1> [[D1]], i1 [[C2]], i32 2
+; CHECK-NEXT:    [[D3:%.*]] = insertelement <4 x i1> [[D2]], i1 [[C3]], i32 3
+; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %p0 = getelementptr inbounds float, float* %b, i32 0
+  %p1 = getelementptr inbounds float, float* %b, i32 1
+  %p2 = getelementptr inbounds float, float* %b, i32 2
+  %p3 = getelementptr inbounds float, float* %b, i32 3
+  %b0 = load float, float* %p0, align 4
+  %b1 = load float, float* %p1, align 4
+  %b2 = load float, float* %p2, align 4
+  %b3 = load float, float* %p3, align 4
+  %c0 = fcmp ord float %a0, %b0
+  %c1 = fcmp uno float %b1, %a1
+  %c2 = fcmp uno float %b2, %a2
+  %c3 = fcmp ord float %a3, %b3
+  %d0 = insertelement <4 x i1> poison, i1 %c0, i32 0
+  %d1 = insertelement <4 x i1>   %d0, i1 %c1, i32 1
+  %d2 = insertelement <4 x i1>   %d1, i1 %c2, i32 2
+  %d3 = insertelement <4 x i1>   %d2, i1 %c3, i32 3
+  %r = sext <4 x i1> %d3 to <4 x i32>
+  ret <4 x i32> %r
+}
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling-inseltpoison.ll
@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-darwin13.3.0"
+
+define void @_foo(double %p1, double %p2, double %p3) #0 {
+; CHECK-LABEL: @_foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TAB1:%.*]] = alloca [256 x i32], align 16
+; CHECK-NEXT:    [[TAB2:%.*]] = alloca [256 x i32], align 16
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[MUL19:%.*]] = fmul double [[P1:%.*]], 1.638400e+04
+; CHECK-NEXT:    [[MUL20:%.*]] = fmul double [[P3:%.*]], 1.638400e+04
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[MUL20]], 8.192000e+03
+; CHECK-NEXT:    [[MUL21:%.*]] = fmul double [[P2:%.*]], 1.638400e+04
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV266:%.*]] = phi i64 [ 0, [[BB1]] ], [ [[INDVARS_IV_NEXT267:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[T_0259:%.*]] = phi double [ 0.000000e+00, [[BB1]] ], [ [[ADD27:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[P3_ADDR_0258:%.*]] = phi double [ [[ADD]], [[BB1]] ], [ [[ADD28:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[VECINIT_I_I237:%.*]] = insertelement <2 x double> poison, double [[T_0259]], i32 0
+; CHECK-NEXT:    [[X13:%.*]] = tail call i32 @_xfn(<2 x double> [[VECINIT_I_I237]])
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB1]], i64 0, i64 [[INDVARS_IV266]]
+; CHECK-NEXT:    store i32 [[X13]], i32* [[ARRAYIDX]], align 4, [[TBAA0:!tbaa !.*]]
+; CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <2 x double> poison, double [[P3_ADDR_0258]], i32 0
+; CHECK-NEXT:    [[X14:%.*]] = tail call i32 @_xfn(<2 x double> [[VECINIT_I_I]])
+; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB2]], i64 0, i64 [[INDVARS_IV266]]
+; CHECK-NEXT:    store i32 [[X14]], i32* [[ARRAYIDX26]], align 4, [[TBAA0]]
+; CHECK-NEXT:    [[ADD27]] = fadd double [[MUL19]], [[T_0259]]
+; CHECK-NEXT:    [[ADD28]] = fadd double [[MUL21]], [[P3_ADDR_0258]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT267]] = add nuw nsw i64 [[INDVARS_IV266]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT267]], 256
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[RETURN:%.*]], label [[FOR_BODY]]
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tab1 = alloca [256 x i32], align 16
+  %tab2 = alloca [256 x i32], align 16
+  br label %bb1
+
+
+bb1:
+  %mul19 = fmul double %p1, 1.638400e+04
+  %mul20 = fmul double %p3, 1.638400e+04
+  %add = fadd double %mul20, 8.192000e+03
+  %mul21 = fmul double %p2, 1.638400e+04
+  ; The SLPVectorizer crashed when scheduling this block after it inserted an
+  ; insertelement instruction (during vectorizing the for.body block) at this position.
+  br label %for.body
+
+for.body:
+  %indvars.iv266 = phi i64 [ 0, %bb1 ], [ %indvars.iv.next267, %for.body ]
+  %t.0259 = phi double [ 0.000000e+00, %bb1 ], [ %add27, %for.body ]
+  %p3.addr.0258 = phi double [ %add, %bb1 ], [ %add28, %for.body ]
+  %vecinit.i.i237 = insertelement <2 x double> poison, double %t.0259, i32 0
+  %x13 = tail call i32 @_xfn(<2 x double> %vecinit.i.i237) #2
+  %arrayidx = getelementptr inbounds [256 x i32], [256 x i32]* %tab1, i64 0, i64 %indvars.iv266
+  store i32 %x13, i32* %arrayidx, align 4, !tbaa !4
+  %vecinit.i.i = insertelement <2 x double> poison, double %p3.addr.0258, i32 0
+  %x14 = tail call i32 @_xfn(<2 x double> %vecinit.i.i) #2
+  %arrayidx26 = getelementptr inbounds [256 x i32], [256 x i32]* %tab2, i64 0, i64 %indvars.iv266
+  store i32 %x14, i32* %arrayidx26, align 4, !tbaa !4
+  %add27 = fadd double %mul19, %t.0259
+  %add28 = fadd double %mul21, %p3.addr.0258
+  %indvars.iv.next267 = add nuw nsw i64 %indvars.iv266, 1
+  %exitcond = icmp eq i64 %indvars.iv.next267, 256
+  br i1 %exitcond, label %return, label %for.body
+
+return:
+  ret void
+}
+
+declare i32 @_xfn(<2 x double>) #4
+
+!3 = !{!"int", !5, i64 0}
+!4 = !{!3, !3, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
--- a/llvm/test/Transforms/SLPVectorizer/X86/external_user_jumbled_load-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/external_user_jumbled_load-inseltpoison.ll
@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s
+
+@array = external global [20 x [13 x i32]]
+
+define void @hoge(i64 %idx, <4 x i32>* %sink) {
+; CHECK-LABEL: @hoge(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [20 x [13 x i32]], [20 x [13 x i32]]* @array, i64 0, i64 [[IDX:%.*]], i64 5
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [20 x [13 x i32]], [20 x [13 x i32]]* @array, i64 0, i64 [[IDX]], i64 6
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [20 x [13 x i32]], [20 x [13 x i32]]* @array, i64 0, i64 [[IDX]], i64 7
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [20 x [13 x i32]], [20 x [13 x i32]]* @array, i64 0, i64 [[IDX]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP10]], i32 2
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 3
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP12]], i32 3
+; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* [[SINK:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+bb:
+  %0 = getelementptr inbounds [20 x [13 x i32]], [20 x [13 x i32]]* @array, i64 0, i64 %idx, i64 5
+  %1 = getelementptr inbounds [20 x [13 x i32]], [20 x [13 x i32]]* @array, i64 0, i64 %idx, i64 6
+  %2 = getelementptr inbounds [20 x [13 x i32]], [20 x [13 x i32]]* @array, i64 0, i64 %idx, i64 7
+  %3 = getelementptr inbounds [20 x [13 x i32]], [20 x [13 x i32]]* @array, i64 0, i64 %idx, i64 8
+  %4 = load i32, i32* %1, align 4
+  %5 = insertelement <4 x i32> poison, i32 %4, i32 0
+  %6 = load i32, i32* %2, align 4
+  %7 = insertelement <4 x i32> %5, i32 %6, i32 1
+  %8 = load i32, i32* %3, align 4
+  %9 = insertelement <4 x i32> %7, i32 %8, i32 2
+  %10 = load i32, i32* %0, align 4
+  %11 = insertelement <4 x i32> %9, i32 %10, i32 3
+  store <4 x i32> %11, <4 x i32>* %sink
+  ret void
+}
+
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle-inseltpoison.ll
@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 -slp-schedule-budget=1 | FileCheck %s
+
+define <2 x i8> @g(<2 x i8> %x, <2 x i8> %y) {
+; CHECK-LABEL: @g(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <2 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[Y1:%.*]] = extractelement <2 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
+; CHECK-NEXT:    [[INS1:%.*]] = insertelement <2 x i8> poison, i8 [[X0X0]], i32 0
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <2 x i8> [[INS1]], i8 [[Y1Y1]], i32 1
+; CHECK-NEXT:    ret <2 x i8> [[INS2]]
+;
+  %x0 = extractelement <2 x i8> %x, i32 0
+  %y1 = extractelement <2 x i8> %y, i32 1
+  %x0x0 = mul i8 %x0, %x0
+  %y1y1 = mul i8 %y1, %y1
+  %ins1 = insertelement <2 x i8> poison, i8 %x0x0, i32 0
+  %ins2 = insertelement <2 x i8> %ins1, i8 %y1y1, i32 1
+  ret <2 x i8> %ins2
+}
+
--- a/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll
@ -0,0 +1,534 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256DQ
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@src64 = common global [8 x double] zeroinitializer, align 64
+@src32 = common global [16 x float] zeroinitializer, align 64
+@dst64 = common global [8 x i64] zeroinitializer, align 64
+@dst32 = common global [16 x i32] zeroinitializer, align 64
+@dst16 = common global [32 x i16] zeroinitializer, align 64
+@dst8 = common global [64 x i8] zeroinitializer, align 64
+
+;
+; FPTOSI vXf64
+;
+
+define void @fptosi_8f64_8i64() #0 {
+; SSE-LABEL: @fptosi_8f64_8i64(
+; SSE-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = fptosi double [[A0]] to i64
+; SSE-NEXT:    [[CVT1:%.*]] = fptosi double [[A1]] to i64
+; SSE-NEXT:    [[CVT2:%.*]] = fptosi double [[A2]] to i64
+; SSE-NEXT:    [[CVT3:%.*]] = fptosi double [[A3]] to i64
+; SSE-NEXT:    [[CVT4:%.*]] = fptosi double [[A4]] to i64
+; SSE-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i64
+; SSE-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i64
+; SSE-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i64
+; SSE-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
+; SSE-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
+; SSE-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
+; SSE-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
+; SSE-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
+; SSE-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @fptosi_8f64_8i64(
+; AVX256NODQ-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; AVX256NODQ-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; AVX256NODQ-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptosi double [[A0]] to i64
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptosi double [[A1]] to i64
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptosi double [[A2]] to i64
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptosi double [[A3]] to i64
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptosi double [[A4]] to i64
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i64
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i64
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i64
+; AVX256NODQ-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @fptosi_8f64_8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i64>
+; AVX512-NEXT:    store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @fptosi_8f64_8i64(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i64>
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptosi <4 x double> [[TMP2]] to <4 x i64>
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    ret void
+;
+  %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+  %a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+  %a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+  %a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+  %a6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+  %a7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+  %cvt0 = fptosi double %a0 to i64
+  %cvt1 = fptosi double %a1 to i64
+  %cvt2 = fptosi double %a2 to i64
+  %cvt3 = fptosi double %a3 to i64
+  %cvt4 = fptosi double %a4 to i64
+  %cvt5 = fptosi double %a5 to i64
+  %cvt6 = fptosi double %a6 to i64
+  %cvt7 = fptosi double %a7 to i64
+  store i64 %cvt0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
+  store i64 %cvt1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
+  store i64 %cvt2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
+  store i64 %cvt3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
+  store i64 %cvt4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
+  store i64 %cvt5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
+  store i64 %cvt6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
+  store i64 %cvt7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @fptosi_8f64_8i32() #0 {
+; SSE-LABEL: @fptosi_8f64_8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i32>
+; SSE-NEXT:    [[TMP4:%.*]] = fptosi <4 x double> [[TMP2]] to <4 x i32>
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @fptosi_8f64_8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i32>
+; AVX-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
+; AVX-NEXT:    ret void
+;
+  %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+  %a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+  %a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+  %a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+  %a6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+  %a7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+  %cvt0 = fptosi double %a0 to i32
+  %cvt1 = fptosi double %a1 to i32
+  %cvt2 = fptosi double %a2 to i32
+  %cvt3 = fptosi double %a3 to i32
+  %cvt4 = fptosi double %a4 to i32
+  %cvt5 = fptosi double %a5 to i32
+  %cvt6 = fptosi double %a6 to i32
+  %cvt7 = fptosi double %a7 to i32
+  store i32 %cvt0, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
+  store i32 %cvt1, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
+  store i32 %cvt2, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
+  store i32 %cvt3, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
+  store i32 %cvt4, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
+  store i32 %cvt5, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
+  store i32 %cvt6, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
+  store i32 %cvt7, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @fptosi_8f64_8i16() #0 {
+; CHECK-LABEL: @fptosi_8f64_8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2
+; CHECK-NEXT:    ret void
+;
+  %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+  %a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+  %a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+  %a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+  %a6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+  %a7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+  %cvt0 = fptosi double %a0 to i16
+  %cvt1 = fptosi double %a1 to i16
+  %cvt2 = fptosi double %a2 to i16
+  %cvt3 = fptosi double %a3 to i16
+  %cvt4 = fptosi double %a4 to i16
+  %cvt5 = fptosi double %a5 to i16
+  %cvt6 = fptosi double %a6 to i16
+  %cvt7 = fptosi double %a7 to i16
+  store i16 %cvt0, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2
+  store i16 %cvt1, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2
+  store i16 %cvt2, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2
+  store i16 %cvt3, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2
+  store i16 %cvt4, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2
+  store i16 %cvt5, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2
+  store i16 %cvt6, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2
+  store i16 %cvt7, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2
+  ret void
+}
+
+define void @fptosi_8f64_8i8() #0 {
+; CHECK-LABEL: @fptosi_8f64_8i8(
+; CHECK-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; CHECK-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; CHECK-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; CHECK-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; CHECK-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; CHECK-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; CHECK-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; CHECK-NEXT:    [[CVT0:%.*]] = fptosi double [[A0]] to i8
+; CHECK-NEXT:    [[CVT1:%.*]] = fptosi double [[A1]] to i8
+; CHECK-NEXT:    [[CVT2:%.*]] = fptosi double [[A2]] to i8
+; CHECK-NEXT:    [[CVT3:%.*]] = fptosi double [[A3]] to i8
+; CHECK-NEXT:    [[CVT4:%.*]] = fptosi double [[A4]] to i8
+; CHECK-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i8
+; CHECK-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i8
+; CHECK-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i8
+; CHECK-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
+; CHECK-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
+; CHECK-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
+; CHECK-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
+; CHECK-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
+; CHECK-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
+; CHECK-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
+; CHECK-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+  %a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+  %a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+  %a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+  %a6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+  %a7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+  %cvt0 = fptosi double %a0 to i8
+  %cvt1 = fptosi double %a1 to i8
+  %cvt2 = fptosi double %a2 to i8
+  %cvt3 = fptosi double %a3 to i8
+  %cvt4 = fptosi double %a4 to i8
+  %cvt5 = fptosi double %a5 to i8
+  %cvt6 = fptosi double %a6 to i8
+  %cvt7 = fptosi double %a7 to i8
+  store i8 %cvt0, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
+  store i8 %cvt1, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
+  store i8 %cvt2, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
+  store i8 %cvt3, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
+  store i8 %cvt4, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
+  store i8 %cvt5, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
+  store i8 %cvt6, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
+  store i8 %cvt7, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+  ret void
+}
+
+;
+; FPTOSI vXf32
+;
+
+define void @fptosi_8f32_8i64() #0 {
+; SSE-LABEL: @fptosi_8f32_8i64(
+; SSE-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; SSE-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; SSE-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; SSE-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; SSE-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[CVT0:%.*]] = fptosi float [[A0]] to i64
+; SSE-NEXT:    [[CVT1:%.*]] = fptosi float [[A1]] to i64
+; SSE-NEXT:    [[CVT2:%.*]] = fptosi float [[A2]] to i64
+; SSE-NEXT:    [[CVT3:%.*]] = fptosi float [[A3]] to i64
+; SSE-NEXT:    [[CVT4:%.*]] = fptosi float [[A4]] to i64
+; SSE-NEXT:    [[CVT5:%.*]] = fptosi float [[A5]] to i64
+; SSE-NEXT:    [[CVT6:%.*]] = fptosi float [[A6]] to i64
+; SSE-NEXT:    [[CVT7:%.*]] = fptosi float [[A7]] to i64
+; SSE-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
+; SSE-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
+; SSE-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
+; SSE-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
+; SSE-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
+; SSE-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @fptosi_8f32_8i64(
+; AVX256NODQ-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; AVX256NODQ-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; AVX256NODQ-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; AVX256NODQ-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; AVX256NODQ-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; AVX256NODQ-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; AVX256NODQ-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptosi float [[A0]] to i64
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptosi float [[A1]] to i64
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptosi float [[A2]] to i64
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptosi float [[A3]] to i64
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptosi float [[A4]] to i64
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptosi float [[A5]] to i64
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptosi float [[A6]] to i64
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptosi float [[A7]] to i64
+; AVX256NODQ-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @fptosi_8f32_8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = fptosi <8 x float> [[TMP1]] to <8 x i64>
+; AVX512-NEXT:    store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @fptosi_8f32_8i64(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i64>
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[TMP2]] to <4 x i64>
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    ret void
+;
+  %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+  %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+  %a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+  %a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+  %a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+  %a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+  %a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+  %a7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+  %cvt0 = fptosi float %a0 to i64
+  %cvt1 = fptosi float %a1 to i64
+  %cvt2 = fptosi float %a2 to i64
+  %cvt3 = fptosi float %a3 to i64
+  %cvt4 = fptosi float %a4 to i64
+  %cvt5 = fptosi float %a5 to i64
+  %cvt6 = fptosi float %a6 to i64
+  %cvt7 = fptosi float %a7 to i64
+  store i64 %cvt0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
+  store i64 %cvt1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
+  store i64 %cvt2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
+  store i64 %cvt3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
+  store i64 %cvt4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
+  store i64 %cvt5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
+  store i64 %cvt6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
+  store i64 %cvt7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @fptosi_8f32_8i32() #0 {
+; SSE-LABEL: @fptosi_8f32_8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
+; SSE-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[TMP2]] to <4 x i32>
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @fptosi_8f32_8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = fptosi <8 x float> [[TMP1]] to <8 x i32>
+; AVX-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
+; AVX-NEXT:    ret void
+;
+  %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+  %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+  %a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+  %a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+  %a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+  %a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+  %a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+  %a7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+  %cvt0 = fptosi float %a0 to i32
+  %cvt1 = fptosi float %a1 to i32
+  %cvt2 = fptosi float %a2 to i32
+  %cvt3 = fptosi float %a3 to i32
+  %cvt4 = fptosi float %a4 to i32
+  %cvt5 = fptosi float %a5 to i32
+  %cvt6 = fptosi float %a6 to i32
+  %cvt7 = fptosi float %a7 to i32
+  store i32 %cvt0, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
+  store i32 %cvt1, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
+  store i32 %cvt2, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
+  store i32 %cvt3, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
+  store i32 %cvt4, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
+  store i32 %cvt5, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
+  store i32 %cvt6, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
+  store i32 %cvt7, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @fptosi_8f32_8i16() #0 {
+; CHECK-LABEL: @fptosi_8f32_8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fptosi <8 x float> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2
+; CHECK-NEXT:    ret void
+;
+  %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+  %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+  %a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+  %a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+  %a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+  %a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+  %a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+  %a7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+  %cvt0 = fptosi float %a0 to i16
+  %cvt1 = fptosi float %a1 to i16
+  %cvt2 = fptosi float %a2 to i16
+  %cvt3 = fptosi float %a3 to i16
+  %cvt4 = fptosi float %a4 to i16
+  %cvt5 = fptosi float %a5 to i16
+  %cvt6 = fptosi float %a6 to i16
+  %cvt7 = fptosi float %a7 to i16
+  store i16 %cvt0, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2
+  store i16 %cvt1, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2
+  store i16 %cvt2, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2
+  store i16 %cvt3, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2
+  store i16 %cvt4, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2
+  store i16 %cvt5, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2
+  store i16 %cvt6, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2
+  store i16 %cvt7, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2
+  ret void
+}
+
+define void @fptosi_8f32_8i8() #0 {
+; CHECK-LABEL: @fptosi_8f32_8i8(
+; CHECK-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; CHECK-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; CHECK-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; CHECK-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; CHECK-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; CHECK-NEXT:    [[CVT0:%.*]] = fptosi float [[A0]] to i8
+; CHECK-NEXT:    [[CVT1:%.*]] = fptosi float [[A1]] to i8
+; CHECK-NEXT:    [[CVT2:%.*]] = fptosi float [[A2]] to i8
+; CHECK-NEXT:    [[CVT3:%.*]] = fptosi float [[A3]] to i8
+; CHECK-NEXT:    [[CVT4:%.*]] = fptosi float [[A4]] to i8
+; CHECK-NEXT:    [[CVT5:%.*]] = fptosi float [[A5]] to i8
+; CHECK-NEXT:    [[CVT6:%.*]] = fptosi float [[A6]] to i8
+; CHECK-NEXT:    [[CVT7:%.*]] = fptosi float [[A7]] to i8
+; CHECK-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
+; CHECK-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
+; CHECK-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
+; CHECK-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
+; CHECK-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
+; CHECK-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
+; CHECK-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
+; CHECK-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+  %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+  %a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+  %a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+  %a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+  %a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+  %a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+  %a7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+  %cvt0 = fptosi float %a0 to i8
+  %cvt1 = fptosi float %a1 to i8
+  %cvt2 = fptosi float %a2 to i8
+  %cvt3 = fptosi float %a3 to i8
+  %cvt4 = fptosi float %a4 to i8
+  %cvt5 = fptosi float %a5 to i8
+  %cvt6 = fptosi float %a6 to i8
+  %cvt7 = fptosi float %a7 to i8
+  store i8 %cvt0, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
+  store i8 %cvt1, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
+  store i8 %cvt2, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
+  store i8 %cvt3, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
+  store i8 %cvt4, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
+  store i8 %cvt5, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
+  store i8 %cvt6, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
+  store i8 %cvt7, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+  ret void
+}
+
+;
+; FPTOSI BUILDVECTOR
+;
+
+define <4 x i32> @fptosi_4xf64_4i32(double %a0, double %a1, double %a2, double %a3) #0 {
+; CHECK-LABEL: @fptosi_4xf64_4i32(
+; CHECK-NEXT:    [[CVT0:%.*]] = fptosi double [[A0:%.*]] to i32
+; CHECK-NEXT:    [[CVT1:%.*]] = fptosi double [[A1:%.*]] to i32
+; CHECK-NEXT:    [[CVT2:%.*]] = fptosi double [[A2:%.*]] to i32
+; CHECK-NEXT:    [[CVT3:%.*]] = fptosi double [[A3:%.*]] to i32
+; CHECK-NEXT:    [[RES0:%.*]] = insertelement <4 x i32> poison, i32 [[CVT0]], i32 0
+; CHECK-NEXT:    [[RES1:%.*]] = insertelement <4 x i32> [[RES0]], i32 [[CVT1]], i32 1
+; CHECK-NEXT:    [[RES2:%.*]] = insertelement <4 x i32> [[RES1]], i32 [[CVT2]], i32 2
+; CHECK-NEXT:    [[RES3:%.*]] = insertelement <4 x i32> [[RES2]], i32 [[CVT3]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[RES3]]
+;
+  %cvt0 = fptosi double %a0 to i32
+  %cvt1 = fptosi double %a1 to i32
+  %cvt2 = fptosi double %a2 to i32
+  %cvt3 = fptosi double %a3 to i32
+  %res0 = insertelement <4 x i32> poison, i32 %cvt0, i32 0
+  %res1 = insertelement <4 x i32> %res0, i32 %cvt1, i32 1
+  %res2 = insertelement <4 x i32> %res1, i32 %cvt2, i32 2
+  %res3 = insertelement <4 x i32> %res2, i32 %cvt3, i32 3
+  ret <4 x i32> %res3
+}
+
+define <4 x i32> @fptosi_4xf32_4i32(float %a0, float %a1, float %a2, float %a3) #0 {
+; CHECK-LABEL: @fptosi_4xf32_4i32(
+; CHECK-NEXT:    [[CVT0:%.*]] = fptosi float [[A0:%.*]] to i32
+; CHECK-NEXT:    [[CVT1:%.*]] = fptosi float [[A1:%.*]] to i32
+; CHECK-NEXT:    [[CVT2:%.*]] = fptosi float [[A2:%.*]] to i32
+; CHECK-NEXT:    [[CVT3:%.*]] = fptosi float [[A3:%.*]] to i32
+; CHECK-NEXT:    [[RES0:%.*]] = insertelement <4 x i32> poison, i32 [[CVT0]], i32 0
+; CHECK-NEXT:    [[RES1:%.*]] = insertelement <4 x i32> [[RES0]], i32 [[CVT1]], i32 1
+; CHECK-NEXT:    [[RES2:%.*]] = insertelement <4 x i32> [[RES1]], i32 [[CVT2]], i32 2
+; CHECK-NEXT:    [[RES3:%.*]] = insertelement <4 x i32> [[RES2]], i32 [[CVT3]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[RES3]]
+;
+  %cvt0 = fptosi float %a0 to i32
+  %cvt1 = fptosi float %a1 to i32
+  %cvt2 = fptosi float %a2 to i32
+  %cvt3 = fptosi float %a3 to i32
+  %res0 = insertelement <4 x i32> poison, i32 %cvt0, i32 0
+  %res1 = insertelement <4 x i32> %res0, i32 %cvt1, i32 1
+  %res2 = insertelement <4 x i32> %res1, i32 %cvt2, i32 2
+  %res3 = insertelement <4 x i32> %res2, i32 %cvt3, i32 3
+  ret <4 x i32> %res3
+}
+
+attributes #0 = { nounwind }
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
@ -0,0 +1,433 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+
+;
+; 128-bit vectors
+;
+
+define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: @test_v2f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; SSE-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    ret <2 x double> [[TMP3]]
+;
+; SLM-LABEL: @test_v2f64(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <2 x double> [[A]], i32 1
+; SLM-NEXT:    [[B0:%.*]] = extractelement <2 x double> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <2 x double> [[B]], i32 1
+; SLM-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
+; SLM-NEXT:    [[R1:%.*]] = fadd double [[B0]], [[B1]]
+; SLM-NEXT:    [[R00:%.*]] = insertelement <2 x double> poison, double [[R0]], i32 0
+; SLM-NEXT:    [[R01:%.*]] = insertelement <2 x double> [[R00]], double [[R1]], i32 1
+; SLM-NEXT:    ret <2 x double> [[R01]]
+;
+; AVX-LABEL: @test_v2f64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <2 x double> [[TMP3]]
+;
+  %a0 = extractelement <2 x double> %a, i32 0
+  %a1 = extractelement <2 x double> %a, i32 1
+  %b0 = extractelement <2 x double> %b, i32 0
+  %b1 = extractelement <2 x double> %b, i32 1
+  %r0 = fadd double %a0, %a1
+  %r1 = fadd double %b0, %b1
+  %r00 = insertelement <2 x double> poison, double %r0, i32 0
+  %r01 = insertelement <2 x double>  %r00, double %r1, i32 1
+  ret <2 x double> %r01
+}
+
+define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %r0 = fadd float %a0, %a1
+  %r1 = fadd float %a2, %a3
+  %r2 = fadd float %b0, %b1
+  %r3 = fadd float %b2, %b3
+  %r00 = insertelement <4 x float> poison, float %r0, i32 0
+  %r01 = insertelement <4 x float>  %r00, float %r1, i32 1
+  %r02 = insertelement <4 x float>  %r01, float %r2, i32 2
+  %r03 = insertelement <4 x float>  %r02, float %r3, i32 3
+  ret <4 x float> %r03
+}
+
+define <2 x i64> @test_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: @test_v2i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
+;
+  %a0 = extractelement <2 x i64> %a, i32 0
+  %a1 = extractelement <2 x i64> %a, i32 1
+  %b0 = extractelement <2 x i64> %b, i32 0
+  %b1 = extractelement <2 x i64> %b, i32 1
+  %r0 = add i64 %a0, %a1
+  %r1 = add i64 %b0, %b1
+  %r00 = insertelement <2 x i64> poison, i64 %r0, i32 0
+  %r01 = insertelement <2 x i64>  %r00, i64 %r1, i32 1
+  ret <2 x i64> %r01
+}
+
+define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @test_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %r0 = add i32 %a0, %a1
+  %r1 = add i32 %a2, %a3
+  %r2 = add i32 %b0, %b1
+  %r3 = add i32 %b2, %b3
+  %r00 = insertelement <4 x i32> poison, i32 %r0, i32 0
+  %r01 = insertelement <4 x i32>  %r00, i32 %r1, i32 1
+  %r02 = insertelement <4 x i32>  %r01, i32 %r2, i32 2
+  %r03 = insertelement <4 x i32>  %r02, i32 %r3, i32 3
+  ret <4 x i32> %r03
+}
+
+define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: @test_v8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+;
+  %a0 = extractelement <8 x i16> %a, i32 0
+  %a1 = extractelement <8 x i16> %a, i32 1
+  %a2 = extractelement <8 x i16> %a, i32 2
+  %a3 = extractelement <8 x i16> %a, i32 3
+  %a4 = extractelement <8 x i16> %a, i32 4
+  %a5 = extractelement <8 x i16> %a, i32 5
+  %a6 = extractelement <8 x i16> %a, i32 6
+  %a7 = extractelement <8 x i16> %a, i32 7
+  %b0 = extractelement <8 x i16> %b, i32 0
+  %b1 = extractelement <8 x i16> %b, i32 1
+  %b2 = extractelement <8 x i16> %b, i32 2
+  %b3 = extractelement <8 x i16> %b, i32 3
+  %b4 = extractelement <8 x i16> %b, i32 4
+  %b5 = extractelement <8 x i16> %b, i32 5
+  %b6 = extractelement <8 x i16> %b, i32 6
+  %b7 = extractelement <8 x i16> %b, i32 7
+  %r0 = add i16 %a0, %a1
+  %r1 = add i16 %a2, %a3
+  %r2 = add i16 %a4, %a5
+  %r3 = add i16 %a6, %a7
+  %r4 = add i16 %b0, %b1
+  %r5 = add i16 %b2, %b3
+  %r6 = add i16 %b4, %b5
+  %r7 = add i16 %b6, %b7
+  %r00 = insertelement <8 x i16> poison, i16 %r0, i32 0
+  %r01 = insertelement <8 x i16>  %r00, i16 %r1, i32 1
+  %r02 = insertelement <8 x i16>  %r01, i16 %r2, i32 2
+  %r03 = insertelement <8 x i16>  %r02, i16 %r3, i32 3
+  %r04 = insertelement <8 x i16>  %r03, i16 %r4, i32 4
+  %r05 = insertelement <8 x i16>  %r04, i16 %r5, i32 5
+  %r06 = insertelement <8 x i16>  %r05, i16 %r6, i32 6
+  %r07 = insertelement <8 x i16>  %r06, i16 %r7, i32 7
+  ret <8 x i16> %r07
+}
+
+;
+; 256-bit vectors
+;
+
+define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
+; SSE-LABEL: @test_v4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[R03]]
+;
+; SLM-LABEL: @test_v4f64(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3
+; SLM-NEXT:    [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <4 x double> [[B]], i32 1
+; SLM-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B]], i32 2
+; SLM-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3
+; SLM-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
+; SLM-NEXT:    [[R1:%.*]] = fadd double [[B0]], [[B1]]
+; SLM-NEXT:    [[R2:%.*]] = fadd double [[A2]], [[A3]]
+; SLM-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
+; SLM-NEXT:    [[R00:%.*]] = insertelement <4 x double> poison, double [[R0]], i32 0
+; SLM-NEXT:    [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i32 1
+; SLM-NEXT:    [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i32 2
+; SLM-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i32 3
+; SLM-NEXT:    ret <4 x double> [[R03]]
+;
+; AVX-LABEL: @test_v4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %r0 = fadd double %a0, %a1
+  %r1 = fadd double %b0, %b1
+  %r2 = fadd double %a2, %a3
+  %r3 = fadd double %b2, %b3
+  %r00 = insertelement <4 x double> poison, double %r0, i32 0
+  %r01 = insertelement <4 x double>  %r00, double %r1, i32 1
+  %r02 = insertelement <4 x double>  %r01, double %r2, i32 2
+  %r03 = insertelement <4 x double>  %r02, double %r3, i32 3
+  ret <4 x double> %r03
+}
+
+define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
+; SSE-LABEL: @test_v8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    ret <8 x float> [[TMP3]]
+;
+; SLM-LABEL: @test_v8f32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    [[R07:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x float> [[R07]]
+;
+; AVX-LABEL: @test_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x float> [[TMP3]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %b0 = extractelement <8 x float> %b, i32 0
+  %b1 = extractelement <8 x float> %b, i32 1
+  %b2 = extractelement <8 x float> %b, i32 2
+  %b3 = extractelement <8 x float> %b, i32 3
+  %b4 = extractelement <8 x float> %b, i32 4
+  %b5 = extractelement <8 x float> %b, i32 5
+  %b6 = extractelement <8 x float> %b, i32 6
+  %b7 = extractelement <8 x float> %b, i32 7
+  %r0 = fadd float %a0, %a1
+  %r1 = fadd float %a2, %a3
+  %r2 = fadd float %b0, %b1
+  %r3 = fadd float %b2, %b3
+  %r4 = fadd float %a4, %a5
+  %r5 = fadd float %a6, %a7
+  %r6 = fadd float %b4, %b5
+  %r7 = fadd float %b6, %b7
+  %r00 = insertelement <8 x float> poison, float %r0, i32 0
+  %r01 = insertelement <8 x float>  %r00, float %r1, i32 1
+  %r02 = insertelement <8 x float>  %r01, float %r2, i32 2
+  %r03 = insertelement <8 x float>  %r02, float %r3, i32 3
+  %r04 = insertelement <8 x float>  %r03, float %r4, i32 4
+  %r05 = insertelement <8 x float>  %r04, float %r5, i32 5
+  %r06 = insertelement <8 x float>  %r05, float %r6, i32 6
+  %r07 = insertelement <8 x float>  %r06, float %r7, i32 7
+  ret <8 x float> %r07
+}
+
+define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: @test_v4i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+;
+  %a0 = extractelement <4 x i64> %a, i32 0
+  %a1 = extractelement <4 x i64> %a, i32 1
+  %a2 = extractelement <4 x i64> %a, i32 2
+  %a3 = extractelement <4 x i64> %a, i32 3
+  %b0 = extractelement <4 x i64> %b, i32 0
+  %b1 = extractelement <4 x i64> %b, i32 1
+  %b2 = extractelement <4 x i64> %b, i32 2
+  %b3 = extractelement <4 x i64> %b, i32 3
+  %r0 = add i64 %a0, %a1
+  %r1 = add i64 %b0, %b1
+  %r2 = add i64 %a2, %a3
+  %r3 = add i64 %b2, %b3
+  %r00 = insertelement <4 x i64> poison, i64 %r0, i32 0
+  %r01 = insertelement <4 x i64>  %r00, i64 %r1, i32 1
+  %r02 = insertelement <4 x i64>  %r01, i64 %r2, i32 2
+  %r03 = insertelement <4 x i64>  %r02, i64 %r3, i32 3
+  ret <4 x i64> %r03
+}
+
+define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: @test_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %b0 = extractelement <8 x i32> %b, i32 0
+  %b1 = extractelement <8 x i32> %b, i32 1
+  %b2 = extractelement <8 x i32> %b, i32 2
+  %b3 = extractelement <8 x i32> %b, i32 3
+  %b4 = extractelement <8 x i32> %b, i32 4
+  %b5 = extractelement <8 x i32> %b, i32 5
+  %b6 = extractelement <8 x i32> %b, i32 6
+  %b7 = extractelement <8 x i32> %b, i32 7
+  %r0 = add i32 %a0, %a1
+  %r1 = add i32 %a2, %a3
+  %r2 = add i32 %b0, %b1
+  %r3 = add i32 %b2, %b3
+  %r4 = add i32 %a4, %a5
+  %r5 = add i32 %a6, %a7
+  %r6 = add i32 %b4, %b5
+  %r7 = add i32 %b6, %b7
+  %r00 = insertelement <8 x i32> poison, i32 %r0, i32 0
+  %r01 = insertelement <8 x i32>  %r00, i32 %r1, i32 1
+  %r02 = insertelement <8 x i32>  %r01, i32 %r2, i32 2
+  %r03 = insertelement <8 x i32>  %r02, i32 %r3, i32 3
+  %r04 = insertelement <8 x i32>  %r03, i32 %r4, i32 4
+  %r05 = insertelement <8 x i32>  %r04, i32 %r5, i32 5
+  %r06 = insertelement <8 x i32>  %r05, i32 %r6, i32 6
+  %r07 = insertelement <8 x i32>  %r06, i32 %r7, i32 7
+  ret <8 x i32> %r07
+}
+
+define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; SSE-LABEL: @test_v16i16(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SSE-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SSE-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[RV15:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i16> [[RV15]]
+;
+; SLM-LABEL: @test_v16i16(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SLM-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    ret <16 x i16> [[TMP3]]
+;
+; AVX-LABEL: @test_v16i16(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; AVX-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <16 x i16> [[TMP3]]
+;
+  %a0  = extractelement <16 x i16> %a, i32 0
+  %a1  = extractelement <16 x i16> %a, i32 1
+  %a2  = extractelement <16 x i16> %a, i32 2
+  %a3  = extractelement <16 x i16> %a, i32 3
+  %a4  = extractelement <16 x i16> %a, i32 4
+  %a5  = extractelement <16 x i16> %a, i32 5
+  %a6  = extractelement <16 x i16> %a, i32 6
+  %a7  = extractelement <16 x i16> %a, i32 7
+  %a8  = extractelement <16 x i16> %a, i32 8
+  %a9  = extractelement <16 x i16> %a, i32 9
+  %a10 = extractelement <16 x i16> %a, i32 10
+  %a11 = extractelement <16 x i16> %a, i32 11
+  %a12 = extractelement <16 x i16> %a, i32 12
+  %a13 = extractelement <16 x i16> %a, i32 13
+  %a14 = extractelement <16 x i16> %a, i32 14
+  %a15 = extractelement <16 x i16> %a, i32 15
+  %b0  = extractelement <16 x i16> %b, i32 0
+  %b1  = extractelement <16 x i16> %b, i32 1
+  %b2  = extractelement <16 x i16> %b, i32 2
+  %b3  = extractelement <16 x i16> %b, i32 3
+  %b4  = extractelement <16 x i16> %b, i32 4
+  %b5  = extractelement <16 x i16> %b, i32 5
+  %b6  = extractelement <16 x i16> %b, i32 6
+  %b7  = extractelement <16 x i16> %b, i32 7
+  %b8  = extractelement <16 x i16> %b, i32 8
+  %b9  = extractelement <16 x i16> %b, i32 9
+  %b10 = extractelement <16 x i16> %b, i32 10
+  %b11 = extractelement <16 x i16> %b, i32 11
+  %b12 = extractelement <16 x i16> %b, i32 12
+  %b13 = extractelement <16 x i16> %b, i32 13
+  %b14 = extractelement <16 x i16> %b, i32 14
+  %b15 = extractelement <16 x i16> %b, i32 15
+  %r0  = add i16 %a0 , %a1
+  %r1  = add i16 %a2 , %a3
+  %r2  = add i16 %a4 , %a5
+  %r3  = add i16 %a6 , %a7
+  %r4  = add i16 %b0 , %b1
+  %r5  = add i16 %b2 , %b3
+  %r6  = add i16 %b4 , %b5
+  %r7  = add i16 %b6 , %b7
+  %r8  = add i16 %a8 , %a9
+  %r9  = add i16 %a10, %a11
+  %r10 = add i16 %a12, %a13
+  %r11 = add i16 %a14, %a15
+  %r12 = add i16 %b8 , %b9
+  %r13 = add i16 %b10, %b11
+  %r14 = add i16 %b12, %b13
+  %r15 = add i16 %b14, %b15
+  %rv0  = insertelement <16 x i16> poison, i16 %r0 , i32 0
+  %rv1  = insertelement <16 x i16> %rv0 , i16 %r1 , i32 1
+  %rv2  = insertelement <16 x i16> %rv1 , i16 %r2 , i32 2
+  %rv3  = insertelement <16 x i16> %rv2 , i16 %r3 , i32 3
+  %rv4  = insertelement <16 x i16> %rv3 , i16 %r4 , i32 4
+  %rv5  = insertelement <16 x i16> %rv4 , i16 %r5 , i32 5
+  %rv6  = insertelement <16 x i16> %rv5 , i16 %r6 , i32 6
+  %rv7  = insertelement <16 x i16> %rv6 , i16 %r7 , i32 7
+  %rv8  = insertelement <16 x i16> %rv7 , i16 %r8 , i32 8
+  %rv9  = insertelement <16 x i16> %rv8 , i16 %r9 , i32 9
+  %rv10 = insertelement <16 x i16> %rv9 , i16 %r10, i32 10
+  %rv11 = insertelement <16 x i16> %rv10, i16 %r11, i32 11
+  %rv12 = insertelement <16 x i16> %rv11, i16 %r12, i32 12
+  %rv13 = insertelement <16 x i16> %rv12, i16 %r13, i32 13
+  %rv14 = insertelement <16 x i16> %rv13, i16 %r14, i32 14
+  %rv15 = insertelement <16 x i16> %rv14, i16 %r15, i32 15
+  ret <16 x i16> %rv15
+}
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
@ -0,0 +1,433 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+
+;
+; 128-bit vectors
+;
+
+define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: @test_v2f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; SSE-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    ret <2 x double> [[TMP3]]
+;
+; SLM-LABEL: @test_v2f64(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <2 x double> [[A]], i32 1
+; SLM-NEXT:    [[B0:%.*]] = extractelement <2 x double> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <2 x double> [[B]], i32 1
+; SLM-NEXT:    [[R0:%.*]] = fsub double [[A0]], [[A1]]
+; SLM-NEXT:    [[R1:%.*]] = fsub double [[B0]], [[B1]]
+; SLM-NEXT:    [[R00:%.*]] = insertelement <2 x double> poison, double [[R0]], i32 0
+; SLM-NEXT:    [[R01:%.*]] = insertelement <2 x double> [[R00]], double [[R1]], i32 1
+; SLM-NEXT:    ret <2 x double> [[R01]]
+;
+; AVX-LABEL: @test_v2f64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <2 x double> [[TMP3]]
+;
+  %a0 = extractelement <2 x double> %a, i32 0
+  %a1 = extractelement <2 x double> %a, i32 1
+  %b0 = extractelement <2 x double> %b, i32 0
+  %b1 = extractelement <2 x double> %b, i32 1
+  %r0 = fsub double %a0, %a1
+  %r1 = fsub double %b0, %b1
+  %r00 = insertelement <2 x double> poison, double %r0, i32 0
+  %r01 = insertelement <2 x double>  %r00, double %r1, i32 1
+  ret <2 x double> %r01
+}
+
+define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %r0 = fsub float %a0, %a1
+  %r1 = fsub float %a2, %a3
+  %r2 = fsub float %b0, %b1
+  %r3 = fsub float %b2, %b3
+  %r00 = insertelement <4 x float> poison, float %r0, i32 0
+  %r01 = insertelement <4 x float>  %r00, float %r1, i32 1
+  %r02 = insertelement <4 x float>  %r01, float %r2, i32 2
+  %r03 = insertelement <4 x float>  %r02, float %r3, i32 3
+  ret <4 x float> %r03
+}
+
+define <2 x i64> @test_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: @test_v2i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
+;
+  %a0 = extractelement <2 x i64> %a, i32 0
+  %a1 = extractelement <2 x i64> %a, i32 1
+  %b0 = extractelement <2 x i64> %b, i32 0
+  %b1 = extractelement <2 x i64> %b, i32 1
+  %r0 = sub i64 %a0, %a1
+  %r1 = sub i64 %b0, %b1
+  %r00 = insertelement <2 x i64> poison, i64 %r0, i32 0
+  %r01 = insertelement <2 x i64>  %r00, i64 %r1, i32 1
+  ret <2 x i64> %r01
+}
+
+define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @test_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %r0 = sub i32 %a0, %a1
+  %r1 = sub i32 %a2, %a3
+  %r2 = sub i32 %b0, %b1
+  %r3 = sub i32 %b2, %b3
+  %r00 = insertelement <4 x i32> poison, i32 %r0, i32 0
+  %r01 = insertelement <4 x i32>  %r00, i32 %r1, i32 1
+  %r02 = insertelement <4 x i32>  %r01, i32 %r2, i32 2
+  %r03 = insertelement <4 x i32>  %r02, i32 %r3, i32 3
+  ret <4 x i32> %r03
+}
+
+define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: @test_v8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+;
+  %a0 = extractelement <8 x i16> %a, i32 0
+  %a1 = extractelement <8 x i16> %a, i32 1
+  %a2 = extractelement <8 x i16> %a, i32 2
+  %a3 = extractelement <8 x i16> %a, i32 3
+  %a4 = extractelement <8 x i16> %a, i32 4
+  %a5 = extractelement <8 x i16> %a, i32 5
+  %a6 = extractelement <8 x i16> %a, i32 6
+  %a7 = extractelement <8 x i16> %a, i32 7
+  %b0 = extractelement <8 x i16> %b, i32 0
+  %b1 = extractelement <8 x i16> %b, i32 1
+  %b2 = extractelement <8 x i16> %b, i32 2
+  %b3 = extractelement <8 x i16> %b, i32 3
+  %b4 = extractelement <8 x i16> %b, i32 4
+  %b5 = extractelement <8 x i16> %b, i32 5
+  %b6 = extractelement <8 x i16> %b, i32 6
+  %b7 = extractelement <8 x i16> %b, i32 7
+  %r0 = sub i16 %a0, %a1
+  %r1 = sub i16 %a2, %a3
+  %r2 = sub i16 %a4, %a5
+  %r3 = sub i16 %a6, %a7
+  %r4 = sub i16 %b0, %b1
+  %r5 = sub i16 %b2, %b3
+  %r6 = sub i16 %b4, %b5
+  %r7 = sub i16 %b6, %b7
+  %r00 = insertelement <8 x i16> poison, i16 %r0, i32 0
+  %r01 = insertelement <8 x i16>  %r00, i16 %r1, i32 1
+  %r02 = insertelement <8 x i16>  %r01, i16 %r2, i32 2
+  %r03 = insertelement <8 x i16>  %r02, i16 %r3, i32 3
+  %r04 = insertelement <8 x i16>  %r03, i16 %r4, i32 4
+  %r05 = insertelement <8 x i16>  %r04, i16 %r5, i32 5
+  %r06 = insertelement <8 x i16>  %r05, i16 %r6, i32 6
+  %r07 = insertelement <8 x i16>  %r06, i16 %r7, i32 7
+  ret <8 x i16> %r07
+}
+
+;
+; 256-bit vectors
+;
+
+define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
+; SSE-LABEL: @test_v4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[R03]]
+;
+; SLM-LABEL: @test_v4f64(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3
+; SLM-NEXT:    [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <4 x double> [[B]], i32 1
+; SLM-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B]], i32 2
+; SLM-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3
+; SLM-NEXT:    [[R0:%.*]] = fsub double [[A0]], [[A1]]
+; SLM-NEXT:    [[R1:%.*]] = fsub double [[B0]], [[B1]]
+; SLM-NEXT:    [[R2:%.*]] = fsub double [[A2]], [[A3]]
+; SLM-NEXT:    [[R3:%.*]] = fsub double [[B2]], [[B3]]
+; SLM-NEXT:    [[R00:%.*]] = insertelement <4 x double> poison, double [[R0]], i32 0
+; SLM-NEXT:    [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i32 1
+; SLM-NEXT:    [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i32 2
+; SLM-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i32 3
+; SLM-NEXT:    ret <4 x double> [[R03]]
+;
+; AVX-LABEL: @test_v4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %r0 = fsub double %a0, %a1
+  %r1 = fsub double %b0, %b1
+  %r2 = fsub double %a2, %a3
+  %r3 = fsub double %b2, %b3
+  %r00 = insertelement <4 x double> poison, double %r0, i32 0
+  %r01 = insertelement <4 x double>  %r00, double %r1, i32 1
+  %r02 = insertelement <4 x double>  %r01, double %r2, i32 2
+  %r03 = insertelement <4 x double>  %r02, double %r3, i32 3
+  ret <4 x double> %r03
+}
+
+define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
+; SSE-LABEL: @test_v8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    ret <8 x float> [[TMP3]]
+;
+; SLM-LABEL: @test_v8f32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    [[R07:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x float> [[R07]]
+;
+; AVX-LABEL: @test_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x float> [[TMP3]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %b0 = extractelement <8 x float> %b, i32 0
+  %b1 = extractelement <8 x float> %b, i32 1
+  %b2 = extractelement <8 x float> %b, i32 2
+  %b3 = extractelement <8 x float> %b, i32 3
+  %b4 = extractelement <8 x float> %b, i32 4
+  %b5 = extractelement <8 x float> %b, i32 5
+  %b6 = extractelement <8 x float> %b, i32 6
+  %b7 = extractelement <8 x float> %b, i32 7
+  %r0 = fsub float %a0, %a1
+  %r1 = fsub float %a2, %a3
+  %r2 = fsub float %b0, %b1
+  %r3 = fsub float %b2, %b3
+  %r4 = fsub float %a4, %a5
+  %r5 = fsub float %a6, %a7
+  %r6 = fsub float %b4, %b5
+  %r7 = fsub float %b6, %b7
+  %r00 = insertelement <8 x float> poison, float %r0, i32 0
+  %r01 = insertelement <8 x float>  %r00, float %r1, i32 1
+  %r02 = insertelement <8 x float>  %r01, float %r2, i32 2
+  %r03 = insertelement <8 x float>  %r02, float %r3, i32 3
+  %r04 = insertelement <8 x float>  %r03, float %r4, i32 4
+  %r05 = insertelement <8 x float>  %r04, float %r5, i32 5
+  %r06 = insertelement <8 x float>  %r05, float %r6, i32 6
+  %r07 = insertelement <8 x float>  %r06, float %r7, i32 7
+  ret <8 x float> %r07
+}
+
+define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: @test_v4i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+;
+  %a0 = extractelement <4 x i64> %a, i32 0
+  %a1 = extractelement <4 x i64> %a, i32 1
+  %a2 = extractelement <4 x i64> %a, i32 2
+  %a3 = extractelement <4 x i64> %a, i32 3
+  %b0 = extractelement <4 x i64> %b, i32 0
+  %b1 = extractelement <4 x i64> %b, i32 1
+  %b2 = extractelement <4 x i64> %b, i32 2
+  %b3 = extractelement <4 x i64> %b, i32 3
+  %r0 = sub i64 %a0, %a1
+  %r1 = sub i64 %b0, %b1
+  %r2 = sub i64 %a2, %a3
+  %r3 = sub i64 %b2, %b3
+  %r00 = insertelement <4 x i64> poison, i64 %r0, i32 0
+  %r01 = insertelement <4 x i64>  %r00, i64 %r1, i32 1
+  %r02 = insertelement <4 x i64>  %r01, i64 %r2, i32 2
+  %r03 = insertelement <4 x i64>  %r02, i64 %r3, i32 3
+  ret <4 x i64> %r03
+}
+
+define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: @test_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %b0 = extractelement <8 x i32> %b, i32 0
+  %b1 = extractelement <8 x i32> %b, i32 1
+  %b2 = extractelement <8 x i32> %b, i32 2
+  %b3 = extractelement <8 x i32> %b, i32 3
+  %b4 = extractelement <8 x i32> %b, i32 4
+  %b5 = extractelement <8 x i32> %b, i32 5
+  %b6 = extractelement <8 x i32> %b, i32 6
+  %b7 = extractelement <8 x i32> %b, i32 7
+  %r0 = sub i32 %a0, %a1
+  %r1 = sub i32 %a2, %a3
+  %r2 = sub i32 %b0, %b1
+  %r3 = sub i32 %b2, %b3
+  %r4 = sub i32 %a4, %a5
+  %r5 = sub i32 %a6, %a7
+  %r6 = sub i32 %b4, %b5
+  %r7 = sub i32 %b6, %b7
+  %r00 = insertelement <8 x i32> poison, i32 %r0, i32 0
+  %r01 = insertelement <8 x i32>  %r00, i32 %r1, i32 1
+  %r02 = insertelement <8 x i32>  %r01, i32 %r2, i32 2
+  %r03 = insertelement <8 x i32>  %r02, i32 %r3, i32 3
+  %r04 = insertelement <8 x i32>  %r03, i32 %r4, i32 4
+  %r05 = insertelement <8 x i32>  %r04, i32 %r5, i32 5
+  %r06 = insertelement <8 x i32>  %r05, i32 %r6, i32 6
+  %r07 = insertelement <8 x i32>  %r06, i32 %r7, i32 7
+  ret <8 x i32> %r07
+}
+
+define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; SSE-LABEL: @test_v16i16(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SSE-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SSE-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[RV15:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i16> [[RV15]]
+;
+; SLM-LABEL: @test_v16i16(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SLM-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    ret <16 x i16> [[TMP3]]
+;
+; AVX-LABEL: @test_v16i16(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <16 x i16> [[TMP3]]
+;
+  %a0  = extractelement <16 x i16> %a, i32 0
+  %a1  = extractelement <16 x i16> %a, i32 1
+  %a2  = extractelement <16 x i16> %a, i32 2
+  %a3  = extractelement <16 x i16> %a, i32 3
+  %a4  = extractelement <16 x i16> %a, i32 4
+  %a5  = extractelement <16 x i16> %a, i32 5
+  %a6  = extractelement <16 x i16> %a, i32 6
+  %a7  = extractelement <16 x i16> %a, i32 7
+  %a8  = extractelement <16 x i16> %a, i32 8
+  %a9  = extractelement <16 x i16> %a, i32 9
+  %a10 = extractelement <16 x i16> %a, i32 10
+  %a11 = extractelement <16 x i16> %a, i32 11
+  %a12 = extractelement <16 x i16> %a, i32 12
+  %a13 = extractelement <16 x i16> %a, i32 13
+  %a14 = extractelement <16 x i16> %a, i32 14
+  %a15 = extractelement <16 x i16> %a, i32 15
+  %b0  = extractelement <16 x i16> %b, i32 0
+  %b1  = extractelement <16 x i16> %b, i32 1
+  %b2  = extractelement <16 x i16> %b, i32 2
+  %b3  = extractelement <16 x i16> %b, i32 3
+  %b4  = extractelement <16 x i16> %b, i32 4
+  %b5  = extractelement <16 x i16> %b, i32 5
+  %b6  = extractelement <16 x i16> %b, i32 6
+  %b7  = extractelement <16 x i16> %b, i32 7
+  %b8  = extractelement <16 x i16> %b, i32 8
+  %b9  = extractelement <16 x i16> %b, i32 9
+  %b10 = extractelement <16 x i16> %b, i32 10
+  %b11 = extractelement <16 x i16> %b, i32 11
+  %b12 = extractelement <16 x i16> %b, i32 12
+  %b13 = extractelement <16 x i16> %b, i32 13
+  %b14 = extractelement <16 x i16> %b, i32 14
+  %b15 = extractelement <16 x i16> %b, i32 15
+  %r0  = sub i16 %a0 , %a1
+  %r1  = sub i16 %a2 , %a3
+  %r2  = sub i16 %a4 , %a5
+  %r3  = sub i16 %a6 , %a7
+  %r4  = sub i16 %b0 , %b1
+  %r5  = sub i16 %b2 , %b3
+  %r6  = sub i16 %b4 , %b5
+  %r7  = sub i16 %b6 , %b7
+  %r8  = sub i16 %a8 , %a9
+  %r9  = sub i16 %a10, %a11
+  %r10 = sub i16 %a12, %a13
+  %r11 = sub i16 %a14, %a15
+  %r12 = sub i16 %b8 , %b9
+  %r13 = sub i16 %b10, %b11
+  %r14 = sub i16 %b12, %b13
+  %r15 = sub i16 %b14, %b15
+  %rv0  = insertelement <16 x i16> poison, i16 %r0 , i32 0
+  %rv1  = insertelement <16 x i16> %rv0 , i16 %r1 , i32 1
+  %rv2  = insertelement <16 x i16> %rv1 , i16 %r2 , i32 2
+  %rv3  = insertelement <16 x i16> %rv2 , i16 %r3 , i32 3
+  %rv4  = insertelement <16 x i16> %rv3 , i16 %r4 , i32 4
+  %rv5  = insertelement <16 x i16> %rv4 , i16 %r5 , i32 5
+  %rv6  = insertelement <16 x i16> %rv5 , i16 %r6 , i32 6
+  %rv7  = insertelement <16 x i16> %rv6 , i16 %r7 , i32 7
+  %rv8  = insertelement <16 x i16> %rv7 , i16 %r8 , i32 8
+  %rv9  = insertelement <16 x i16> %rv8 , i16 %r9 , i32 9
+  %rv10 = insertelement <16 x i16> %rv9 , i16 %r10, i32 10
+  %rv11 = insertelement <16 x i16> %rv10, i16 %r11, i32 11
+  %rv12 = insertelement <16 x i16> %rv11, i16 %r12, i32 12
+  %rv13 = insertelement <16 x i16> %rv12, i16 %r13, i32 13
+  %rv14 = insertelement <16 x i16> %rv13, i16 %r14, i32 14
+  %rv15 = insertelement <16 x i16> %rv14, i16 %r15, i32 15
+  ret <16 x i16> %rv15
+}
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
@ -0,0 +1,540 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -slp-vectorizer -slp-threshold=-10000 < %s | FileCheck %s
+; RUN: opt -S -slp-vectorizer -slp-threshold=0 < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @simple_select(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[TMP5]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP6]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[RD]]
+;
+  %c0 = extractelement <4 x i32> %c, i32 0
+  %c1 = extractelement <4 x i32> %c, i32 1
+  %c2 = extractelement <4 x i32> %c, i32 2
+  %c3 = extractelement <4 x i32> %c, i32 3
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %cmp0 = icmp ne i32 %c0, 0
+  %cmp1 = icmp ne i32 %c1, 0
+  %cmp2 = icmp ne i32 %c2, 0
+  %cmp3 = icmp ne i32 %c3, 0
+  %s0 = select i1 %cmp0, float %a0, float %b0
+  %s1 = select i1 %cmp1, float %a1, float %b1
+  %s2 = select i1 %cmp2, float %a2, float %b2
+  %s3 = select i1 %cmp3, float %a3, float %b3
+  %ra = insertelement <4 x float> poison, float %s0, i32 0
+  %rb = insertelement <4 x float> %ra, float %s1, i32 1
+  %rc = insertelement <4 x float> %rb, float %s2, i32 2
+  %rd = insertelement <4 x float> %rc, float %s3, i32 3
+  ret <4 x float> %rd
+}
+
+declare void @llvm.assume(i1) nounwind
+
+; This entire tree is ephemeral, don't vectorize any of it.
+define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @simple_select_eph(
+; CHECK-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; CHECK-NEXT:    [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
+; CHECK-NEXT:    [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
+; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
+; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
+; CHECK-NEXT:    [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; CHECK-NEXT:    [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
+; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2
+; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp ne i32 [[C0]], 0
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[C1]], 0
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[C2]], 0
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ne i32 [[C3]], 0
+; CHECK-NEXT:    [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]]
+; CHECK-NEXT:    [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]]
+; CHECK-NEXT:    [[S2:%.*]] = select i1 [[CMP2]], float [[A2]], float [[B2]]
+; CHECK-NEXT:    [[S3:%.*]] = select i1 [[CMP3]], float [[A3]], float [[B3]]
+; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> poison, float [[S0]], i32 0
+; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[S1]], i32 1
+; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[S2]], i32 2
+; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3
+; CHECK-NEXT:    [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0
+; CHECK-NEXT:    [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1
+; CHECK-NEXT:    [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2
+; CHECK-NEXT:    [[Q3:%.*]] = extractelement <4 x float> [[RD]], i32 3
+; CHECK-NEXT:    [[Q4:%.*]] = fadd float [[Q0]], [[Q1]]
+; CHECK-NEXT:    [[Q5:%.*]] = fadd float [[Q2]], [[Q3]]
+; CHECK-NEXT:    [[Q6:%.*]] = fadd float [[Q4]], [[Q5]]
+; CHECK-NEXT:    [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[QI]])
+; CHECK-NEXT:    ret <4 x float> undef
+;
+  %c0 = extractelement <4 x i32> %c, i32 0
+  %c1 = extractelement <4 x i32> %c, i32 1
+  %c2 = extractelement <4 x i32> %c, i32 2
+  %c3 = extractelement <4 x i32> %c, i32 3
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %cmp0 = icmp ne i32 %c0, 0
+  %cmp1 = icmp ne i32 %c1, 0
+  %cmp2 = icmp ne i32 %c2, 0
+  %cmp3 = icmp ne i32 %c3, 0
+  %s0 = select i1 %cmp0, float %a0, float %b0
+  %s1 = select i1 %cmp1, float %a1, float %b1
+  %s2 = select i1 %cmp2, float %a2, float %b2
+  %s3 = select i1 %cmp3, float %a3, float %b3
+  %ra = insertelement <4 x float> poison, float %s0, i32 0
+  %rb = insertelement <4 x float> %ra, float %s1, i32 1
+  %rc = insertelement <4 x float> %rb, float %s2, i32 2
+  %rd = insertelement <4 x float> %rc, float %s3, i32 3
+  %q0 = extractelement <4 x float> %rd, i32 0
+  %q1 = extractelement <4 x float> %rd, i32 1
+  %q2 = extractelement <4 x float> %rd, i32 2
+  %q3 = extractelement <4 x float> %rd, i32 3
+  %q4 = fadd float %q0, %q1
+  %q5 = fadd float %q2, %q3
+  %q6 = fadd float %q4, %q5
+  %qi = fcmp olt float %q6, %q5
+  call void @llvm.assume(i1 %qi)
+  ret <4 x float> undef
+}
+
+; Insert in an order different from the vector indices to make sure it
+; doesn't matter
+define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @simple_select_insert_out_of_order(
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[C:%.*]], <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[SHUFFLE]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[SHUFFLE1]], <4 x float> [[SHUFFLE2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP6]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[RD]]
+;
+  %c0 = extractelement <4 x i32> %c, i32 0
+  %c1 = extractelement <4 x i32> %c, i32 1
+  %c2 = extractelement <4 x i32> %c, i32 2
+  %c3 = extractelement <4 x i32> %c, i32 3
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %cmp0 = icmp ne i32 %c0, 0
+  %cmp1 = icmp ne i32 %c1, 0
+  %cmp2 = icmp ne i32 %c2, 0
+  %cmp3 = icmp ne i32 %c3, 0
+  %s0 = select i1 %cmp0, float %a0, float %b0
+  %s1 = select i1 %cmp1, float %a1, float %b1
+  %s2 = select i1 %cmp2, float %a2, float %b2
+  %s3 = select i1 %cmp3, float %a3, float %b3
+  %ra = insertelement <4 x float> poison, float %s0, i32 2
+  %rb = insertelement <4 x float> %ra, float %s1, i32 1
+  %rc = insertelement <4 x float> %rb, float %s2, i32 0
+  %rd = insertelement <4 x float> %rc, float %s3, i32 3
+  ret <4 x float> %rd
+}
+
+declare void @v4f32_user(<4 x float>) #0
+declare void @f32_user(float) #0
+
+; Multiple users of the final constructed vector
+define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @simple_select_users(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[TMP5]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP6]], i32 3
+; CHECK-NEXT:    call void @v4f32_user(<4 x float> [[RD]]) [[ATTR0:#.*]]
+; CHECK-NEXT:    ret <4 x float> [[RD]]
+;
+  %c0 = extractelement <4 x i32> %c, i32 0
+  %c1 = extractelement <4 x i32> %c, i32 1
+  %c2 = extractelement <4 x i32> %c, i32 2
+  %c3 = extractelement <4 x i32> %c, i32 3
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %cmp0 = icmp ne i32 %c0, 0
+  %cmp1 = icmp ne i32 %c1, 0
+  %cmp2 = icmp ne i32 %c2, 0
+  %cmp3 = icmp ne i32 %c3, 0
+  %s0 = select i1 %cmp0, float %a0, float %b0
+  %s1 = select i1 %cmp1, float %a1, float %b1
+  %s2 = select i1 %cmp2, float %a2, float %b2
+  %s3 = select i1 %cmp3, float %a3, float %b3
+  %ra = insertelement <4 x float> poison, float %s0, i32 0
+  %rb = insertelement <4 x float> %ra, float %s1, i32 1
+  %rc = insertelement <4 x float> %rb, float %s2, i32 2
+  %rd = insertelement <4 x float> %rc, float %s3, i32 3
+  call void @v4f32_user(<4 x float> %rd) #0
+  ret <4 x float> %rd
+}
+
+; Unused insertelement
+define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @simple_select_no_users(
+; CHECK-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; CHECK-NEXT:    [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
+; CHECK-NEXT:    [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
+; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
+; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
+; CHECK-NEXT:    [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; CHECK-NEXT:    [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
+; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2
+; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> undef, float [[A0]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> undef, float [[B0]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1
+; CHECK-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x float> [[TMP11]], i32 0
+; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> poison, float [[TMP17]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP11]], i32 1
+; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP18]], i32 1
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x float> [[TMP16]], i32 0
+; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x float> poison, float [[TMP19]], i32 2
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP16]], i32 1
+; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP20]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[RD]]
+;
+  %c0 = extractelement <4 x i32> %c, i32 0
+  %c1 = extractelement <4 x i32> %c, i32 1
+  %c2 = extractelement <4 x i32> %c, i32 2
+  %c3 = extractelement <4 x i32> %c, i32 3
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %cmp0 = icmp ne i32 %c0, 0
+  %cmp1 = icmp ne i32 %c1, 0
+  %cmp2 = icmp ne i32 %c2, 0
+  %cmp3 = icmp ne i32 %c3, 0
+  %s0 = select i1 %cmp0, float %a0, float %b0
+  %s1 = select i1 %cmp1, float %a1, float %b1
+  %s2 = select i1 %cmp2, float %a2, float %b2
+  %s3 = select i1 %cmp3, float %a3, float %b3
+  %ra = insertelement <4 x float> poison, float %s0, i32 0
+  %rb = insertelement <4 x float> %ra, float %s1, i32 1
+  %rc = insertelement <4 x float> poison, float %s2, i32 2
+  %rd = insertelement <4 x float> %rc, float %s3, i32 3
+  ret <4 x float> %rd
+}
+
+; Make sure infinite loop doesn't happen which I ran into when trying
+; to do this backwards this backwards
+define <4 x i32> @reconstruct(<4 x i32> %c) #0 {
+; CHECK-LABEL: @reconstruct(
+; CHECK-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; CHECK-NEXT:    [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
+; CHECK-NEXT:    [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
+; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x i32> poison, i32 [[C0]], i32 0
+; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x i32> [[RA]], i32 [[C1]], i32 1
+; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x i32> [[RB]], i32 [[C2]], i32 2
+; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x i32> [[RC]], i32 [[C3]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[RD]]
+;
+  %c0 = extractelement <4 x i32> %c, i32 0
+  %c1 = extractelement <4 x i32> %c, i32 1
+  %c2 = extractelement <4 x i32> %c, i32 2
+  %c3 = extractelement <4 x i32> %c, i32 3
+  %ra = insertelement <4 x i32> poison, i32 %c0, i32 0
+  %rb = insertelement <4 x i32> %ra, i32 %c1, i32 1
+  %rc = insertelement <4 x i32> %rb, i32 %c2, i32 2
+  %rd = insertelement <4 x i32> %rc, i32 %c3, i32 3
+  ret <4 x i32> %rd
+}
+
+define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %c) #0 {
+; CHECK-LABEL: @simple_select_v2(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i32> [[C:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x float> [[A:%.*]], <2 x float> [[B:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    [[RA:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; CHECK-NEXT:    [[RB:%.*]] = insertelement <2 x float> [[RA]], float [[TMP4]], i32 1
+; CHECK-NEXT:    ret <2 x float> [[RB]]
+;
+  %c0 = extractelement <2 x i32> %c, i32 0
+  %c1 = extractelement <2 x i32> %c, i32 1
+  %a0 = extractelement <2 x float> %a, i32 0
+  %a1 = extractelement <2 x float> %a, i32 1
+  %b0 = extractelement <2 x float> %b, i32 0
+  %b1 = extractelement <2 x float> %b, i32 1
+  %cmp0 = icmp ne i32 %c0, 0
+  %cmp1 = icmp ne i32 %c1, 0
+  %s0 = select i1 %cmp0, float %a0, float %b0
+  %s1 = select i1 %cmp1, float %a1, float %b1
+  %ra = insertelement <2 x float> poison, float %s0, i32 0
+  %rb = insertelement <2 x float> %ra, float %s1, i32 1
+  ret <2 x float> %rb
+}
+
+; Make sure when we construct partial vectors, we don't keep
+; re-visiting the insertelement chains starting with undef
+; (low cost threshold needed to force this to happen)
+define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @simple_select_partial_vector(
+; CHECK-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT:    [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; CHECK-NEXT:    [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[C0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A1]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B0]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B1]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP5]], <2 x float> [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
+; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> poison, float [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
+; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP10]], i32 1
+; CHECK-NEXT:    ret <4 x float> [[RB]]
+;
+  %c0 = extractelement <4 x i32> %c, i32 0
+  %c1 = extractelement <4 x i32> %c, i32 1
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %1 = insertelement <2 x i32> poison, i32 %c0, i32 0
+  %2 = insertelement <2 x i32> %1, i32 %c1, i32 1
+  %3 = icmp ne <2 x i32> %2, zeroinitializer
+  %4 = insertelement <2 x float> poison, float %a0, i32 0
+  %5 = insertelement <2 x float> %4, float %a1, i32 1
+  %6 = insertelement <2 x float> poison, float %b0, i32 0
+  %7 = insertelement <2 x float> %6, float %b1, i32 1
+  %8 = select <2 x i1> %3, <2 x float> %5, <2 x float> %7
+  %9 = extractelement <2 x float> %8, i32 0
+  %ra = insertelement <4 x float> poison, float %9, i32 0
+  %10 = extractelement <2 x float> %8, i32 1
+  %rb = insertelement <4 x float> %ra, float %10, i32 1
+  ret <4 x float> %rb
+}
+
+; Make sure that vectorization happens even if insertelements operations
+; must be rescheduled. The case here is from compiling Julia.
+define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @reschedule_extract(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x float> [[V0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[TMP5]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[V3]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %b0 = extractelement <4 x float> %b, i32 0
+  %c0 = fadd float %a0, %b0
+  %v0 = insertelement <4 x float> poison, float %c0, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %b1 = extractelement <4 x float> %b, i32 1
+  %c1 = fadd float %a1, %b1
+  %v1 = insertelement <4 x float> %v0, float %c1, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %b2 = extractelement <4 x float> %b, i32 2
+  %c2 = fadd float %a2, %b2
+  %v2 = insertelement <4 x float> %v1, float %c2, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b3 = extractelement <4 x float> %b, i32 3
+  %c3 = fadd float %a3, %b3
+  %v3 = insertelement <4 x float> %v2, float %c3, i32 3
+  ret <4 x float> %v3
+}
+
+; Check that cost model for vectorization takes credit for
+; instructions that are erased.
+define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @take_credit(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x float> [[V0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[TMP5]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[V3]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %b0 = extractelement <4 x float> %b, i32 0
+  %c0 = fadd float %a0, %b0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %b1 = extractelement <4 x float> %b, i32 1
+  %c1 = fadd float %a1, %b1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %b2 = extractelement <4 x float> %b, i32 2
+  %c2 = fadd float %a2, %b2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b3 = extractelement <4 x float> %b, i32 3
+  %c3 = fadd float %a3, %b3
+  %v0 = insertelement <4 x float> poison, float %c0, i32 0
+  %v1 = insertelement <4 x float> %v0, float %c1, i32 1
+  %v2 = insertelement <4 x float> %v1, float %c2, i32 2
+  %v3 = insertelement <4 x float> %v2, float %c3, i32 3
+  ret <4 x float> %v3
+}
+
+; Make sure we handle multiple trees that feed one build vector correctly.
+define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
+; CHECK-LABEL: @multi_tree(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> undef, double [[Z:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[X:%.*]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[W:%.*]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x double> [[TMP4]], <double 3.000000e+00, double 2.000000e+00, double 1.000000e+00, double 0.000000e+00>
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP5]], <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP6]], i32 3
+; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x double> poison, double [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x double> [[TMP6]], i32 2
+; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x double> [[I1]], double [[TMP8]], i32 2
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x double> [[TMP6]], i32 1
+; CHECK-NEXT:    [[I3:%.*]] = insertelement <4 x double> [[I2]], double [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x double> [[TMP6]], i32 0
+; CHECK-NEXT:    [[I4:%.*]] = insertelement <4 x double> [[I3]], double [[TMP10]], i32 0
+; CHECK-NEXT:    ret <4 x double> [[I4]]
+;
+  %t0 = fadd double %w , 0.000000e+00
+  %t1 = fadd double %x , 1.000000e+00
+  %t2 = fadd double %y , 2.000000e+00
+  %t3 = fadd double %z , 3.000000e+00
+  %t4 = fmul double %t0, 1.000000e+00
+  %i1 = insertelement <4 x double> poison, double %t4, i32 3
+  %t5 = fmul double %t1, 1.000000e+00
+  %i2 = insertelement <4 x double> %i1, double %t5, i32 2
+  %t6 = fmul double %t2, 1.000000e+00
+  %i3 = insertelement <4 x double> %i2, double %t6, i32 1
+  %t7 = fmul double %t3, 1.000000e+00
+  %i4 = insertelement <4 x double> %i3, double %t7, i32 0
+  ret <4 x double> %i4
+}
+
+define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: @_vadd256(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x float> poison, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x float> [[VECINIT_I]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x float> [[VECINIT1_I]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x float> [[VECINIT2_I]], float [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4
+; CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x float> [[VECINIT3_I]], float [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5
+; CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x float> [[VECINIT4_I]], float [[TMP7]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6
+; CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x float> [[VECINIT5_I]], float [[TMP8]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7
+; CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x float> [[VECINIT6_I]], float [[TMP9]], i32 7
+; CHECK-NEXT:    ret <8 x float> [[VECINIT7_I]]
+;
+  %vecext = extractelement <8 x float> %a, i32 0
+  %vecext1 = extractelement <8 x float> %b, i32 0
+  %add = fadd float %vecext, %vecext1
+  %vecext2 = extractelement <8 x float> %a, i32 1
+  %vecext3 = extractelement <8 x float> %b, i32 1
+  %add4 = fadd float %vecext2, %vecext3
+  %vecext5 = extractelement <8 x float> %a, i32 2
+  %vecext6 = extractelement <8 x float> %b, i32 2
+  %add7 = fadd float %vecext5, %vecext6
+  %vecext8 = extractelement <8 x float> %a, i32 3
+  %vecext9 = extractelement <8 x float> %b, i32 3
+  %add10 = fadd float %vecext8, %vecext9
+  %vecext11 = extractelement <8 x float> %a, i32 4
+  %vecext12 = extractelement <8 x float> %b, i32 4
+  %add13 = fadd float %vecext11, %vecext12
+  %vecext14 = extractelement <8 x float> %a, i32 5
+  %vecext15 = extractelement <8 x float> %b, i32 5
+  %add16 = fadd float %vecext14, %vecext15
+  %vecext17 = extractelement <8 x float> %a, i32 6
+  %vecext18 = extractelement <8 x float> %b, i32 6
+  %add19 = fadd float %vecext17, %vecext18
+  %vecext20 = extractelement <8 x float> %a, i32 7
+  %vecext21 = extractelement <8 x float> %b, i32 7
+  %add22 = fadd float %vecext20, %vecext21
+  %vecinit.i = insertelement <8 x float> poison, float %add, i32 0
+  %vecinit1.i = insertelement <8 x float> %vecinit.i, float %add4, i32 1
+  %vecinit2.i = insertelement <8 x float> %vecinit1.i, float %add7, i32 2
+  %vecinit3.i = insertelement <8 x float> %vecinit2.i, float %add10, i32 3
+  %vecinit4.i = insertelement <8 x float> %vecinit3.i, float %add13, i32 4
+  %vecinit5.i = insertelement <8 x float> %vecinit4.i, float %add16, i32 5
+  %vecinit6.i = insertelement <8 x float> %vecinit5.i, float %add19, i32 6
+  %vecinit7.i = insertelement <8 x float> %vecinit6.i, float %add22, i32 7
+  ret <8 x float> %vecinit7.i
+}
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
@ -0,0 +1,208 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=haswell | FileCheck %s
+
+;unsigned load_le32(unsigned char *data) {
+;    unsigned le32 = (data[0]<<0) | (data[1]<<8) | (data[2]<<16) | (data[3]<<24);
+;    return le32;
+;}
+
+define i32 @_Z9load_le32Ph(i8* nocapture readonly %data) {
+; CHECK-LABEL: @_Z9load_le32Ph(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[DATA:%.*]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[DATA]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CONV2:%.*]] = zext i8 [[TMP1]] to i32
+; CHECK-NEXT:    [[SHL3:%.*]] = shl nuw nsw i32 [[CONV2]], 8
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHL3]], [[CONV]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, i8* [[DATA]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX4]], align 1
+; CHECK-NEXT:    [[CONV5:%.*]] = zext i8 [[TMP2]] to i32
+; CHECK-NEXT:    [[SHL6:%.*]] = shl nuw nsw i32 [[CONV5]], 16
+; CHECK-NEXT:    [[OR7:%.*]] = or i32 [[OR]], [[SHL6]]
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, i8* [[DATA]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX8]], align 1
+; CHECK-NEXT:    [[CONV9:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT:    [[SHL10:%.*]] = shl nuw i32 [[CONV9]], 24
+; CHECK-NEXT:    [[OR11:%.*]] = or i32 [[OR7]], [[SHL10]]
+; CHECK-NEXT:    ret i32 [[OR11]]
+;
+entry:
+  %0 = load i8, i8* %data, align 1
+  %conv = zext i8 %0 to i32
+  %arrayidx1 = getelementptr inbounds i8, i8* %data, i64 1
+  %1 = load i8, i8* %arrayidx1, align 1
+  %conv2 = zext i8 %1 to i32
+  %shl3 = shl nuw nsw i32 %conv2, 8
+  %or = or i32 %shl3, %conv
+  %arrayidx4 = getelementptr inbounds i8, i8* %data, i64 2
+  %2 = load i8, i8* %arrayidx4, align 1
+  %conv5 = zext i8 %2 to i32
+  %shl6 = shl nuw nsw i32 %conv5, 16
+  %or7 = or i32 %or, %shl6
+  %arrayidx8 = getelementptr inbounds i8, i8* %data, i64 3
+  %3 = load i8, i8* %arrayidx8, align 1
+  %conv9 = zext i8 %3 to i32
+  %shl10 = shl nuw i32 %conv9, 24
+  %or11 = or i32 %or7, %shl10
+  ret i32 %or11
+}
+
+define <4 x float> @PR16739_byref(<4 x float>* nocapture readonly dereferenceable(16) %x) {
+; CHECK-LABEL: @PR16739_byref(
+; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[X2:%.*]] = load float, float* [[GEP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    [[I0:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[TMP4]], i32 1
+; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[X2]], i32 2
+; CHECK-NEXT:    [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[X2]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[I3]]
+;
+  %gep0 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 0
+  %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 1
+  %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 2
+  %x0 = load float, float* %gep0
+  %x1 = load float, float* %gep1
+  %x2 = load float, float* %gep2
+  %i0 = insertelement <4 x float> poison, float %x0, i32 0
+  %i1 = insertelement <4 x float> %i0, float %x1, i32 1
+  %i2 = insertelement <4 x float> %i1, float %x2, i32 2
+  %i3 = insertelement <4 x float> %i2, float %x2, i32 3
+  ret <4 x float> %i3
+}
+
+define <4 x float> @PR16739_byref_alt(<4 x float>* nocapture readonly dereferenceable(16) %x) {
+; CHECK-LABEL: @PR16739_byref_alt(
+; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0
+; CHECK-NEXT:    [[I0:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0
+; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 2
+; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[I3]]
+;
+  %gep0 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 0
+  %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 1
+  %x0 = load float, float* %gep0
+  %x1 = load float, float* %gep1
+  %i0 = insertelement <4 x float> poison, float %x0, i32 0
+  %i1 = insertelement <4 x float> %i0, float %x0, i32 1
+  %i2 = insertelement <4 x float> %i1, float %x1, i32 2
+  %i3 = insertelement <4 x float> %i2, float %x1, i32 3
+  ret <4 x float> %i3
+}
+
+define <4 x float> @PR16739_byval(<4 x float>* nocapture readonly dereferenceable(16) %x) {
+; CHECK-LABEL: @PR16739_byval(
+; CHECK-NEXT:    [[T0:%.*]] = bitcast <4 x float>* [[X:%.*]] to i64*
+; CHECK-NEXT:    [[T1:%.*]] = load i64, i64* [[T0]], align 16
+; CHECK-NEXT:    [[T2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2
+; CHECK-NEXT:    [[T3:%.*]] = bitcast float* [[T2]] to i64*
+; CHECK-NEXT:    [[T4:%.*]] = load i64, i64* [[T3]], align 8
+; CHECK-NEXT:    [[T5:%.*]] = trunc i64 [[T1]] to i32
+; CHECK-NEXT:    [[T6:%.*]] = bitcast i32 [[T5]] to float
+; CHECK-NEXT:    [[T7:%.*]] = insertelement <4 x float> poison, float [[T6]], i32 0
+; CHECK-NEXT:    [[T8:%.*]] = lshr i64 [[T1]], 32
+; CHECK-NEXT:    [[T9:%.*]] = trunc i64 [[T8]] to i32
+; CHECK-NEXT:    [[T10:%.*]] = bitcast i32 [[T9]] to float
+; CHECK-NEXT:    [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1
+; CHECK-NEXT:    [[T12:%.*]] = trunc i64 [[T4]] to i32
+; CHECK-NEXT:    [[T13:%.*]] = bitcast i32 [[T12]] to float
+; CHECK-NEXT:    [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2
+; CHECK-NEXT:    [[T15:%.*]] = insertelement <4 x float> [[T14]], float [[T13]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[T15]]
+;
+  %t0 = bitcast <4 x float>* %x to i64*
+  %t1 = load i64, i64* %t0, align 16
+  %t2 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 2
+  %t3 = bitcast float* %t2 to i64*
+  %t4 = load i64, i64* %t3, align 8
+  %t5 = trunc i64 %t1 to i32
+  %t6 = bitcast i32 %t5 to float
+  %t7 = insertelement <4 x float> poison, float %t6, i32 0
+  %t8 = lshr i64 %t1, 32
+  %t9 = trunc i64 %t8 to i32
+  %t10 = bitcast i32 %t9 to float
+  %t11 = insertelement <4 x float> %t7, float %t10, i32 1
+  %t12 = trunc i64 %t4 to i32
+  %t13 = bitcast i32 %t12 to float
+  %t14 = insertelement <4 x float> %t11, float %t13, i32 2
+  %t15 = insertelement <4 x float> %t14, float %t13, i32 3
+  ret <4 x float> %t15
+}
+
+define void @PR43578_prefer128(i32* %r, i64* %p, i64* %q) #0 {
+; CHECK-LABEL: @PR43578_prefer128(
+; CHECK-NEXT:    [[P0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 0
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 3
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr inbounds i64, i64* [[Q:%.*]], i64 0
+; CHECK-NEXT:    [[Q1:%.*]] = getelementptr inbounds i64, i64* [[Q]], i64 1
+; CHECK-NEXT:    [[Q2:%.*]] = getelementptr inbounds i64, i64* [[Q]], i64 2
+; CHECK-NEXT:    [[Q3:%.*]] = getelementptr inbounds i64, i64* [[Q]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[P0]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[P2]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[Q0]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 2
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64* [[Q2]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 2
+; CHECK-NEXT:    [[TMP9:%.*]] = sub nsw <2 x i64> [[TMP2]], [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw <2 x i64> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
+; CHECK-NEXT:    [[G0:%.*]] = getelementptr inbounds i32, i32* [[R:%.*]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP10]], i32 0
+; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP10]], i32 1
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP14]]
+; CHECK-NEXT:    ret void
+;
+  %p0 = getelementptr inbounds i64, i64* %p, i64 0
+  %p1 = getelementptr inbounds i64, i64* %p, i64 1
+  %p2 = getelementptr inbounds i64, i64* %p, i64 2
+  %p3 = getelementptr inbounds i64, i64* %p, i64 3
+
+  %q0 = getelementptr inbounds i64, i64* %q, i64 0
+  %q1 = getelementptr inbounds i64, i64* %q, i64 1
+  %q2 = getelementptr inbounds i64, i64* %q, i64 2
+  %q3 = getelementptr inbounds i64, i64* %q, i64 3
+
+  %x0 = load i64, i64* %p0, align 2
+  %x1 = load i64, i64* %p1, align 2
+  %x2 = load i64, i64* %p2, align 2
+  %x3 = load i64, i64* %p3, align 2
+
+  %y0 = load i64, i64* %q0, align 2
+  %y1 = load i64, i64* %q1, align 2
+  %y2 = load i64, i64* %q2, align 2
+  %y3 = load i64, i64* %q3, align 2
+
+  %sub0 = sub nsw i64 %x0, %y0
+  %sub1 = sub nsw i64 %x1, %y1
+  %sub2 = sub nsw i64 %x2, %y2
+  %sub3 = sub nsw i64 %x3, %y3
+
+  %g0 = getelementptr inbounds i32, i32* %r, i64 %sub0
+  %g1 = getelementptr inbounds i32, i32* %r, i64 %sub1
+  %g2 = getelementptr inbounds i32, i32* %r, i64 %sub2
+  %g3 = getelementptr inbounds i32, i32* %r, i64 %sub3
+  ret void
+}
+
+attributes #0 = { "prefer-vector-width"="128" }
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr31599-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr31599-inseltpoison.ll
@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define <2 x float> @foo() {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SOURCE:%.*]] = insertelement <2 x float> poison, float undef, i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = fsub <2 x float> [[SOURCE]], [[SOURCE]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[RES1:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[RES2:%.*]] = insertelement <2 x float> [[RES1]], float [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x float> [[RES2]]
+;
+entry:
+  %source = insertelement <2 x float> poison, float undef, i32 0
+  %e0 = extractelement <2 x float> %source, i32 0
+  %e0.dup = extractelement <2 x float> %source, i32 0
+  %sub1 = fsub float %e0, %e0.dup
+  %e1 = extractelement <2 x float> %source, i32 1
+  %e1.dup = extractelement <2 x float> %source, i32 1
+  %sub2 = fsub float %e1, %e1.dup
+  %res1 = insertelement <2 x float> poison, float %sub1, i32 0
+  %res2 = insertelement <2 x float> %res1, float %sub2, i32 1
+  ret <2 x float> %res2
+}
+
+!llvm.ident = !{!0, !0}
+
+!0 = !{!"clang version 4.0.0 "}
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr42022-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr42022-inseltpoison.ll
@ -0,0 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+; See https://reviews.llvm.org/D70068 and https://reviews.llvm.org/D70587 for context
+
+; Checks that vector insertvalues into the struct become SLP seeds.
+define { <2 x float>, <2 x float> } @StructOfVectors(float *%Ptr) {
+; CHECK-LABEL: @StructOfVectors(
+; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds float, float* [[PTR:%.*]], i64 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 2
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[GEP0]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <4 x float> [[TMP2]], <float 1.100000e+01, float 1.200000e+01, float 1.300000e+01, float 1.400000e+01>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[VECIN0:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[VECIN1:%.*]] = insertelement <2 x float> [[VECIN0]], float [[TMP5]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECIN2:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; CHECK-NEXT:    [[VECIN3:%.*]] = insertelement <2 x float> [[VECIN2]], float [[TMP7]], i64 1
+; CHECK-NEXT:    [[RET0:%.*]] = insertvalue { <2 x float>, <2 x float> } undef, <2 x float> [[VECIN1]], 0
+; CHECK-NEXT:    [[RET1:%.*]] = insertvalue { <2 x float>, <2 x float> } [[RET0]], <2 x float> [[VECIN3]], 1
+; CHECK-NEXT:    ret { <2 x float>, <2 x float> } [[RET1]]
+;
+  %GEP0 = getelementptr inbounds float, float* %Ptr, i64 0
+  %L0 = load float, float * %GEP0
+  %GEP1 = getelementptr inbounds float, float* %Ptr, i64 1
+  %L1 = load float, float * %GEP1
+  %GEP2 = getelementptr inbounds float, float* %Ptr, i64 2
+  %L2 = load float, float * %GEP2
+  %GEP3 = getelementptr inbounds float, float* %Ptr, i64 3
+  %L3 = load float, float * %GEP3
+
+  %Fadd0 = fadd fast float %L0, 1.1e+01
+  %Fadd1 = fadd fast float %L1, 1.2e+01
+  %Fadd2 = fadd fast float %L2, 1.3e+01
+  %Fadd3 = fadd fast float %L3, 1.4e+01
+
+  %VecIn0 = insertelement <2 x float> poison, float %Fadd0, i64 0
+  %VecIn1 = insertelement <2 x float> %VecIn0, float %Fadd1, i64 1
+
+  %VecIn2 = insertelement <2 x float> poison, float %Fadd2, i64 0
+  %VecIn3 = insertelement <2 x float> %VecIn2, float %Fadd3, i64 1
+
+  %Ret0 = insertvalue {<2 x float>, <2 x float>} undef, <2 x float> %VecIn1, 0
+  %Ret1 = insertvalue {<2 x float>, <2 x float>} %Ret0, <2 x float> %VecIn3, 1
+  ret {<2 x float>, <2 x float>} %Ret1
+}
+
+%StructTy = type { float, float}
+
+define [2 x %StructTy] @ArrayOfStruct(float *%Ptr) {
+; CHECK-LABEL: @ArrayOfStruct(
+; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds float, float* [[PTR:%.*]], i64 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 2
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[GEP0]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <4 x float> [[TMP2]], <float 1.100000e+01, float 1.200000e+01, float 1.300000e+01, float 1.400000e+01>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[STRUCTIN0:%.*]] = insertvalue [[STRUCTTY:%.*]] undef, float [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[STRUCTIN1:%.*]] = insertvalue [[STRUCTTY]] [[STRUCTIN0]], float [[TMP5]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; CHECK-NEXT:    [[STRUCTIN2:%.*]] = insertvalue [[STRUCTTY]] undef, float [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; CHECK-NEXT:    [[STRUCTIN3:%.*]] = insertvalue [[STRUCTTY]] [[STRUCTIN2]], float [[TMP7]], 1
+; CHECK-NEXT:    [[RET0:%.*]] = insertvalue [2 x %StructTy] undef, [[STRUCTTY]] [[STRUCTIN1]], 0
+; CHECK-NEXT:    [[RET1:%.*]] = insertvalue [2 x %StructTy] [[RET0]], [[STRUCTTY]] [[STRUCTIN3]], 1
+; CHECK-NEXT:    ret [2 x %StructTy] [[RET1]]
+;
+  %GEP0 = getelementptr inbounds float, float* %Ptr, i64 0
+  %L0 = load float, float * %GEP0
+  %GEP1 = getelementptr inbounds float, float* %Ptr, i64 1
+  %L1 = load float, float * %GEP1
+  %GEP2 = getelementptr inbounds float, float* %Ptr, i64 2
+  %L2 = load float, float * %GEP2
+  %GEP3 = getelementptr inbounds float, float* %Ptr, i64 3
+  %L3 = load float, float * %GEP3
+
+  %Fadd0 = fadd fast float %L0, 1.1e+01
+  %Fadd1 = fadd fast float %L1, 1.2e+01
+  %Fadd2 = fadd fast float %L2, 1.3e+01
+  %Fadd3 = fadd fast float %L3, 1.4e+01
+
+  %StructIn0 = insertvalue %StructTy undef, float %Fadd0, 0
+  %StructIn1 = insertvalue %StructTy %StructIn0, float %Fadd1, 1
+
+  %StructIn2 = insertvalue %StructTy undef, float %Fadd2, 0
+  %StructIn3 = insertvalue %StructTy %StructIn2, float %Fadd3, 1
+
+  %Ret0 = insertvalue [2 x %StructTy] undef, %StructTy %StructIn1, 0
+  %Ret1 = insertvalue [2 x %StructTy] %Ret0, %StructTy %StructIn3, 1
+  ret [2 x %StructTy] %Ret1
+}
+
+define {%StructTy, %StructTy} @StructOfStruct(float *%Ptr) {
+; CHECK-LABEL: @StructOfStruct(
+; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds float, float* [[PTR:%.*]], i64 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 2
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[GEP0]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <4 x float> [[TMP2]], <float 1.100000e+01, float 1.200000e+01, float 1.300000e+01, float 1.400000e+01>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[STRUCTIN0:%.*]] = insertvalue [[STRUCTTY:%.*]] undef, float [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[STRUCTIN1:%.*]] = insertvalue [[STRUCTTY]] [[STRUCTIN0]], float [[TMP5]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; CHECK-NEXT:    [[STRUCTIN2:%.*]] = insertvalue [[STRUCTTY]] undef, float [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; CHECK-NEXT:    [[STRUCTIN3:%.*]] = insertvalue [[STRUCTTY]] [[STRUCTIN2]], float [[TMP7]], 1
+; CHECK-NEXT:    [[RET0:%.*]] = insertvalue { [[STRUCTTY]], [[STRUCTTY]] } undef, [[STRUCTTY]] [[STRUCTIN1]], 0
+; CHECK-NEXT:    [[RET1:%.*]] = insertvalue { [[STRUCTTY]], [[STRUCTTY]] } [[RET0]], [[STRUCTTY]] [[STRUCTIN3]], 1
+; CHECK-NEXT:    ret { [[STRUCTTY]], [[STRUCTTY]] } [[RET1]]
+;
+  %GEP0 = getelementptr inbounds float, float* %Ptr, i64 0
+  %L0 = load float, float * %GEP0
+  %GEP1 = getelementptr inbounds float, float* %Ptr, i64 1
+  %L1 = load float, float * %GEP1
+  %GEP2 = getelementptr inbounds float, float* %Ptr, i64 2
+  %L2 = load float, float * %GEP2
+  %GEP3 = getelementptr inbounds float, float* %Ptr, i64 3
+  %L3 = load float, float * %GEP3
+
+  %Fadd0 = fadd fast float %L0, 1.1e+01
+  %Fadd1 = fadd fast float %L1, 1.2e+01
+  %Fadd2 = fadd fast float %L2, 1.3e+01
+  %Fadd3 = fadd fast float %L3, 1.4e+01
+
+  %StructIn0 = insertvalue %StructTy undef, float %Fadd0, 0
+  %StructIn1 = insertvalue %StructTy %StructIn0, float %Fadd1, 1
+
+  %StructIn2 = insertvalue %StructTy undef, float %Fadd2, 0
+  %StructIn3 = insertvalue %StructTy %StructIn2, float %Fadd3, 1
+
+  %Ret0 = insertvalue {%StructTy, %StructTy} undef, %StructTy %StructIn1, 0
+  %Ret1 = insertvalue {%StructTy, %StructTy} %Ret0, %StructTy %StructIn3, 1
+  ret {%StructTy, %StructTy} %Ret1
+}
+
+define {%StructTy, float, float} @NonHomogeneousStruct(float *%Ptr) {
+; CHECK-LABEL: @NonHomogeneousStruct(
+; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds float, float* [[PTR:%.*]], i64 0
+; CHECK-NEXT:    [[L0:%.*]] = load float, float* [[GEP0]], align 4
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 1
+; CHECK-NEXT:    [[L1:%.*]] = load float, float* [[GEP1]], align 4
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 2
+; CHECK-NEXT:    [[L2:%.*]] = load float, float* [[GEP2]], align 4
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 3
+; CHECK-NEXT:    [[L3:%.*]] = load float, float* [[GEP3]], align 4
+; CHECK-NEXT:    [[FADD0:%.*]] = fadd fast float [[L0]], 1.100000e+01
+; CHECK-NEXT:    [[FADD1:%.*]] = fadd fast float [[L1]], 1.200000e+01
+; CHECK-NEXT:    [[FADD2:%.*]] = fadd fast float [[L2]], 1.300000e+01
+; CHECK-NEXT:    [[FADD3:%.*]] = fadd fast float [[L3]], 1.400000e+01
+; CHECK-NEXT:    [[STRUCTIN0:%.*]] = insertvalue [[STRUCTTY:%.*]] undef, float [[FADD0]], 0
+; CHECK-NEXT:    [[STRUCTIN1:%.*]] = insertvalue [[STRUCTTY]] [[STRUCTIN0]], float [[FADD1]], 1
+; CHECK-NEXT:    [[RET0:%.*]] = insertvalue { [[STRUCTTY]], float, float } undef, [[STRUCTTY]] [[STRUCTIN1]], 0
+; CHECK-NEXT:    [[RET1:%.*]] = insertvalue { [[STRUCTTY]], float, float } [[RET0]], float [[FADD2]], 1
+; CHECK-NEXT:    [[RET2:%.*]] = insertvalue { [[STRUCTTY]], float, float } [[RET1]], float [[FADD3]], 2
+; CHECK-NEXT:    ret { [[STRUCTTY]], float, float } [[RET2]]
+;
+  %GEP0 = getelementptr inbounds float, float* %Ptr, i64 0
+  %L0 = load float, float * %GEP0
+  %GEP1 = getelementptr inbounds float, float* %Ptr, i64 1
+  %L1 = load float, float * %GEP1
+  %GEP2 = getelementptr inbounds float, float* %Ptr, i64 2
+  %L2 = load float, float * %GEP2
+  %GEP3 = getelementptr inbounds float, float* %Ptr, i64 3
+  %L3 = load float, float * %GEP3
+
+  %Fadd0 = fadd fast float %L0, 1.1e+01
+  %Fadd1 = fadd fast float %L1, 1.2e+01
+  %Fadd2 = fadd fast float %L2, 1.3e+01
+  %Fadd3 = fadd fast float %L3, 1.4e+01
+
+  %StructIn0 = insertvalue %StructTy undef, float %Fadd0, 0
+  %StructIn1 = insertvalue %StructTy %StructIn0, float %Fadd1, 1
+
+  %Ret0 = insertvalue {%StructTy, float, float} undef, %StructTy %StructIn1, 0
+  %Ret1 = insertvalue {%StructTy, float, float} %Ret0, float %Fadd2, 1
+  %Ret2 = insertvalue {%StructTy, float, float} %Ret1, float %Fadd3, 2
+  ret {%StructTy, float, float} %Ret2
+}
+
+%Struct1Ty = type { i16, i16 }
+%Struct2Ty = type { %Struct1Ty, %Struct1Ty}
+
+define {%Struct2Ty, %Struct2Ty} @StructOfStructOfStruct(i16 *%Ptr) {
+; CHECK-LABEL: @StructOfStructOfStruct(
+; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 2
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 3
+; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 4
+; CHECK-NEXT:    [[GEP5:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 5
+; CHECK-NEXT:    [[GEP6:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 6
+; CHECK-NEXT:    [[GEP7:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 7
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[GEP0]] to <8 x i16>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[TMP2]], <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[STRUCTIN0:%.*]] = insertvalue [[STRUCT1TY:%.*]] undef, i16 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; CHECK-NEXT:    [[STRUCTIN1:%.*]] = insertvalue [[STRUCT1TY]] [[STRUCTIN0]], i16 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; CHECK-NEXT:    [[STRUCTIN2:%.*]] = insertvalue [[STRUCT1TY]] undef, i16 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    [[STRUCTIN3:%.*]] = insertvalue [[STRUCT1TY]] [[STRUCTIN2]], i16 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; CHECK-NEXT:    [[STRUCTIN4:%.*]] = insertvalue [[STRUCT1TY]] undef, i16 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; CHECK-NEXT:    [[STRUCTIN5:%.*]] = insertvalue [[STRUCT1TY]] [[STRUCTIN4]], i16 [[TMP9]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; CHECK-NEXT:    [[STRUCTIN6:%.*]] = insertvalue [[STRUCT1TY]] undef, i16 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; CHECK-NEXT:    [[STRUCTIN7:%.*]] = insertvalue [[STRUCT1TY]] [[STRUCTIN6]], i16 [[TMP11]], 1
+; CHECK-NEXT:    [[STRUCT2IN0:%.*]] = insertvalue [[STRUCT2TY:%.*]] undef, [[STRUCT1TY]] [[STRUCTIN1]], 0
+; CHECK-NEXT:    [[STRUCT2IN1:%.*]] = insertvalue [[STRUCT2TY]] [[STRUCT2IN0]], [[STRUCT1TY]] [[STRUCTIN3]], 1
+; CHECK-NEXT:    [[STRUCT2IN2:%.*]] = insertvalue [[STRUCT2TY]] undef, [[STRUCT1TY]] [[STRUCTIN5]], 0
+; CHECK-NEXT:    [[STRUCT2IN3:%.*]] = insertvalue [[STRUCT2TY]] [[STRUCT2IN2]], [[STRUCT1TY]] [[STRUCTIN7]], 1
+; CHECK-NEXT:    [[RET0:%.*]] = insertvalue { [[STRUCT2TY]], [[STRUCT2TY]] } undef, [[STRUCT2TY]] [[STRUCT2IN1]], 0
+; CHECK-NEXT:    [[RET1:%.*]] = insertvalue { [[STRUCT2TY]], [[STRUCT2TY]] } [[RET0]], [[STRUCT2TY]] [[STRUCT2IN3]], 1
+; CHECK-NEXT:    ret { [[STRUCT2TY]], [[STRUCT2TY]] } [[RET1]]
+;
+  %GEP0 = getelementptr inbounds i16, i16* %Ptr, i64 0
+  %L0 = load i16, i16 * %GEP0
+  %GEP1 = getelementptr inbounds i16, i16* %Ptr, i64 1
+  %L1 = load i16, i16 * %GEP1
+  %GEP2 = getelementptr inbounds i16, i16* %Ptr, i64 2
+  %L2 = load i16, i16 * %GEP2
+  %GEP3 = getelementptr inbounds i16, i16* %Ptr, i64 3
+  %L3 = load i16, i16 * %GEP3
+  %GEP4 = getelementptr inbounds i16, i16* %Ptr, i64 4
+  %L4 = load i16, i16 * %GEP4
+  %GEP5 = getelementptr inbounds i16, i16* %Ptr, i64 5
+  %L5 = load i16, i16 * %GEP5
+  %GEP6 = getelementptr inbounds i16, i16* %Ptr, i64 6
+  %L6 = load i16, i16 * %GEP6
+  %GEP7 = getelementptr inbounds i16, i16* %Ptr, i64 7
+  %L7 = load i16, i16 * %GEP7
+
+  %Fadd0 = add i16 %L0, 1
+  %Fadd1 = add i16 %L1, 2
+  %Fadd2 = add i16 %L2, 3
+  %Fadd3 = add i16 %L3, 4
+  %Fadd4 = add i16 %L4, 5
+  %Fadd5 = add i16 %L5, 6
+  %Fadd6 = add i16 %L6, 7
+  %Fadd7 = add i16 %L7, 8
+
+  %StructIn0 = insertvalue %Struct1Ty undef, i16 %Fadd0, 0
+  %StructIn1 = insertvalue %Struct1Ty %StructIn0, i16 %Fadd1, 1
+
+  %StructIn2 = insertvalue %Struct1Ty undef, i16 %Fadd2, 0
+  %StructIn3 = insertvalue %Struct1Ty %StructIn2, i16 %Fadd3, 1
+
+  %StructIn4 = insertvalue %Struct1Ty undef, i16 %Fadd4, 0
+  %StructIn5 = insertvalue %Struct1Ty %StructIn4, i16 %Fadd5, 1
+
+  %StructIn6 = insertvalue %Struct1Ty undef, i16 %Fadd6, 0
+  %StructIn7 = insertvalue %Struct1Ty %StructIn6, i16 %Fadd7, 1
+
+  %Struct2In0 = insertvalue %Struct2Ty undef, %Struct1Ty %StructIn1, 0
+  %Struct2In1 = insertvalue %Struct2Ty %Struct2In0, %Struct1Ty %StructIn3, 1
+
+  %Struct2In2 = insertvalue %Struct2Ty undef, %Struct1Ty %StructIn5, 0
+  %Struct2In3 = insertvalue %Struct2Ty %Struct2In2, %Struct1Ty %StructIn7, 1
+
+  %Ret0 = insertvalue {%Struct2Ty, %Struct2Ty} undef, %Struct2Ty %Struct2In1, 0
+  %Ret1 = insertvalue {%Struct2Ty, %Struct2Ty} %Ret0, %Struct2Ty %Struct2In3, 1
+  ret {%Struct2Ty, %Struct2Ty} %Ret1
+}
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr44067-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr44067-inseltpoison.ll
@ -0,0 +1,118 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+; See https://reviews.llvm.org/D83779
+
+define <2 x float> @foo({{float, float}}* %A) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast { { float, float } }* [[A:%.*]] to <2 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], <float 2.000000e+00, float 2.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; CHECK-NEXT:    [[INS1:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    [[INS0:%.*]] = insertelement <2 x float> [[INS1]], float [[TMP4]], i32 0
+; CHECK-NEXT:    ret <2 x float> [[INS0]]
+;
+entry:
+  %0 = bitcast {{float, float}}* %A to <2 x float>*
+  %1 = load <2 x float>, <2 x float>* %0
+  %L0 = extractelement <2 x float> %1, i32 0
+  %L1 = extractelement <2 x float> %1, i32 1
+  %Mul0 = fmul float %L0, 2.000000e+00
+  %Mul1 = fmul float %L1, 2.000000e+00
+  %Ins1 = insertelement <2 x float> poison, float %Mul1, i32 1
+  %Ins0 = insertelement <2 x float> %Ins1, float %Mul0, i32 0
+  ret <2 x float> %Ins0
+}
+
+
+%Struct1Ty = type { i16, i16 }
+%Struct2Ty = type { %Struct1Ty, %Struct1Ty}
+
+define {%Struct2Ty, %Struct2Ty} @StructOfStructOfStruct(i16 *%Ptr) {
+; CHECK-LABEL: @StructOfStructOfStruct(
+; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 2
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 3
+; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 4
+; CHECK-NEXT:    [[GEP5:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 5
+; CHECK-NEXT:    [[GEP6:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 6
+; CHECK-NEXT:    [[GEP7:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 7
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[GEP0]] to <8 x i16>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[TMP2]], <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; CHECK-NEXT:    [[STRUCTIN0:%.*]] = insertvalue [[STRUCT1TY:%.*]] undef, i16 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[STRUCTIN1:%.*]] = insertvalue [[STRUCT1TY]] [[STRUCTIN0]], i16 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; CHECK-NEXT:    [[STRUCTIN2:%.*]] = insertvalue [[STRUCT1TY]] undef, i16 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    [[STRUCTIN3:%.*]] = insertvalue [[STRUCT1TY]] [[STRUCTIN2]], i16 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; CHECK-NEXT:    [[STRUCTIN4:%.*]] = insertvalue [[STRUCT1TY]] undef, i16 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; CHECK-NEXT:    [[STRUCTIN5:%.*]] = insertvalue [[STRUCT1TY]] [[STRUCTIN4]], i16 [[TMP9]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; CHECK-NEXT:    [[STRUCTIN6:%.*]] = insertvalue [[STRUCT1TY]] undef, i16 [[TMP10]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; CHECK-NEXT:    [[STRUCTIN7:%.*]] = insertvalue [[STRUCT1TY]] [[STRUCTIN6]], i16 [[TMP11]], 0
+; CHECK-NEXT:    [[STRUCT2IN0:%.*]] = insertvalue [[STRUCT2TY:%.*]] undef, [[STRUCT1TY]] [[STRUCTIN1]], 0
+; CHECK-NEXT:    [[STRUCT2IN1:%.*]] = insertvalue [[STRUCT2TY]] [[STRUCT2IN0]], [[STRUCT1TY]] [[STRUCTIN3]], 1
+; CHECK-NEXT:    [[STRUCT2IN2:%.*]] = insertvalue [[STRUCT2TY]] undef, [[STRUCT1TY]] [[STRUCTIN5]], 0
+; CHECK-NEXT:    [[STRUCT2IN3:%.*]] = insertvalue [[STRUCT2TY]] [[STRUCT2IN2]], [[STRUCT1TY]] [[STRUCTIN7]], 1
+; CHECK-NEXT:    [[RET0:%.*]] = insertvalue { [[STRUCT2TY]], [[STRUCT2TY]] } undef, [[STRUCT2TY]] [[STRUCT2IN3]], 1
+; CHECK-NEXT:    [[RET1:%.*]] = insertvalue { [[STRUCT2TY]], [[STRUCT2TY]] } [[RET0]], [[STRUCT2TY]] [[STRUCT2IN1]], 0
+; CHECK-NEXT:    ret { [[STRUCT2TY]], [[STRUCT2TY]] } [[RET1]]
+;
+  %GEP0 = getelementptr inbounds i16, i16* %Ptr, i64 0
+  %L0 = load i16, i16 * %GEP0
+  %GEP1 = getelementptr inbounds i16, i16* %Ptr, i64 1
+  %L1 = load i16, i16 * %GEP1
+  %GEP2 = getelementptr inbounds i16, i16* %Ptr, i64 2
+  %L2 = load i16, i16 * %GEP2
+  %GEP3 = getelementptr inbounds i16, i16* %Ptr, i64 3
+  %L3 = load i16, i16 * %GEP3
+  %GEP4 = getelementptr inbounds i16, i16* %Ptr, i64 4
+  %L4 = load i16, i16 * %GEP4
+  %GEP5 = getelementptr inbounds i16, i16* %Ptr, i64 5
+  %L5 = load i16, i16 * %GEP5
+  %GEP6 = getelementptr inbounds i16, i16* %Ptr, i64 6
+  %L6 = load i16, i16 * %GEP6
+  %GEP7 = getelementptr inbounds i16, i16* %Ptr, i64 7
+  %L7 = load i16, i16 * %GEP7
+
+  %Fadd0 = add i16 %L0, 1
+  %Fadd1 = add i16 %L1, 2
+  %Fadd2 = add i16 %L2, 3
+  %Fadd3 = add i16 %L3, 4
+  %Fadd4 = add i16 %L4, 5
+  %Fadd5 = add i16 %L5, 6
+  %Fadd6 = add i16 %L6, 7
+  %Fadd7 = add i16 %L7, 8
+
+  %StructIn0 = insertvalue %Struct1Ty undef, i16 %Fadd1, 1
+  %StructIn1 = insertvalue %Struct1Ty %StructIn0, i16 %Fadd0, 0
+
+  %StructIn2 = insertvalue %Struct1Ty undef, i16 %Fadd2, 0
+  %StructIn3 = insertvalue %Struct1Ty %StructIn2, i16 %Fadd3, 1
+
+  %StructIn4 = insertvalue %Struct1Ty undef, i16 %Fadd4, 0
+  %StructIn5 = insertvalue %Struct1Ty %StructIn4, i16 %Fadd5, 1
+
+  %StructIn6 = insertvalue %Struct1Ty undef, i16 %Fadd7, 1
+  %StructIn7 = insertvalue %Struct1Ty %StructIn6, i16 %Fadd6, 0
+
+  %Struct2In0 = insertvalue %Struct2Ty undef, %Struct1Ty %StructIn1, 0
+  %Struct2In1 = insertvalue %Struct2Ty %Struct2In0, %Struct1Ty %StructIn3, 1
+
+  %Struct2In2 = insertvalue %Struct2Ty undef, %Struct1Ty %StructIn5, 0
+  %Struct2In3 = insertvalue %Struct2Ty %Struct2In2, %Struct1Ty %StructIn7, 1
+
+  %Ret0 = insertvalue {%Struct2Ty, %Struct2Ty} undef, %Struct2Ty %Struct2In3, 1
+  %Ret1 = insertvalue {%Struct2Ty, %Struct2Ty} %Ret0, %Struct2Ty %Struct2In1, 0
+  ret {%Struct2Ty, %Struct2Ty} %Ret1
+}
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
@ -0,0 +1,664 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2     | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx      | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2     | FileCheck %s --check-prefixes=CHECK,AVX2
+; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f  | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512
+
+define void @gather_load(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) {
+; CHECK-LABEL: @gather_load(
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, [[TBAA0:!tbaa !.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, [[TBAA0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0]]
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3
+; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, [[TBAA0]]
+; CHECK-NEXT:    ret void
+;
+  %3 = getelementptr inbounds i32, i32* %1, i64 1
+  %4 = load i32, i32* %1, align 4, !tbaa !2
+  %5 = getelementptr inbounds i32, i32* %0, i64 1
+  %6 = getelementptr inbounds i32, i32* %1, i64 11
+  %7 = load i32, i32* %6, align 4, !tbaa !2
+  %8 = getelementptr inbounds i32, i32* %0, i64 2
+  %9 = getelementptr inbounds i32, i32* %1, i64 4
+  %10 = load i32, i32* %9, align 4, !tbaa !2
+  %11 = getelementptr inbounds i32, i32* %0, i64 3
+  %12 = load i32, i32* %3, align 4, !tbaa !2
+  %13 = insertelement <4 x i32> poison, i32 %4, i32 0
+  %14 = insertelement <4 x i32> %13, i32 %7, i32 1
+  %15 = insertelement <4 x i32> %14, i32 %10, i32 2
+  %16 = insertelement <4 x i32> %15, i32 %12, i32 3
+  %17 = add nsw <4 x i32> %16, <i32 1, i32 2, i32 3, i32 4>
+  %18 = bitcast i32* %0 to <4 x i32>*
+  store <4 x i32> %17, <4 x i32>* %18, align 4, !tbaa !2
+  ret void
+}
+
+define void @gather_load_2(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) {
+; SSE-LABEL: @gather_load_2(
+; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
+; SSE-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0:!tbaa !.*]]
+; SSE-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
+; SSE-NEXT:    store i32 [[TMP5]], i32* [[TMP0]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
+; SSE-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2
+; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
+; SSE-NEXT:    store i32 [[TMP9]], i32* [[TMP6]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
+; SSE-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
+; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
+; SSE-NEXT:    store i32 [[TMP13]], i32* [[TMP10]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
+; SSE-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
+; SSE-NEXT:    store i32 [[TMP17]], i32* [[TMP14]], align 4, [[TBAA0]]
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @gather_load_2(
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
+; AVX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0:!tbaa !.*]]
+; AVX-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
+; AVX-NEXT:    store i32 [[TMP5]], i32* [[TMP0]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
+; AVX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2
+; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
+; AVX-NEXT:    store i32 [[TMP9]], i32* [[TMP6]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
+; AVX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
+; AVX-NEXT:    store i32 [[TMP13]], i32* [[TMP10]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
+; AVX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
+; AVX-NEXT:    store i32 [[TMP17]], i32* [[TMP14]], align 4, [[TBAA0]]
+; AVX-NEXT:    ret void
+;
+; AVX2-LABEL: @gather_load_2(
+; AVX2-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32*> undef, i32* [[TMP1:%.*]], i32 0
+; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> <i64 1, i64 10, i64 3, i64 5>
+; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0:!tbaa !.*]]
+; AVX2-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4>
+; AVX2-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
+; AVX2-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, [[TBAA0]]
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @gather_load_2(
+; AVX512-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32*> undef, i32* [[TMP1:%.*]], i32 0
+; AVX512-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> <i64 1, i64 10, i64 3, i64 5>
+; AVX512-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0:!tbaa !.*]]
+; AVX512-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4>
+; AVX512-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
+; AVX512-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, [[TBAA0]]
+; AVX512-NEXT:    ret void
+;
+  %3 = getelementptr inbounds i32, i32* %1, i64 1
+  %4 = load i32, i32* %3, align 4, !tbaa !2
+  %5 = add nsw i32 %4, 1
+  %6 = getelementptr inbounds i32, i32* %0, i64 1
+  store i32 %5, i32* %0, align 4, !tbaa !2
+  %7 = getelementptr inbounds i32, i32* %1, i64 10
+  %8 = load i32, i32* %7, align 4, !tbaa !2
+  %9 = add nsw i32 %8, 2
+  %10 = getelementptr inbounds i32, i32* %0, i64 2
+  store i32 %9, i32* %6, align 4, !tbaa !2
+  %11 = getelementptr inbounds i32, i32* %1, i64 3
+  %12 = load i32, i32* %11, align 4, !tbaa !2
+  %13 = add nsw i32 %12, 3
+  %14 = getelementptr inbounds i32, i32* %0, i64 3
+  store i32 %13, i32* %10, align 4, !tbaa !2
+  %15 = getelementptr inbounds i32, i32* %1, i64 5
+  %16 = load i32, i32* %15, align 4, !tbaa !2
+  %17 = add nsw i32 %16, 4
+  store i32 %17, i32* %14, align 4, !tbaa !2
+  ret void
+}
+
+
+define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) {
+; SSE-LABEL: @gather_load_3(
+; SSE-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
+; SSE-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; SSE-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 2
+; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
+; SSE-NEXT:    store i32 [[TMP8]], i32* [[TMP5]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; SSE-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 3
+; SSE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
+; SSE-NEXT:    store i32 [[TMP12]], i32* [[TMP9]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; SSE-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 4
+; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
+; SSE-NEXT:    store i32 [[TMP16]], i32* [[TMP13]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
+; SSE-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 1
+; SSE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
+; SSE-NEXT:    store i32 [[TMP20]], i32* [[TMP17]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
+; SSE-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], 2
+; SSE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
+; SSE-NEXT:    store i32 [[TMP24]], i32* [[TMP21]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
+; SSE-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], 3
+; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
+; SSE-NEXT:    store i32 [[TMP28]], i32* [[TMP25]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; SSE-NEXT:    [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], 4
+; SSE-NEXT:    store i32 [[TMP32]], i32* [[TMP29]], align 4, [[TBAA0]]
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @gather_load_3(
+; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
+; AVX-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 2
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
+; AVX-NEXT:    store i32 [[TMP8]], i32* [[TMP5]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 3
+; AVX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
+; AVX-NEXT:    store i32 [[TMP12]], i32* [[TMP9]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; AVX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 4
+; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
+; AVX-NEXT:    store i32 [[TMP16]], i32* [[TMP13]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
+; AVX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 1
+; AVX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
+; AVX-NEXT:    store i32 [[TMP20]], i32* [[TMP17]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
+; AVX-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], 2
+; AVX-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
+; AVX-NEXT:    store i32 [[TMP24]], i32* [[TMP21]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
+; AVX-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], 3
+; AVX-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
+; AVX-NEXT:    store i32 [[TMP28]], i32* [[TMP25]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; AVX-NEXT:    [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], 4
+; AVX-NEXT:    store i32 [[TMP32]], i32* [[TMP29]], align 4, [[TBAA0]]
+; AVX-NEXT:    ret void
+;
+; AVX2-LABEL: @gather_load_3(
+; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
+; AVX2-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32*> undef, i32* [[TMP1]], i32 0
+; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
+; AVX2-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
+; AVX2-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], <i32 2, i32 3, i32 4, i32 1>
+; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
+; AVX2-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; AVX2-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
+; AVX2-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 2
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
+; AVX2-NEXT:    store i32 [[TMP15]], i32* [[TMP11]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
+; AVX2-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 3
+; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
+; AVX2-NEXT:    store i32 [[TMP19]], i32* [[TMP16]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; AVX2-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP23:%.*]] = add i32 [[TMP22]], 4
+; AVX2-NEXT:    store i32 [[TMP23]], i32* [[TMP20]], align 4, [[TBAA0]]
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @gather_load_3(
+; AVX512-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
+; AVX512-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32*> undef, i32* [[TMP1]], i32 0
+; AVX512-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
+; AVX512-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
+; AVX512-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], <i32 2, i32 3, i32 4, i32 1>
+; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
+; AVX512-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; AVX512-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
+; AVX512-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 2
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
+; AVX512-NEXT:    store i32 [[TMP15]], i32* [[TMP11]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
+; AVX512-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 3
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
+; AVX512-NEXT:    store i32 [[TMP19]], i32* [[TMP16]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; AVX512-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[TMP23:%.*]] = add i32 [[TMP22]], 4
+; AVX512-NEXT:    store i32 [[TMP23]], i32* [[TMP20]], align 4, [[TBAA0]]
+; AVX512-NEXT:    ret void
+;
+  %3 = load i32, i32* %1, align 4, !tbaa !2
+  %4 = add i32 %3, 1
+  %5 = getelementptr inbounds i32, i32* %0, i64 1
+  store i32 %4, i32* %0, align 4, !tbaa !2
+  %6 = getelementptr inbounds i32, i32* %1, i64 11
+  %7 = load i32, i32* %6, align 4, !tbaa !2
+  %8 = add i32 %7, 2
+  %9 = getelementptr inbounds i32, i32* %0, i64 2
+  store i32 %8, i32* %5, align 4, !tbaa !2
+  %10 = getelementptr inbounds i32, i32* %1, i64 4
+  %11 = load i32, i32* %10, align 4, !tbaa !2
+  %12 = add i32 %11, 3
+  %13 = getelementptr inbounds i32, i32* %0, i64 3
+  store i32 %12, i32* %9, align 4, !tbaa !2
+  %14 = getelementptr inbounds i32, i32* %1, i64 15
+  %15 = load i32, i32* %14, align 4, !tbaa !2
+  %16 = add i32 %15, 4
+  %17 = getelementptr inbounds i32, i32* %0, i64 4
+  store i32 %16, i32* %13, align 4, !tbaa !2
+  %18 = getelementptr inbounds i32, i32* %1, i64 18
+  %19 = load i32, i32* %18, align 4, !tbaa !2
+  %20 = add i32 %19, 1
+  %21 = getelementptr inbounds i32, i32* %0, i64 5
+  store i32 %20, i32* %17, align 4, !tbaa !2
+  %22 = getelementptr inbounds i32, i32* %1, i64 9
+  %23 = load i32, i32* %22, align 4, !tbaa !2
+  %24 = add i32 %23, 2
+  %25 = getelementptr inbounds i32, i32* %0, i64 6
+  store i32 %24, i32* %21, align 4, !tbaa !2
+  %26 = getelementptr inbounds i32, i32* %1, i64 6
+  %27 = load i32, i32* %26, align 4, !tbaa !2
+  %28 = add i32 %27, 3
+  %29 = getelementptr inbounds i32, i32* %0, i64 7
+  store i32 %28, i32* %25, align 4, !tbaa !2
+  %30 = getelementptr inbounds i32, i32* %1, i64 21
+  %31 = load i32, i32* %30, align 4, !tbaa !2
+  %32 = add i32 %31, 4
+  store i32 %32, i32* %29, align 4, !tbaa !2
+  ret void
+}
+
+define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture readonly %t1) {
+; SSE-LABEL: @gather_load_4(
+; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
+; SSE-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
+; SSE-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2
+; SSE-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
+; SSE-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3
+; SSE-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
+; SSE-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4
+; SSE-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18
+; SSE-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
+; SSE-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
+; SSE-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
+; SSE-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
+; SSE-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
+; SSE-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
+; SSE-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[T19:%.*]] = load i32, i32* [[T18]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
+; SSE-NEXT:    [[T8:%.*]] = add i32 [[T7]], 2
+; SSE-NEXT:    [[T12:%.*]] = add i32 [[T11]], 3
+; SSE-NEXT:    [[T16:%.*]] = add i32 [[T15]], 4
+; SSE-NEXT:    [[T20:%.*]] = add i32 [[T19]], 1
+; SSE-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
+; SSE-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
+; SSE-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
+; SSE-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
+; SSE-NEXT:    store i32 [[T8]], i32* [[T5]], align 4, [[TBAA0]]
+; SSE-NEXT:    store i32 [[T12]], i32* [[T9]], align 4, [[TBAA0]]
+; SSE-NEXT:    store i32 [[T16]], i32* [[T13]], align 4, [[TBAA0]]
+; SSE-NEXT:    store i32 [[T20]], i32* [[T17]], align 4, [[TBAA0]]
+; SSE-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
+; SSE-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
+; SSE-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @gather_load_4(
+; AVX-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
+; AVX-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
+; AVX-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2
+; AVX-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
+; AVX-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3
+; AVX-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
+; AVX-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4
+; AVX-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18
+; AVX-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
+; AVX-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
+; AVX-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
+; AVX-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
+; AVX-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
+; AVX-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
+; AVX-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[T19:%.*]] = load i32, i32* [[T18]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
+; AVX-NEXT:    [[T8:%.*]] = add i32 [[T7]], 2
+; AVX-NEXT:    [[T12:%.*]] = add i32 [[T11]], 3
+; AVX-NEXT:    [[T16:%.*]] = add i32 [[T15]], 4
+; AVX-NEXT:    [[T20:%.*]] = add i32 [[T19]], 1
+; AVX-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
+; AVX-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
+; AVX-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
+; AVX-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
+; AVX-NEXT:    store i32 [[T8]], i32* [[T5]], align 4, [[TBAA0]]
+; AVX-NEXT:    store i32 [[T12]], i32* [[T9]], align 4, [[TBAA0]]
+; AVX-NEXT:    store i32 [[T16]], i32* [[T13]], align 4, [[TBAA0]]
+; AVX-NEXT:    store i32 [[T20]], i32* [[T17]], align 4, [[TBAA0]]
+; AVX-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
+; AVX-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
+; AVX-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
+; AVX-NEXT:    ret void
+;
+; AVX2-LABEL: @gather_load_4(
+; AVX2-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
+; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> undef, i32* [[T1:%.*]], i32 0
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
+; AVX2-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
+; AVX2-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
+; AVX2-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
+; AVX2-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
+; AVX2-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
+; AVX2-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
+; AVX2-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
+; AVX2-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
+; AVX2-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], <i32 2, i32 3, i32 4, i32 1>
+; AVX2-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
+; AVX2-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
+; AVX2-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
+; AVX2-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>*
+; AVX2-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, [[TBAA0]]
+; AVX2-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
+; AVX2-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
+; AVX2-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @gather_load_4(
+; AVX512-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
+; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> undef, i32* [[T1:%.*]], i32 0
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
+; AVX512-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
+; AVX512-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
+; AVX512-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
+; AVX512-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
+; AVX512-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
+; AVX512-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
+; AVX512-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
+; AVX512-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
+; AVX512-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], <i32 2, i32 3, i32 4, i32 1>
+; AVX512-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
+; AVX512-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
+; AVX512-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
+; AVX512-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>*
+; AVX512-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, [[TBAA0]]
+; AVX512-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
+; AVX512-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
+; AVX512-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
+; AVX512-NEXT:    ret void
+;
+  %t5 = getelementptr inbounds i32, i32* %t0, i64 1
+  %t6 = getelementptr inbounds i32, i32* %t1, i64 11
+  %t9 = getelementptr inbounds i32, i32* %t0, i64 2
+  %t10 = getelementptr inbounds i32, i32* %t1, i64 4
+  %t13 = getelementptr inbounds i32, i32* %t0, i64 3
+  %t14 = getelementptr inbounds i32, i32* %t1, i64 15
+  %t17 = getelementptr inbounds i32, i32* %t0, i64 4
+  %t18 = getelementptr inbounds i32, i32* %t1, i64 18
+  %t21 = getelementptr inbounds i32, i32* %t0, i64 5
+  %t22 = getelementptr inbounds i32, i32* %t1, i64 9
+  %t25 = getelementptr inbounds i32, i32* %t0, i64 6
+  %t26 = getelementptr inbounds i32, i32* %t1, i64 6
+  %t29 = getelementptr inbounds i32, i32* %t0, i64 7
+  %t30 = getelementptr inbounds i32, i32* %t1, i64 21
+
+  %t3 = load i32, i32* %t1, align 4, !tbaa !2
+  %t7 = load i32, i32* %t6, align 4, !tbaa !2
+  %t11 = load i32, i32* %t10, align 4, !tbaa !2
+  %t15 = load i32, i32* %t14, align 4, !tbaa !2
+  %t19 = load i32, i32* %t18, align 4, !tbaa !2
+  %t23 = load i32, i32* %t22, align 4, !tbaa !2
+  %t27 = load i32, i32* %t26, align 4, !tbaa !2
+  %t31 = load i32, i32* %t30, align 4, !tbaa !2
+
+  %t4 = add i32 %t3, 1
+  %t8 = add i32 %t7, 2
+  %t12 = add i32 %t11, 3
+  %t16 = add i32 %t15, 4
+  %t20 = add i32 %t19, 1
+  %t24 = add i32 %t23, 2
+  %t28 = add i32 %t27, 3
+  %t32 = add i32 %t31, 4
+
+  store i32 %t4, i32* %t0, align 4, !tbaa !2
+  store i32 %t8, i32* %t5, align 4, !tbaa !2
+  store i32 %t12, i32* %t9, align 4, !tbaa !2
+  store i32 %t16, i32* %t13, align 4, !tbaa !2
+  store i32 %t20, i32* %t17, align 4, !tbaa !2
+  store i32 %t24, i32* %t21, align 4, !tbaa !2
+  store i32 %t28, i32* %t25, align 4, !tbaa !2
+  store i32 %t32, i32* %t29, align 4, !tbaa !2
+
+  ret void
+}
+
+
+define void @gather_load_div(float* noalias nocapture %0, float* noalias nocapture readonly %1) {
+; SSE-LABEL: @gather_load_div(
+; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10
+; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x float*> undef, float* [[TMP1]], i32 0
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x float*> [[TMP6]], float* [[TMP3]], i32 1
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x float*> [[TMP7]], float* [[TMP4]], i32 2
+; SSE-NEXT:    [[TMP9:%.*]] = insertelement <4 x float*> [[TMP8]], float* [[TMP5]], i32 3
+; SSE-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP9]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
+; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> undef, <4 x i32> zeroinitializer
+; SSE-NEXT:    [[TMP12:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> <i64 4, i64 13, i64 11, i64 44>
+; SSE-NEXT:    [[TMP13:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP12]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
+; SSE-NEXT:    [[TMP14:%.*]] = fdiv <4 x float> [[TMP10]], [[TMP13]]
+; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4
+; SSE-NEXT:    [[TMP16:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
+; SSE-NEXT:    store <4 x float> [[TMP14]], <4 x float>* [[TMP16]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP17:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> <i64 17, i64 8, i64 5, i64 20>
+; SSE-NEXT:    [[TMP18:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP17]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
+; SSE-NEXT:    [[TMP19:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> <i64 33, i64 30, i64 27, i64 23>
+; SSE-NEXT:    [[TMP20:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP19]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
+; SSE-NEXT:    [[TMP21:%.*]] = fdiv <4 x float> [[TMP18]], [[TMP20]]
+; SSE-NEXT:    [[TMP22:%.*]] = bitcast float* [[TMP15]] to <4 x float>*
+; SSE-NEXT:    store <4 x float> [[TMP21]], <4 x float>* [[TMP22]], align 4, [[TBAA0]]
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @gather_load_div(
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; AVX-NEXT:    [[TMP10:%.*]] = insertelement <8 x float*> undef, float* [[TMP1]], i32 0
+; AVX-NEXT:    [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1
+; AVX-NEXT:    [[TMP12:%.*]] = insertelement <8 x float*> [[TMP11]], float* [[TMP4]], i32 2
+; AVX-NEXT:    [[TMP13:%.*]] = insertelement <8 x float*> [[TMP12]], float* [[TMP5]], i32 3
+; AVX-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP6]], i32 4
+; AVX-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
+; AVX-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
+; AVX-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
+; AVX-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
+; AVX-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
+; AVX-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
+; AVX-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
+; AVX-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, [[TBAA0]]
+; AVX-NEXT:    ret void
+;
+; AVX2-LABEL: @gather_load_div(
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; AVX2-NEXT:    [[TMP10:%.*]] = insertelement <8 x float*> undef, float* [[TMP1]], i32 0
+; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1
+; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <8 x float*> [[TMP11]], float* [[TMP4]], i32 2
+; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <8 x float*> [[TMP12]], float* [[TMP5]], i32 3
+; AVX2-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP6]], i32 4
+; AVX2-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
+; AVX2-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
+; AVX2-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
+; AVX2-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX2-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
+; AVX2-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX2-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
+; AVX2-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
+; AVX2-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, [[TBAA0]]
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @gather_load_div(
+; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
+; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; AVX512-NEXT:    [[TMP10:%.*]] = insertelement <8 x float*> undef, float* [[TMP1]], i32 0
+; AVX512-NEXT:    [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1
+; AVX512-NEXT:    [[TMP12:%.*]] = insertelement <8 x float*> [[TMP11]], float* [[TMP4]], i32 2
+; AVX512-NEXT:    [[TMP13:%.*]] = insertelement <8 x float*> [[TMP12]], float* [[TMP5]], i32 3
+; AVX512-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP6]], i32 4
+; AVX512-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
+; AVX512-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
+; AVX512-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
+; AVX512-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX512-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
+; AVX512-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
+; AVX512-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
+; AVX512-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
+; AVX512-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, [[TBAA0]]
+; AVX512-NEXT:    ret void
+;
+  %3 = load float, float* %1, align 4, !tbaa !2
+  %4 = getelementptr inbounds float, float* %1, i64 4
+  %5 = load float, float* %4, align 4, !tbaa !2
+  %6 = fdiv float %3, %5
+  %7 = getelementptr inbounds float, float* %0, i64 1
+  store float %6, float* %0, align 4, !tbaa !2
+  %8 = getelementptr inbounds float, float* %1, i64 10
+  %9 = load float, float* %8, align 4, !tbaa !2
+  %10 = getelementptr inbounds float, float* %1, i64 13
+  %11 = load float, float* %10, align 4, !tbaa !2
+  %12 = fdiv float %9, %11
+  %13 = getelementptr inbounds float, float* %0, i64 2
+  store float %12, float* %7, align 4, !tbaa !2
+  %14 = getelementptr inbounds float, float* %1, i64 3
+  %15 = load float, float* %14, align 4, !tbaa !2
+  %16 = getelementptr inbounds float, float* %1, i64 11
+  %17 = load float, float* %16, align 4, !tbaa !2
+  %18 = fdiv float %15, %17
+  %19 = getelementptr inbounds float, float* %0, i64 3
+  store float %18, float* %13, align 4, !tbaa !2
+  %20 = getelementptr inbounds float, float* %1, i64 14
+  %21 = load float, float* %20, align 4, !tbaa !2
+  %22 = getelementptr inbounds float, float* %1, i64 44
+  %23 = load float, float* %22, align 4, !tbaa !2
+  %24 = fdiv float %21, %23
+  %25 = getelementptr inbounds float, float* %0, i64 4
+  store float %24, float* %19, align 4, !tbaa !2
+  %26 = getelementptr inbounds float, float* %1, i64 17
+  %27 = load float, float* %26, align 4, !tbaa !2
+  %28 = getelementptr inbounds float, float* %1, i64 33
+  %29 = load float, float* %28, align 4, !tbaa !2
+  %30 = fdiv float %27, %29
+  %31 = getelementptr inbounds float, float* %0, i64 5
+  store float %30, float* %25, align 4, !tbaa !2
+  %32 = getelementptr inbounds float, float* %1, i64 8
+  %33 = load float, float* %32, align 4, !tbaa !2
+  %34 = getelementptr inbounds float, float* %1, i64 30
+  %35 = load float, float* %34, align 4, !tbaa !2
+  %36 = fdiv float %33, %35
+  %37 = getelementptr inbounds float, float* %0, i64 6
+  store float %36, float* %31, align 4, !tbaa !2
+  %38 = getelementptr inbounds float, float* %1, i64 5
+  %39 = load float, float* %38, align 4, !tbaa !2
+  %40 = getelementptr inbounds float, float* %1, i64 27
+  %41 = load float, float* %40, align 4, !tbaa !2
+  %42 = fdiv float %39, %41
+  %43 = getelementptr inbounds float, float* %0, i64 7
+  store float %42, float* %37, align 4, !tbaa !2
+  %44 = getelementptr inbounds float, float* %1, i64 20
+  %45 = load float, float* %44, align 4, !tbaa !2
+  %46 = getelementptr inbounds float, float* %1, i64 23
+  %47 = load float, float* %46, align 4, !tbaa !2
+  %48 = fdiv float %45, %47
+  store float %48, float* %43, align 4, !tbaa !2
+  ret void
+}
+
+!2 = !{!3, !3, i64 0}
+!3 = !{!"short", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C++ TBAA"}
--- a/llvm/test/Transforms/SLPVectorizer/X86/sext-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sext-inseltpoison.ll
--- a/Show More
+++ b/Show More