llvm-project/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s


define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) {
; CHECK-LABEL: @jumbled-load(
; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i32> [[TMP2]], [[SHUFFLE]]
; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
; CHECK-NEXT:    store <4 x i32> [[SHUFFLE1]], <4 x i32>* [[TMP6]], align 4
; CHECK-NEXT:    ret i32 undef
;
  %in.addr = getelementptr inbounds i32, i32* %in, i64 0
  %load.1 = load i32, i32* %in.addr, align 4
  %gep.1 = getelementptr inbounds i32, i32* %in.addr, i64 3
  %load.2 = load i32, i32* %gep.1, align 4
  %gep.2 = getelementptr inbounds i32, i32* %in.addr, i64 1
  %load.3 = load i32, i32* %gep.2, align 4
  %gep.3 = getelementptr inbounds i32, i32* %in.addr, i64 2
  %load.4 = load i32, i32* %gep.3, align 4
  %inn.addr = getelementptr inbounds i32, i32* %inn, i64 0
  %load.5 = load i32, i32* %inn.addr, align 4
  %gep.4 = getelementptr inbounds i32, i32* %inn.addr, i64 2
  %load.6 = load i32, i32* %gep.4, align 4
  %gep.5 = getelementptr inbounds i32, i32* %inn.addr, i64 3
  %load.7 = load i32, i32* %gep.5, align 4
  %gep.6 = getelementptr inbounds i32, i32* %inn.addr, i64 1
  %load.8 = load i32, i32* %gep.6, align 4
  %mul.1 = mul i32 %load.3, %load.5
  %mul.2 = mul i32 %load.2, %load.8
  %mul.3 = mul i32 %load.4, %load.7
  %mul.4 = mul i32 %load.1, %load.6
  %gep.7 = getelementptr inbounds i32, i32* %out, i64 0
  store i32 %mul.1, i32* %gep.7, align 4
  %gep.8 = getelementptr inbounds i32, i32* %out, i64 1
  store i32 %mul.2, i32* %gep.8, align 4
  %gep.9 = getelementptr inbounds i32, i32* %out, i64 2
  store i32 %mul.3, i32* %gep.9, align 4
  %gep.10 = getelementptr inbounds i32, i32* %out, i64 3
  store i32 %mul.4, i32* %gep.10, align 4

  ret i32 undef
}


define i32 @jumbled-load-multiuses(i32* noalias nocapture %in, i32* noalias nocapture %out) {
; CHECK-LABEL: @jumbled-load-multiuses(
; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1
; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0
; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 2
; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP5]], i32 1
; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 0
; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP7]], i32 2
; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 3
; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP9]], i32 3
; CHECK-NEXT:    [[TMP11:%.*]] = mul <4 x i32> [[SHUFFLE]], [[TMP10]]
; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
; CHECK-NEXT:    store <4 x i32> [[SHUFFLE1]], <4 x i32>* [[TMP12]], align 4
; CHECK-NEXT:    ret i32 undef
;
  %in.addr = getelementptr inbounds i32, i32* %in, i64 0
  %load.1 = load i32, i32* %in.addr, align 4
  %gep.1 = getelementptr inbounds i32, i32* %in.addr, i64 3
  %load.2 = load i32, i32* %gep.1, align 4
  %gep.2 = getelementptr inbounds i32, i32* %in.addr, i64 1
  %load.3 = load i32, i32* %gep.2, align 4
  %gep.3 = getelementptr inbounds i32, i32* %in.addr, i64 2
  %load.4 = load i32, i32* %gep.3, align 4
  %mul.1 = mul i32 %load.3, %load.4
  %mul.2 = mul i32 %load.2, %load.2
  %mul.3 = mul i32 %load.4, %load.1
  %mul.4 = mul i32 %load.1, %load.3
  %gep.7 = getelementptr inbounds i32, i32* %out, i64 0
  store i32 %mul.1, i32* %gep.7, align 4
  %gep.8 = getelementptr inbounds i32, i32* %out, i64 1
  store i32 %mul.2, i32* %gep.8, align 4
  %gep.9 = getelementptr inbounds i32, i32* %out, i64 2
  store i32 %mul.3, i32* %gep.9, align 4
  %gep.10 = getelementptr inbounds i32, i32* %out, i64 3
  store i32 %mul.4, i32* %gep.10, align 4

  ret i32 undef
}
Revert "Temporarily Revert "Add basic loop fusion pass."" The reversion apparently deleted the test/Transforms directory. Will be re-reverting again. llvm-svn: 358552 2019-04-17 12:52:47 +08:00			`; NOTE: Assertions have been autogenerated by utils/update_test_checks.py`
			`; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer \| FileCheck %s`



			`define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) {`
			`; CHECK-LABEL: @jumbled-load(`
			`; CHECK-NEXT: [[IN_ADDR:%.]] = getelementptr inbounds i32, i32 [[IN:%.*]], i64 0`
			`; CHECK-NEXT: [[GEP_1:%.]] = getelementptr inbounds i32, i32 [[IN_ADDR]], i64 3`
			`; CHECK-NEXT: [[GEP_2:%.]] = getelementptr inbounds i32, i32 [[IN_ADDR]], i64 1`
			`; CHECK-NEXT: [[GEP_3:%.]] = getelementptr inbounds i32, i32 [[IN_ADDR]], i64 2`
			`; CHECK-NEXT: [[TMP1:%.]] = bitcast i32 [[IN_ADDR]] to <4 x i32>*`
			`; CHECK-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> [[TMP1]], align 4`
			`; CHECK-NEXT: [[INN_ADDR:%.]] = getelementptr inbounds i32, i32 [[INN:%.*]], i64 0`
			`; CHECK-NEXT: [[GEP_4:%.]] = getelementptr inbounds i32, i32 [[INN_ADDR]], i64 2`
			`; CHECK-NEXT: [[GEP_5:%.]] = getelementptr inbounds i32, i32 [[INN_ADDR]], i64 3`
			`; CHECK-NEXT: [[GEP_6:%.]] = getelementptr inbounds i32, i32 [[INN_ADDR]], i64 1`
Revert "[SLP]Improve graph reordering." This reverts commit e408d1dfab42b27d0aa51b221e50fa6390fb5ed1 and 2 other (4b25c113210e579a5346ca0abc0717ab1ce5d9df and c2deb2afafee991c06cc96dc5beecb6de448b9fc) related to fix the problem with the reordering shuffles. 2021-08-04 02:30:18 +08:00			`; CHECK-NEXT: [[TMP3:%.]] = bitcast i32 [[INN_ADDR]] to <4 x i32>*`
Revert "Temporarily Revert "Add basic loop fusion pass."" The reversion apparently deleted the test/Transforms directory. Will be re-reverting again. llvm-svn: 358552 2019-04-17 12:52:47 +08:00			`; CHECK-NEXT: [[TMP4:%.]] = load <4 x i32>, <4 x i32> [[TMP3]], align 4`
[SLP]Improve graph reordering. Reworked reordering algorithm. Originally, the compiler just tried to detect the most common order in the reordarable nodes (loads, stores, extractelements,extractvalues) and then fully rebuilding the graph in the best order. This was not effecient, since it required an extra memory and time for building/rebuilding tree, double the use of the scheduling budget, which could lead to missing vectorization due to exausted scheduling resources. Patch provide 2-way approach for graph reodering problem. At first, all reordering is done in-place, it doe not required tree deleting/rebuilding, it just rotates the scalars/orders/reuses masks in the graph node. The first step (top-to bottom) rotates the whole graph, similarly to the previous implementation. Compiler counts the number of the most used orders of the graph nodes with the same vectorization factor and then rotates the subgraph with the given vectorization factor to the most used order, if it is not empty. Then repeats the same procedure for the subgraphs with the smaller vectorization factor. We can do this because we still need to reshuffle smaller subgraph when buildiong operands for the graph nodes with lasrger vectorization factor, we can rotate just subgraph, not the whole graph. The second step (bottom-to-top) scans through the leaves and tries to detect the users of the leaves which can be reordered. If the leaves can be reorder in the best fashion, they are reordered and their user too. It allows to remove double shuffles to the same ordering of the operands in many cases and just reorder the user operations instead. Plus, it moves the final shuffles closer to the top of the graph and in many cases allows to remove extra shuffle because the same procedure is repeated again and we can again merge some reordering masks and reorder user nodes instead of the operands. Also, patch improves cost model for gathering of loads, which improves x264 benchmark in some cases. Gives about +2% on AVX512 + LTO (more expected for AVX/AVX2) for {625,525}x264, +3% for 508.namd, improves most of other benchmarks. The compile and link time are almost the same, though in some cases it should be better (we're not doing an extra instruction scheduling anymore) + we may vectorize more code for the large basic blocks again because of saving scheduling budget. Differential Revision: https://reviews.llvm.org/D105020 2021-08-04 04:20:32 +08:00			`; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 3, i32 1>`
			`; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP2]], [[SHUFFLE]]`
Revert "Temporarily Revert "Add basic loop fusion pass."" The reversion apparently deleted the test/Transforms directory. Will be re-reverting again. llvm-svn: 358552 2019-04-17 12:52:47 +08:00			`; CHECK-NEXT: [[GEP_7:%.]] = getelementptr inbounds i32, i32 [[OUT:%.*]], i64 0`
			`; CHECK-NEXT: [[GEP_8:%.]] = getelementptr inbounds i32, i32 [[OUT]], i64 1`
			`; CHECK-NEXT: [[GEP_9:%.]] = getelementptr inbounds i32, i32 [[OUT]], i64 2`
			`; CHECK-NEXT: [[GEP_10:%.]] = getelementptr inbounds i32, i32 [[OUT]], i64 3`
[SLP]Improve graph reordering. Reworked reordering algorithm. Originally, the compiler just tried to detect the most common order in the reordarable nodes (loads, stores, extractelements,extractvalues) and then fully rebuilding the graph in the best order. This was not effecient, since it required an extra memory and time for building/rebuilding tree, double the use of the scheduling budget, which could lead to missing vectorization due to exausted scheduling resources. Patch provide 2-way approach for graph reodering problem. At first, all reordering is done in-place, it doe not required tree deleting/rebuilding, it just rotates the scalars/orders/reuses masks in the graph node. The first step (top-to bottom) rotates the whole graph, similarly to the previous implementation. Compiler counts the number of the most used orders of the graph nodes with the same vectorization factor and then rotates the subgraph with the given vectorization factor to the most used order, if it is not empty. Then repeats the same procedure for the subgraphs with the smaller vectorization factor. We can do this because we still need to reshuffle smaller subgraph when buildiong operands for the graph nodes with lasrger vectorization factor, we can rotate just subgraph, not the whole graph. The second step (bottom-to-top) scans through the leaves and tries to detect the users of the leaves which can be reordered. If the leaves can be reorder in the best fashion, they are reordered and their user too. It allows to remove double shuffles to the same ordering of the operands in many cases and just reorder the user operations instead. Plus, it moves the final shuffles closer to the top of the graph and in many cases allows to remove extra shuffle because the same procedure is repeated again and we can again merge some reordering masks and reorder user nodes instead of the operands. Also, patch improves cost model for gathering of loads, which improves x264 benchmark in some cases. Gives about +2% on AVX512 + LTO (more expected for AVX/AVX2) for {625,525}x264, +3% for 508.namd, improves most of other benchmarks. The compile and link time are almost the same, though in some cases it should be better (we're not doing an extra instruction scheduling anymore) + we may vectorize more code for the large basic blocks again because of saving scheduling budget. Differential Revision: https://reviews.llvm.org/D105020 2021-08-04 04:20:32 +08:00			`; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> <i32 1, i32 3, i32 2, i32 0>`
Revert "Temporarily Revert "Add basic loop fusion pass."" The reversion apparently deleted the test/Transforms directory. Will be re-reverting again. llvm-svn: 358552 2019-04-17 12:52:47 +08:00			`; CHECK-NEXT: [[TMP6:%.]] = bitcast i32 [[GEP_7]] to <4 x i32>*`
[SLP]Improve graph reordering. Reworked reordering algorithm. Originally, the compiler just tried to detect the most common order in the reordarable nodes (loads, stores, extractelements,extractvalues) and then fully rebuilding the graph in the best order. This was not effecient, since it required an extra memory and time for building/rebuilding tree, double the use of the scheduling budget, which could lead to missing vectorization due to exausted scheduling resources. Patch provide 2-way approach for graph reodering problem. At first, all reordering is done in-place, it doe not required tree deleting/rebuilding, it just rotates the scalars/orders/reuses masks in the graph node. The first step (top-to bottom) rotates the whole graph, similarly to the previous implementation. Compiler counts the number of the most used orders of the graph nodes with the same vectorization factor and then rotates the subgraph with the given vectorization factor to the most used order, if it is not empty. Then repeats the same procedure for the subgraphs with the smaller vectorization factor. We can do this because we still need to reshuffle smaller subgraph when buildiong operands for the graph nodes with lasrger vectorization factor, we can rotate just subgraph, not the whole graph. The second step (bottom-to-top) scans through the leaves and tries to detect the users of the leaves which can be reordered. If the leaves can be reorder in the best fashion, they are reordered and their user too. It allows to remove double shuffles to the same ordering of the operands in many cases and just reorder the user operations instead. Plus, it moves the final shuffles closer to the top of the graph and in many cases allows to remove extra shuffle because the same procedure is repeated again and we can again merge some reordering masks and reorder user nodes instead of the operands. Also, patch improves cost model for gathering of loads, which improves x264 benchmark in some cases. Gives about +2% on AVX512 + LTO (more expected for AVX/AVX2) for {625,525}x264, +3% for 508.namd, improves most of other benchmarks. The compile and link time are almost the same, though in some cases it should be better (we're not doing an extra instruction scheduling anymore) + we may vectorize more code for the large basic blocks again because of saving scheduling budget. Differential Revision: https://reviews.llvm.org/D105020 2021-08-04 04:20:32 +08:00			`; CHECK-NEXT: store <4 x i32> [[SHUFFLE1]], <4 x i32>* [[TMP6]], align 4`
Revert "Temporarily Revert "Add basic loop fusion pass."" The reversion apparently deleted the test/Transforms directory. Will be re-reverting again. llvm-svn: 358552 2019-04-17 12:52:47 +08:00			`; CHECK-NEXT: ret i32 undef`
			`;`
			`%in.addr = getelementptr inbounds i32, i32* %in, i64 0`
			`%load.1 = load i32, i32* %in.addr, align 4`
			`%gep.1 = getelementptr inbounds i32, i32* %in.addr, i64 3`
			`%load.2 = load i32, i32* %gep.1, align 4`
			`%gep.2 = getelementptr inbounds i32, i32* %in.addr, i64 1`
			`%load.3 = load i32, i32* %gep.2, align 4`
			`%gep.3 = getelementptr inbounds i32, i32* %in.addr, i64 2`
			`%load.4 = load i32, i32* %gep.3, align 4`
			`%inn.addr = getelementptr inbounds i32, i32* %inn, i64 0`
			`%load.5 = load i32, i32* %inn.addr, align 4`
			`%gep.4 = getelementptr inbounds i32, i32* %inn.addr, i64 2`
			`%load.6 = load i32, i32* %gep.4, align 4`
			`%gep.5 = getelementptr inbounds i32, i32* %inn.addr, i64 3`
			`%load.7 = load i32, i32* %gep.5, align 4`
			`%gep.6 = getelementptr inbounds i32, i32* %inn.addr, i64 1`
			`%load.8 = load i32, i32* %gep.6, align 4`
			`%mul.1 = mul i32 %load.3, %load.5`
			`%mul.2 = mul i32 %load.2, %load.8`
			`%mul.3 = mul i32 %load.4, %load.7`
			`%mul.4 = mul i32 %load.1, %load.6`
			`%gep.7 = getelementptr inbounds i32, i32* %out, i64 0`
			`store i32 %mul.1, i32* %gep.7, align 4`
			`%gep.8 = getelementptr inbounds i32, i32* %out, i64 1`
			`store i32 %mul.2, i32* %gep.8, align 4`
			`%gep.9 = getelementptr inbounds i32, i32* %out, i64 2`
			`store i32 %mul.3, i32* %gep.9, align 4`
			`%gep.10 = getelementptr inbounds i32, i32* %out, i64 3`
			`store i32 %mul.4, i32* %gep.10, align 4`

			`ret i32 undef`
			`}`


			`define i32 @jumbled-load-multiuses(i32* noalias nocapture %in, i32* noalias nocapture %out) {`
			`; CHECK-LABEL: @jumbled-load-multiuses(`
			`; CHECK-NEXT: [[IN_ADDR:%.]] = getelementptr inbounds i32, i32 [[IN:%.*]], i64 0`
			`; CHECK-NEXT: [[GEP_1:%.]] = getelementptr inbounds i32, i32 [[IN_ADDR]], i64 3`
			`; CHECK-NEXT: [[GEP_2:%.]] = getelementptr inbounds i32, i32 [[IN_ADDR]], i64 1`
			`; CHECK-NEXT: [[GEP_3:%.]] = getelementptr inbounds i32, i32 [[IN_ADDR]], i64 2`
			`; CHECK-NEXT: [[TMP1:%.]] = bitcast i32 [[IN_ADDR]] to <4 x i32>*`
			`; CHECK-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> [[TMP1]], align 4`
[SLP]Improve/fix reordering of the gathered graph nodes. Gathered loads/extractelements/extractvalue instructions should be checked if they can represent a vector reordering node too and their order should ve taken into account for better graph reordering analysis/ Also, if the gather node has reused scalars, they must be reordered instead of the scalars themselves. Differential Revision: https://reviews.llvm.org/D112454 2021-10-25 22:32:35 +08:00			`; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 3>`
			`; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1`
[SLP,LV] Use poison constant vector for shufflevector/initial insertelement This patch makes SLP and LV emit operations with initial vectors set to poison constant instead of undef. This is a part of efforts for using poison vector instead of undef to represent "doesn't care" vector. The goal is to make nice shufflevector optimizations valid that is currently incorrect due to the tricky interaction between undef and poison (see https://bugs.llvm.org/show_bug.cgi?id=44185 ). Reviewed By: fhahn Differential Revision: https://reviews.llvm.org/D94061 2021-01-05 13:11:50 +08:00			`; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0`
[SLP]Improve/fix reordering of the gathered graph nodes. Gathered loads/extractelements/extractvalue instructions should be checked if they can represent a vector reordering node too and their order should ve taken into account for better graph reordering analysis/ Also, if the gather node has reused scalars, they must be reordered instead of the scalars themselves. Differential Revision: https://reviews.llvm.org/D112454 2021-10-25 22:32:35 +08:00			`; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 2`
Revert "Temporarily Revert "Add basic loop fusion pass."" The reversion apparently deleted the test/Transforms directory. Will be re-reverting again. llvm-svn: 358552 2019-04-17 12:52:47 +08:00			`; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP5]], i32 1`
[SLP]Improve/fix reordering of the gathered graph nodes. Gathered loads/extractelements/extractvalue instructions should be checked if they can represent a vector reordering node too and their order should ve taken into account for better graph reordering analysis/ Also, if the gather node has reused scalars, they must be reordered instead of the scalars themselves. Differential Revision: https://reviews.llvm.org/D112454 2021-10-25 22:32:35 +08:00			`; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 0`
Revert "Temporarily Revert "Add basic loop fusion pass."" The reversion apparently deleted the test/Transforms directory. Will be re-reverting again. llvm-svn: 358552 2019-04-17 12:52:47 +08:00			`; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP7]], i32 2`
[SLP]Improve/fix reordering of the gathered graph nodes. Gathered loads/extractelements/extractvalue instructions should be checked if they can represent a vector reordering node too and their order should ve taken into account for better graph reordering analysis/ Also, if the gather node has reused scalars, they must be reordered instead of the scalars themselves. Differential Revision: https://reviews.llvm.org/D112454 2021-10-25 22:32:35 +08:00			`; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 3`
Revert "Temporarily Revert "Add basic loop fusion pass."" The reversion apparently deleted the test/Transforms directory. Will be re-reverting again. llvm-svn: 358552 2019-04-17 12:52:47 +08:00			`; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP9]], i32 3`
[SLP]Improve/fix reordering of the gathered graph nodes. Gathered loads/extractelements/extractvalue instructions should be checked if they can represent a vector reordering node too and their order should ve taken into account for better graph reordering analysis/ Also, if the gather node has reused scalars, they must be reordered instead of the scalars themselves. Differential Revision: https://reviews.llvm.org/D112454 2021-10-25 22:32:35 +08:00			`; CHECK-NEXT: [[TMP11:%.*]] = mul <4 x i32> [[SHUFFLE]], [[TMP10]]`
Revert "Temporarily Revert "Add basic loop fusion pass."" The reversion apparently deleted the test/Transforms directory. Will be re-reverting again. llvm-svn: 358552 2019-04-17 12:52:47 +08:00			`; CHECK-NEXT: [[GEP_7:%.]] = getelementptr inbounds i32, i32 [[OUT:%.*]], i64 0`
			`; CHECK-NEXT: [[GEP_8:%.]] = getelementptr inbounds i32, i32 [[OUT]], i64 1`
			`; CHECK-NEXT: [[GEP_9:%.]] = getelementptr inbounds i32, i32 [[OUT]], i64 2`
			`; CHECK-NEXT: [[GEP_10:%.]] = getelementptr inbounds i32, i32 [[OUT]], i64 3`
[SLP]Improve/fix reordering of the gathered graph nodes. Gathered loads/extractelements/extractvalue instructions should be checked if they can represent a vector reordering node too and their order should ve taken into account for better graph reordering analysis/ Also, if the gather node has reused scalars, they must be reordered instead of the scalars themselves. Differential Revision: https://reviews.llvm.org/D112454 2021-10-25 22:32:35 +08:00			`; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>`
Revert "Temporarily Revert "Add basic loop fusion pass."" The reversion apparently deleted the test/Transforms directory. Will be re-reverting again. llvm-svn: 358552 2019-04-17 12:52:47 +08:00			`; CHECK-NEXT: [[TMP12:%.]] = bitcast i32 [[GEP_7]] to <4 x i32>*`
[SLP]Improve/fix reordering of the gathered graph nodes. Gathered loads/extractelements/extractvalue instructions should be checked if they can represent a vector reordering node too and their order should ve taken into account for better graph reordering analysis/ Also, if the gather node has reused scalars, they must be reordered instead of the scalars themselves. Differential Revision: https://reviews.llvm.org/D112454 2021-10-25 22:32:35 +08:00			`; CHECK-NEXT: store <4 x i32> [[SHUFFLE1]], <4 x i32>* [[TMP12]], align 4`
Revert "Temporarily Revert "Add basic loop fusion pass."" The reversion apparently deleted the test/Transforms directory. Will be re-reverting again. llvm-svn: 358552 2019-04-17 12:52:47 +08:00			`; CHECK-NEXT: ret i32 undef`
			`;`
			`%in.addr = getelementptr inbounds i32, i32* %in, i64 0`
			`%load.1 = load i32, i32* %in.addr, align 4`
			`%gep.1 = getelementptr inbounds i32, i32* %in.addr, i64 3`
			`%load.2 = load i32, i32* %gep.1, align 4`
			`%gep.2 = getelementptr inbounds i32, i32* %in.addr, i64 1`
			`%load.3 = load i32, i32* %gep.2, align 4`
			`%gep.3 = getelementptr inbounds i32, i32* %in.addr, i64 2`
			`%load.4 = load i32, i32* %gep.3, align 4`
			`%mul.1 = mul i32 %load.3, %load.4`
			`%mul.2 = mul i32 %load.2, %load.2`
			`%mul.3 = mul i32 %load.4, %load.1`
			`%mul.4 = mul i32 %load.1, %load.3`
			`%gep.7 = getelementptr inbounds i32, i32* %out, i64 0`
			`store i32 %mul.1, i32* %gep.7, align 4`
			`%gep.8 = getelementptr inbounds i32, i32* %out, i64 1`
			`store i32 %mul.2, i32* %gep.8, align 4`
			`%gep.9 = getelementptr inbounds i32, i32* %out, i64 2`
			`store i32 %mul.3, i32* %gep.9, align 4`
			`%gep.10 = getelementptr inbounds i32, i32* %out, i64 3`
			`store i32 %mul.4, i32* %gep.10, align 4`

			`ret i32 undef`
			`}`