llvm-project/llvm/test/Transforms/InstCombine/x86-insertps.ll

; RUN: opt < %s -instcombine -S | FileCheck %s

declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone

; This should never happen, but make sure we don't crash handling a non-constant immediate byte.

define <4 x float> @insertps_non_const_imm(<4 x float> %v1, <4 x float> %v2, i8 %c) {
  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 %c)
  ret <4 x float> %res

; CHECK-LABEL: @insertps_non_const_imm
; CHECK-NEXT:  call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 %c)
; CHECK-NEXT:  ret <4 x float>
}

; If all zero mask bits are set, return a zero regardless of the other control bits.

define <4 x float> @insertps_0x0f(<4 x float> %v1, <4 x float> %v2) {
  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 15)
  ret <4 x float> %res

; CHECK-LABEL: @insertps_0x0f
; CHECK-NEXT:  ret <4 x float> zeroinitializer
}
define <4 x float> @insertps_0xff(<4 x float> %v1, <4 x float> %v2) {
  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 255)
  ret <4 x float> %res

; CHECK-LABEL: @insertps_0xff
; CHECK-NEXT:  ret <4 x float> zeroinitializer
}

; If some zero mask bits are set that do not override the insertion, we do not change anything.

define <4 x float> @insertps_0x0c(<4 x float> %v1, <4 x float> %v2) {
  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 12)
  ret <4 x float> %res

; CHECK-LABEL: @insertps_0x0c
; CHECK-NEXT:  call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 12)
; CHECK-NEXT:  ret <4 x float>
}

; ...unless both input vectors are the same operand.

define <4 x float> @insertps_0x15_single_input(<4 x float> %v1) {
  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v1, i8 21)
  ret <4 x float> %res

; CHECK-LABEL: @insertps_0x15_single_input
; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> <float 0.000000e+00, float undef, float 0.000000e+00, float undef>, <4 x i32> <i32 4, i32 0, i32 6, i32 3>
; CHECK-NEXT:  ret <4 x float>
}

; The zero mask overrides the insertion lane.

define <4 x float> @insertps_0x1a_single_input(<4 x float> %v1) {
  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v1, i8 26)
  ret <4 x float> %res

; CHECK-LABEL: @insertps_0x1a_single_input
; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> <float undef, float 0.000000e+00, float undef, float 0.000000e+00>, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
; CHECK-NEXT:  ret <4 x float>
}

; The zero mask overrides the insertion lane, so the second input vector is not used.

define <4 x float> @insertps_0xc1(<4 x float> %v1, <4 x float> %v2) {
  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 193)
  ret <4 x float> %res

; CHECK-LABEL: @insertps_0xc1
; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
; CHECK-NEXT:  ret <4 x float>
}

; If no zero mask bits are set, convert to a shuffle.

define <4 x float> @insertps_0x00(<4 x float> %v1, <4 x float> %v2) {
  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 0)
  ret <4 x float> %res

; CHECK-LABEL: @insertps_0x00
; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
; CHECK-NEXT:  ret <4 x float>
}

define <4 x float> @insertps_0x10(<4 x float> %v1, <4 x float> %v2) {
  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 16)
  ret <4 x float> %res

; CHECK-LABEL: @insertps_0x10
; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
; CHECK-NEXT:  ret <4 x float>
}

define <4 x float> @insertps_0x20(<4 x float> %v1, <4 x float> %v2) {
  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 32)
  ret <4 x float> %res

; CHECK-LABEL: @insertps_0x20
; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
; CHECK-NEXT:  ret <4 x float>
}

define <4 x float> @insertps_0x30(<4 x float> %v1, <4 x float> %v2) {
  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 48)
  ret <4 x float> %res

; CHECK-LABEL: @insertps_0x30
; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
; CHECK-NEXT:  ret <4 x float>
}

define <4 x float> @insertps_0xc0(<4 x float> %v1, <4 x float> %v2) {
  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 192)
  ret <4 x float> %res

; CHECK-LABEL: @insertps_0xc0
; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 7, i32 1, i32 2, i32 3>
; CHECK-NEXT:  ret <4 x float>
}

define <4 x float> @insertps_0xd0(<4 x float> %v1, <4 x float> %v2) {
  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 208)
  ret <4 x float> %res

; CHECK-LABEL: @insertps_0xd0
; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
; CHECK-NEXT:  ret <4 x float>
}

define <4 x float> @insertps_0xe0(<4 x float> %v1, <4 x float> %v2) {
  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 224)
  ret <4 x float> %res

; CHECK-LABEL: @insertps_0xe0
; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 1, i32 7, i32 3>
; CHECK-NEXT:  ret <4 x float>
}

define <4 x float> @insertps_0xf0(<4 x float> %v1, <4 x float> %v2) {
  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 240)
  ret <4 x float> %res

; CHECK-LABEL: @insertps_0xf0
; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
; CHECK-NEXT:  ret <4 x float>
}
[X86, SSE] instcombine common cases of insertps intrinsics into shuffles This is very similar to D8486 / r232852 (vperm2). If we treat insertps intrinsics as shufflevectors, we can optimize them better. I've left all but the full zero case of the zero mask variants out of this patch. I don't think those can be converted into a single shuffle in all cases, but I'd be happy to be proven wrong as I was for vperm2f128. Either way, we'd need to support whatever sequence we come up with for those cases in the backend before converting them here. Differential Revision: http://reviews.llvm.org/D8833 llvm-svn: 235124 2015-04-17 01:52:13 +08:00			`; RUN: opt < %s -instcombine -S \| FileCheck %s`

			`declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone`

			`; This should never happen, but make sure we don't crash handling a non-constant immediate byte.`

			`define <4 x float> @insertps_non_const_imm(<4 x float> %v1, <4 x float> %v2, i8 %c) {`
			`%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 %c)`
			`ret <4 x float> %res`

			`; CHECK-LABEL: @insertps_non_const_imm`
			`; CHECK-NEXT: call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 %c)`
			`; CHECK-NEXT: ret <4 x float>`
			`}`

			`; If all zero mask bits are set, return a zero regardless of the other control bits.`

			`define <4 x float> @insertps_0x0f(<4 x float> %v1, <4 x float> %v2) {`
			`%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 15)`
			`ret <4 x float> %res`

			`; CHECK-LABEL: @insertps_0x0f`
			`; CHECK-NEXT: ret <4 x float> zeroinitializer`
			`}`
			`define <4 x float> @insertps_0xff(<4 x float> %v1, <4 x float> %v2) {`
			`%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 255)`
			`ret <4 x float> %res`

			`; CHECK-LABEL: @insertps_0xff`
			`; CHECK-NEXT: ret <4 x float> zeroinitializer`
			`}`

[x86] instcombine more cases of insertps into a shufflevector This is a follow-on to D8833 (insertps optimization when the zero mask is not used). In this patch, we check for the case where the zmask is used, but both input vectors to the insertps intrinsic are the same operand or the zmask overrides the destination lane. This lets us replace the 2nd shuffle input operand with the zero vector. Differential Revision: http://reviews.llvm.org/D9257 llvm-svn: 235810 2015-04-26 04:55:25 +08:00			`; If some zero mask bits are set that do not override the insertion, we do not change anything.`
[X86, SSE] instcombine common cases of insertps intrinsics into shuffles This is very similar to D8486 / r232852 (vperm2). If we treat insertps intrinsics as shufflevectors, we can optimize them better. I've left all but the full zero case of the zero mask variants out of this patch. I don't think those can be converted into a single shuffle in all cases, but I'd be happy to be proven wrong as I was for vperm2f128. Either way, we'd need to support whatever sequence we come up with for those cases in the backend before converting them here. Differential Revision: http://reviews.llvm.org/D8833 llvm-svn: 235124 2015-04-17 01:52:13 +08:00
[x86] instcombine more cases of insertps into a shufflevector This is a follow-on to D8833 (insertps optimization when the zero mask is not used). In this patch, we check for the case where the zmask is used, but both input vectors to the insertps intrinsic are the same operand or the zmask overrides the destination lane. This lets us replace the 2nd shuffle input operand with the zero vector. Differential Revision: http://reviews.llvm.org/D9257 llvm-svn: 235810 2015-04-26 04:55:25 +08:00			`define <4 x float> @insertps_0x0c(<4 x float> %v1, <4 x float> %v2) {`
			`%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 12)`
[X86, SSE] instcombine common cases of insertps intrinsics into shuffles This is very similar to D8486 / r232852 (vperm2). If we treat insertps intrinsics as shufflevectors, we can optimize them better. I've left all but the full zero case of the zero mask variants out of this patch. I don't think those can be converted into a single shuffle in all cases, but I'd be happy to be proven wrong as I was for vperm2f128. Either way, we'd need to support whatever sequence we come up with for those cases in the backend before converting them here. Differential Revision: http://reviews.llvm.org/D8833 llvm-svn: 235124 2015-04-17 01:52:13 +08:00			`ret <4 x float> %res`

[x86] instcombine more cases of insertps into a shufflevector This is a follow-on to D8833 (insertps optimization when the zero mask is not used). In this patch, we check for the case where the zmask is used, but both input vectors to the insertps intrinsic are the same operand or the zmask overrides the destination lane. This lets us replace the 2nd shuffle input operand with the zero vector. Differential Revision: http://reviews.llvm.org/D9257 llvm-svn: 235810 2015-04-26 04:55:25 +08:00			`; CHECK-LABEL: @insertps_0x0c`
			`; CHECK-NEXT: call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 12)`
			`; CHECK-NEXT: ret <4 x float>`
			`}`

			`; ...unless both input vectors are the same operand.`

			`define <4 x float> @insertps_0x15_single_input(<4 x float> %v1) {`
			`%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v1, i8 21)`
			`ret <4 x float> %res`

			`; CHECK-LABEL: @insertps_0x15_single_input`
			`; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> <float 0.000000e+00, float undef, float 0.000000e+00, float undef>, <4 x i32> <i32 4, i32 0, i32 6, i32 3>`
			`; CHECK-NEXT: ret <4 x float>`
			`}`

			`; The zero mask overrides the insertion lane.`

			`define <4 x float> @insertps_0x1a_single_input(<4 x float> %v1) {`
			`%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v1, i8 26)`
			`ret <4 x float> %res`

			`; CHECK-LABEL: @insertps_0x1a_single_input`
			`; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> <float undef, float 0.000000e+00, float undef, float 0.000000e+00>, <4 x i32> <i32 0, i32 5, i32 2, i32 7>`
			`; CHECK-NEXT: ret <4 x float>`
			`}`

			`; The zero mask overrides the insertion lane, so the second input vector is not used.`

			`define <4 x float> @insertps_0xc1(<4 x float> %v1, <4 x float> %v2) {`
			`%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 193)`
			`ret <4 x float> %res`

			`; CHECK-LABEL: @insertps_0xc1`
			`; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>`
[X86, SSE] instcombine common cases of insertps intrinsics into shuffles This is very similar to D8486 / r232852 (vperm2). If we treat insertps intrinsics as shufflevectors, we can optimize them better. I've left all but the full zero case of the zero mask variants out of this patch. I don't think those can be converted into a single shuffle in all cases, but I'd be happy to be proven wrong as I was for vperm2f128. Either way, we'd need to support whatever sequence we come up with for those cases in the backend before converting them here. Differential Revision: http://reviews.llvm.org/D8833 llvm-svn: 235124 2015-04-17 01:52:13 +08:00			`; CHECK-NEXT: ret <4 x float>`
			`}`

			`; If no zero mask bits are set, convert to a shuffle.`

			`define <4 x float> @insertps_0x00(<4 x float> %v1, <4 x float> %v2) {`
			`%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 0)`
			`ret <4 x float> %res`

			`; CHECK-LABEL: @insertps_0x00`
			`; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 4, i32 1, i32 2, i32 3>`
			`; CHECK-NEXT: ret <4 x float>`
			`}`

			`define <4 x float> @insertps_0x10(<4 x float> %v1, <4 x float> %v2) {`
			`%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 16)`
			`ret <4 x float> %res`

			`; CHECK-LABEL: @insertps_0x10`
			`; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>`
			`; CHECK-NEXT: ret <4 x float>`
			`}`

			`define <4 x float> @insertps_0x20(<4 x float> %v1, <4 x float> %v2) {`
			`%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 32)`
			`ret <4 x float> %res`

			`; CHECK-LABEL: @insertps_0x20`
			`; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>`
			`; CHECK-NEXT: ret <4 x float>`
			`}`

			`define <4 x float> @insertps_0x30(<4 x float> %v1, <4 x float> %v2) {`
			`%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 48)`
			`ret <4 x float> %res`

			`; CHECK-LABEL: @insertps_0x30`
			`; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 4>`
			`; CHECK-NEXT: ret <4 x float>`
			`}`

			`define <4 x float> @insertps_0xc0(<4 x float> %v1, <4 x float> %v2) {`
			`%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 192)`
			`ret <4 x float> %res`

			`; CHECK-LABEL: @insertps_0xc0`
			`; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 7, i32 1, i32 2, i32 3>`
			`; CHECK-NEXT: ret <4 x float>`
			`}`

			`define <4 x float> @insertps_0xd0(<4 x float> %v1, <4 x float> %v2) {`
			`%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 208)`
			`ret <4 x float> %res`

			`; CHECK-LABEL: @insertps_0xd0`
			`; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 7, i32 2, i32 3>`
			`; CHECK-NEXT: ret <4 x float>`
			`}`

			`define <4 x float> @insertps_0xe0(<4 x float> %v1, <4 x float> %v2) {`
			`%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 224)`
			`ret <4 x float> %res`

			`; CHECK-LABEL: @insertps_0xe0`
			`; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 1, i32 7, i32 3>`
			`; CHECK-NEXT: ret <4 x float>`
			`}`

			`define <4 x float> @insertps_0xf0(<4 x float> %v1, <4 x float> %v2) {`
			`%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 240)`
			`ret <4 x float> %res`

			`; CHECK-LABEL: @insertps_0xf0`
			`; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 7>`
			`; CHECK-NEXT: ret <4 x float>`
			`}`