llvm-project/llvm/test/CodeGen/X86/avx-cast.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx  | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2

; Prefer a blend instruction to a vinsert128 instruction because blends
; are simpler (no lane changes) and therefore will have equal or better
; performance.

define <8 x float> @castA(<4 x float> %m) nounwind uwtable readnone ssp {
; AVX-LABEL: castA:
; AVX:       ## BB#0:
; AVX-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX-NEXT:    vxorps %ymm1, %ymm1, %ymm1
; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX-NEXT:    retq
  %shuffle.i = shufflevector <4 x float> %m, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
  ret <8 x float> %shuffle.i
}

define <4 x double> @castB(<2 x double> %m) nounwind uwtable readnone ssp {
; AVX-LABEL: castB:
; AVX:       ## BB#0:
; AVX-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX-NEXT:    retq
  %shuffle.i = shufflevector <2 x double> %m, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
  ret <4 x double> %shuffle.i
}

; AVX2 is needed for integer types.

define <4 x i64> @castC(<2 x i64> %m) nounwind uwtable readnone ssp {
; AVX1-LABEL: castC:
; AVX1:       ## BB#0:
; AVX1-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX1-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX1-NEXT:    retq
;
; AVX2-LABEL: castC:
; AVX2:       ## BB#0:
; AVX2-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT:    retq
  %shuffle.i = shufflevector <2 x i64> %m, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
  ret <4 x i64> %shuffle.i
}

; The next three tests don't need any shuffling. There may or may not be a
; vzeroupper before the return, so just check for the absence of shuffles.

define <4 x float> @castD(<8 x float> %m) nounwind uwtable readnone ssp {
; AVX-LABEL: castD:
; AVX:       ## BB#0:
; AVX-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX-NEXT:    vzeroupper
; AVX-NEXT:    retq
  %shuffle.i = shufflevector <8 x float> %m, <8 x float> %m, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  ret <4 x float> %shuffle.i
}

define <2 x i64> @castE(<4 x i64> %m) nounwind uwtable readnone ssp {
; AVX-LABEL: castE:
; AVX:       ## BB#0:
; AVX-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX-NEXT:    vzeroupper
; AVX-NEXT:    retq
  %shuffle.i = shufflevector <4 x i64> %m, <4 x i64> %m, <2 x i32> <i32 0, i32 1>
  ret <2 x i64> %shuffle.i
}

define <2 x double> @castF(<4 x double> %m) nounwind uwtable readnone ssp {
; AVX-LABEL: castF:
; AVX:       ## BB#0:
; AVX-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX-NEXT:    vzeroupper
; AVX-NEXT:    retq
  %shuffle.i = shufflevector <4 x double> %m, <4 x double> %m, <2 x i32> <i32 0, i32 1>
  ret <2 x double> %shuffle.i
}
[X86][SSE] Ensure BLENDPD/BLENDPS/PBLEND inputs are both of the correct input type llvm-svn: 256782 2016-01-05 05:41:11 +08:00			`; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py`
			`; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx \| FileCheck %s --check-prefix=AVX --check-prefix=AVX1`
			`; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 \| FileCheck %s --check-prefix=AVX --check-prefix=AVX2`
[X86, AVX] use blends instead of insert128 with index 0 Another case of x86-specific shuffle strength reduction: avoid generating insert*128 instructions with index 0 because they are slower than their non-lane-changing blend equivalents. Shuffle lowering already catches most of these cases, but the zero vector case and some other paths such as in the modified test in vector-shuffle-256-v32.ll were getting through. Differential Revision: http://reviews.llvm.org/D8366 llvm-svn: 232773 2015-03-20 06:29:40 +08:00
			`; Prefer a blend instruction to a vinsert128 instruction because blends`
			`; are simpler (no lane changes) and therefore will have equal or better`
			`; performance.`
Add a DAGCombine for transforming 128->256 casts into a simple vxorps + vinsertf128 pair of instructions llvm-svn: 135727 2011-07-22 08:15:00 +08:00
			`define <8 x float> @castA(<4 x float> %m) nounwind uwtable readnone ssp {`
[X86][SSE] Ensure BLENDPD/BLENDPS/PBLEND inputs are both of the correct input type llvm-svn: 256782 2016-01-05 05:41:11 +08:00			`; AVX-LABEL: castA:`
			`; AVX: ## BB#0:`
VirtRegMap: Replace some identity copies with KILL instructions. An identity COPY like this: %AL = COPY %AL, %EAX<imp-def> has no semantic effect, but encodes liveness information: Further users of %EAX only depend on this instruction even though it does not define the full register. Replace the COPY with a KILL instruction in those cases to maintain this liveness information. (This reverts a small part of r238588 but this time adds a comment explaining why a KILL instruction is useful). llvm-svn: 274952 2016-07-09 08:19:07 +08:00			`; AVX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>`
[X86][SSE] Ensure BLENDPD/BLENDPS/PBLEND inputs are both of the correct input type llvm-svn: 256782 2016-01-05 05:41:11 +08:00			`; AVX-NEXT: vxorps %ymm1, %ymm1, %ymm1`
			`; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]`
			`; AVX-NEXT: retq`
Add a DAGCombine for transforming 128->256 casts into a simple vxorps + vinsertf128 pair of instructions llvm-svn: 135727 2011-07-22 08:15:00 +08:00			`%shuffle.i = shufflevector <4 x float> %m, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>`
			`ret <8 x float> %shuffle.i`
			`}`

			`define <4 x double> @castB(<2 x double> %m) nounwind uwtable readnone ssp {`
[X86][SSE] Ensure BLENDPD/BLENDPS/PBLEND inputs are both of the correct input type llvm-svn: 256782 2016-01-05 05:41:11 +08:00			`; AVX-LABEL: castB:`
			`; AVX: ## BB#0:`
VirtRegMap: Replace some identity copies with KILL instructions. An identity COPY like this: %AL = COPY %AL, %EAX<imp-def> has no semantic effect, but encodes liveness information: Further users of %EAX only depend on this instruction even though it does not define the full register. Replace the COPY with a KILL instruction in those cases to maintain this liveness information. (This reverts a small part of r238588 but this time adds a comment explaining why a KILL instruction is useful). llvm-svn: 274952 2016-07-09 08:19:07 +08:00			`; AVX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>`
[X86][SSE] Ensure BLENDPD/BLENDPS/PBLEND inputs are both of the correct input type llvm-svn: 256782 2016-01-05 05:41:11 +08:00			`; AVX-NEXT: vxorpd %ymm1, %ymm1, %ymm1`
			`; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]`
			`; AVX-NEXT: retq`
Add a DAGCombine for transforming 128->256 casts into a simple vxorps + vinsertf128 pair of instructions llvm-svn: 135727 2011-07-22 08:15:00 +08:00			`%shuffle.i = shufflevector <2 x double> %m, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 2>`
			`ret <4 x double> %shuffle.i`
			`}`

[X86, AVX] use blends instead of insert128 with index 0 Another case of x86-specific shuffle strength reduction: avoid generating insert*128 instructions with index 0 because they are slower than their non-lane-changing blend equivalents. Shuffle lowering already catches most of these cases, but the zero vector case and some other paths such as in the modified test in vector-shuffle-256-v32.ll were getting through. Differential Revision: http://reviews.llvm.org/D8366 llvm-svn: 232773 2015-03-20 06:29:40 +08:00			`; AVX2 is needed for integer types.`

Add a DAGCombine for transforming 128->256 casts into a simple vxorps + vinsertf128 pair of instructions llvm-svn: 135727 2011-07-22 08:15:00 +08:00			`define <4 x i64> @castC(<2 x i64> %m) nounwind uwtable readnone ssp {`
[X86, AVX] use blends instead of insert128 with index 0 Another case of x86-specific shuffle strength reduction: avoid generating insert*128 instructions with index 0 because they are slower than their non-lane-changing blend equivalents. Shuffle lowering already catches most of these cases, but the zero vector case and some other paths such as in the modified test in vector-shuffle-256-v32.ll were getting through. Differential Revision: http://reviews.llvm.org/D8366 llvm-svn: 232773 2015-03-20 06:29:40 +08:00			`; AVX1-LABEL: castC:`
[X86][SSE] Ensure BLENDPD/BLENDPS/PBLEND inputs are both of the correct input type llvm-svn: 256782 2016-01-05 05:41:11 +08:00			`; AVX1: ## BB#0:`
VirtRegMap: Replace some identity copies with KILL instructions. An identity COPY like this: %AL = COPY %AL, %EAX<imp-def> has no semantic effect, but encodes liveness information: Further users of %EAX only depend on this instruction even though it does not define the full register. Replace the COPY with a KILL instruction in those cases to maintain this liveness information. (This reverts a small part of r238588 but this time adds a comment explaining why a KILL instruction is useful). llvm-svn: 274952 2016-07-09 08:19:07 +08:00			`; AVX1-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>`
[SelectionDAG] Generalised the CONCAT_VECTORS creation to support BUILD_VECTOR and UNDEF folding. llvm-svn: 258646 2016-01-24 06:27:54 +08:00			`; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1`
			`; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]`
[X86, AVX] use blends instead of insert128 with index 0 Another case of x86-specific shuffle strength reduction: avoid generating insert*128 instructions with index 0 because they are slower than their non-lane-changing blend equivalents. Shuffle lowering already catches most of these cases, but the zero vector case and some other paths such as in the modified test in vector-shuffle-256-v32.ll were getting through. Differential Revision: http://reviews.llvm.org/D8366 llvm-svn: 232773 2015-03-20 06:29:40 +08:00			`; AVX1-NEXT: retq`
			`;`
			`; AVX2-LABEL: castC:`
[X86][SSE] Ensure BLENDPD/BLENDPS/PBLEND inputs are both of the correct input type llvm-svn: 256782 2016-01-05 05:41:11 +08:00			`; AVX2: ## BB#0:`
VirtRegMap: Replace some identity copies with KILL instructions. An identity COPY like this: %AL = COPY %AL, %EAX<imp-def> has no semantic effect, but encodes liveness information: Further users of %EAX only depend on this instruction even though it does not define the full register. Replace the COPY with a KILL instruction in those cases to maintain this liveness information. (This reverts a small part of r238588 but this time adds a comment explaining why a KILL instruction is useful). llvm-svn: 274952 2016-07-09 08:19:07 +08:00			`; AVX2-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>`
[X86][SSE] Ensure BLENDPD/BLENDPS/PBLEND inputs are both of the correct input type llvm-svn: 256782 2016-01-05 05:41:11 +08:00			`; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1`
[X86, AVX] use blends instead of insert128 with index 0 Another case of x86-specific shuffle strength reduction: avoid generating insert*128 instructions with index 0 because they are slower than their non-lane-changing blend equivalents. Shuffle lowering already catches most of these cases, but the zero vector case and some other paths such as in the modified test in vector-shuffle-256-v32.ll were getting through. Differential Revision: http://reviews.llvm.org/D8366 llvm-svn: 232773 2015-03-20 06:29:40 +08:00			`; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]`
			`; AVX2-NEXT: retq`
Add a DAGCombine for transforming 128->256 casts into a simple vxorps + vinsertf128 pair of instructions llvm-svn: 135727 2011-07-22 08:15:00 +08:00			`%shuffle.i = shufflevector <2 x i64> %m, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 2>`
			`ret <4 x i64> %shuffle.i`
			`}`

[X86, AVX] use blends instead of insert128 with index 0 Another case of x86-specific shuffle strength reduction: avoid generating insert*128 instructions with index 0 because they are slower than their non-lane-changing blend equivalents. Shuffle lowering already catches most of these cases, but the zero vector case and some other paths such as in the modified test in vector-shuffle-256-v32.ll were getting through. Differential Revision: http://reviews.llvm.org/D8366 llvm-svn: 232773 2015-03-20 06:29:40 +08:00			`; The next three tests don't need any shuffling. There may or may not be a`
			`; vzeroupper before the return, so just check for the absence of shuffles.`

Although we already support this, add testcases for consistency llvm-svn: 135728 2011-07-22 08:15:03 +08:00			`define <4 x float> @castD(<8 x float> %m) nounwind uwtable readnone ssp {`
[X86][SSE] Ensure BLENDPD/BLENDPS/PBLEND inputs are both of the correct input type llvm-svn: 256782 2016-01-05 05:41:11 +08:00			`; AVX-LABEL: castD:`
			`; AVX: ## BB#0:`
VirtRegMap: Replace some identity copies with KILL instructions. An identity COPY like this: %AL = COPY %AL, %EAX<imp-def> has no semantic effect, but encodes liveness information: Further users of %EAX only depend on this instruction even though it does not define the full register. Replace the COPY with a KILL instruction in those cases to maintain this liveness information. (This reverts a small part of r238588 but this time adds a comment explaining why a KILL instruction is useful). llvm-svn: 274952 2016-07-09 08:19:07 +08:00			`; AVX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>`
[X86][SSE] Ensure BLENDPD/BLENDPS/PBLEND inputs are both of the correct input type llvm-svn: 256782 2016-01-05 05:41:11 +08:00			`; AVX-NEXT: vzeroupper`
			`; AVX-NEXT: retq`
Although we already support this, add testcases for consistency llvm-svn: 135728 2011-07-22 08:15:03 +08:00			`%shuffle.i = shufflevector <8 x float> %m, <8 x float> %m, <4 x i32> <i32 0, i32 1, i32 2, i32 3>`
			`ret <4 x float> %shuffle.i`
			`}`

			`define <2 x i64> @castE(<4 x i64> %m) nounwind uwtable readnone ssp {`
[X86][SSE] Ensure BLENDPD/BLENDPS/PBLEND inputs are both of the correct input type llvm-svn: 256782 2016-01-05 05:41:11 +08:00			`; AVX-LABEL: castE:`
			`; AVX: ## BB#0:`
VirtRegMap: Replace some identity copies with KILL instructions. An identity COPY like this: %AL = COPY %AL, %EAX<imp-def> has no semantic effect, but encodes liveness information: Further users of %EAX only depend on this instruction even though it does not define the full register. Replace the COPY with a KILL instruction in those cases to maintain this liveness information. (This reverts a small part of r238588 but this time adds a comment explaining why a KILL instruction is useful). llvm-svn: 274952 2016-07-09 08:19:07 +08:00			`; AVX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>`
[X86][SSE] Ensure BLENDPD/BLENDPS/PBLEND inputs are both of the correct input type llvm-svn: 256782 2016-01-05 05:41:11 +08:00			`; AVX-NEXT: vzeroupper`
			`; AVX-NEXT: retq`
Although we already support this, add testcases for consistency llvm-svn: 135728 2011-07-22 08:15:03 +08:00			`%shuffle.i = shufflevector <4 x i64> %m, <4 x i64> %m, <2 x i32> <i32 0, i32 1>`
			`ret <2 x i64> %shuffle.i`
			`}`

			`define <2 x double> @castF(<4 x double> %m) nounwind uwtable readnone ssp {`
[X86][SSE] Ensure BLENDPD/BLENDPS/PBLEND inputs are both of the correct input type llvm-svn: 256782 2016-01-05 05:41:11 +08:00			`; AVX-LABEL: castF:`
			`; AVX: ## BB#0:`
VirtRegMap: Replace some identity copies with KILL instructions. An identity COPY like this: %AL = COPY %AL, %EAX<imp-def> has no semantic effect, but encodes liveness information: Further users of %EAX only depend on this instruction even though it does not define the full register. Replace the COPY with a KILL instruction in those cases to maintain this liveness information. (This reverts a small part of r238588 but this time adds a comment explaining why a KILL instruction is useful). llvm-svn: 274952 2016-07-09 08:19:07 +08:00			`; AVX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>`
[X86][SSE] Ensure BLENDPD/BLENDPS/PBLEND inputs are both of the correct input type llvm-svn: 256782 2016-01-05 05:41:11 +08:00			`; AVX-NEXT: vzeroupper`
			`; AVX-NEXT: retq`
Although we already support this, add testcases for consistency llvm-svn: 135728 2011-07-22 08:15:03 +08:00			`%shuffle.i = shufflevector <4 x double> %m, <4 x double> %m, <2 x i32> <i32 0, i32 1>`
			`ret <2 x double> %shuffle.i`
			`}`