llvm-project/llvm/test/CodeGen/X86/avx-vbroadcast.ll

993 lines
35 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X32
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64
define <4 x i64> @A(i64* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: A:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: A:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd (%rdi), %ymm0
; X64-NEXT: retq
entry:
%q = load i64, i64* %ptr, align 8
%vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
%vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1
%vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2
%vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3
ret <4 x i64> %vecinit6.i
}
define <4 x i64> @A2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp {
; X32-LABEL: A2:
; X32: ## %bb.0: ## %entry
; X32-NEXT: pushl %esi
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: .cfi_offset %esi, -8
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl (%ecx), %edx
; X32-NEXT: movl 4(%ecx), %esi
; X32-NEXT: vbroadcastsd (%ecx), %ymm0
; X32-NEXT: movl %edx, (%eax)
; X32-NEXT: movl %esi, 4(%eax)
; X32-NEXT: popl %esi
; X32-NEXT: retl
;
; X64-LABEL: A2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: movq %rax, (%rsi)
; X64-NEXT: vmovq %rax, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
entry:
%q = load i64, i64* %ptr, align 8
store i64 %q, i64* %ptr2, align 8 ; to create a chain to prevent broadcast
%vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
%vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1
%vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2
%vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3
ret <4 x i64> %vecinit6.i
}
define <8 x i32> @B(i32* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: B:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: B:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: retq
entry:
%q = load i32, i32* %ptr, align 4
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%vecinit2.i = insertelement <8 x i32> %vecinit.i, i32 %q, i32 1
%vecinit4.i = insertelement <8 x i32> %vecinit2.i, i32 %q, i32 2
%vecinit6.i = insertelement <8 x i32> %vecinit4.i, i32 %q, i32 3
ret <8 x i32> %vecinit6.i
}
define <8 x i32> @B2(i32* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: B2:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: B2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: retq
entry:
%q = load i32, i32* %ptr, align 4
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%vecinit2.i = insertelement <8 x i32> %vecinit.i, i32 %q, i32 1
%vecinit4.i = insertelement <8 x i32> %vecinit2.i, i32 %q, i32 2
%vecinit6.i = insertelement <8 x i32> %vecinit4.i, i32 %q, i32 3
%vecinit8.i = insertelement <8 x i32> %vecinit6.i, i32 %q, i32 4
%vecinit10.i = insertelement <8 x i32> %vecinit8.i, i32 %q, i32 5
%vecinit12.i = insertelement <8 x i32> %vecinit10.i, i32 %q, i32 6
%vecinit14.i = insertelement <8 x i32> %vecinit12.i, i32 %q, i32 7
ret <8 x i32> %vecinit14.i
}
define <8 x i32> @B3(i32* %ptr, i32* %ptr2) nounwind uwtable readnone ssp {
; X32-LABEL: B3:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl (%ecx), %ecx
; X32-NEXT: movl %ecx, (%eax)
; X32-NEXT: vmovd %ecx, %xmm0
; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: B3:
; X64: ## %bb.0: ## %entry
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: movl %eax, (%rsi)
; X64-NEXT: vmovd %eax, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
entry:
%q = load i32, i32* %ptr, align 4
store i32 %q, i32* %ptr2, align 4 ; to create a chain to prevent broadcast
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%vecinit2.i = insertelement <8 x i32> %vecinit.i, i32 %q, i32 1
%vecinit4.i = insertelement <8 x i32> %vecinit2.i, i32 %q, i32 2
%vecinit6.i = insertelement <8 x i32> %vecinit4.i, i32 %q, i32 3
%vecinit8.i = insertelement <8 x i32> %vecinit6.i, i32 %q, i32 4
%vecinit10.i = insertelement <8 x i32> %vecinit8.i, i32 %q, i32 5
%vecinit12.i = insertelement <8 x i32> %vecinit10.i, i32 %q, i32 6
%vecinit14.i = insertelement <8 x i32> %vecinit12.i, i32 %q, i32 7
ret <8 x i32> %vecinit14.i
}
define <4 x double> @C(double* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: C:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: C:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd (%rdi), %ymm0
; X64-NEXT: retq
entry:
%q = load double, double* %ptr, align 8
%vecinit.i = insertelement <4 x double> undef, double %q, i32 0
%vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1
%vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2
%vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3
ret <4 x double> %vecinit6.i
}
define <4 x double> @C2(double* %ptr, double* %ptr2) nounwind uwtable readnone ssp {
; X32-LABEL: C2:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: vmovsd %xmm0, (%eax)
; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: C2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: vmovsd %xmm0, (%rsi)
; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
entry:
%q = load double, double* %ptr, align 8
store double %q, double* %ptr2, align 8 ; to create a chain to prevent broadcast
%vecinit.i = insertelement <4 x double> undef, double %q, i32 0
%vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1
%vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2
%vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3
ret <4 x double> %vecinit6.i
}
define <8 x float> @D(float* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: D:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: D:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: retq
entry:
%q = load float, float* %ptr, align 4
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%vecinit2.i = insertelement <8 x float> %vecinit.i, float %q, i32 1
%vecinit4.i = insertelement <8 x float> %vecinit2.i, float %q, i32 2
%vecinit6.i = insertelement <8 x float> %vecinit4.i, float %q, i32 3
ret <8 x float> %vecinit6.i
}
define <8 x float> @D2(float* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: D2:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: D2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: retq
entry:
%q = load float, float* %ptr, align 4
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%vecinit2.i = insertelement <8 x float> %vecinit.i, float %q, i32 1
%vecinit4.i = insertelement <8 x float> %vecinit2.i, float %q, i32 2
%vecinit6.i = insertelement <8 x float> %vecinit4.i, float %q, i32 3
%vecinit8.i = insertelement <8 x float> %vecinit6.i, float %q, i32 4
%vecinit10.i = insertelement <8 x float> %vecinit8.i, float %q, i32 5
%vecinit12.i = insertelement <8 x float> %vecinit10.i, float %q, i32 6
%vecinit14.i = insertelement <8 x float> %vecinit12.i, float %q, i32 7
ret <8 x float> %vecinit14.i
}
define <8 x float> @D3(float* %ptr, float* %ptr2) nounwind uwtable readnone ssp {
; X32-LABEL: D3:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: vmovss %xmm0, (%eax)
; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: D3:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: vmovss %xmm0, (%rsi)
; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
entry:
%q = load float, float* %ptr, align 4
store float %q, float* %ptr2, align 4 ; to create a chain to prevent broadcast
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%vecinit2.i = insertelement <8 x float> %vecinit.i, float %q, i32 1
%vecinit4.i = insertelement <8 x float> %vecinit2.i, float %q, i32 2
%vecinit6.i = insertelement <8 x float> %vecinit4.i, float %q, i32 3
%vecinit8.i = insertelement <8 x float> %vecinit6.i, float %q, i32 4
%vecinit10.i = insertelement <8 x float> %vecinit8.i, float %q, i32 5
%vecinit12.i = insertelement <8 x float> %vecinit10.i, float %q, i32 6
%vecinit14.i = insertelement <8 x float> %vecinit12.i, float %q, i32 7
ret <8 x float> %vecinit14.i
}
;;;; 128-bit versions
define <4 x float> @e(float* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: e:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: e:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %xmm0
; X64-NEXT: retq
entry:
%q = load float, float* %ptr, align 4
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
%vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
%vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
ret <4 x float> %vecinit6.i
}
define <4 x float> @e2(float* %ptr, float* %ptr2) nounwind uwtable readnone ssp {
; X32-LABEL: e2:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: vmovss %xmm0, (%eax)
; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X32-NEXT: retl
;
; X64-LABEL: e2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: vmovss %xmm0, (%rsi)
; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: retq
entry:
%q = load float, float* %ptr, align 4
store float %q, float* %ptr2, align 4 ; to create a chain to prevent broadcast
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
%vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
%vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
ret <4 x float> %vecinit6.i
}
; Don't broadcast constants on pre-AVX2 hardware.
define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: _e2:
; X32: ## %bb.0: ## %entry
; X32-NEXT: vmovaps {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
; X32-NEXT: retl
;
; X64-LABEL: _e2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vmovaps {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
; X64-NEXT: retq
entry:
%vecinit.i = insertelement <4 x float> undef, float 0xbf80000000000000, i32 0
%vecinit2.i = insertelement <4 x float> %vecinit.i, float 0xbf80000000000000, i32 1
%vecinit4.i = insertelement <4 x float> %vecinit2.i, float 0xbf80000000000000, i32 2
%vecinit6.i = insertelement <4 x float> %vecinit4.i, float 0xbf80000000000000, i32 3
ret <4 x float> %vecinit6.i
}
define <4 x i32> @F(i32* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: F:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: F:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %xmm0
; X64-NEXT: retq
entry:
%q = load i32, i32* %ptr, align 4
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%vecinit2.i = insertelement <4 x i32> %vecinit.i, i32 %q, i32 1
%vecinit4.i = insertelement <4 x i32> %vecinit2.i, i32 %q, i32 2
%vecinit6.i = insertelement <4 x i32> %vecinit4.i, i32 %q, i32 3
ret <4 x i32> %vecinit6.i
}
define <4 x i32> @F2(i32* %ptr, i32* %ptr2) nounwind uwtable readnone ssp {
; X32-LABEL: F2:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl (%ecx), %ecx
; X32-NEXT: movl %ecx, (%eax)
; X32-NEXT: vmovd %ecx, %xmm0
; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X32-NEXT: retl
;
; X64-LABEL: F2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: movl %eax, (%rsi)
; X64-NEXT: vmovd %eax, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: retq
entry:
%q = load i32, i32* %ptr, align 4
store i32 %q, i32* %ptr2, align 4 ; to create a chain to prevent broadcast
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%vecinit2.i = insertelement <4 x i32> %vecinit.i, i32 %q, i32 1
%vecinit4.i = insertelement <4 x i32> %vecinit2.i, i32 %q, i32 2
%vecinit6.i = insertelement <4 x i32> %vecinit4.i, i32 %q, i32 3
ret <4 x i32> %vecinit6.i
}
; FIXME: Pointer adjusted broadcasts
define <4 x i32> @load_splat_4i32_4i32_1111(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_4i32_4i32_1111:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,1,1]
; X32-NEXT: retl
;
; X64-LABEL: load_splat_4i32_4i32_1111:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,1,1]
; X64-NEXT: retq
entry:
%ld = load <4 x i32>, <4 x i32>* %ptr
%ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
ret <4 x i32> %ret
}
define <8 x i32> @load_splat_8i32_4i32_33333333(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_8i32_4i32_33333333:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss 12(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_8i32_4i32_33333333:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss 12(%rdi), %ymm0
; X64-NEXT: retq
entry:
%ld = load <4 x i32>, <4 x i32>* %ptr
%ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
ret <8 x i32> %ret
}
define <8 x i32> @load_splat_8i32_8i32_55555555(<8 x i32>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_8i32_8i32_55555555:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss 20(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_8i32_8i32_55555555:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss 20(%rdi), %ymm0
; X64-NEXT: retq
entry:
%ld = load <8 x i32>, <8 x i32>* %ptr
%ret = shufflevector <8 x i32> %ld, <8 x i32> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <8 x i32> %ret
}
define <4 x float> @load_splat_4f32_4f32_1111(<4 x float>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_4f32_4f32_1111:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss 4(%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_4f32_4f32_1111:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss 4(%rdi), %xmm0
; X64-NEXT: retq
entry:
%ld = load <4 x float>, <4 x float>* %ptr
%ret = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
ret <4 x float> %ret
}
define <8 x float> @load_splat_8f32_4f32_33333333(<4 x float>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_8f32_4f32_33333333:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss 12(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_8f32_4f32_33333333:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss 12(%rdi), %ymm0
; X64-NEXT: retq
entry:
%ld = load <4 x float>, <4 x float>* %ptr
%ret = shufflevector <4 x float> %ld, <4 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
ret <8 x float> %ret
}
define <8 x float> @load_splat_8f32_8f32_55555555(<8 x float>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_8f32_8f32_55555555:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss 20(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_8f32_8f32_55555555:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss 20(%rdi), %ymm0
; X64-NEXT: retq
entry:
%ld = load <8 x float>, <8 x float>* %ptr
%ret = shufflevector <8 x float> %ld, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <8 x float> %ret
}
define <2 x i64> @load_splat_2i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_2i64_2i64_1111:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
; X32-NEXT: retl
;
; X64-LABEL: load_splat_2i64_2i64_1111:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
; X64-NEXT: retq
entry:
%ld = load <2 x i64>, <2 x i64>* %ptr
%ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
ret <2 x i64> %ret
}
define <4 x i64> @load_splat_4i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_4i64_2i64_1111:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd 8(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_4i64_2i64_1111:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd 8(%rdi), %ymm0
; X64-NEXT: retq
entry:
%ld = load <2 x i64>, <2 x i64>* %ptr
%ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
ret <4 x i64> %ret
}
define <4 x i64> @load_splat_4i64_4i64_2222(<4 x i64>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_4i64_4i64_2222:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd 16(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_4i64_4i64_2222:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd 16(%rdi), %ymm0
; X64-NEXT: retq
entry:
%ld = load <4 x i64>, <4 x i64>* %ptr
%ret = shufflevector <4 x i64> %ld, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
ret <4 x i64> %ret
}
define <2 x double> @load_splat_2f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_2f64_2f64_1111:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: load_splat_2f64_2f64_1111:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X64-NEXT: retq
entry:
%ld = load <2 x double>, <2 x double>* %ptr
%ret = shufflevector <2 x double> %ld, <2 x double> undef, <2 x i32> <i32 1, i32 1>
ret <2 x double> %ret
}
define <4 x double> @load_splat_4f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_4f64_2f64_1111:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd 8(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_4f64_2f64_1111:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd 8(%rdi), %ymm0
; X64-NEXT: retq
entry:
%ld = load <2 x double>, <2 x double>* %ptr
%ret = shufflevector <2 x double> %ld, <2 x double> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
ret <4 x double> %ret
}
define <4 x double> @load_splat_4f64_4f64_2222(<4 x double>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_4f64_4f64_2222:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd 16(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_4f64_4f64_2222:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd 16(%rdi), %ymm0
; X64-NEXT: retq
entry:
%ld = load <4 x double>, <4 x double>* %ptr
%ret = shufflevector <4 x double> %ld, <4 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
ret <4 x double> %ret
}
; Unsupported vbroadcasts
define <2 x i64> @G(i64* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: G:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: G:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X64-NEXT: retq
entry:
%q = load i64, i64* %ptr, align 8
%vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
%vecinit2.i = insertelement <2 x i64> %vecinit.i, i64 %q, i32 1
ret <2 x i64> %vecinit2.i
}
define <2 x i64> @G2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp {
; X32-LABEL: G2:
; X32: ## %bb.0: ## %entry
; X32-NEXT: pushl %esi
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: .cfi_offset %esi, -8
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl (%ecx), %edx
; X32-NEXT: movl 4(%ecx), %esi
; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X32-NEXT: movl %edx, (%eax)
; X32-NEXT: movl %esi, 4(%eax)
; X32-NEXT: popl %esi
; X32-NEXT: retl
;
; X64-LABEL: G2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: movq %rax, (%rsi)
; X64-NEXT: vmovq %rax, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X64-NEXT: retq
entry:
%q = load i64, i64* %ptr, align 8
store i64 %q, i64* %ptr2, align 8 ; to create a chain to prevent broadcast
%vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
%vecinit2.i = insertelement <2 x i64> %vecinit.i, i64 %q, i32 1
ret <2 x i64> %vecinit2.i
}
define <4 x i32> @H(<4 x i32> %a) {
; X32-LABEL: H:
; X32: ## %bb.0: ## %entry
; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X32-NEXT: retl
;
; X64-LABEL: H:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X64-NEXT: retq
entry:
%x = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
ret <4 x i32> %x
}
define <2 x double> @I(double* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: I:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: I:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X64-NEXT: retq
entry:
%q = load double, double* %ptr, align 4
%vecinit.i = insertelement <2 x double> undef, double %q, i32 0
%vecinit2.i = insertelement <2 x double> %vecinit.i, double %q, i32 1
ret <2 x double> %vecinit2.i
}
define <2 x double> @I2(double* %ptr, double* %ptr2) nounwind uwtable readnone ssp {
; X32-LABEL: I2:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: vmovsd %xmm0, (%eax)
; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; X32-NEXT: retl
;
; X64-LABEL: I2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: vmovsd %xmm0, (%rsi)
; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; X64-NEXT: retq
entry:
%q = load double, double* %ptr, align 4
store double %q, double* %ptr2, align 4 ; to create a chain to prevent broadcast
%vecinit.i = insertelement <2 x double> undef, double %q, i32 0
%vecinit2.i = insertelement <2 x double> %vecinit.i, double %q, i32 1
ret <2 x double> %vecinit2.i
}
define <4 x float> @_RR(float* %ptr, i32* %k) nounwind uwtable readnone ssp {
; X32-LABEL: _RR:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vbroadcastss (%ecx), %xmm0
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: movl %eax, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: _RR:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %xmm0
; X64-NEXT: movl (%rsi), %eax
; X64-NEXT: movl %eax, (%rax)
; X64-NEXT: retq
entry:
%q = load float, float* %ptr, align 4
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
%vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
%vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
; force a chain
%j = load i32, i32* %k, align 4
store i32 %j, i32* undef
ret <4 x float> %vecinit6.i
}
define <4 x float> @_RR2(float* %ptr, i32* %k) nounwind uwtable readnone ssp {
; X32-LABEL: _RR2:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: _RR2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %xmm0
; X64-NEXT: retq
entry:
%q = load float, float* %ptr, align 4
%v = insertelement <4 x float> undef, float %q, i32 0
%t = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
ret <4 x float> %t
}
; These tests check that a vbroadcast instruction is used when we have a splat
; formed from a concat_vectors (via the shufflevector) of two BUILD_VECTORs
; (via the insertelements).
define <8 x float> @splat_concat1(float* %p) {
; X32-LABEL: splat_concat1:
; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: splat_concat1:
; X64: ## %bb.0:
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: retq
%1 = load float, float* %p, align 4
%2 = insertelement <4 x float> undef, float %1, i32 0
%3 = insertelement <4 x float> %2, float %1, i32 1
%4 = insertelement <4 x float> %3, float %1, i32 2
%5 = insertelement <4 x float> %4, float %1, i32 3
%6 = shufflevector <4 x float> %5, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
ret <8 x float> %6
}
define <8 x float> @splat_concat2(float* %p) {
; X32-LABEL: splat_concat2:
; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: splat_concat2:
; X64: ## %bb.0:
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: retq
%1 = load float, float* %p, align 4
%2 = insertelement <4 x float> undef, float %1, i32 0
%3 = insertelement <4 x float> %2, float %1, i32 1
%4 = insertelement <4 x float> %3, float %1, i32 2
%5 = insertelement <4 x float> %4, float %1, i32 3
%6 = insertelement <4 x float> undef, float %1, i32 0
%7 = insertelement <4 x float> %6, float %1, i32 1
%8 = insertelement <4 x float> %7, float %1, i32 2
%9 = insertelement <4 x float> %8, float %1, i32 3
%10 = shufflevector <4 x float> %5, <4 x float> %9, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x float> %10
}
define <4 x double> @splat_concat3(double* %p) {
; X32-LABEL: splat_concat3:
; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: splat_concat3:
; X64: ## %bb.0:
; X64-NEXT: vbroadcastsd (%rdi), %ymm0
; X64-NEXT: retq
%1 = load double, double* %p, align 8
%2 = insertelement <2 x double> undef, double %1, i32 0
%3 = insertelement <2 x double> %2, double %1, i32 1
%4 = shufflevector <2 x double> %3, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
ret <4 x double> %4
}
define <4 x double> @splat_concat4(double* %p) {
; X32-LABEL: splat_concat4:
; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: splat_concat4:
; X64: ## %bb.0:
; X64-NEXT: vbroadcastsd (%rdi), %ymm0
; X64-NEXT: retq
%1 = load double, double* %p, align 8
%2 = insertelement <2 x double> undef, double %1, i32 0
%3 = insertelement <2 x double> %2, double %1, i32 1
%4 = insertelement <2 x double> undef, double %1, i32 0
%5 = insertelement <2 x double> %2, double %1, i32 1
%6 = shufflevector <2 x double> %3, <2 x double> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x double> %6
}
; PR34041
define <4 x double> @broadcast_shuffle_1000(double* %p) {
; X32-LABEL: broadcast_shuffle_1000:
; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: broadcast_shuffle_1000:
; X64: ## %bb.0:
; X64-NEXT: vbroadcastsd (%rdi), %ymm0
; X64-NEXT: retq
%1 = load double, double* %p
%2 = insertelement <2 x double> undef, double %1, i32 0
%3 = shufflevector <2 x double> %2, <2 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
ret <4 x double> %3
}
define <4 x double> @broadcast_shuffle1032(double* %p) {
; X32-LABEL: broadcast_shuffle1032:
; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: broadcast_shuffle1032:
; X64: ## %bb.0:
; X64-NEXT: vbroadcastsd (%rdi), %ymm0
; X64-NEXT: retq
%1 = load double, double* %p
%2 = insertelement <2 x double> undef, double %1, i32 1
%3 = insertelement <2 x double> undef, double %1, i32 0
%4 = shufflevector <2 x double> %2, <2 x double> %3, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
ret <4 x double> %4
}
define void @broadcast_v16i32(i32* %a, <16 x i32>* %b) {
; X32-LABEL: broadcast_v16i32:
; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vbroadcastss (%ecx), %ymm0
; X32-NEXT: vmovups %ymm0, 32(%eax)
; X32-NEXT: vmovups %ymm0, (%eax)
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: broadcast_v16i32:
; X64: ## %bb.0:
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: vmovups %ymm0, 32(%rsi)
; X64-NEXT: vmovups %ymm0, (%rsi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
%1 = load i32, i32* %a, align 4
%2 = insertelement <8 x i32> undef, i32 %1, i32 0
%3 = shufflevector <8 x i32> %2, <8 x i32> undef, <8 x i32> zeroinitializer
%4 = shufflevector <8 x i32> undef, <8 x i32> %3, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
store <16 x i32> %4, <16 x i32>* %b, align 4
ret void
}
;
; Broadcast scale factor for xyz vector - slp will have vectorized xy.
; FIXME: Load as a broadcast and then use the scalar 0'th element.
;
define double @broadcast_scale_xyz(double* nocapture readonly, double* nocapture readonly) nounwind {
; X32-LABEL: broadcast_scale_xyz:
; X32: ## %bb.0:
; X32-NEXT: subl $12, %esp
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
; X32-NEXT: vmulpd (%eax), %xmm1, %xmm1
; X32-NEXT: vmulsd 16(%eax), %xmm0, %xmm0
; X32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; X32-NEXT: vaddsd %xmm2, %xmm1, %xmm1
; X32-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; X32-NEXT: vmovsd %xmm0, (%esp)
; X32-NEXT: fldl (%esp)
; X32-NEXT: addl $12, %esp
; X32-NEXT: retl
;
; X64-LABEL: broadcast_scale_xyz:
; X64: ## %bb.0:
; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
; X64-NEXT: vmulpd (%rsi), %xmm1, %xmm1
; X64-NEXT: vmulsd 16(%rsi), %xmm0, %xmm0
; X64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; X64-NEXT: vaddsd %xmm2, %xmm1, %xmm1
; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%3 = bitcast double* %1 to <2 x double>*
%4 = load <2 x double>, <2 x double>* %3, align 8
%5 = getelementptr inbounds double, double* %1, i64 2
%6 = load double, double* %5, align 8
%7 = load double, double* %0, align 8
%8 = insertelement <2 x double> undef, double %7, i32 0
%9 = shufflevector <2 x double> %8, <2 x double> undef, <2 x i32> zeroinitializer
%10 = fmul <2 x double> %4, %9
%11 = fmul double %6, %7
%12 = extractelement <2 x double> %10, i32 0
%13 = extractelement <2 x double> %10, i32 1
%14 = fadd double %12, %13
%15 = fadd double %11, %14
ret double %15
}
;
; When VBROADCAST replaces an existing load, ensure it still respects lifetime dependencies.
;
define float @broadcast_lifetime() nounwind {
; X32-LABEL: broadcast_lifetime:
; X32: ## %bb.0:
; X32-NEXT: pushl %esi
; X32-NEXT: subl $40, %esp
; X32-NEXT: leal {{[0-9]+}}(%esp), %esi
; X32-NEXT: movl %esi, (%esp)
; X32-NEXT: calll _gfunc
; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X32-NEXT: movl %esi, (%esp)
; X32-NEXT: calll _gfunc
; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: vsubss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 4-byte Folded Reload
; X32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp)
; X32-NEXT: flds {{[0-9]+}}(%esp)
; X32-NEXT: addl $40, %esp
; X32-NEXT: popl %esi
; X32-NEXT: retl
;
; X64-LABEL: broadcast_lifetime:
; X64: ## %bb.0:
; X64-NEXT: subq $40, %rsp
; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
; X64-NEXT: callq _gfunc
; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
; X64-NEXT: callq _gfunc
; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 ## 4-byte Folded Reload
; X64-NEXT: addq $40, %rsp
; X64-NEXT: retq
%1 = alloca <4 x float>, align 16
%2 = alloca <4 x float>, align 16
%3 = bitcast <4 x float>* %1 to i8*
%4 = bitcast <4 x float>* %2 to i8*
call void @llvm.lifetime.start.p0i8(i64 16, i8* %3)
call void @gfunc(<4 x float>* %1)
%5 = load <4 x float>, <4 x float>* %1, align 16
call void @llvm.lifetime.end.p0i8(i64 16, i8* %3)
call void @llvm.lifetime.start.p0i8(i64 16, i8* %4)
call void @gfunc(<4 x float>* %2)
%6 = load <4 x float>, <4 x float>* %2, align 16
call void @llvm.lifetime.end.p0i8(i64 16, i8* %4)
%7 = extractelement <4 x float> %5, i32 1
%8 = extractelement <4 x float> %6, i32 1
%9 = fsub float %8, %7
ret float %9
}
declare void @gfunc(<4 x float>*)
declare void @llvm.lifetime.start.p0i8(i64, i8*)
declare void @llvm.lifetime.end.p0i8(i64, i8*)