llvm-project/llvm/test/CodeGen/X86/fold-load-binops.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512f < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX512

; Verify that we're folding the load into the math instruction.
; This pattern is generated out of the simplest intrinsics usage:
;  _mm_add_ss(a, _mm_load_ss(b));

define <4 x float> @addss(<4 x float> %va, float* %pb) {
; SSE-LABEL: addss:
; SSE:       # BB#0:
; SSE-NEXT:    addss (%rdi), %xmm0
; SSE-NEXT:    retq
;
; AVX1-LABEL: addss:
; AVX1:       # BB#0:
; AVX1-NEXT:    vaddss (%rdi), %xmm0, %xmm0
; AVX1-NEXT:    retq
;
; AVX512-LABEL: addss:
; AVX512:       # BB#0:
; AVX512-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT:    retq
    %a = extractelement <4 x float> %va, i32 0
    %b = load float, float* %pb
    %r = fadd float %a, %b
    %vr = insertelement <4 x float> %va, float %r, i32 0
    ret <4 x float> %vr
}

define <2 x double> @addsd(<2 x double> %va, double* %pb) {
; SSE-LABEL: addsd:
; SSE:       # BB#0:
; SSE-NEXT:    addsd (%rdi), %xmm0
; SSE-NEXT:    retq
;
; AVX1-LABEL: addsd:
; AVX1:       # BB#0:
; AVX1-NEXT:    vaddsd (%rdi), %xmm0, %xmm0
; AVX1-NEXT:    retq
;
; AVX512-LABEL: addsd:
; AVX512:       # BB#0:
; AVX512-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT:    retq
    %a = extractelement <2 x double> %va, i32 0
    %b = load double, double* %pb
    %r = fadd double %a, %b
    %vr = insertelement <2 x double> %va, double %r, i32 0
    ret <2 x double> %vr
}

define <4 x float> @subss(<4 x float> %va, float* %pb) {
; SSE-LABEL: subss:
; SSE:       # BB#0:
; SSE-NEXT:    subss (%rdi), %xmm0
; SSE-NEXT:    retq
;
; AVX1-LABEL: subss:
; AVX1:       # BB#0:
; AVX1-NEXT:    vsubss (%rdi), %xmm0, %xmm0
; AVX1-NEXT:    retq
;
; AVX512-LABEL: subss:
; AVX512:       # BB#0:
; AVX512-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX512-NEXT:    vsubss %xmm1, %xmm0, %xmm0
; AVX512-NEXT:    retq
    %a = extractelement <4 x float> %va, i32 0
    %b = load float, float* %pb
    %r = fsub float %a, %b
    %vr = insertelement <4 x float> %va, float %r, i32 0
    ret <4 x float> %vr
}

define <2 x double> @subsd(<2 x double> %va, double* %pb) {
; SSE-LABEL: subsd:
; SSE:       # BB#0:
; SSE-NEXT:    subsd (%rdi), %xmm0
; SSE-NEXT:    retq
;
; AVX1-LABEL: subsd:
; AVX1:       # BB#0:
; AVX1-NEXT:    vsubsd (%rdi), %xmm0, %xmm0
; AVX1-NEXT:    retq
;
; AVX512-LABEL: subsd:
; AVX512:       # BB#0:
; AVX512-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX512-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT:    retq
    %a = extractelement <2 x double> %va, i32 0
    %b = load double, double* %pb
    %r = fsub double %a, %b
    %vr = insertelement <2 x double> %va, double %r, i32 0
    ret <2 x double> %vr
}

define <4 x float> @mulss(<4 x float> %va, float* %pb) {
; SSE-LABEL: mulss:
; SSE:       # BB#0:
; SSE-NEXT:    mulss (%rdi), %xmm0
; SSE-NEXT:    retq
;
; AVX1-LABEL: mulss:
; AVX1:       # BB#0:
; AVX1-NEXT:    vmulss (%rdi), %xmm0, %xmm0
; AVX1-NEXT:    retq
;
; AVX512-LABEL: mulss:
; AVX512:       # BB#0:
; AVX512-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT:    retq
    %a = extractelement <4 x float> %va, i32 0
    %b = load float, float* %pb
    %r = fmul float %a, %b
    %vr = insertelement <4 x float> %va, float %r, i32 0
    ret <4 x float> %vr
}

define <2 x double> @mulsd(<2 x double> %va, double* %pb) {
; SSE-LABEL: mulsd:
; SSE:       # BB#0:
; SSE-NEXT:    mulsd (%rdi), %xmm0
; SSE-NEXT:    retq
;
; AVX1-LABEL: mulsd:
; AVX1:       # BB#0:
; AVX1-NEXT:    vmulsd (%rdi), %xmm0, %xmm0
; AVX1-NEXT:    retq
;
; AVX512-LABEL: mulsd:
; AVX512:       # BB#0:
; AVX512-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT:    retq
    %a = extractelement <2 x double> %va, i32 0
    %b = load double, double* %pb
    %r = fmul double %a, %b
    %vr = insertelement <2 x double> %va, double %r, i32 0
    ret <2 x double> %vr
}

define <4 x float> @divss(<4 x float> %va, float* %pb) {
; SSE-LABEL: divss:
; SSE:       # BB#0:
; SSE-NEXT:    divss (%rdi), %xmm0
; SSE-NEXT:    retq
;
; AVX1-LABEL: divss:
; AVX1:       # BB#0:
; AVX1-NEXT:    vdivss (%rdi), %xmm0, %xmm0
; AVX1-NEXT:    retq
;
; AVX512-LABEL: divss:
; AVX512:       # BB#0:
; AVX512-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX512-NEXT:    vdivss %xmm1, %xmm0, %xmm0
; AVX512-NEXT:    retq
    %a = extractelement <4 x float> %va, i32 0
    %b = load float, float* %pb
    %r = fdiv float %a, %b
    %vr = insertelement <4 x float> %va, float %r, i32 0
    ret <4 x float> %vr
}

define <2 x double> @divsd(<2 x double> %va, double* %pb) {
; SSE-LABEL: divsd:
; SSE:       # BB#0:
; SSE-NEXT:    divsd (%rdi), %xmm0
; SSE-NEXT:    retq
;
; AVX1-LABEL: divsd:
; AVX1:       # BB#0:
; AVX1-NEXT:    vdivsd (%rdi), %xmm0, %xmm0
; AVX1-NEXT:    retq
;
; AVX512-LABEL: divsd:
; AVX512:       # BB#0:
; AVX512-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX512-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT:    retq
    %a = extractelement <2 x double> %va, i32 0
    %b = load double, double* %pb
    %r = fdiv double %a, %b
    %vr = insertelement <2 x double> %va, double %r, i32 0
    ret <2 x double> %vr
}
Make utils/update_llc_test_checks.py note that the assertions are autogenerated. Also update existing test cases which appear to be generated by it and weren't modified (other than addition of the header) by rerunning it. llvm-svn: 253917 2015-11-24 05:33:58 +08:00			`; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py`
[X86] Teach load folding to accept scalar _Int users of MOVSS/MOVSD. The _Int instructions are special, in that they operate on the full VR128 instead of FR32. The load folding then looks at MOVSS, at the user, and bails out when it sees a size mismatch. What we really know is that the rm_Int instructions don't load the higher lanes, so folding is fine. This happens for the straightforward intrinsic code, e.g.: _mm_add_ss(a, _mm_load_ss(p)); Fixes PR23349. Differential Revision: http://reviews.llvm.org/D10554 llvm-svn: 240326 2015-06-23 04:51:51 +08:00			`; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s \| FileCheck %s --check-prefix=SSE`
[AVX512] Add AVX512 run lines to some tests for scalar fma/add/sub/mul/div and regenerate. Follow up commits will bring AVX512 code up to the same quality as AVX/SSE. llvm-svn: 277118 2016-07-29 14:05:58 +08:00			`; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s \| FileCheck %s --check-prefix=AVX --check-prefix=AVX1`
			`; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512f < %s \| FileCheck %s --check-prefix=AVX --check-prefix=AVX512`
[X86] Teach load folding to accept scalar _Int users of MOVSS/MOVSD. The _Int instructions are special, in that they operate on the full VR128 instead of FR32. The load folding then looks at MOVSS, at the user, and bails out when it sees a size mismatch. What we really know is that the rm_Int instructions don't load the higher lanes, so folding is fine. This happens for the straightforward intrinsic code, e.g.: _mm_add_ss(a, _mm_load_ss(p)); Fixes PR23349. Differential Revision: http://reviews.llvm.org/D10554 llvm-svn: 240326 2015-06-23 04:51:51 +08:00
			`; Verify that we're folding the load into the math instruction.`
			`; This pattern is generated out of the simplest intrinsics usage:`
			`; _mm_add_ss(a, _mm_load_ss(b));`

			`define <4 x float> @addss(<4 x float> %va, float* %pb) {`
			`; SSE-LABEL: addss:`
			`; SSE: # BB#0:`
			`; SSE-NEXT: addss (%rdi), %xmm0`
			`; SSE-NEXT: retq`
			`;`
[AVX512] Add AVX512 run lines to some tests for scalar fma/add/sub/mul/div and regenerate. Follow up commits will bring AVX512 code up to the same quality as AVX/SSE. llvm-svn: 277118 2016-07-29 14:05:58 +08:00			`; AVX1-LABEL: addss:`
			`; AVX1: # BB#0:`
			`; AVX1-NEXT: vaddss (%rdi), %xmm0, %xmm0`
			`; AVX1-NEXT: retq`
			`;`
			`; AVX512-LABEL: addss:`
			`; AVX512: # BB#0:`
			`; AVX512-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero`
			`; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0`
			`; AVX512-NEXT: retq`
[X86] Teach load folding to accept scalar _Int users of MOVSS/MOVSD. The _Int instructions are special, in that they operate on the full VR128 instead of FR32. The load folding then looks at MOVSS, at the user, and bails out when it sees a size mismatch. What we really know is that the rm_Int instructions don't load the higher lanes, so folding is fine. This happens for the straightforward intrinsic code, e.g.: _mm_add_ss(a, _mm_load_ss(p)); Fixes PR23349. Differential Revision: http://reviews.llvm.org/D10554 llvm-svn: 240326 2015-06-23 04:51:51 +08:00			`%a = extractelement <4 x float> %va, i32 0`
			`%b = load float, float* %pb`
			`%r = fadd float %a, %b`
			`%vr = insertelement <4 x float> %va, float %r, i32 0`
			`ret <4 x float> %vr`
			`}`

			`define <2 x double> @addsd(<2 x double> %va, double* %pb) {`
			`; SSE-LABEL: addsd:`
			`; SSE: # BB#0:`
			`; SSE-NEXT: addsd (%rdi), %xmm0`
			`; SSE-NEXT: retq`
			`;`
[AVX512] Add AVX512 run lines to some tests for scalar fma/add/sub/mul/div and regenerate. Follow up commits will bring AVX512 code up to the same quality as AVX/SSE. llvm-svn: 277118 2016-07-29 14:05:58 +08:00			`; AVX1-LABEL: addsd:`
			`; AVX1: # BB#0:`
			`; AVX1-NEXT: vaddsd (%rdi), %xmm0, %xmm0`
			`; AVX1-NEXT: retq`
			`;`
			`; AVX512-LABEL: addsd:`
			`; AVX512: # BB#0:`
			`; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero`
			`; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0`
			`; AVX512-NEXT: retq`
[X86] Teach load folding to accept scalar _Int users of MOVSS/MOVSD. The _Int instructions are special, in that they operate on the full VR128 instead of FR32. The load folding then looks at MOVSS, at the user, and bails out when it sees a size mismatch. What we really know is that the rm_Int instructions don't load the higher lanes, so folding is fine. This happens for the straightforward intrinsic code, e.g.: _mm_add_ss(a, _mm_load_ss(p)); Fixes PR23349. Differential Revision: http://reviews.llvm.org/D10554 llvm-svn: 240326 2015-06-23 04:51:51 +08:00			`%a = extractelement <2 x double> %va, i32 0`
			`%b = load double, double* %pb`
			`%r = fadd double %a, %b`
			`%vr = insertelement <2 x double> %va, double %r, i32 0`
			`ret <2 x double> %vr`
			`}`

			`define <4 x float> @subss(<4 x float> %va, float* %pb) {`
			`; SSE-LABEL: subss:`
			`; SSE: # BB#0:`
			`; SSE-NEXT: subss (%rdi), %xmm0`
			`; SSE-NEXT: retq`
			`;`
[AVX512] Add AVX512 run lines to some tests for scalar fma/add/sub/mul/div and regenerate. Follow up commits will bring AVX512 code up to the same quality as AVX/SSE. llvm-svn: 277118 2016-07-29 14:05:58 +08:00			`; AVX1-LABEL: subss:`
			`; AVX1: # BB#0:`
			`; AVX1-NEXT: vsubss (%rdi), %xmm0, %xmm0`
			`; AVX1-NEXT: retq`
			`;`
			`; AVX512-LABEL: subss:`
			`; AVX512: # BB#0:`
			`; AVX512-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero`
			`; AVX512-NEXT: vsubss %xmm1, %xmm0, %xmm0`
			`; AVX512-NEXT: retq`
[X86] Teach load folding to accept scalar _Int users of MOVSS/MOVSD. The _Int instructions are special, in that they operate on the full VR128 instead of FR32. The load folding then looks at MOVSS, at the user, and bails out when it sees a size mismatch. What we really know is that the rm_Int instructions don't load the higher lanes, so folding is fine. This happens for the straightforward intrinsic code, e.g.: _mm_add_ss(a, _mm_load_ss(p)); Fixes PR23349. Differential Revision: http://reviews.llvm.org/D10554 llvm-svn: 240326 2015-06-23 04:51:51 +08:00			`%a = extractelement <4 x float> %va, i32 0`
			`%b = load float, float* %pb`
			`%r = fsub float %a, %b`
			`%vr = insertelement <4 x float> %va, float %r, i32 0`
			`ret <4 x float> %vr`
			`}`

			`define <2 x double> @subsd(<2 x double> %va, double* %pb) {`
			`; SSE-LABEL: subsd:`
			`; SSE: # BB#0:`
			`; SSE-NEXT: subsd (%rdi), %xmm0`
			`; SSE-NEXT: retq`
			`;`
[AVX512] Add AVX512 run lines to some tests for scalar fma/add/sub/mul/div and regenerate. Follow up commits will bring AVX512 code up to the same quality as AVX/SSE. llvm-svn: 277118 2016-07-29 14:05:58 +08:00			`; AVX1-LABEL: subsd:`
			`; AVX1: # BB#0:`
			`; AVX1-NEXT: vsubsd (%rdi), %xmm0, %xmm0`
			`; AVX1-NEXT: retq`
			`;`
			`; AVX512-LABEL: subsd:`
			`; AVX512: # BB#0:`
			`; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero`
			`; AVX512-NEXT: vsubsd %xmm1, %xmm0, %xmm0`
			`; AVX512-NEXT: retq`
[X86] Teach load folding to accept scalar _Int users of MOVSS/MOVSD. The _Int instructions are special, in that they operate on the full VR128 instead of FR32. The load folding then looks at MOVSS, at the user, and bails out when it sees a size mismatch. What we really know is that the rm_Int instructions don't load the higher lanes, so folding is fine. This happens for the straightforward intrinsic code, e.g.: _mm_add_ss(a, _mm_load_ss(p)); Fixes PR23349. Differential Revision: http://reviews.llvm.org/D10554 llvm-svn: 240326 2015-06-23 04:51:51 +08:00			`%a = extractelement <2 x double> %va, i32 0`
			`%b = load double, double* %pb`
			`%r = fsub double %a, %b`
			`%vr = insertelement <2 x double> %va, double %r, i32 0`
			`ret <2 x double> %vr`
			`}`

			`define <4 x float> @mulss(<4 x float> %va, float* %pb) {`
			`; SSE-LABEL: mulss:`
			`; SSE: # BB#0:`
			`; SSE-NEXT: mulss (%rdi), %xmm0`
			`; SSE-NEXT: retq`
			`;`
[AVX512] Add AVX512 run lines to some tests for scalar fma/add/sub/mul/div and regenerate. Follow up commits will bring AVX512 code up to the same quality as AVX/SSE. llvm-svn: 277118 2016-07-29 14:05:58 +08:00			`; AVX1-LABEL: mulss:`
			`; AVX1: # BB#0:`
			`; AVX1-NEXT: vmulss (%rdi), %xmm0, %xmm0`
			`; AVX1-NEXT: retq`
			`;`
			`; AVX512-LABEL: mulss:`
			`; AVX512: # BB#0:`
			`; AVX512-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero`
			`; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0`
			`; AVX512-NEXT: retq`
[X86] Teach load folding to accept scalar _Int users of MOVSS/MOVSD. The _Int instructions are special, in that they operate on the full VR128 instead of FR32. The load folding then looks at MOVSS, at the user, and bails out when it sees a size mismatch. What we really know is that the rm_Int instructions don't load the higher lanes, so folding is fine. This happens for the straightforward intrinsic code, e.g.: _mm_add_ss(a, _mm_load_ss(p)); Fixes PR23349. Differential Revision: http://reviews.llvm.org/D10554 llvm-svn: 240326 2015-06-23 04:51:51 +08:00			`%a = extractelement <4 x float> %va, i32 0`
			`%b = load float, float* %pb`
			`%r = fmul float %a, %b`
			`%vr = insertelement <4 x float> %va, float %r, i32 0`
			`ret <4 x float> %vr`
			`}`

			`define <2 x double> @mulsd(<2 x double> %va, double* %pb) {`
			`; SSE-LABEL: mulsd:`
			`; SSE: # BB#0:`
			`; SSE-NEXT: mulsd (%rdi), %xmm0`
			`; SSE-NEXT: retq`
			`;`
[AVX512] Add AVX512 run lines to some tests for scalar fma/add/sub/mul/div and regenerate. Follow up commits will bring AVX512 code up to the same quality as AVX/SSE. llvm-svn: 277118 2016-07-29 14:05:58 +08:00			`; AVX1-LABEL: mulsd:`
			`; AVX1: # BB#0:`
			`; AVX1-NEXT: vmulsd (%rdi), %xmm0, %xmm0`
			`; AVX1-NEXT: retq`
			`;`
			`; AVX512-LABEL: mulsd:`
			`; AVX512: # BB#0:`
			`; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero`
			`; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0`
			`; AVX512-NEXT: retq`
[X86] Teach load folding to accept scalar _Int users of MOVSS/MOVSD. The _Int instructions are special, in that they operate on the full VR128 instead of FR32. The load folding then looks at MOVSS, at the user, and bails out when it sees a size mismatch. What we really know is that the rm_Int instructions don't load the higher lanes, so folding is fine. This happens for the straightforward intrinsic code, e.g.: _mm_add_ss(a, _mm_load_ss(p)); Fixes PR23349. Differential Revision: http://reviews.llvm.org/D10554 llvm-svn: 240326 2015-06-23 04:51:51 +08:00			`%a = extractelement <2 x double> %va, i32 0`
			`%b = load double, double* %pb`
			`%r = fmul double %a, %b`
			`%vr = insertelement <2 x double> %va, double %r, i32 0`
			`ret <2 x double> %vr`
			`}`

			`define <4 x float> @divss(<4 x float> %va, float* %pb) {`
			`; SSE-LABEL: divss:`
			`; SSE: # BB#0:`
			`; SSE-NEXT: divss (%rdi), %xmm0`
			`; SSE-NEXT: retq`
			`;`
[AVX512] Add AVX512 run lines to some tests for scalar fma/add/sub/mul/div and regenerate. Follow up commits will bring AVX512 code up to the same quality as AVX/SSE. llvm-svn: 277118 2016-07-29 14:05:58 +08:00			`; AVX1-LABEL: divss:`
			`; AVX1: # BB#0:`
			`; AVX1-NEXT: vdivss (%rdi), %xmm0, %xmm0`
			`; AVX1-NEXT: retq`
			`;`
			`; AVX512-LABEL: divss:`
			`; AVX512: # BB#0:`
			`; AVX512-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero`
			`; AVX512-NEXT: vdivss %xmm1, %xmm0, %xmm0`
			`; AVX512-NEXT: retq`
[X86] Teach load folding to accept scalar _Int users of MOVSS/MOVSD. The _Int instructions are special, in that they operate on the full VR128 instead of FR32. The load folding then looks at MOVSS, at the user, and bails out when it sees a size mismatch. What we really know is that the rm_Int instructions don't load the higher lanes, so folding is fine. This happens for the straightforward intrinsic code, e.g.: _mm_add_ss(a, _mm_load_ss(p)); Fixes PR23349. Differential Revision: http://reviews.llvm.org/D10554 llvm-svn: 240326 2015-06-23 04:51:51 +08:00			`%a = extractelement <4 x float> %va, i32 0`
			`%b = load float, float* %pb`
			`%r = fdiv float %a, %b`
			`%vr = insertelement <4 x float> %va, float %r, i32 0`
			`ret <4 x float> %vr`
			`}`

			`define <2 x double> @divsd(<2 x double> %va, double* %pb) {`
			`; SSE-LABEL: divsd:`
			`; SSE: # BB#0:`
			`; SSE-NEXT: divsd (%rdi), %xmm0`
			`; SSE-NEXT: retq`
			`;`
[AVX512] Add AVX512 run lines to some tests for scalar fma/add/sub/mul/div and regenerate. Follow up commits will bring AVX512 code up to the same quality as AVX/SSE. llvm-svn: 277118 2016-07-29 14:05:58 +08:00			`; AVX1-LABEL: divsd:`
			`; AVX1: # BB#0:`
			`; AVX1-NEXT: vdivsd (%rdi), %xmm0, %xmm0`
			`; AVX1-NEXT: retq`
			`;`
			`; AVX512-LABEL: divsd:`
			`; AVX512: # BB#0:`
			`; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero`
			`; AVX512-NEXT: vdivsd %xmm1, %xmm0, %xmm0`
			`; AVX512-NEXT: retq`
[X86] Teach load folding to accept scalar _Int users of MOVSS/MOVSD. The _Int instructions are special, in that they operate on the full VR128 instead of FR32. The load folding then looks at MOVSS, at the user, and bails out when it sees a size mismatch. What we really know is that the rm_Int instructions don't load the higher lanes, so folding is fine. This happens for the straightforward intrinsic code, e.g.: _mm_add_ss(a, _mm_load_ss(p)); Fixes PR23349. Differential Revision: http://reviews.llvm.org/D10554 llvm-svn: 240326 2015-06-23 04:51:51 +08:00			`%a = extractelement <2 x double> %va, i32 0`
			`%b = load double, double* %pb`
			`%r = fdiv double %a, %b`
			`%vr = insertelement <2 x double> %va, double %r, i32 0`
			`ret <2 x double> %vr`
			`}`