llvm-project/llvm/test/CodeGen/X86/avx512-memfold.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s

define i8 @test_int_x86_avx512_mask_cmp_ss(<4 x float> %a, float* %b, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss:
; CHECK:       ## %bb.0:
; CHECK-NEXT:    kmovw %esi, %k1
; CHECK-NEXT:    vcmpunordss (%rdi), %xmm0, %k0 {%k1}
; CHECK-NEXT:    kmovw %k0, %eax
; CHECK-NEXT:    ## kill: def %al killed %al killed %eax
; CHECK-NEXT:    retq
  %b.val = load float, float* %b
  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
  %bv  = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
  %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %a, <4 x float> %bv, i32 3, i8 %mask, i32 4)
  ret i8 %res2
}
declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)

define <4 x float> @test_mask_max_ss(<4 x float> %a, float* %b, i8 %mask) {
; CHECK-LABEL: test_mask_max_ss:
; CHECK:       ## %bb.0:
; CHECK-NEXT:    kmovw %esi, %k1
; CHECK-NEXT:    vmaxss (%rdi), %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %b.val = load float, float* %b
  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
  %bv  = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a, <4 x float> %bv, <4 x float> zeroinitializer, i8 %mask, i32 4)
  ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone

define <4 x float> @test_maskz_add_ss(<4 x float> %a, float* %b, i8 %mask) {
; CHECK-LABEL: test_maskz_add_ss:
; CHECK:       ## %bb.0:
; CHECK-NEXT:    kmovw %esi, %k1
; CHECK-NEXT:    vaddss (%rdi), %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %b.val = load float, float* %b
  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
  %bv  = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a, <4 x float> %bv, <4 x float> zeroinitializer, i8 %mask, i32 4)
  ret <4 x float> %res
}

declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone

declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)

define <2 x double> @test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %a, <2 x double> %b, double* %c, i8 %mask){
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_sd:
; CHECK:       ## %bb.0:
; CHECK-NEXT:    kmovw %esi, %k1
; CHECK-NEXT:    vfmadd213sd (%rdi), %xmm1, %xmm0 {%k1}
; CHECK-NEXT:    retq
  %c.val = load double, double* %c
  %cv0 = insertelement <2 x double> undef, double %c.val, i32 0
  %cv = insertelement <2 x double> %cv0, double 0.000000e+00, i32 1
  %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %cv, i8 %mask, i32 4)
  ret <2 x double> %res
}

; Test what happens when the load when we have multiple uses of the fadds DAG node via separate vselect nodes.
; TODO: We shouldn't fold the load twice here.
define <4 x float> @test_mask_add_ss_double_use(<4 x float> %a, float* %b, i8 %mask, <4 x float> %c) {
; CHECK-LABEL: test_mask_add_ss_double_use:
; CHECK:       ## %bb.0:
; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; CHECK-NEXT:    kmovw %esi, %k1
; CHECK-NEXT:    vaddss %xmm2, %xmm0, %xmm1 {%k1}
; CHECK-NEXT:    vaddss %xmm2, %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT:    vmulps %xmm0, %xmm1, %xmm0
; CHECK-NEXT:    retq
  %b.val = load float, float* %b
  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
  %bv  = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %bv, <4 x float> %c, i8 %mask, i32 4)
  %res2 = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %bv, <4 x float> zeroinitializer, i8 %mask, i32 4)
  %res3 = fmul <4 x float> %res, %res2
  ret <4 x float> %res3
}
[AVX-512] Add test cases showing failure to fold zero extending scalar loads in scalar intrinsics without the peephole pass. llvm-svn: 295692 2017-02-21 12:26:07 +08:00			`; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py`
			`; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mcpu=knl \| FileCheck %s`

			`define i8 @test_int_x86_avx512_mask_cmp_ss(<4 x float> %a, float* %b, i8 %mask) {`
			`; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss:`
[CodeGen] Unify MBB reference format in both MIR and debug output As part of the unification of the debug format and the MIR format, print MBB references as '%bb.5'. The MIR printer prints the IR name of a MBB only for block definitions. * find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)->getNumber\(\)/" << printMBBReference(\1)/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)\.getNumber\(\)/" << printMBBReference(\1)/g' * find . \( -name ".txt" -o -name ".s" -o -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#([0-9]+)/%bb.\1/g' * grep -nr 'BB#' and fix Differential Revision: https://reviews.llvm.org/D40422 llvm-svn: 319665 2017-12-05 01:18:51 +08:00			`; CHECK: ## %bb.0:`
[AVX-512] Add test cases showing failure to fold zero extending scalar loads in scalar intrinsics without the peephole pass. llvm-svn: 295692 2017-02-21 12:26:07 +08:00			`; CHECK-NEXT: kmovw %esi, %k1`
[AVX-512] Use sse_load_f32/f64 in place of scalar_to_vector and scalar load in some patterns. llvm-svn: 295693 2017-02-21 12:26:10 +08:00			`; CHECK-NEXT: vcmpunordss (%rdi), %xmm0, %k0 {%k1}`
[AVX-512] Add test cases showing failure to fold zero extending scalar loads in scalar intrinsics without the peephole pass. llvm-svn: 295692 2017-02-21 12:26:07 +08:00			`; CHECK-NEXT: kmovw %k0, %eax`
[CodeGen] Use MachineOperand::print in the MIRPrinter for MO_Register. Work towards the unification of MIR and debug output by refactoring the interfaces. For MachineOperand::print, keep a simple version that can be easily called from `dump()`, and a more complex one which will be called from both the MIRPrinter and MachineInstr::print. Add extra checks inside MachineOperand for detached operands (operands with getParent() == nullptr). https://reviews.llvm.org/D40836 * find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" -o -name ".s" \) -type f -print0 \| xargs -0 sed -i '' -E 's/kill: ([^ ]+) ([^ ]+)<def> ([^ ]+)/kill: \1 def \2 \3/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" -o -name ".s" \) -type f -print0 \| xargs -0 sed -i '' -E 's/kill: ([^ ]+) ([^ ]+) ([^ ]+)<def>/kill: \1 \2 def \3/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" -o -name ".s" \) -type f -print0 \| xargs -0 sed -i '' -E 's/kill: def ([^ ]+) ([^ ]+) ([^ ]+)<def>/kill: def \1 \2 def \3/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" -o -name ".s" \) -type f -print0 \| xargs -0 sed -i '' -E 's/<def>//g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" -o -name ".s" \) -type f -print0 \| xargs -0 sed -i '' -E 's/([^ ]+)<kill>/killed \1/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" -o -name ".s" \) -type f -print0 \| xargs -0 sed -i '' -E 's/([^ ]+)<imp-use,kill>/implicit killed \1/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" -o -name ".s" \) -type f -print0 \| xargs -0 sed -i '' -E 's/([^ ]+)<dead>/dead \1/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" -o -name ".s" \) -type f -print0 \| xargs -0 sed -i '' -E 's/([^ ]+)<def[ ],[ ]dead>/dead \1/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" -o -name ".s" \) -type f -print0 \| xargs -0 sed -i '' -E 's/([^ ]+)<imp-def[ ],[ ]dead>/implicit-def dead \1/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" -o -name ".s" \) -type f -print0 \| xargs -0 sed -i '' -E 's/([^ ]+)<imp-def>/implicit-def \1/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" -o -name ".s" \) -type f -print0 \| xargs -0 sed -i '' -E 's/([^ ]+)<imp-use>/implicit \1/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" -o -name ".s" \) -type f -print0 \| xargs -0 sed -i '' -E 's/([^ ]+)<internal>/internal \1/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" -o -name "*.s" \) -type f -print0 \| xargs -0 sed -i '' -E 's/([^ ]+)<undef>/undef \1/g' llvm-svn: 320022 2017-12-07 18:40:31 +08:00			`; CHECK-NEXT: ## kill: def %al killed %al killed %eax`
[AVX-512] Add test cases showing failure to fold zero extending scalar loads in scalar intrinsics without the peephole pass. llvm-svn: 295692 2017-02-21 12:26:07 +08:00			`; CHECK-NEXT: retq`
			`%b.val = load float, float* %b`
			`%bv0 = insertelement <4 x float> undef, float %b.val, i32 0`
			`%bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1`
			`%bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2`
			`%bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3`
			`%res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %a, <4 x float> %bv, i32 3, i8 %mask, i32 4)`
			`ret i8 %res2`
			`}`
			`declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)`

			`define <4 x float> @test_mask_max_ss(<4 x float> %a, float* %b, i8 %mask) {`
			`; CHECK-LABEL: test_mask_max_ss:`
[CodeGen] Unify MBB reference format in both MIR and debug output As part of the unification of the debug format and the MIR format, print MBB references as '%bb.5'. The MIR printer prints the IR name of a MBB only for block definitions. * find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)->getNumber\(\)/" << printMBBReference(\1)/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)\.getNumber\(\)/" << printMBBReference(\1)/g' * find . \( -name ".txt" -o -name ".s" -o -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#([0-9]+)/%bb.\1/g' * grep -nr 'BB#' and fix Differential Revision: https://reviews.llvm.org/D40422 llvm-svn: 319665 2017-12-05 01:18:51 +08:00			`; CHECK: ## %bb.0:`
[AVX-512] Add test cases showing failure to fold zero extending scalar loads in scalar intrinsics without the peephole pass. llvm-svn: 295692 2017-02-21 12:26:07 +08:00			`; CHECK-NEXT: kmovw %esi, %k1`
[AVX-512] Use sse_load_f32/f64 in place of scalar_to_vector and scalar load in some patterns. llvm-svn: 295693 2017-02-21 12:26:10 +08:00			`; CHECK-NEXT: vmaxss (%rdi), %xmm0, %xmm0 {%k1} {z}`
[AVX-512] Add test cases showing failure to fold zero extending scalar loads in scalar intrinsics without the peephole pass. llvm-svn: 295692 2017-02-21 12:26:07 +08:00			`; CHECK-NEXT: retq`
			`%b.val = load float, float* %b`
			`%bv0 = insertelement <4 x float> undef, float %b.val, i32 0`
			`%bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1`
			`%bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2`
			`%bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3`
			`%res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a, <4 x float> %bv, <4 x float> zeroinitializer, i8 %mask, i32 4)`
			`ret <4 x float> %res`
			`}`
			`declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone`

			`define <4 x float> @test_maskz_add_ss(<4 x float> %a, float* %b, i8 %mask) {`
			`; CHECK-LABEL: test_maskz_add_ss:`
[CodeGen] Unify MBB reference format in both MIR and debug output As part of the unification of the debug format and the MIR format, print MBB references as '%bb.5'. The MIR printer prints the IR name of a MBB only for block definitions. * find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)->getNumber\(\)/" << printMBBReference(\1)/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)\.getNumber\(\)/" << printMBBReference(\1)/g' * find . \( -name ".txt" -o -name ".s" -o -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#([0-9]+)/%bb.\1/g' * grep -nr 'BB#' and fix Differential Revision: https://reviews.llvm.org/D40422 llvm-svn: 319665 2017-12-05 01:18:51 +08:00			`; CHECK: ## %bb.0:`
[AVX-512] Add test cases showing failure to fold zero extending scalar loads in scalar intrinsics without the peephole pass. llvm-svn: 295692 2017-02-21 12:26:07 +08:00			`; CHECK-NEXT: kmovw %esi, %k1`
[AVX-512] Use sse_load_f32/f64 in place of scalar_to_vector and scalar load in some patterns. llvm-svn: 295693 2017-02-21 12:26:10 +08:00			`; CHECK-NEXT: vaddss (%rdi), %xmm0, %xmm0 {%k1} {z}`
[AVX-512] Add test cases showing failure to fold zero extending scalar loads in scalar intrinsics without the peephole pass. llvm-svn: 295692 2017-02-21 12:26:07 +08:00			`; CHECK-NEXT: retq`
			`%b.val = load float, float* %b`
			`%bv0 = insertelement <4 x float> undef, float %b.val, i32 0`
			`%bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1`
			`%bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2`
			`%bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3`
			`%res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a, <4 x float> %bv, <4 x float> zeroinitializer, i8 %mask, i32 4)`
			`ret <4 x float> %res`
			`}`

			`declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone`

			`declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)`

			`define <2 x double> @test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %a, <2 x double> %b, double* %c, i8 %mask){`
			`; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_sd:`
[CodeGen] Unify MBB reference format in both MIR and debug output As part of the unification of the debug format and the MIR format, print MBB references as '%bb.5'. The MIR printer prints the IR name of a MBB only for block definitions. * find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)->getNumber\(\)/" << printMBBReference(\1)/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)\.getNumber\(\)/" << printMBBReference(\1)/g' * find . \( -name ".txt" -o -name ".s" -o -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#([0-9]+)/%bb.\1/g' * grep -nr 'BB#' and fix Differential Revision: https://reviews.llvm.org/D40422 llvm-svn: 319665 2017-12-05 01:18:51 +08:00			`; CHECK: ## %bb.0:`
[AVX-512] Add test cases showing failure to fold zero extending scalar loads in scalar intrinsics without the peephole pass. llvm-svn: 295692 2017-02-21 12:26:07 +08:00			`; CHECK-NEXT: kmovw %esi, %k1`
[AVX-512] Use sse_load_f32/f64 in place of scalar_to_vector and scalar load in some patterns. llvm-svn: 295693 2017-02-21 12:26:10 +08:00			`; CHECK-NEXT: vfmadd213sd (%rdi), %xmm1, %xmm0 {%k1}`
[AVX-512] Add test cases showing failure to fold zero extending scalar loads in scalar intrinsics without the peephole pass. llvm-svn: 295692 2017-02-21 12:26:07 +08:00			`; CHECK-NEXT: retq`
			`%c.val = load double, double* %c`
			`%cv0 = insertelement <2 x double> undef, double %c.val, i32 0`
			`%cv = insertelement <2 x double> %cv0, double 0.000000e+00, i32 1`
			`%res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %cv, i8 %mask, i32 4)`
			`ret <2 x double> %res`
			`}`
[AVX512] Add a test to check what happens when a load is referenced by two different masked scalar intrinsics with the same op inputs, but different masking node. We're missing some single use checks in the sse_load_f32/f64 handling that cause us to replicate the load. llvm-svn: 311300 2017-08-21 03:47:00 +08:00
			`; Test what happens when the load when we have multiple uses of the fadds DAG node via separate vselect nodes.`
			`; TODO: We shouldn't fold the load twice here.`
			`define <4 x float> @test_mask_add_ss_double_use(<4 x float> %a, float* %b, i8 %mask, <4 x float> %c) {`
			`; CHECK-LABEL: test_mask_add_ss_double_use:`
[CodeGen] Unify MBB reference format in both MIR and debug output As part of the unification of the debug format and the MIR format, print MBB references as '%bb.5'. The MIR printer prints the IR name of a MBB only for block definitions. * find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)->getNumber\(\)/" << printMBBReference(\1)/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)\.getNumber\(\)/" << printMBBReference(\1)/g' * find . \( -name ".txt" -o -name ".s" -o -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#([0-9]+)/%bb.\1/g' * grep -nr 'BB#' and fix Differential Revision: https://reviews.llvm.org/D40422 llvm-svn: 319665 2017-12-05 01:18:51 +08:00			`; CHECK: ## %bb.0:`
[X86] When selecting sse_load_f32/f64 pattern, make sure there's only one use of every node all the way back to the root of the match Summary: With masked operations, its possible for the operation node like fadd, fsub, etc. to be used by multiple different vselects. Since the pattern matching will start at the vselect, we need to make sure the operation node itself is only used once before we can fold a load. Otherwise we'll end up folding the same load into multiple instructions. Reviewers: RKSimon, spatel, zvi, igorb Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D36938 llvm-svn: 311342 2017-08-22 00:04:04 +08:00			`; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero`
[AVX512] Add a test to check what happens when a load is referenced by two different masked scalar intrinsics with the same op inputs, but different masking node. We're missing some single use checks in the sse_load_f32/f64 handling that cause us to replicate the load. llvm-svn: 311300 2017-08-21 03:47:00 +08:00			`; CHECK-NEXT: kmovw %esi, %k1`
[X86] When selecting sse_load_f32/f64 pattern, make sure there's only one use of every node all the way back to the root of the match Summary: With masked operations, its possible for the operation node like fadd, fsub, etc. to be used by multiple different vselects. Since the pattern matching will start at the vselect, we need to make sure the operation node itself is only used once before we can fold a load. Otherwise we'll end up folding the same load into multiple instructions. Reviewers: RKSimon, spatel, zvi, igorb Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D36938 llvm-svn: 311342 2017-08-22 00:04:04 +08:00			`; CHECK-NEXT: vaddss %xmm2, %xmm0, %xmm1 {%k1}`
			`; CHECK-NEXT: vaddss %xmm2, %xmm0, %xmm0 {%k1} {z}`
[AVX512] Add a test to check what happens when a load is referenced by two different masked scalar intrinsics with the same op inputs, but different masking node. We're missing some single use checks in the sse_load_f32/f64 handling that cause us to replicate the load. llvm-svn: 311300 2017-08-21 03:47:00 +08:00			`; CHECK-NEXT: vmulps %xmm0, %xmm1, %xmm0`
			`; CHECK-NEXT: retq`
			`%b.val = load float, float* %b`
			`%bv0 = insertelement <4 x float> undef, float %b.val, i32 0`
			`%bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1`
			`%bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2`
			`%bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3`
			`%res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %bv, <4 x float> %c, i8 %mask, i32 4)`
			`%res2 = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %bv, <4 x float> zeroinitializer, i8 %mask, i32 4)`
			`%res3 = fmul <4 x float> %res, %res2`
			`ret <4 x float> %res3`
			`}`