llvm-project/llvm/test/CodeGen/X86/avx512-memfold.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s

define i8 @test_int_x86_avx512_mask_cmp_ss(<4 x float> %a, float* %b, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss:
; CHECK:       ## BB#0:
; CHECK-NEXT:    kmovw %esi, %k1
; CHECK-NEXT:    vcmpunordss (%rdi), %xmm0, %k0 {%k1}
; CHECK-NEXT:    kmovw %k0, %eax
; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT:    retq
  %b.val = load float, float* %b
  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
  %bv  = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
  %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %a, <4 x float> %bv, i32 3, i8 %mask, i32 4)
  ret i8 %res2
}
declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)

define <4 x float> @test_mask_max_ss(<4 x float> %a, float* %b, i8 %mask) {
; CHECK-LABEL: test_mask_max_ss:
; CHECK:       ## BB#0:
; CHECK-NEXT:    kmovw %esi, %k1
; CHECK-NEXT:    vmaxss (%rdi), %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %b.val = load float, float* %b
  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
  %bv  = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a, <4 x float> %bv, <4 x float> zeroinitializer, i8 %mask, i32 4)
  ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone

define <4 x float> @test_maskz_add_ss(<4 x float> %a, float* %b, i8 %mask) {
; CHECK-LABEL: test_maskz_add_ss:
; CHECK:       ## BB#0:
; CHECK-NEXT:    kmovw %esi, %k1
; CHECK-NEXT:    vaddss (%rdi), %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT:    retq
  %b.val = load float, float* %b
  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
  %bv  = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a, <4 x float> %bv, <4 x float> zeroinitializer, i8 %mask, i32 4)
  ret <4 x float> %res
}

declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone

declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)

define <2 x double> @test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %a, <2 x double> %b, double* %c, i8 %mask){
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_sd:
; CHECK:       ## BB#0:
; CHECK-NEXT:    kmovw %esi, %k1
; CHECK-NEXT:    vfmadd213sd (%rdi), %xmm1, %xmm0 {%k1}
; CHECK-NEXT:    retq
  %c.val = load double, double* %c
  %cv0 = insertelement <2 x double> undef, double %c.val, i32 0
  %cv = insertelement <2 x double> %cv0, double 0.000000e+00, i32 1
  %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %cv, i8 %mask, i32 4)
  ret <2 x double> %res
}

; Test what happens when the load when we have multiple uses of the fadds DAG node via separate vselect nodes.
; TODO: We shouldn't fold the load twice here.
define <4 x float> @test_mask_add_ss_double_use(<4 x float> %a, float* %b, i8 %mask, <4 x float> %c) {
; CHECK-LABEL: test_mask_add_ss_double_use:
; CHECK:       ## BB#0:
; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; CHECK-NEXT:    kmovw %esi, %k1
; CHECK-NEXT:    vaddss %xmm2, %xmm0, %xmm1 {%k1}
; CHECK-NEXT:    vaddss %xmm2, %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT:    vmulps %xmm0, %xmm1, %xmm0
; CHECK-NEXT:    retq
  %b.val = load float, float* %b
  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
  %bv  = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %bv, <4 x float> %c, i8 %mask, i32 4)
  %res2 = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %bv, <4 x float> zeroinitializer, i8 %mask, i32 4)
  %res3 = fmul <4 x float> %res, %res2
  ret <4 x float> %res3
}
[AVX-512] Add test cases showing failure to fold zero extending scalar loads in scalar intrinsics without the peephole pass. llvm-svn: 295692 2017-02-21 12:26:07 +08:00			`; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py`
			`; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mcpu=knl \| FileCheck %s`

			`define i8 @test_int_x86_avx512_mask_cmp_ss(<4 x float> %a, float* %b, i8 %mask) {`
			`; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss:`
			`; CHECK: ## BB#0:`
			`; CHECK-NEXT: kmovw %esi, %k1`
[AVX-512] Use sse_load_f32/f64 in place of scalar_to_vector and scalar load in some patterns. llvm-svn: 295693 2017-02-21 12:26:10 +08:00			`; CHECK-NEXT: vcmpunordss (%rdi), %xmm0, %k0 {%k1}`
[AVX-512] Add test cases showing failure to fold zero extending scalar loads in scalar intrinsics without the peephole pass. llvm-svn: 295692 2017-02-21 12:26:07 +08:00			`; CHECK-NEXT: kmovw %k0, %eax`
			`; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>`
			`; CHECK-NEXT: retq`
			`%b.val = load float, float* %b`
			`%bv0 = insertelement <4 x float> undef, float %b.val, i32 0`
			`%bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1`
			`%bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2`
			`%bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3`
			`%res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %a, <4 x float> %bv, i32 3, i8 %mask, i32 4)`
			`ret i8 %res2`
			`}`
			`declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)`

			`define <4 x float> @test_mask_max_ss(<4 x float> %a, float* %b, i8 %mask) {`
			`; CHECK-LABEL: test_mask_max_ss:`
			`; CHECK: ## BB#0:`
			`; CHECK-NEXT: kmovw %esi, %k1`
[AVX-512] Use sse_load_f32/f64 in place of scalar_to_vector and scalar load in some patterns. llvm-svn: 295693 2017-02-21 12:26:10 +08:00			`; CHECK-NEXT: vmaxss (%rdi), %xmm0, %xmm0 {%k1} {z}`
[AVX-512] Add test cases showing failure to fold zero extending scalar loads in scalar intrinsics without the peephole pass. llvm-svn: 295692 2017-02-21 12:26:07 +08:00			`; CHECK-NEXT: retq`
			`%b.val = load float, float* %b`
			`%bv0 = insertelement <4 x float> undef, float %b.val, i32 0`
			`%bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1`
			`%bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2`
			`%bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3`
			`%res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a, <4 x float> %bv, <4 x float> zeroinitializer, i8 %mask, i32 4)`
			`ret <4 x float> %res`
			`}`
			`declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone`

			`define <4 x float> @test_maskz_add_ss(<4 x float> %a, float* %b, i8 %mask) {`
			`; CHECK-LABEL: test_maskz_add_ss:`
			`; CHECK: ## BB#0:`
			`; CHECK-NEXT: kmovw %esi, %k1`
[AVX-512] Use sse_load_f32/f64 in place of scalar_to_vector and scalar load in some patterns. llvm-svn: 295693 2017-02-21 12:26:10 +08:00			`; CHECK-NEXT: vaddss (%rdi), %xmm0, %xmm0 {%k1} {z}`
[AVX-512] Add test cases showing failure to fold zero extending scalar loads in scalar intrinsics without the peephole pass. llvm-svn: 295692 2017-02-21 12:26:07 +08:00			`; CHECK-NEXT: retq`
			`%b.val = load float, float* %b`
			`%bv0 = insertelement <4 x float> undef, float %b.val, i32 0`
			`%bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1`
			`%bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2`
			`%bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3`
			`%res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a, <4 x float> %bv, <4 x float> zeroinitializer, i8 %mask, i32 4)`
			`ret <4 x float> %res`
			`}`

			`declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone`

			`declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)`

			`define <2 x double> @test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %a, <2 x double> %b, double* %c, i8 %mask){`
			`; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_sd:`
			`; CHECK: ## BB#0:`
			`; CHECK-NEXT: kmovw %esi, %k1`
[AVX-512] Use sse_load_f32/f64 in place of scalar_to_vector and scalar load in some patterns. llvm-svn: 295693 2017-02-21 12:26:10 +08:00			`; CHECK-NEXT: vfmadd213sd (%rdi), %xmm1, %xmm0 {%k1}`
[AVX-512] Add test cases showing failure to fold zero extending scalar loads in scalar intrinsics without the peephole pass. llvm-svn: 295692 2017-02-21 12:26:07 +08:00			`; CHECK-NEXT: retq`
			`%c.val = load double, double* %c`
			`%cv0 = insertelement <2 x double> undef, double %c.val, i32 0`
			`%cv = insertelement <2 x double> %cv0, double 0.000000e+00, i32 1`
			`%res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %cv, i8 %mask, i32 4)`
			`ret <2 x double> %res`
			`}`
[AVX512] Add a test to check what happens when a load is referenced by two different masked scalar intrinsics with the same op inputs, but different masking node. We're missing some single use checks in the sse_load_f32/f64 handling that cause us to replicate the load. llvm-svn: 311300 2017-08-21 03:47:00 +08:00
			`; Test what happens when the load when we have multiple uses of the fadds DAG node via separate vselect nodes.`
			`; TODO: We shouldn't fold the load twice here.`
			`define <4 x float> @test_mask_add_ss_double_use(<4 x float> %a, float* %b, i8 %mask, <4 x float> %c) {`
			`; CHECK-LABEL: test_mask_add_ss_double_use:`
			`; CHECK: ## BB#0:`
[X86] When selecting sse_load_f32/f64 pattern, make sure there's only one use of every node all the way back to the root of the match Summary: With masked operations, its possible for the operation node like fadd, fsub, etc. to be used by multiple different vselects. Since the pattern matching will start at the vselect, we need to make sure the operation node itself is only used once before we can fold a load. Otherwise we'll end up folding the same load into multiple instructions. Reviewers: RKSimon, spatel, zvi, igorb Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D36938 llvm-svn: 311342 2017-08-22 00:04:04 +08:00			`; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero`
[AVX512] Add a test to check what happens when a load is referenced by two different masked scalar intrinsics with the same op inputs, but different masking node. We're missing some single use checks in the sse_load_f32/f64 handling that cause us to replicate the load. llvm-svn: 311300 2017-08-21 03:47:00 +08:00			`; CHECK-NEXT: kmovw %esi, %k1`
[X86] When selecting sse_load_f32/f64 pattern, make sure there's only one use of every node all the way back to the root of the match Summary: With masked operations, its possible for the operation node like fadd, fsub, etc. to be used by multiple different vselects. Since the pattern matching will start at the vselect, we need to make sure the operation node itself is only used once before we can fold a load. Otherwise we'll end up folding the same load into multiple instructions. Reviewers: RKSimon, spatel, zvi, igorb Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D36938 llvm-svn: 311342 2017-08-22 00:04:04 +08:00			`; CHECK-NEXT: vaddss %xmm2, %xmm0, %xmm1 {%k1}`
			`; CHECK-NEXT: vaddss %xmm2, %xmm0, %xmm0 {%k1} {z}`
[AVX512] Add a test to check what happens when a load is referenced by two different masked scalar intrinsics with the same op inputs, but different masking node. We're missing some single use checks in the sse_load_f32/f64 handling that cause us to replicate the load. llvm-svn: 311300 2017-08-21 03:47:00 +08:00			`; CHECK-NEXT: vmulps %xmm0, %xmm1, %xmm0`
			`; CHECK-NEXT: retq`
			`%b.val = load float, float* %b`
			`%bv0 = insertelement <4 x float> undef, float %b.val, i32 0`
			`%bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1`
			`%bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2`
			`%bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3`
			`%res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %bv, <4 x float> %c, i8 %mask, i32 4)`
			`%res2 = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %bv, <4 x float> zeroinitializer, i8 %mask, i32 4)`
			`%res3 = fmul <4 x float> %res, %res2`
			`ret <4 x float> %res3`
			`}`