llvm-project/llvm/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll

; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s

define <4 x float> @foo(<4 x float> %val, <4 x float> %test) nounwind {
; CHECK-LABEL: LCPI0_0:
; CHECK-NEXT: .long 1065353216              ## float 1.000000e+00
; CHECK-NEXT: .long 1065353216              ## float 1.000000e+00
; CHECK-NEXT: .long 1065353216              ## float 1.000000e+00
; CHECK-NEXT: .long 1065353216              ## float 1.000000e+00
; CHECK-LABEL: foo:
; CHECK: cmpeqps %xmm1, %xmm0
; CHECK-NEXT: andps LCPI0_0(%rip), %xmm0
; CHECK-NEXT: retq

  %cmp = fcmp oeq <4 x float> %val, %test
  %ext = zext <4 x i1> %cmp to <4 x i32>
  %result = sitofp <4 x i32> %ext to <4 x float>
  ret <4 x float> %result
}

; Make sure the operation doesn't try to get folded when the sizes don't match,
; as that ends up crashing later when trying to form a bitcast operation for
; the folded nodes.
define void @foo1(<4 x float> %val, <4 x float> %test, <4 x double>* %p) nounwind {
; CHECK-LABEL: LCPI1_0:
; CHECK-NEXT: .long 1                       ## 0x1
; CHECK-NEXT: .long 1                       ## 0x1
; CHECK-NEXT: .long 1                       ## 0x1
; CHECK-NEXT: .long 1                       ## 0x1
; CHECK-LABEL: foo1:
;   FIXME: The operation gets scalarized. If/when the compiler learns to better
;          use [V]CVTDQ2PD, this will need updated.
; CHECK: cvtsi2sdq
; CHECK: cvtsi2sdq
; CHECK: cvtsi2sdq
; CHECK: cvtsi2sdq
  %cmp = fcmp oeq <4 x float> %val, %test
  %ext = zext <4 x i1> %cmp to <4 x i32>
  %result = sitofp <4 x i32> %ext to <4 x double>
  store <4 x double> %result, <4 x double>* %p
  ret void
}

; Also test the general purpose constant folding of int->fp.
define void @foo2(<4 x float>* noalias %result) nounwind {
; CHECK-LABEL: LCPI2_0:
; CHECK-NEXT: .long 1082130432              ## float 4.000000e+00
; CHECK-NEXT: .long 1084227584              ## float 5.000000e+00
; CHECK-NEXT: .long 1086324736              ## float 6.000000e+00
; CHECK-NEXT: .long 1088421888              ## float 7.000000e+00
; CHECK-LABEL: foo2:
; CHECK:  movaps LCPI2_0(%rip), %xmm0

  %val = uitofp <4 x i32> <i32 4, i32 5, i32 6, i32 7> to <4 x float>
  store <4 x float> %val, <4 x float>* %result
  ret void
}

; Fold explicit AND operations when the constant isn't a splat of a single
; scalar value like what the zext creates.
define <4 x float> @foo3(<4 x float> %val, <4 x float> %test) nounwind {
; CHECK-LABEL: LCPI3_0:
; CHECK-NEXT: .long 1065353216              ## float 1.000000e+00
; CHECK-NEXT: .long 0                       ## float 0.000000e+00
; CHECK-NEXT: .long 1065353216              ## float 1.000000e+00
; CHECK-NEXT: .long 0                       ## float 0.000000e+00
; CHECK-LABEL: foo3:
; CHECK: cmpeqps %xmm1, %xmm0
; CHECK-NEXT: andps LCPI3_0(%rip), %xmm0
  %cmp = fcmp oeq <4 x float> %val, %test
  %ext = zext <4 x i1> %cmp to <4 x i32>
  %and = and <4 x i32> %ext, <i32 255, i32 256, i32 257, i32 258>
  %result = sitofp <4 x i32> %and to <4 x float>
  ret <4 x float> %result
}
X86: Constant fold converting vector setcc results to float. Since the result of a SETCC for X86 is 0 or -1 in each lane, we can move unary operations, in this case [su]int_to_fp through the mask operation and constant fold the operation away. Generally speaking: UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> AND(VECTOR_CMP(x,y), constant2) where constant2 is UNARYOP(constant). This implements the transform where UNARYOP is [su]int_to_fp. For example, consider the simple function: define <4 x float> @foo(<4 x float> %val, <4 x float> %test) nounwind { %cmp = fcmp oeq <4 x float> %val, %test %ext = zext <4 x i1> %cmp to <4 x i32> %result = sitofp <4 x i32> %ext to <4 x float> ret <4 x float> %result } Before this change, the SSE code is generated as: LCPI0_0: .long 1 ## 0x1 .long 1 ## 0x1 .long 1 ## 0x1 .long 1 ## 0x1 .section __TEXT,__text,regular,pure_instructions .globl _foo .align 4, 0x90 _foo: ## @foo cmpeqps %xmm1, %xmm0 andps LCPI0_0(%rip), %xmm0 cvtdq2ps %xmm0, %xmm0 retq After, the code is improved to: LCPI0_0: .long 1065353216 ## float 1.000000e+00 .long 1065353216 ## float 1.000000e+00 .long 1065353216 ## float 1.000000e+00 .long 1065353216 ## float 1.000000e+00 .section __TEXT,__text,regular,pure_instructions .globl _foo .align 4, 0x90 _foo: ## @foo cmpeqps %xmm1, %xmm0 andps LCPI0_0(%rip), %xmm0 retq The cvtdq2ps has been constant folded away and the floating point 1.0f vector lanes are materialized directly via the ModRM operand of andps. llvm-svn: 213342 2014-07-18 08:40:56 +08:00			`; RUN: llc < %s -mtriple=x86_64-apple-darwin \| FileCheck %s`

			`define <4 x float> @foo(<4 x float> %val, <4 x float> %test) nounwind {`
DAG: fp->int conversion for non-splat constants. Constant fold the lanes of the input constant build_vector individually so we correctly handle when the vector elements are not all the same constant value. PR20394 llvm-svn: 213798 2014-07-24 04:41:31 +08:00			`; CHECK-LABEL: LCPI0_0:`
X86: Constant fold converting vector setcc results to float. Since the result of a SETCC for X86 is 0 or -1 in each lane, we can move unary operations, in this case [su]int_to_fp through the mask operation and constant fold the operation away. Generally speaking: UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> AND(VECTOR_CMP(x,y), constant2) where constant2 is UNARYOP(constant). This implements the transform where UNARYOP is [su]int_to_fp. For example, consider the simple function: define <4 x float> @foo(<4 x float> %val, <4 x float> %test) nounwind { %cmp = fcmp oeq <4 x float> %val, %test %ext = zext <4 x i1> %cmp to <4 x i32> %result = sitofp <4 x i32> %ext to <4 x float> ret <4 x float> %result } Before this change, the SSE code is generated as: LCPI0_0: .long 1 ## 0x1 .long 1 ## 0x1 .long 1 ## 0x1 .long 1 ## 0x1 .section __TEXT,__text,regular,pure_instructions .globl _foo .align 4, 0x90 _foo: ## @foo cmpeqps %xmm1, %xmm0 andps LCPI0_0(%rip), %xmm0 cvtdq2ps %xmm0, %xmm0 retq After, the code is improved to: LCPI0_0: .long 1065353216 ## float 1.000000e+00 .long 1065353216 ## float 1.000000e+00 .long 1065353216 ## float 1.000000e+00 .long 1065353216 ## float 1.000000e+00 .section __TEXT,__text,regular,pure_instructions .globl _foo .align 4, 0x90 _foo: ## @foo cmpeqps %xmm1, %xmm0 andps LCPI0_0(%rip), %xmm0 retq The cvtdq2ps has been constant folded away and the floating point 1.0f vector lanes are materialized directly via the ModRM operand of andps. llvm-svn: 213342 2014-07-18 08:40:56 +08:00			`; CHECK-NEXT: .long 1065353216 ## float 1.000000e+00`
			`; CHECK-NEXT: .long 1065353216 ## float 1.000000e+00`
			`; CHECK-NEXT: .long 1065353216 ## float 1.000000e+00`
			`; CHECK-NEXT: .long 1065353216 ## float 1.000000e+00`
			`; CHECK-LABEL: foo:`
			`; CHECK: cmpeqps %xmm1, %xmm0`
			`; CHECK-NEXT: andps LCPI0_0(%rip), %xmm0`
			`; CHECK-NEXT: retq`

			`%cmp = fcmp oeq <4 x float> %val, %test`
			`%ext = zext <4 x i1> %cmp to <4 x i32>`
			`%result = sitofp <4 x i32> %ext to <4 x float>`
			`ret <4 x float> %result`
			`}`
DAG: fp->int conversion for non-splat constants. Constant fold the lanes of the input constant build_vector individually so we correctly handle when the vector elements are not all the same constant value. PR20394 llvm-svn: 213798 2014-07-24 04:41:31 +08:00
X86: restrict combine to when type sizes are safe. The folding of unary operations through a vector compare and mask operation is only safe if the unary operation result is of the same size as its input. For example, it's not safe for [su]itofp from v4i32 to v4f64. llvm-svn: 213799 2014-07-24 04:41:38 +08:00			`; Make sure the operation doesn't try to get folded when the sizes don't match,`
			`; as that ends up crashing later when trying to form a bitcast operation for`
			`; the folded nodes.`
			`define void @foo1(<4 x float> %val, <4 x float> %test, <4 x double>* %p) nounwind {`
DAG: fp->int conversion for non-splat constants. Constant fold the lanes of the input constant build_vector individually so we correctly handle when the vector elements are not all the same constant value. PR20394 llvm-svn: 213798 2014-07-24 04:41:31 +08:00			`; CHECK-LABEL: LCPI1_0:`
X86: restrict combine to when type sizes are safe. The folding of unary operations through a vector compare and mask operation is only safe if the unary operation result is of the same size as its input. For example, it's not safe for [su]itofp from v4i32 to v4f64. llvm-svn: 213799 2014-07-24 04:41:38 +08:00			`; CHECK-NEXT: .long 1 ## 0x1`
			`; CHECK-NEXT: .long 1 ## 0x1`
			`; CHECK-NEXT: .long 1 ## 0x1`
			`; CHECK-NEXT: .long 1 ## 0x1`
			`; CHECK-LABEL: foo1:`
			`; FIXME: The operation gets scalarized. If/when the compiler learns to better`
			`; use [V]CVTDQ2PD, this will need updated.`
			`; CHECK: cvtsi2sdq`
			`; CHECK: cvtsi2sdq`
			`; CHECK: cvtsi2sdq`
			`; CHECK: cvtsi2sdq`
			`%cmp = fcmp oeq <4 x float> %val, %test`
			`%ext = zext <4 x i1> %cmp to <4 x i32>`
			`%result = sitofp <4 x i32> %ext to <4 x double>`
			`store <4 x double> %result, <4 x double>* %p`
			`ret void`
			`}`

			`; Also test the general purpose constant folding of int->fp.`
			`define void @foo2(<4 x float>* noalias %result) nounwind {`
			`; CHECK-LABEL: LCPI2_0:`
DAG: fp->int conversion for non-splat constants. Constant fold the lanes of the input constant build_vector individually so we correctly handle when the vector elements are not all the same constant value. PR20394 llvm-svn: 213798 2014-07-24 04:41:31 +08:00			`; CHECK-NEXT: .long 1082130432 ## float 4.000000e+00`
			`; CHECK-NEXT: .long 1084227584 ## float 5.000000e+00`
			`; CHECK-NEXT: .long 1086324736 ## float 6.000000e+00`
			`; CHECK-NEXT: .long 1088421888 ## float 7.000000e+00`
X86: restrict combine to when type sizes are safe. The folding of unary operations through a vector compare and mask operation is only safe if the unary operation result is of the same size as its input. For example, it's not safe for [su]itofp from v4i32 to v4f64. llvm-svn: 213799 2014-07-24 04:41:38 +08:00			`; CHECK-LABEL: foo2:`
			`; CHECK: movaps LCPI2_0(%rip), %xmm0`
DAG: fp->int conversion for non-splat constants. Constant fold the lanes of the input constant build_vector individually so we correctly handle when the vector elements are not all the same constant value. PR20394 llvm-svn: 213798 2014-07-24 04:41:31 +08:00
			`%val = uitofp <4 x i32> <i32 4, i32 5, i32 6, i32 7> to <4 x float>`
			`store <4 x float> %val, <4 x float>* %result`
			`ret void`
			`}`
[X86,AArch64] Extend vcmp w/ unary op combine to work w/ more constants. The transform to constant fold unary operations with an AND across a vector comparison applies when the constant is not a splat of a scalar as well. llvm-svn: 213800 2014-07-24 04:41:43 +08:00
			`; Fold explicit AND operations when the constant isn't a splat of a single`
			`; scalar value like what the zext creates.`
			`define <4 x float> @foo3(<4 x float> %val, <4 x float> %test) nounwind {`
			`; CHECK-LABEL: LCPI3_0:`
			`; CHECK-NEXT: .long 1065353216 ## float 1.000000e+00`
			`; CHECK-NEXT: .long 0 ## float 0.000000e+00`
			`; CHECK-NEXT: .long 1065353216 ## float 1.000000e+00`
			`; CHECK-NEXT: .long 0 ## float 0.000000e+00`
			`; CHECK-LABEL: foo3:`
			`; CHECK: cmpeqps %xmm1, %xmm0`
			`; CHECK-NEXT: andps LCPI3_0(%rip), %xmm0`
			`%cmp = fcmp oeq <4 x float> %val, %test`
			`%ext = zext <4 x i1> %cmp to <4 x i32>`
			`%and = and <4 x i32> %ext, <i32 255, i32 256, i32 257, i32 258>`
			`%result = sitofp <4 x i32> %and to <4 x float>`
			`ret <4 x float> %result`
			`}`