llvm-project/llvm/test/CodeGen/X86/avx512dq-intrinsics-fast-is...

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=CHECK,X86
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=CHECK,X64

; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512dq-builtins.c

define zeroext i8 @test_mm512_mask_fpclass_pd_mask(i8 zeroext %__U, <8 x double> %__A) {
; X86-LABEL: test_mm512_mask_fpclass_pd_mask:
; X86:       # %bb.0: # %entry
; X86-NEXT:    vfpclasspd $4, %zmm0, %k0
; X86-NEXT:    kmovw %k0, %eax
; X86-NEXT:    andb {{[0-9]+}}(%esp), %al
; X86-NEXT:    # kill: def $al killed $al killed $eax
; X86-NEXT:    vzeroupper
; X86-NEXT:    retl
;
; X64-LABEL: test_mm512_mask_fpclass_pd_mask:
; X64:       # %bb.0: # %entry
; X64-NEXT:    vfpclasspd $4, %zmm0, %k0
; X64-NEXT:    kmovw %k0, %eax
; X64-NEXT:    andb %dil, %al
; X64-NEXT:    # kill: def $al killed $al killed $eax
; X64-NEXT:    vzeroupper
; X64-NEXT:    retq
entry:
  %0 = tail call <8 x i1> @llvm.x86.avx512.fpclass.pd.512(<8 x double> %__A, i32 4)
  %1 = bitcast i8 %__U to <8 x i1>
  %2 = and <8 x i1> %0, %1
  %3 = bitcast <8 x i1> %2 to i8
  ret i8 %3
}

declare <8 x i1> @llvm.x86.avx512.fpclass.pd.512(<8 x double>, i32)

define zeroext i8 @test_mm512_fpclass_pd_mask(<8 x double> %__A) {
; CHECK-LABEL: test_mm512_fpclass_pd_mask:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    vfpclasspd $4, %zmm0, %k0
; CHECK-NEXT:    kmovw %k0, %eax
; CHECK-NEXT:    # kill: def $al killed $al killed $eax
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    ret{{[l|q]}}
entry:
  %0 = tail call <8 x i1> @llvm.x86.avx512.fpclass.pd.512(<8 x double> %__A, i32 4)
  %1 = bitcast <8 x i1> %0 to i8
  ret i8 %1
}

define zeroext i16 @test_mm512_mask_fpclass_ps_mask(i16 zeroext %__U, <16 x float> %__A) {
; X86-LABEL: test_mm512_mask_fpclass_ps_mask:
; X86:       # %bb.0: # %entry
; X86-NEXT:    vfpclassps $4, %zmm0, %k0
; X86-NEXT:    kmovw %k0, %eax
; X86-NEXT:    andw {{[0-9]+}}(%esp), %ax
; X86-NEXT:    # kill: def $ax killed $ax killed $eax
; X86-NEXT:    vzeroupper
; X86-NEXT:    retl
;
; X64-LABEL: test_mm512_mask_fpclass_ps_mask:
; X64:       # %bb.0: # %entry
; X64-NEXT:    vfpclassps $4, %zmm0, %k0
; X64-NEXT:    kmovw %k0, %eax
; X64-NEXT:    andl %edi, %eax
; X64-NEXT:    # kill: def $ax killed $ax killed $eax
; X64-NEXT:    vzeroupper
; X64-NEXT:    retq
entry:
  %0 = tail call <16 x i1> @llvm.x86.avx512.fpclass.ps.512(<16 x float> %__A, i32 4)
  %1 = bitcast i16 %__U to <16 x i1>
  %2 = and <16 x i1> %0, %1
  %3 = bitcast <16 x i1> %2 to i16
  ret i16 %3
}

declare <16 x i1> @llvm.x86.avx512.fpclass.ps.512(<16 x float>, i32)

define zeroext i16 @test_mm512_fpclass_ps_mask(<16 x float> %__A) {
; CHECK-LABEL: test_mm512_fpclass_ps_mask:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    vfpclassps $4, %zmm0, %k0
; CHECK-NEXT:    kmovw %k0, %eax
; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    ret{{[l|q]}}
entry:
  %0 = tail call <16 x i1> @llvm.x86.avx512.fpclass.ps.512(<16 x float> %__A, i32 4)
  %1 = bitcast <16 x i1> %0 to i16
  ret i16 %1
}

define zeroext i8 @test_mm_fpclass_sd_mask(<4 x float> %__A) {
; CHECK-LABEL: test_mm_fpclass_sd_mask:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    vfpclasssd $2, %xmm0, %k0
; CHECK-NEXT:    kmovw %k0, %eax
; CHECK-NEXT:    # kill: def $al killed $al killed $eax
; CHECK-NEXT:    ret{{[l|q]}}
entry:
  %0 = bitcast <4 x float> %__A to <2 x double>
  %1 = tail call i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double> %0, i32 2, i8 -1)
  ret i8 %1
}

declare i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double>, i32, i8)

define zeroext i8 @test_mm_mask_fpclass_sd_mask(i8 zeroext %__U, <4 x float> %__A) {
; X86-LABEL: test_mm_mask_fpclass_sd_mask:
; X86:       # %bb.0: # %entry
; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
; X86-NEXT:    vfpclasssd $2, %xmm0, %k0 {%k1}
; X86-NEXT:    kmovw %k0, %eax
; X86-NEXT:    # kill: def $al killed $al killed $eax
; X86-NEXT:    retl
;
; X64-LABEL: test_mm_mask_fpclass_sd_mask:
; X64:       # %bb.0: # %entry
; X64-NEXT:    kmovw %edi, %k1
; X64-NEXT:    vfpclasssd $2, %xmm0, %k0 {%k1}
; X64-NEXT:    kmovw %k0, %eax
; X64-NEXT:    # kill: def $al killed $al killed $eax
; X64-NEXT:    retq
entry:
  %0 = bitcast <4 x float> %__A to <2 x double>
  %1 = tail call i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double> %0, i32 2, i8 %__U)
  ret i8 %1
}

define zeroext i8 @test_mm_fpclass_ss_mask(<4 x float> %__A) {
; CHECK-LABEL: test_mm_fpclass_ss_mask:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    vfpclassss $2, %xmm0, %k0
; CHECK-NEXT:    kmovw %k0, %eax
; CHECK-NEXT:    # kill: def $al killed $al killed $eax
; CHECK-NEXT:    ret{{[l|q]}}
entry:
  %0 = tail call i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float> %__A, i32 2, i8 -1)
  ret i8 %0
}

declare i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float>, i32, i8)

define zeroext i8 @test_mm_mask_fpclass_ss_mask(i8 zeroext %__U, <4 x float> %__A) {
; X86-LABEL: test_mm_mask_fpclass_ss_mask:
; X86:       # %bb.0: # %entry
; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
; X86-NEXT:    vfpclassss $2, %xmm0, %k0 {%k1}
; X86-NEXT:    kmovw %k0, %eax
; X86-NEXT:    # kill: def $al killed $al killed $eax
; X86-NEXT:    retl
;
; X64-LABEL: test_mm_mask_fpclass_ss_mask:
; X64:       # %bb.0: # %entry
; X64-NEXT:    kmovw %edi, %k1
; X64-NEXT:    vfpclassss $2, %xmm0, %k0 {%k1}
; X64-NEXT:    kmovw %k0, %eax
; X64-NEXT:    # kill: def $al killed $al killed $eax
; X64-NEXT:    retq
entry:
  %0 = tail call i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float> %__A, i32 2, i8 %__U)
  ret i8 %0
}
[X86] Redefine avx512 packed fpclass intrinsics to return a vXi1 mask and implement the mask input argument using an 'and' IR instruction. This recommits r335562 and 335563 as a single commit. The frontend will surround the intrinsic with the appropriate marshalling to/from a scalar type to match the sigature of the builtin that software expects. By exposing the vXi1 type directly in the llvm intrinsic we make it available to optimizers much earlier. This can enable the scalar marshalling code to be optimized away. llvm-svn: 335568 2018-06-26 09:37:02 +08:00			`; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py`
			`; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512dq \| FileCheck %s --check-prefixes=CHECK,X86`
			`; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq \| FileCheck %s --check-prefixes=CHECK,X64`

			`; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512dq-builtins.c`

			`define zeroext i8 @test_mm512_mask_fpclass_pd_mask(i8 zeroext %__U, <8 x double> %__A) {`
			`; X86-LABEL: test_mm512_mask_fpclass_pd_mask:`
			`; X86: # %bb.0: # %entry`
			`; X86-NEXT: vfpclasspd $4, %zmm0, %k0`
			`; X86-NEXT: kmovw %k0, %eax`
			`; X86-NEXT: andb {{[0-9]+}}(%esp), %al`
			`; X86-NEXT: # kill: def $al killed $al killed $eax`
			`; X86-NEXT: vzeroupper`
			`; X86-NEXT: retl`
			`;`
			`; X64-LABEL: test_mm512_mask_fpclass_pd_mask:`
			`; X64: # %bb.0: # %entry`
			`; X64-NEXT: vfpclasspd $4, %zmm0, %k0`
			`; X64-NEXT: kmovw %k0, %eax`
			`; X64-NEXT: andb %dil, %al`
			`; X64-NEXT: # kill: def $al killed $al killed $eax`
			`; X64-NEXT: vzeroupper`
			`; X64-NEXT: retq`
			`entry:`
[X86] Rename the autoupgraded of packed fp compare and fpclass intrinsics that don't take a mask as input to exclude '.mask.' from their name. I think the intrinsics named 'avx512.mask.' should refer to the previous behavior of taking a mask argument in the intrinsic instead of using a 'select' or 'and' instruction in IR to accomplish the masking. This is more consistent with the goal that eventually we will have no intrinsics that have masking builtin. When we reach that goal, we should have no intrinsics named "avx512.mask". llvm-svn: 335744 2018-06-27 23:57:53 +08:00			`%0 = tail call <8 x i1> @llvm.x86.avx512.fpclass.pd.512(<8 x double> %__A, i32 4)`
[X86] Redefine avx512 packed fpclass intrinsics to return a vXi1 mask and implement the mask input argument using an 'and' IR instruction. This recommits r335562 and 335563 as a single commit. The frontend will surround the intrinsic with the appropriate marshalling to/from a scalar type to match the sigature of the builtin that software expects. By exposing the vXi1 type directly in the llvm intrinsic we make it available to optimizers much earlier. This can enable the scalar marshalling code to be optimized away. llvm-svn: 335568 2018-06-26 09:37:02 +08:00			`%1 = bitcast i8 %__U to <8 x i1>`
			`%2 = and <8 x i1> %0, %1`
			`%3 = bitcast <8 x i1> %2 to i8`
			`ret i8 %3`
			`}`

[X86] Rename the autoupgraded of packed fp compare and fpclass intrinsics that don't take a mask as input to exclude '.mask.' from their name. I think the intrinsics named 'avx512.mask.' should refer to the previous behavior of taking a mask argument in the intrinsic instead of using a 'select' or 'and' instruction in IR to accomplish the masking. This is more consistent with the goal that eventually we will have no intrinsics that have masking builtin. When we reach that goal, we should have no intrinsics named "avx512.mask". llvm-svn: 335744 2018-06-27 23:57:53 +08:00			`declare <8 x i1> @llvm.x86.avx512.fpclass.pd.512(<8 x double>, i32)`
[X86] Redefine avx512 packed fpclass intrinsics to return a vXi1 mask and implement the mask input argument using an 'and' IR instruction. This recommits r335562 and 335563 as a single commit. The frontend will surround the intrinsic with the appropriate marshalling to/from a scalar type to match the sigature of the builtin that software expects. By exposing the vXi1 type directly in the llvm intrinsic we make it available to optimizers much earlier. This can enable the scalar marshalling code to be optimized away. llvm-svn: 335568 2018-06-26 09:37:02 +08:00
			`define zeroext i8 @test_mm512_fpclass_pd_mask(<8 x double> %__A) {`
			`; CHECK-LABEL: test_mm512_fpclass_pd_mask:`
			`; CHECK: # %bb.0: # %entry`
			`; CHECK-NEXT: vfpclasspd $4, %zmm0, %k0`
			`; CHECK-NEXT: kmovw %k0, %eax`
			`; CHECK-NEXT: # kill: def $al killed $al killed $eax`
			`; CHECK-NEXT: vzeroupper`
			`; CHECK-NEXT: ret{{[l\|q]}}`
			`entry:`
[X86] Rename the autoupgraded of packed fp compare and fpclass intrinsics that don't take a mask as input to exclude '.mask.' from their name. I think the intrinsics named 'avx512.mask.' should refer to the previous behavior of taking a mask argument in the intrinsic instead of using a 'select' or 'and' instruction in IR to accomplish the masking. This is more consistent with the goal that eventually we will have no intrinsics that have masking builtin. When we reach that goal, we should have no intrinsics named "avx512.mask". llvm-svn: 335744 2018-06-27 23:57:53 +08:00			`%0 = tail call <8 x i1> @llvm.x86.avx512.fpclass.pd.512(<8 x double> %__A, i32 4)`
[X86] Redefine avx512 packed fpclass intrinsics to return a vXi1 mask and implement the mask input argument using an 'and' IR instruction. This recommits r335562 and 335563 as a single commit. The frontend will surround the intrinsic with the appropriate marshalling to/from a scalar type to match the sigature of the builtin that software expects. By exposing the vXi1 type directly in the llvm intrinsic we make it available to optimizers much earlier. This can enable the scalar marshalling code to be optimized away. llvm-svn: 335568 2018-06-26 09:37:02 +08:00			`%1 = bitcast <8 x i1> %0 to i8`
			`ret i8 %1`
			`}`

			`define zeroext i16 @test_mm512_mask_fpclass_ps_mask(i16 zeroext %__U, <16 x float> %__A) {`
			`; X86-LABEL: test_mm512_mask_fpclass_ps_mask:`
			`; X86: # %bb.0: # %entry`
			`; X86-NEXT: vfpclassps $4, %zmm0, %k0`
			`; X86-NEXT: kmovw %k0, %eax`
			`; X86-NEXT: andw {{[0-9]+}}(%esp), %ax`
			`; X86-NEXT: # kill: def $ax killed $ax killed $eax`
			`; X86-NEXT: vzeroupper`
			`; X86-NEXT: retl`
			`;`
			`; X64-LABEL: test_mm512_mask_fpclass_ps_mask:`
			`; X64: # %bb.0: # %entry`
			`; X64-NEXT: vfpclassps $4, %zmm0, %k0`
			`; X64-NEXT: kmovw %k0, %eax`
			`; X64-NEXT: andl %edi, %eax`
			`; X64-NEXT: # kill: def $ax killed $ax killed $eax`
			`; X64-NEXT: vzeroupper`
			`; X64-NEXT: retq`
			`entry:`
[X86] Rename the autoupgraded of packed fp compare and fpclass intrinsics that don't take a mask as input to exclude '.mask.' from their name. I think the intrinsics named 'avx512.mask.' should refer to the previous behavior of taking a mask argument in the intrinsic instead of using a 'select' or 'and' instruction in IR to accomplish the masking. This is more consistent with the goal that eventually we will have no intrinsics that have masking builtin. When we reach that goal, we should have no intrinsics named "avx512.mask". llvm-svn: 335744 2018-06-27 23:57:53 +08:00			`%0 = tail call <16 x i1> @llvm.x86.avx512.fpclass.ps.512(<16 x float> %__A, i32 4)`
[X86] Redefine avx512 packed fpclass intrinsics to return a vXi1 mask and implement the mask input argument using an 'and' IR instruction. This recommits r335562 and 335563 as a single commit. The frontend will surround the intrinsic with the appropriate marshalling to/from a scalar type to match the sigature of the builtin that software expects. By exposing the vXi1 type directly in the llvm intrinsic we make it available to optimizers much earlier. This can enable the scalar marshalling code to be optimized away. llvm-svn: 335568 2018-06-26 09:37:02 +08:00			`%1 = bitcast i16 %__U to <16 x i1>`
			`%2 = and <16 x i1> %0, %1`
			`%3 = bitcast <16 x i1> %2 to i16`
			`ret i16 %3`
			`}`

[X86] Rename the autoupgraded of packed fp compare and fpclass intrinsics that don't take a mask as input to exclude '.mask.' from their name. I think the intrinsics named 'avx512.mask.' should refer to the previous behavior of taking a mask argument in the intrinsic instead of using a 'select' or 'and' instruction in IR to accomplish the masking. This is more consistent with the goal that eventually we will have no intrinsics that have masking builtin. When we reach that goal, we should have no intrinsics named "avx512.mask". llvm-svn: 335744 2018-06-27 23:57:53 +08:00			`declare <16 x i1> @llvm.x86.avx512.fpclass.ps.512(<16 x float>, i32)`
[X86] Redefine avx512 packed fpclass intrinsics to return a vXi1 mask and implement the mask input argument using an 'and' IR instruction. This recommits r335562 and 335563 as a single commit. The frontend will surround the intrinsic with the appropriate marshalling to/from a scalar type to match the sigature of the builtin that software expects. By exposing the vXi1 type directly in the llvm intrinsic we make it available to optimizers much earlier. This can enable the scalar marshalling code to be optimized away. llvm-svn: 335568 2018-06-26 09:37:02 +08:00
			`define zeroext i16 @test_mm512_fpclass_ps_mask(<16 x float> %__A) {`
			`; CHECK-LABEL: test_mm512_fpclass_ps_mask:`
			`; CHECK: # %bb.0: # %entry`
			`; CHECK-NEXT: vfpclassps $4, %zmm0, %k0`
			`; CHECK-NEXT: kmovw %k0, %eax`
			`; CHECK-NEXT: # kill: def $ax killed $ax killed $eax`
			`; CHECK-NEXT: vzeroupper`
			`; CHECK-NEXT: ret{{[l\|q]}}`
			`entry:`
[X86] Rename the autoupgraded of packed fp compare and fpclass intrinsics that don't take a mask as input to exclude '.mask.' from their name. I think the intrinsics named 'avx512.mask.' should refer to the previous behavior of taking a mask argument in the intrinsic instead of using a 'select' or 'and' instruction in IR to accomplish the masking. This is more consistent with the goal that eventually we will have no intrinsics that have masking builtin. When we reach that goal, we should have no intrinsics named "avx512.mask". llvm-svn: 335744 2018-06-27 23:57:53 +08:00			`%0 = tail call <16 x i1> @llvm.x86.avx512.fpclass.ps.512(<16 x float> %__A, i32 4)`
[X86] Redefine avx512 packed fpclass intrinsics to return a vXi1 mask and implement the mask input argument using an 'and' IR instruction. This recommits r335562 and 335563 as a single commit. The frontend will surround the intrinsic with the appropriate marshalling to/from a scalar type to match the sigature of the builtin that software expects. By exposing the vXi1 type directly in the llvm intrinsic we make it available to optimizers much earlier. This can enable the scalar marshalling code to be optimized away. llvm-svn: 335568 2018-06-26 09:37:02 +08:00			`%1 = bitcast <16 x i1> %0 to i16`
			`ret i16 %1`
			`}`

			`define zeroext i8 @test_mm_fpclass_sd_mask(<4 x float> %__A) {`
			`; CHECK-LABEL: test_mm_fpclass_sd_mask:`
			`; CHECK: # %bb.0: # %entry`
			`; CHECK-NEXT: vfpclasssd $2, %xmm0, %k0`
			`; CHECK-NEXT: kmovw %k0, %eax`
			`; CHECK-NEXT: # kill: def $al killed $al killed $eax`
			`; CHECK-NEXT: ret{{[l\|q]}}`
			`entry:`
			`%0 = bitcast <4 x float> %__A to <2 x double>`
			`%1 = tail call i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double> %0, i32 2, i8 -1)`
			`ret i8 %1`
			`}`

			`declare i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double>, i32, i8)`

			`define zeroext i8 @test_mm_mask_fpclass_sd_mask(i8 zeroext %__U, <4 x float> %__A) {`
			`; X86-LABEL: test_mm_mask_fpclass_sd_mask:`
			`; X86: # %bb.0: # %entry`
			`; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1`
			`; X86-NEXT: vfpclasssd $2, %xmm0, %k0 {%k1}`
			`; X86-NEXT: kmovw %k0, %eax`
			`; X86-NEXT: # kill: def $al killed $al killed $eax`
			`; X86-NEXT: retl`
			`;`
			`; X64-LABEL: test_mm_mask_fpclass_sd_mask:`
			`; X64: # %bb.0: # %entry`
			`; X64-NEXT: kmovw %edi, %k1`
			`; X64-NEXT: vfpclasssd $2, %xmm0, %k0 {%k1}`
			`; X64-NEXT: kmovw %k0, %eax`
			`; X64-NEXT: # kill: def $al killed $al killed $eax`
			`; X64-NEXT: retq`
			`entry:`
			`%0 = bitcast <4 x float> %__A to <2 x double>`
			`%1 = tail call i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double> %0, i32 2, i8 %__U)`
			`ret i8 %1`
			`}`

			`define zeroext i8 @test_mm_fpclass_ss_mask(<4 x float> %__A) {`
			`; CHECK-LABEL: test_mm_fpclass_ss_mask:`
			`; CHECK: # %bb.0: # %entry`
			`; CHECK-NEXT: vfpclassss $2, %xmm0, %k0`
			`; CHECK-NEXT: kmovw %k0, %eax`
			`; CHECK-NEXT: # kill: def $al killed $al killed $eax`
			`; CHECK-NEXT: ret{{[l\|q]}}`
			`entry:`
			`%0 = tail call i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float> %__A, i32 2, i8 -1)`
			`ret i8 %0`
			`}`

			`declare i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float>, i32, i8)`

			`define zeroext i8 @test_mm_mask_fpclass_ss_mask(i8 zeroext %__U, <4 x float> %__A) {`
			`; X86-LABEL: test_mm_mask_fpclass_ss_mask:`
			`; X86: # %bb.0: # %entry`
			`; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1`
			`; X86-NEXT: vfpclassss $2, %xmm0, %k0 {%k1}`
			`; X86-NEXT: kmovw %k0, %eax`
			`; X86-NEXT: # kill: def $al killed $al killed $eax`
			`; X86-NEXT: retl`
			`;`
			`; X64-LABEL: test_mm_mask_fpclass_ss_mask:`
			`; X64: # %bb.0: # %entry`
			`; X64-NEXT: kmovw %edi, %k1`
			`; X64-NEXT: vfpclassss $2, %xmm0, %k0 {%k1}`
			`; X64-NEXT: kmovw %k0, %eax`
			`; X64-NEXT: # kill: def $al killed $al killed $eax`
			`; X64-NEXT: retq`
			`entry:`
			`%0 = tail call i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float> %__A, i32 2, i8 %__U)`
			`ret i8 %0`
			`}`