2016-05-19 02:00:43 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
|
|
; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32
|
|
|
|
; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X64
|
|
|
|
|
|
|
|
; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse2-builtins.c
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_add_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_add_epi8:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: paddb %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_add_epi8:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: paddb %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <16 x i8>
|
|
|
|
%res = add <16 x i8> %arg0, %arg1
|
|
|
|
%bc = bitcast <16 x i8> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_add_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_add_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: paddw %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_add_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: paddw %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <8 x i16>
|
|
|
|
%res = add <8 x i16> %arg0, %arg1
|
|
|
|
%bc = bitcast <8 x i16> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_add_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_add_epi32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: paddd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_add_epi32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: paddd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
|
|
|
|
%res = add <4 x i32> %arg0, %arg1
|
|
|
|
%bc = bitcast <4 x i32> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_add_epi64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_add_epi64:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: paddq %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_add_epi64:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: paddq %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = add <2 x i64> %a0, %a1
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_add_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_add_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: addpd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_add_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: addpd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = fadd <2 x double> %a0, %a1
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_add_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_add_sd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: addsd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_add_sd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: addsd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%ext0 = extractelement <2 x double> %a0, i32 0
|
|
|
|
%ext1 = extractelement <2 x double> %a1, i32 0
|
|
|
|
%fadd = fadd double %ext0, %ext1
|
|
|
|
%res = insertelement <2 x double> %a0, double %fadd, i32 0
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_adds_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_adds_epi8:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: paddsb %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_adds_epi8:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: paddsb %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <16 x i8>
|
|
|
|
%res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %arg0, <16 x i8> %arg1)
|
|
|
|
%bc = bitcast <16 x i8> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_adds_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_adds_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: paddsw %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_adds_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: paddsw %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <8 x i16>
|
|
|
|
%res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %arg0, <8 x i16> %arg1)
|
|
|
|
%bc = bitcast <8 x i16> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_adds_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_adds_epu8:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: paddusb %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_adds_epu8:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: paddusb %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <16 x i8>
|
|
|
|
%res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %arg0, <16 x i8> %arg1)
|
|
|
|
%bc = bitcast <16 x i8> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_adds_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_adds_epu16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: paddusw %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_adds_epu16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: paddusw %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <8 x i16>
|
|
|
|
%res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %arg0, <8 x i16> %arg1)
|
|
|
|
%bc = bitcast <8 x i16> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_and_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_and_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: andps %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_and_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: andps %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x double> %a0 to <4 x i32>
|
|
|
|
%arg1 = bitcast <2 x double> %a1 to <4 x i32>
|
|
|
|
%res = and <4 x i32> %arg0, %arg1
|
|
|
|
%bc = bitcast <4 x i32> %res to <2 x double>
|
|
|
|
ret <2 x double> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_and_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_and_si128:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: andps %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_and_si128:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: andps %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = and <2 x i64> %a0, %a1
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_andnot_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_andnot_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: andnps %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_andnot_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: andnps %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x double> %a0 to <4 x i32>
|
|
|
|
%arg1 = bitcast <2 x double> %a1 to <4 x i32>
|
|
|
|
%not = xor <4 x i32> %arg0, <i32 -1, i32 -1, i32 -1, i32 -1>
|
|
|
|
%res = and <4 x i32> %not, %arg1
|
|
|
|
%bc = bitcast <4 x i32> %res to <2 x double>
|
|
|
|
ret <2 x double> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_andnot_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_andnot_si128:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pcmpeqd %xmm2, %xmm2
|
|
|
|
; X32-NEXT: pxor %xmm2, %xmm0
|
|
|
|
; X32-NEXT: pand %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_andnot_si128:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pcmpeqd %xmm2, %xmm2
|
|
|
|
; X64-NEXT: pxor %xmm2, %xmm0
|
|
|
|
; X64-NEXT: pand %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%not = xor <2 x i64> %a0, <i64 -1, i64 -1>
|
|
|
|
%res = and <2 x i64> %not, %a1
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_avg_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_avg_epu8:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pavgb %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_avg_epu8:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pavgb %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <16 x i8>
|
2017-09-12 15:50:35 +08:00
|
|
|
%zext0 = zext <16 x i8> %arg0 to <16 x i16>
|
|
|
|
%zext1 = zext <16 x i8> %arg1 to <16 x i16>
|
|
|
|
%add = add <16 x i16> %zext0, %zext1
|
|
|
|
%add1 = add <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
|
|
|
|
%lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
|
|
|
|
%res = trunc <16 x i16> %lshr to <16 x i8>
|
2016-05-19 02:00:43 +08:00
|
|
|
%bc = bitcast <16 x i8> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_avg_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_avg_epu16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pavgw %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_avg_epu16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pavgw %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <8 x i16>
|
2017-09-12 15:50:35 +08:00
|
|
|
%zext0 = zext <8 x i16> %arg0 to <8 x i32>
|
|
|
|
%zext1 = zext <8 x i16> %arg1 to <8 x i32>
|
|
|
|
%add = add <8 x i32> %zext0, %zext1
|
|
|
|
%add1 = add <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%lshr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%res = trunc <8 x i32> %lshr to <8 x i16>
|
2016-05-19 02:00:43 +08:00
|
|
|
%bc = bitcast <8 x i16> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_bslli_si128(<2 x i64> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_bslli_si128:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_bslli_si128:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
|
|
|
|
%res = shufflevector <16 x i8> zeroinitializer, <16 x i8> %arg0, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
|
|
|
|
%bc = bitcast <16 x i8> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_bsrli_si128(<2 x i64> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_bsrli_si128:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_bsrli_si128:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
|
|
|
|
%res = shufflevector <16 x i8> %arg0, <16 x i8> zeroinitializer, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
|
|
|
|
%bc = bitcast <16 x i8> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
2016-05-19 18:58:54 +08:00
|
|
|
define <4 x float> @test_mm_castpd_ps(<2 x double> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_castpd_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_castpd_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = bitcast <2 x double> %a0 to <4 x float>
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_castpd_si128(<2 x double> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_castpd_si128:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_castpd_si128:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = bitcast <2 x double> %a0 to <2 x i64>
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_castps_pd(<4 x float> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_castps_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_castps_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = bitcast <4 x float> %a0 to <2 x double>
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_castps_si128(<4 x float> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_castps_si128:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_castps_si128:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = bitcast <4 x float> %a0 to <2 x i64>
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_castsi128_pd(<2 x i64> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_castsi128_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_castsi128_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = bitcast <2 x i64> %a0 to <2 x double>
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_castsi128_ps(<2 x i64> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_castsi128_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_castsi128_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = bitcast <2 x i64> %a0 to <4 x float>
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
2016-05-19 02:00:43 +08:00
|
|
|
define void @test_mm_clflush(i8* %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_clflush:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: clflush (%eax)
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_clflush:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: clflush (%rdi)
|
|
|
|
; X64-NEXT: retq
|
|
|
|
call void @llvm.x86.sse2.clflush(i8* %a0)
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
declare void @llvm.x86.sse2.clflush(i8*) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_cmpeq_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpeq_epi8:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pcmpeqb %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpeq_epi8:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pcmpeqb %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <16 x i8>
|
|
|
|
%cmp = icmp eq <16 x i8> %arg0, %arg1
|
|
|
|
%res = sext <16 x i1> %cmp to <16 x i8>
|
|
|
|
%bc = bitcast <16 x i8> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_cmpeq_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpeq_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pcmpeqw %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpeq_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pcmpeqw %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <8 x i16>
|
|
|
|
%cmp = icmp eq <8 x i16> %arg0, %arg1
|
|
|
|
%res = sext <8 x i1> %cmp to <8 x i16>
|
|
|
|
%bc = bitcast <8 x i16> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_cmpeq_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpeq_epi32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pcmpeqd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpeq_epi32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pcmpeqd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
|
|
|
|
%cmp = icmp eq <4 x i32> %arg0, %arg1
|
|
|
|
%res = sext <4 x i1> %cmp to <4 x i32>
|
|
|
|
%bc = bitcast <4 x i32> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_cmpeq_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpeq_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpeqpd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpeq_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpeqpd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
2016-06-16 05:22:15 +08:00
|
|
|
%fcmp = fcmp oeq <2 x double> %a0, %a1
|
|
|
|
%sext = sext <2 x i1> %fcmp to <2 x i64>
|
|
|
|
%res = bitcast <2 x i64> %sext to <2 x double>
|
2016-05-19 02:00:43 +08:00
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_cmpeq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpeq_sd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpeqsd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpeq_sd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpeqsd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 0)
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_cmpge_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpge_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmplepd %xmm0, %xmm1
|
|
|
|
; X32-NEXT: movapd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpge_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmplepd %xmm0, %xmm1
|
|
|
|
; X64-NEXT: movapd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
2016-06-16 05:22:15 +08:00
|
|
|
%fcmp = fcmp ole <2 x double> %a1, %a0
|
|
|
|
%sext = sext <2 x i1> %fcmp to <2 x i64>
|
|
|
|
%res = bitcast <2 x i64> %sext to <2 x double>
|
2016-05-19 02:00:43 +08:00
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_cmpge_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpge_sd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmplesd %xmm0, %xmm1
|
|
|
|
; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpge_sd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmplesd %xmm0, %xmm1
|
|
|
|
; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 2)
|
|
|
|
%ext0 = extractelement <2 x double> %cmp, i32 0
|
|
|
|
%ins0 = insertelement <2 x double> undef, double %ext0, i32 0
|
|
|
|
%ext1 = extractelement <2 x double> %a0, i32 1
|
|
|
|
%ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
|
|
|
|
ret <2 x double> %ins1
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_cmpgt_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpgt_epi8:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pcmpgtb %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpgt_epi8:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pcmpgtb %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <16 x i8>
|
|
|
|
%cmp = icmp sgt <16 x i8> %arg0, %arg1
|
|
|
|
%res = sext <16 x i1> %cmp to <16 x i8>
|
|
|
|
%bc = bitcast <16 x i8> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_cmpgt_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpgt_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pcmpgtw %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpgt_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pcmpgtw %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <8 x i16>
|
|
|
|
%cmp = icmp sgt <8 x i16> %arg0, %arg1
|
|
|
|
%res = sext <8 x i1> %cmp to <8 x i16>
|
|
|
|
%bc = bitcast <8 x i16> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_cmpgt_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpgt_epi32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pcmpgtd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpgt_epi32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pcmpgtd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
|
|
|
|
%cmp = icmp sgt <4 x i32> %arg0, %arg1
|
|
|
|
%res = sext <4 x i1> %cmp to <4 x i32>
|
|
|
|
%bc = bitcast <4 x i32> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_cmpgt_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpgt_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpltpd %xmm0, %xmm1
|
|
|
|
; X32-NEXT: movapd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpgt_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpltpd %xmm0, %xmm1
|
|
|
|
; X64-NEXT: movapd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
2016-06-16 05:22:15 +08:00
|
|
|
%fcmp = fcmp olt <2 x double> %a1, %a0
|
|
|
|
%sext = sext <2 x i1> %fcmp to <2 x i64>
|
|
|
|
%res = bitcast <2 x i64> %sext to <2 x double>
|
2016-05-19 02:00:43 +08:00
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_cmpgt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpgt_sd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpltsd %xmm0, %xmm1
|
|
|
|
; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpgt_sd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpltsd %xmm0, %xmm1
|
|
|
|
; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 1)
|
|
|
|
%ext0 = extractelement <2 x double> %cmp, i32 0
|
|
|
|
%ins0 = insertelement <2 x double> undef, double %ext0, i32 0
|
|
|
|
%ext1 = extractelement <2 x double> %a0, i32 1
|
|
|
|
%ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
|
|
|
|
ret <2 x double> %ins1
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_cmple_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmple_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmplepd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmple_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmplepd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
2016-06-16 05:22:15 +08:00
|
|
|
%fcmp = fcmp ole <2 x double> %a0, %a1
|
|
|
|
%sext = sext <2 x i1> %fcmp to <2 x i64>
|
|
|
|
%res = bitcast <2 x i64> %sext to <2 x double>
|
2016-05-19 02:00:43 +08:00
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_cmple_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmple_sd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmplesd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmple_sd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmplesd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 2)
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_cmplt_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmplt_epi8:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pcmpgtb %xmm0, %xmm1
|
|
|
|
; X32-NEXT: movdqa %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmplt_epi8:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pcmpgtb %xmm0, %xmm1
|
|
|
|
; X64-NEXT: movdqa %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <16 x i8>
|
|
|
|
%cmp = icmp sgt <16 x i8> %arg1, %arg0
|
|
|
|
%res = sext <16 x i1> %cmp to <16 x i8>
|
|
|
|
%bc = bitcast <16 x i8> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_cmplt_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmplt_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pcmpgtw %xmm0, %xmm1
|
|
|
|
; X32-NEXT: movdqa %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmplt_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pcmpgtw %xmm0, %xmm1
|
|
|
|
; X64-NEXT: movdqa %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <8 x i16>
|
|
|
|
%cmp = icmp sgt <8 x i16> %arg1, %arg0
|
|
|
|
%res = sext <8 x i1> %cmp to <8 x i16>
|
|
|
|
%bc = bitcast <8 x i16> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_cmplt_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmplt_epi32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pcmpgtd %xmm0, %xmm1
|
|
|
|
; X32-NEXT: movdqa %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmplt_epi32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pcmpgtd %xmm0, %xmm1
|
|
|
|
; X64-NEXT: movdqa %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
|
|
|
|
%cmp = icmp sgt <4 x i32> %arg1, %arg0
|
|
|
|
%res = sext <4 x i1> %cmp to <4 x i32>
|
|
|
|
%bc = bitcast <4 x i32> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_cmplt_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmplt_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpltpd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmplt_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpltpd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
2016-06-16 05:22:15 +08:00
|
|
|
%fcmp = fcmp olt <2 x double> %a0, %a1
|
|
|
|
%sext = sext <2 x i1> %fcmp to <2 x i64>
|
|
|
|
%res = bitcast <2 x i64> %sext to <2 x double>
|
2016-05-19 02:00:43 +08:00
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_cmplt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmplt_sd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpltsd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmplt_sd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpltsd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 1)
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_cmpneq_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpneq_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpneqpd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpneq_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpneqpd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
2016-06-16 05:22:15 +08:00
|
|
|
%fcmp = fcmp une <2 x double> %a0, %a1
|
|
|
|
%sext = sext <2 x i1> %fcmp to <2 x i64>
|
|
|
|
%res = bitcast <2 x i64> %sext to <2 x double>
|
2016-05-19 02:00:43 +08:00
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_cmpneq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpneq_sd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpneqsd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpneq_sd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpneqsd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 4)
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_cmpnge_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpnge_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpnlepd %xmm0, %xmm1
|
|
|
|
; X32-NEXT: movapd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpnge_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpnlepd %xmm0, %xmm1
|
|
|
|
; X64-NEXT: movapd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
2016-06-16 05:22:15 +08:00
|
|
|
%fcmp = fcmp ugt <2 x double> %a1, %a0
|
|
|
|
%sext = sext <2 x i1> %fcmp to <2 x i64>
|
|
|
|
%res = bitcast <2 x i64> %sext to <2 x double>
|
2016-05-19 02:00:43 +08:00
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_cmpnge_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpnge_sd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpnlesd %xmm0, %xmm1
|
2016-05-20 00:49:53 +08:00
|
|
|
; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
|
2016-05-19 02:00:43 +08:00
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpnge_sd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpnlesd %xmm0, %xmm1
|
2016-05-20 00:49:53 +08:00
|
|
|
; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
|
2016-05-19 02:00:43 +08:00
|
|
|
; X64-NEXT: retq
|
2016-05-20 00:49:53 +08:00
|
|
|
%cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 6)
|
|
|
|
%ext0 = extractelement <2 x double> %cmp, i32 0
|
|
|
|
%ins0 = insertelement <2 x double> undef, double %ext0, i32 0
|
|
|
|
%ext1 = extractelement <2 x double> %a0, i32 1
|
|
|
|
%ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
|
|
|
|
ret <2 x double> %ins1
|
2016-05-19 02:00:43 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_cmpngt_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpngt_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpnltpd %xmm0, %xmm1
|
|
|
|
; X32-NEXT: movapd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpngt_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpnltpd %xmm0, %xmm1
|
|
|
|
; X64-NEXT: movapd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
2016-06-16 05:22:15 +08:00
|
|
|
%fcmp = fcmp uge <2 x double> %a1, %a0
|
|
|
|
%sext = sext <2 x i1> %fcmp to <2 x i64>
|
|
|
|
%res = bitcast <2 x i64> %sext to <2 x double>
|
2016-05-19 02:00:43 +08:00
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_cmpngt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpngt_sd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpnltsd %xmm0, %xmm1
|
2016-05-20 00:49:53 +08:00
|
|
|
; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
|
2016-05-19 02:00:43 +08:00
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpngt_sd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpnltsd %xmm0, %xmm1
|
2016-05-20 00:49:53 +08:00
|
|
|
; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
|
2016-05-19 02:00:43 +08:00
|
|
|
; X64-NEXT: retq
|
2016-05-20 00:49:53 +08:00
|
|
|
%cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 5)
|
|
|
|
%ext0 = extractelement <2 x double> %cmp, i32 0
|
|
|
|
%ins0 = insertelement <2 x double> undef, double %ext0, i32 0
|
|
|
|
%ext1 = extractelement <2 x double> %a0, i32 1
|
|
|
|
%ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
|
|
|
|
ret <2 x double> %ins1
|
2016-05-19 02:00:43 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_cmpnle_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpnle_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpnlepd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpnle_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpnlepd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
2016-06-16 05:22:15 +08:00
|
|
|
%fcmp = fcmp ugt <2 x double> %a0, %a1
|
|
|
|
%sext = sext <2 x i1> %fcmp to <2 x i64>
|
|
|
|
%res = bitcast <2 x i64> %sext to <2 x double>
|
2016-05-19 02:00:43 +08:00
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_cmpnle_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpnle_sd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpnlesd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpnle_sd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpnlesd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 6)
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_cmpnlt_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpnlt_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpnltpd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpnlt_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpnltpd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
2016-06-16 05:22:15 +08:00
|
|
|
%fcmp = fcmp uge <2 x double> %a0, %a1
|
|
|
|
%sext = sext <2 x i1> %fcmp to <2 x i64>
|
|
|
|
%res = bitcast <2 x i64> %sext to <2 x double>
|
2016-05-19 02:00:43 +08:00
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_cmpnlt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpnlt_sd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpnltsd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpnlt_sd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpnltsd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 5)
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_cmpord_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpord_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpordpd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpord_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpordpd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
2016-06-16 05:22:15 +08:00
|
|
|
%fcmp = fcmp ord <2 x double> %a0, %a1
|
|
|
|
%sext = sext <2 x i1> %fcmp to <2 x i64>
|
|
|
|
%res = bitcast <2 x i64> %sext to <2 x double>
|
2016-05-19 02:00:43 +08:00
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_cmpord_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpord_sd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpordsd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpord_sd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpordsd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 7)
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_cmpunord_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpunord_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpunordpd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpunord_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpunordpd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
2016-06-16 05:22:15 +08:00
|
|
|
%fcmp = fcmp uno <2 x double> %a0, %a1
|
|
|
|
%sext = sext <2 x i1> %fcmp to <2 x i64>
|
|
|
|
%res = bitcast <2 x i64> %sext to <2 x double>
|
2016-05-19 02:00:43 +08:00
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_cmpunord_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpunord_sd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpunordsd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpunord_sd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpunordsd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 3)
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @test_mm_comieq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_comieq_sd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: comisd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: setnp %al
|
|
|
|
; X32-NEXT: sete %cl
|
|
|
|
; X32-NEXT: andb %al, %cl
|
|
|
|
; X32-NEXT: movzbl %cl, %eax
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_comieq_sd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: comisd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: setnp %al
|
|
|
|
; X64-NEXT: sete %cl
|
|
|
|
; X64-NEXT: andb %al, %cl
|
|
|
|
; X64-NEXT: movzbl %cl, %eax
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readnone
|
|
|
|
|
|
|
|
define i32 @test_mm_comige_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_comige_sd:
|
|
|
|
; X32: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X32-NEXT: xorl %eax, %eax
|
2016-05-19 02:00:43 +08:00
|
|
|
; X32-NEXT: comisd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: setae %al
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_comige_sd:
|
|
|
|
; X64: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X64-NEXT: xorl %eax, %eax
|
2016-05-19 02:00:43 +08:00
|
|
|
; X64-NEXT: comisd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: setae %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse2.comige.sd(<2 x double> %a0, <2 x double> %a1)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse2.comige.sd(<2 x double>, <2 x double>) nounwind readnone
|
|
|
|
|
|
|
|
define i32 @test_mm_comigt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_comigt_sd:
|
|
|
|
; X32: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X32-NEXT: xorl %eax, %eax
|
2016-05-19 02:00:43 +08:00
|
|
|
; X32-NEXT: comisd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: seta %al
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_comigt_sd:
|
|
|
|
; X64: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X64-NEXT: xorl %eax, %eax
|
2016-05-19 02:00:43 +08:00
|
|
|
; X64-NEXT: comisd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: seta %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse2.comigt.sd(<2 x double> %a0, <2 x double> %a1)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse2.comigt.sd(<2 x double>, <2 x double>) nounwind readnone
|
|
|
|
|
|
|
|
define i32 @test_mm_comile_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_comile_sd:
|
|
|
|
; X32: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X32-NEXT: xorl %eax, %eax
|
2016-05-19 02:00:43 +08:00
|
|
|
; X32-NEXT: comisd %xmm0, %xmm1
|
|
|
|
; X32-NEXT: setae %al
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_comile_sd:
|
|
|
|
; X64: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X64-NEXT: xorl %eax, %eax
|
2016-05-19 02:00:43 +08:00
|
|
|
; X64-NEXT: comisd %xmm0, %xmm1
|
|
|
|
; X64-NEXT: setae %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse2.comile.sd(<2 x double> %a0, <2 x double> %a1)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse2.comile.sd(<2 x double>, <2 x double>) nounwind readnone
|
|
|
|
|
|
|
|
define i32 @test_mm_comilt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_comilt_sd:
|
|
|
|
; X32: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X32-NEXT: xorl %eax, %eax
|
2016-05-19 02:00:43 +08:00
|
|
|
; X32-NEXT: comisd %xmm0, %xmm1
|
|
|
|
; X32-NEXT: seta %al
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_comilt_sd:
|
|
|
|
; X64: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X64-NEXT: xorl %eax, %eax
|
2016-05-19 02:00:43 +08:00
|
|
|
; X64-NEXT: comisd %xmm0, %xmm1
|
|
|
|
; X64-NEXT: seta %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse2.comilt.sd(<2 x double> %a0, <2 x double> %a1)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse2.comilt.sd(<2 x double>, <2 x double>) nounwind readnone
|
|
|
|
|
|
|
|
define i32 @test_mm_comineq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_comineq_sd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: comisd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: setp %al
|
|
|
|
; X32-NEXT: setne %cl
|
|
|
|
; X32-NEXT: orb %al, %cl
|
|
|
|
; X32-NEXT: movzbl %cl, %eax
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_comineq_sd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: comisd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: setp %al
|
|
|
|
; X64-NEXT: setne %cl
|
|
|
|
; X64-NEXT: orb %al, %cl
|
|
|
|
; X64-NEXT: movzbl %cl, %eax
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse2.comineq.sd(<2 x double> %a0, <2 x double> %a1)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse2.comineq.sd(<2 x double>, <2 x double>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_cvtepi32_pd(<2 x i64> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cvtepi32_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cvtdq2pd %xmm0, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cvtepi32_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cvtdq2pd %xmm0, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
|
2016-05-24 06:17:36 +08:00
|
|
|
%ext = shufflevector <4 x i32> %arg0, <4 x i32> %arg0, <2 x i32> <i32 0, i32 1>
|
|
|
|
%res = sitofp <2 x i32> %ext to <2 x double>
|
2016-05-19 02:00:43 +08:00
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_cvtepi32_ps(<2 x i64> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cvtepi32_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cvtdq2ps %xmm0, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cvtepi32_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cvtdq2ps %xmm0, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
|
|
|
|
%res = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %arg0)
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_cvtpd_epi32(<2 x double> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cvtpd_epi32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cvtpd2dq %xmm0, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cvtpd_epi32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cvtpd2dq %xmm0, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0)
|
|
|
|
%bc = bitcast <4 x i32> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_cvtpd_ps(<2 x double> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cvtpd_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cvtpd2ps %xmm0, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cvtpd_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cvtpd2ps %xmm0, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0)
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
declare <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_cvtps_epi32(<4 x float> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cvtps_epi32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cvtps2dq %xmm0, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cvtps_epi32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cvtps2dq %xmm0, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0)
|
|
|
|
%bc = bitcast <4 x i32> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_cvtps_pd(<4 x float> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cvtps_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cvtps2pd %xmm0, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cvtps_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cvtps2pd %xmm0, %xmm0
|
|
|
|
; X64-NEXT: retq
|
2016-05-24 06:17:36 +08:00
|
|
|
%ext = shufflevector <4 x float> %a0, <4 x float> %a0, <2 x i32> <i32 0, i32 1>
|
|
|
|
%res = fpext <2 x float> %ext to <2 x double>
|
2016-05-19 02:00:43 +08:00
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define double @test_mm_cvtsd_f64(<2 x double> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cvtsd_f64:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pushl %ebp
|
|
|
|
; X32-NEXT: movl %esp, %ebp
|
|
|
|
; X32-NEXT: andl $-8, %esp
|
|
|
|
; X32-NEXT: subl $8, %esp
|
|
|
|
; X32-NEXT: movlps %xmm0, (%esp)
|
|
|
|
; X32-NEXT: fldl (%esp)
|
|
|
|
; X32-NEXT: movl %ebp, %esp
|
|
|
|
; X32-NEXT: popl %ebp
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cvtsd_f64:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = extractelement <2 x double> %a0, i32 0
|
|
|
|
ret double %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @test_mm_cvtsd_si32(<2 x double> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cvtsd_si32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cvtsd2si %xmm0, %eax
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cvtsd_si32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cvtsd2si %xmm0, %eax
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
|
|
|
|
|
2016-07-19 23:07:43 +08:00
|
|
|
define <4 x float> @test_mm_cvtsd_ss(<4 x float> %a0, <2 x double> %a1) {
|
|
|
|
; X32-LABEL: test_mm_cvtsd_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cvtsd2ss %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cvtsd_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cvtsd2ss %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1)
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone
|
|
|
|
|
2016-07-26 18:41:28 +08:00
|
|
|
define <4 x float> @test_mm_cvtsd_ss_load(<4 x float> %a0, <2 x double>* %p1) {
|
|
|
|
; X32-LABEL: test_mm_cvtsd_ss_load:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
2016-12-12 13:07:17 +08:00
|
|
|
; X32-NEXT: cvtsd2ss (%eax), %xmm0
|
2016-07-26 18:41:28 +08:00
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cvtsd_ss_load:
|
|
|
|
; X64: # BB#0:
|
2016-12-12 13:07:17 +08:00
|
|
|
; X64-NEXT: cvtsd2ss (%rdi), %xmm0
|
2016-07-26 18:41:28 +08:00
|
|
|
; X64-NEXT: retq
|
|
|
|
%a1 = load <2 x double>, <2 x double>* %p1
|
|
|
|
%res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1)
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
2016-05-19 02:00:43 +08:00
|
|
|
define i32 @test_mm_cvtsi128_si32(<2 x i64> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cvtsi128_si32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movd %xmm0, %eax
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cvtsi128_si32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movd %xmm0, %eax
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
|
|
|
|
%res = extractelement <4 x i32> %arg0, i32 0
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_cvtsi32_sd(<2 x double> %a0, i32 %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cvtsi32_sd:
|
|
|
|
; X32: # BB#0:
|
2017-01-11 17:11:48 +08:00
|
|
|
; X32-NEXT: cvtsi2sdl {{[0-9]+}}(%esp), %xmm0
|
2016-05-19 02:00:43 +08:00
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cvtsi32_sd:
|
|
|
|
; X64: # BB#0:
|
2017-01-11 17:11:48 +08:00
|
|
|
; X64-NEXT: cvtsi2sdl %edi, %xmm0
|
2016-05-19 02:00:43 +08:00
|
|
|
; X64-NEXT: retq
|
|
|
|
%cvt = sitofp i32 %a1 to double
|
|
|
|
%res = insertelement <2 x double> %a0, double %cvt, i32 0
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_cvtsi32_si128(i32 %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cvtsi32_si128:
|
|
|
|
; X32: # BB#0:
|
2016-12-16 00:05:29 +08:00
|
|
|
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
2016-05-19 02:00:43 +08:00
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cvtsi32_si128:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movd %edi, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res0 = insertelement <4 x i32> undef, i32 %a0, i32 0
|
|
|
|
%res1 = insertelement <4 x i32> %res0, i32 0, i32 1
|
|
|
|
%res2 = insertelement <4 x i32> %res1, i32 0, i32 2
|
|
|
|
%res3 = insertelement <4 x i32> %res2, i32 0, i32 3
|
|
|
|
%res = bitcast <4 x i32> %res3 to <2 x i64>
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_cvtss_sd(<2 x double> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cvtss_sd:
|
|
|
|
; X32: # BB#0:
|
2017-01-11 17:11:48 +08:00
|
|
|
; X32-NEXT: cvtss2sd %xmm1, %xmm0
|
2016-05-19 02:00:43 +08:00
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cvtss_sd:
|
|
|
|
; X64: # BB#0:
|
2017-01-11 17:11:48 +08:00
|
|
|
; X64-NEXT: cvtss2sd %xmm1, %xmm0
|
2016-05-19 02:00:43 +08:00
|
|
|
; X64-NEXT: retq
|
|
|
|
%ext = extractelement <4 x float> %a1, i32 0
|
|
|
|
%cvt = fpext float %ext to double
|
|
|
|
%res = insertelement <2 x double> %a0, double %cvt, i32 0
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_cvttpd_epi32(<2 x double> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cvttpd_epi32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cvttpd2dq %xmm0, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cvttpd_epi32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cvttpd2dq %xmm0, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0)
|
|
|
|
%bc = bitcast <4 x i32> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_cvttps_epi32(<4 x float> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cvttps_epi32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cvttps2dq %xmm0, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cvttps_epi32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cvttps2dq %xmm0, %xmm0
|
|
|
|
; X64-NEXT: retq
|
2016-07-19 23:07:43 +08:00
|
|
|
%res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0)
|
2016-05-19 02:00:43 +08:00
|
|
|
%bc = bitcast <4 x i32> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
2016-07-19 23:07:43 +08:00
|
|
|
declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
|
2016-05-19 02:00:43 +08:00
|
|
|
|
|
|
|
define i32 @test_mm_cvttsd_si32(<2 x double> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cvttsd_si32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cvttsd2si %xmm0, %eax
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cvttsd_si32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cvttsd2si %xmm0, %eax
|
|
|
|
; X64-NEXT: retq
|
2016-07-19 23:07:43 +08:00
|
|
|
%res = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0)
|
2016-05-19 02:00:43 +08:00
|
|
|
ret i32 %res
|
|
|
|
}
|
2016-07-19 23:07:43 +08:00
|
|
|
declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
|
2016-05-19 02:00:43 +08:00
|
|
|
|
|
|
|
define <2 x double> @test_mm_div_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_div_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: divpd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_div_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: divpd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = fdiv <2 x double> %a0, %a1
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_div_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_div_sd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: divsd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_div_sd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: divsd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%ext0 = extractelement <2 x double> %a0, i32 0
|
|
|
|
%ext1 = extractelement <2 x double> %a1, i32 0
|
|
|
|
%fdiv = fdiv double %ext0, %ext1
|
|
|
|
%res = insertelement <2 x double> %a0, double %fdiv, i32 0
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @test_mm_extract_epi16(<2 x i64> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_extract_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pextrw $1, %xmm0, %eax
|
|
|
|
; X32-NEXT: movzwl %ax, %eax
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_extract_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pextrw $1, %xmm0, %eax
|
|
|
|
; X64-NEXT: movzwl %ax, %eax
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%ext = extractelement <8 x i16> %arg0, i32 1
|
|
|
|
%res = zext i16 %ext to i32
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_insert_epi16(<2 x i64> %a0, i16 %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_insert_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
|
|
|
|
; X32-NEXT: pinsrw $1, %eax, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_insert_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pinsrw $1, %edi, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%res = insertelement <8 x i16> %arg0, i16 %a1,i32 1
|
|
|
|
%bc = bitcast <8 x i16> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @test_mm_lfence() nounwind {
|
|
|
|
; X32-LABEL: test_mm_lfence:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: lfence
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_lfence:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: lfence
|
|
|
|
; X64-NEXT: retq
|
|
|
|
call void @llvm.x86.sse2.lfence()
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
declare void @llvm.x86.sse2.lfence() nounwind readnone
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_load_pd(double* %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_load_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movaps (%eax), %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_load_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movaps (%rdi), %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast double* %a0 to <2 x double>*
|
|
|
|
%res = load <2 x double>, <2 x double>* %arg0, align 16
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_load_sd(double* %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_load_sd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_load_sd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%ld = load double, double* %a0, align 1
|
|
|
|
%res0 = insertelement <2 x double> undef, double %ld, i32 0
|
|
|
|
%res1 = insertelement <2 x double> %res0, double 0.0, i32 1
|
|
|
|
ret <2 x double> %res1
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_load_si128(<2 x i64>* %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_load_si128:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movaps (%eax), %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_load_si128:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movaps (%rdi), %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = load <2 x i64>, <2 x i64>* %a0, align 16
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_load1_pd(double* %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_load1_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_load1_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%ld = load double, double* %a0, align 8
|
|
|
|
%res0 = insertelement <2 x double> undef, double %ld, i32 0
|
|
|
|
%res1 = insertelement <2 x double> %res0, double %ld, i32 1
|
|
|
|
ret <2 x double> %res1
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_loadh_pd(<2 x double> %a0, double* %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_loadh_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_loadh_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%ld = load double, double* %a1, align 8
|
|
|
|
%res = insertelement <2 x double> %a0, double %ld, i32 1
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_loadl_epi64(<2 x i64> %a0, <2 x i64>* %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_loadl_epi64:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
2016-12-16 00:05:29 +08:00
|
|
|
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
2016-05-19 02:00:43 +08:00
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_loadl_epi64:
|
|
|
|
; X64: # BB#0:
|
2016-12-16 00:05:29 +08:00
|
|
|
; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
2016-05-19 02:00:43 +08:00
|
|
|
; X64-NEXT: retq
|
|
|
|
%bc = bitcast <2 x i64>* %a1 to i64*
|
|
|
|
%ld = load i64, i64* %bc, align 1
|
|
|
|
%res0 = insertelement <2 x i64> undef, i64 %ld, i32 0
|
|
|
|
%res1 = insertelement <2 x i64> %res0, i64 0, i32 1
|
|
|
|
ret <2 x i64> %res1
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_loadl_pd(<2 x double> %a0, double* %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_loadl_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_loadl_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%ld = load double, double* %a1, align 8
|
|
|
|
%res = insertelement <2 x double> %a0, double %ld, i32 0
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_loadr_pd(double* %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_loadr_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movapd (%eax), %xmm0
|
|
|
|
; X32-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_loadr_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movapd (%rdi), %xmm0
|
|
|
|
; X64-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast double* %a0 to <2 x double>*
|
|
|
|
%ld = load <2 x double>, <2 x double>* %arg0, align 16
|
|
|
|
%res = shufflevector <2 x double> %ld, <2 x double> undef, <2 x i32> <i32 1, i32 0>
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_loadu_pd(double* %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_loadu_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movups (%eax), %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_loadu_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movups (%rdi), %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast double* %a0 to <2 x double>*
|
|
|
|
%res = load <2 x double>, <2 x double>* %arg0, align 1
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_loadu_si128(<2 x i64>* %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_loadu_si128:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movups (%eax), %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_loadu_si128:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movups (%rdi), %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = load <2 x i64>, <2 x i64>* %a0, align 1
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_madd_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_madd_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pmaddwd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_madd_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pmaddwd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <8 x i16>
|
|
|
|
%res = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %arg0, <8 x i16> %arg1)
|
|
|
|
%bc = bitcast <4 x i32> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
|
|
|
|
|
|
|
|
define void @test_mm_maskmoveu_si128(<2 x i64> %a0, <2 x i64> %a1, i8* %a2) nounwind {
|
|
|
|
; X32-LABEL: test_mm_maskmoveu_si128:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pushl %edi
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
|
|
|
|
; X32-NEXT: maskmovdqu %xmm1, %xmm0
|
|
|
|
; X32-NEXT: popl %edi
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_maskmoveu_si128:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: maskmovdqu %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <16 x i8>
|
|
|
|
call void @llvm.x86.sse2.maskmov.dqu(<16 x i8> %arg0, <16 x i8> %arg1, i8* %a2)
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
declare void @llvm.x86.sse2.maskmov.dqu(<16 x i8>, <16 x i8>, i8*) nounwind
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_max_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_max_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pmaxsw %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_max_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pmaxsw %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <8 x i16>
|
2016-06-16 01:17:27 +08:00
|
|
|
%cmp = icmp sgt <8 x i16> %arg0, %arg1
|
|
|
|
%sel = select <8 x i1> %cmp, <8 x i16> %arg0, <8 x i16> %arg1
|
|
|
|
%bc = bitcast <8 x i16> %sel to <2 x i64>
|
2016-05-19 02:00:43 +08:00
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_max_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_max_epu8:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pmaxub %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_max_epu8:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pmaxub %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <16 x i8>
|
2016-06-16 01:17:27 +08:00
|
|
|
%cmp = icmp ugt <16 x i8> %arg0, %arg1
|
|
|
|
%sel = select <16 x i1> %cmp, <16 x i8> %arg0, <16 x i8> %arg1
|
|
|
|
%bc = bitcast <16 x i8> %sel to <2 x i64>
|
2016-05-19 02:00:43 +08:00
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_max_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_max_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: maxpd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_max_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: maxpd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_max_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_max_sd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: maxsd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_max_sd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: maxsd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1)
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
|
|
|
|
|
|
|
|
define void @test_mm_mfence() nounwind {
|
|
|
|
; X32-LABEL: test_mm_mfence:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: mfence
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_mfence:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: mfence
|
|
|
|
; X64-NEXT: retq
|
|
|
|
call void @llvm.x86.sse2.mfence()
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
declare void @llvm.x86.sse2.mfence() nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_min_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_min_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pminsw %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_min_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pminsw %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <8 x i16>
|
2016-06-16 01:17:27 +08:00
|
|
|
%cmp = icmp slt <8 x i16> %arg0, %arg1
|
|
|
|
%sel = select <8 x i1> %cmp, <8 x i16> %arg0, <8 x i16> %arg1
|
|
|
|
%bc = bitcast <8 x i16> %sel to <2 x i64>
|
2016-05-19 02:00:43 +08:00
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_min_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_min_epu8:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pminub %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_min_epu8:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pminub %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <16 x i8>
|
2016-06-16 01:17:27 +08:00
|
|
|
%cmp = icmp ult <16 x i8> %arg0, %arg1
|
|
|
|
%sel = select <16 x i1> %cmp, <16 x i8> %arg0, <16 x i8> %arg1
|
|
|
|
%bc = bitcast <16 x i8> %sel to <2 x i64>
|
2016-05-19 02:00:43 +08:00
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_min_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_min_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: minpd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_min_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: minpd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_min_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_min_sd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: minsd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_min_sd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: minsd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1)
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
|
|
|
|
|
2016-05-19 19:59:57 +08:00
|
|
|
define <2 x i64> @test_mm_move_epi64(<2 x i64> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_move_epi64:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_move_epi64:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_move_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_move_sd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_move_sd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%ext0 = extractelement <2 x double> %a1, i32 0
|
|
|
|
%res0 = insertelement <2 x double> undef, double %ext0, i32 0
|
|
|
|
%ext1 = extractelement <2 x double> %a0, i32 1
|
|
|
|
%res1 = insertelement <2 x double> %res0, double %ext1, i32 1
|
|
|
|
ret <2 x double> %res1
|
|
|
|
}
|
|
|
|
|
2016-05-19 02:00:43 +08:00
|
|
|
define i32 @test_mm_movemask_epi8(<2 x i64> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_movemask_epi8:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pmovmskb %xmm0, %eax
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_movemask_epi8:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pmovmskb %xmm0, %eax
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
|
|
|
|
%res = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %arg0)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
|
|
|
|
|
|
|
|
define i32 @test_mm_movemask_pd(<2 x double> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_movemask_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movmskpd %xmm0, %eax
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_movemask_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movmskpd %xmm0, %eax
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_mul_epu32(<2 x i64> %a0, <2 x i64> %a1) {
|
|
|
|
; X32-LABEL: test_mm_mul_epu32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pmuludq %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_mul_epu32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pmuludq %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
|
|
|
|
%res = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %arg0, <4 x i32> %arg1)
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_mul_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_mul_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: mulpd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_mul_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: mulpd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = fmul <2 x double> %a0, %a1
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_mul_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_mul_sd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: mulsd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_mul_sd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: mulsd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%ext0 = extractelement <2 x double> %a0, i32 0
|
|
|
|
%ext1 = extractelement <2 x double> %a1, i32 0
|
|
|
|
%fmul = fmul double %ext0, %ext1
|
|
|
|
%res = insertelement <2 x double> %a0, double %fmul, i32 0
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_mulhi_epi16(<2 x i64> %a0, <2 x i64> %a1) {
|
|
|
|
; X32-LABEL: test_mm_mulhi_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pmulhw %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_mulhi_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pmulhw %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <8 x i16>
|
|
|
|
%res = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %arg0, <8 x i16> %arg1)
|
|
|
|
%bc = bitcast <8 x i16> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_mulhi_epu16(<2 x i64> %a0, <2 x i64> %a1) {
|
|
|
|
; X32-LABEL: test_mm_mulhi_epu16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pmulhuw %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_mulhi_epu16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pmulhuw %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <8 x i16>
|
|
|
|
%res = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %arg0, <8 x i16> %arg1)
|
|
|
|
%bc = bitcast <8 x i16> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_mullo_epi16(<2 x i64> %a0, <2 x i64> %a1) {
|
|
|
|
; X32-LABEL: test_mm_mullo_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pmullw %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_mullo_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pmullw %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <8 x i16>
|
|
|
|
%res = mul <8 x i16> %arg0, %arg1
|
|
|
|
%bc = bitcast <8 x i16> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_or_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_or_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: orps %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_or_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: orps %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x double> %a0 to <4 x i32>
|
|
|
|
%arg1 = bitcast <2 x double> %a1 to <4 x i32>
|
|
|
|
%res = or <4 x i32> %arg0, %arg1
|
|
|
|
%bc = bitcast <4 x i32> %res to <2 x double>
|
|
|
|
ret <2 x double> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_or_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_or_si128:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: orps %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_or_si128:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: orps %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = or <2 x i64> %a0, %a1
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_packs_epi16(<2 x i64> %a0, <2 x i64> %a1) {
|
|
|
|
; X32-LABEL: test_mm_packs_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: packsswb %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_packs_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: packsswb %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <8 x i16>
|
|
|
|
%res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %arg0, <8 x i16> %arg1)
|
|
|
|
%bc = bitcast <16 x i8> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_packs_epi32(<2 x i64> %a0, <2 x i64> %a1) {
|
|
|
|
; X32-LABEL: test_mm_packs_epi32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: packssdw %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_packs_epi32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: packssdw %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
|
|
|
|
%res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %arg0, <4 x i32> %arg1)
|
|
|
|
%bc = bitcast <8 x i16> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_packus_epi16(<2 x i64> %a0, <2 x i64> %a1) {
|
|
|
|
; X32-LABEL: test_mm_packus_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_packus_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: packuswb %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <8 x i16>
|
|
|
|
%res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %arg0, <8 x i16> %arg1)
|
|
|
|
%bc = bitcast <16 x i8> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
|
|
|
|
|
|
|
|
define void @test_mm_pause() nounwind {
|
|
|
|
; X32-LABEL: test_mm_pause:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pause
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_pause:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pause
|
|
|
|
; X64-NEXT: retq
|
|
|
|
call void @llvm.x86.sse2.pause()
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
declare void @llvm.x86.sse2.pause() nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_sad_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_sad_epu8:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: psadbw %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_sad_epu8:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: psadbw %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <16 x i8>
|
|
|
|
%res = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %arg0, <16 x i8> %arg1)
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
|
|
|
|
|
2016-05-19 18:58:54 +08:00
|
|
|
define <2 x i64> @test_mm_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15) nounwind {
|
|
|
|
; X32-LABEL: test_mm_set_epi8:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm0
|
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm1
|
|
|
|
; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm0
|
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm2
|
|
|
|
; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
|
2016-05-19 18:58:54 +08:00
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm0
|
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm3
|
|
|
|
; X32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
|
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm0
|
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm1
|
|
|
|
; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
|
|
|
|
; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
2016-05-19 18:58:54 +08:00
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm0
|
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm2
|
|
|
|
; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
|
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm0
|
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm3
|
|
|
|
; X32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
|
2016-05-19 18:58:54 +08:00
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm0
|
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm2
|
|
|
|
; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
|
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm4
|
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm0
|
|
|
|
; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
|
|
|
|
; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
|
|
|
|
; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-05-19 18:58:54 +08:00
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_set_epi8:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movzbl %dil, %eax
|
|
|
|
; X64-NEXT: movd %eax, %xmm0
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movzbl %sil, %eax
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: movd %eax, %xmm1
|
|
|
|
; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movzbl %dl, %eax
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: movd %eax, %xmm0
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movzbl %cl, %eax
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: movd %eax, %xmm2
|
|
|
|
; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
|
|
|
|
; X64-NEXT: movzbl %r8b, %eax
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: movd %eax, %xmm0
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movzbl %r9b, %eax
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: movd %eax, %xmm3
|
|
|
|
; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
|
|
|
|
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
|
|
|
|
; X64-NEXT: movd %eax, %xmm0
|
|
|
|
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
|
|
|
|
; X64-NEXT: movd %eax, %xmm1
|
|
|
|
; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
|
|
|
|
; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
|
|
|
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: movd %eax, %xmm0
|
|
|
|
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
|
|
|
|
; X64-NEXT: movd %eax, %xmm2
|
|
|
|
; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: movd %eax, %xmm0
|
|
|
|
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
|
|
|
|
; X64-NEXT: movd %eax, %xmm3
|
|
|
|
; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
|
|
|
|
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: movd %eax, %xmm0
|
|
|
|
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
|
|
|
|
; X64-NEXT: movd %eax, %xmm2
|
|
|
|
; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
|
|
|
|
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
|
|
|
|
; X64-NEXT: movd %eax, %xmm4
|
|
|
|
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
|
|
|
|
; X64-NEXT: movd %eax, %xmm0
|
|
|
|
; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
|
|
|
|
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
|
|
|
|
; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: retq
|
|
|
|
%res0 = insertelement <16 x i8> undef, i8 %a15, i32 0
|
|
|
|
%res1 = insertelement <16 x i8> %res0, i8 %a14, i32 1
|
|
|
|
%res2 = insertelement <16 x i8> %res1, i8 %a13, i32 2
|
|
|
|
%res3 = insertelement <16 x i8> %res2, i8 %a12, i32 3
|
|
|
|
%res4 = insertelement <16 x i8> %res3, i8 %a11, i32 4
|
|
|
|
%res5 = insertelement <16 x i8> %res4, i8 %a10, i32 5
|
|
|
|
%res6 = insertelement <16 x i8> %res5, i8 %a9 , i32 6
|
|
|
|
%res7 = insertelement <16 x i8> %res6, i8 %a8 , i32 7
|
|
|
|
%res8 = insertelement <16 x i8> %res7, i8 %a7 , i32 8
|
|
|
|
%res9 = insertelement <16 x i8> %res8, i8 %a6 , i32 9
|
|
|
|
%res10 = insertelement <16 x i8> %res9, i8 %a5 , i32 10
|
|
|
|
%res11 = insertelement <16 x i8> %res10, i8 %a4 , i32 11
|
|
|
|
%res12 = insertelement <16 x i8> %res11, i8 %a3 , i32 12
|
|
|
|
%res13 = insertelement <16 x i8> %res12, i8 %a2 , i32 13
|
|
|
|
%res14 = insertelement <16 x i8> %res13, i8 %a1 , i32 14
|
|
|
|
%res15 = insertelement <16 x i8> %res14, i8 %a0 , i32 15
|
|
|
|
%res = bitcast <16 x i8> %res15 to <2 x i64>
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
|
|
|
|
; X32-LABEL: test_mm_set_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
|
|
|
|
; X32-NEXT: movd %eax, %xmm1
|
|
|
|
; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
|
|
|
|
; X32-NEXT: movd %eax, %xmm2
|
|
|
|
; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
|
|
|
|
; X32-NEXT: movd %eax, %xmm3
|
|
|
|
; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
|
|
|
|
; X32-NEXT: movd %eax, %xmm4
|
|
|
|
; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
|
|
|
|
; X32-NEXT: movd %eax, %xmm5
|
|
|
|
; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
|
|
|
|
; X32-NEXT: movd %eax, %xmm6
|
|
|
|
; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
|
|
|
|
; X32-NEXT: movd %eax, %xmm7
|
|
|
|
; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
|
|
|
|
; X32-NEXT: movd %eax, %xmm0
|
|
|
|
; X32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
|
|
|
|
; X32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
|
2016-05-19 18:58:54 +08:00
|
|
|
; X32-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
|
|
|
|
; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
|
|
|
|
; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
|
2016-05-19 18:58:54 +08:00
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_set_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movw {{[0-9]+}}(%rsp), %r10w
|
|
|
|
; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
|
|
|
|
; X64-NEXT: movd %edi, %xmm0
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movd %esi, %xmm1
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
|
|
|
; X64-NEXT: movd %edx, %xmm0
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movd %ecx, %xmm2
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
|
|
|
|
; X64-NEXT: movd %r8d, %xmm0
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: movd %r9d, %xmm1
|
|
|
|
; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movd %eax, %xmm3
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: movd %r10d, %xmm0
|
|
|
|
; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
|
|
|
; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: retq
|
|
|
|
%res0 = insertelement <8 x i16> undef, i16 %a7, i32 0
|
|
|
|
%res1 = insertelement <8 x i16> %res0, i16 %a6, i32 1
|
|
|
|
%res2 = insertelement <8 x i16> %res1, i16 %a5, i32 2
|
|
|
|
%res3 = insertelement <8 x i16> %res2, i16 %a4, i32 3
|
|
|
|
%res4 = insertelement <8 x i16> %res3, i16 %a3, i32 4
|
|
|
|
%res5 = insertelement <8 x i16> %res4, i16 %a2, i32 5
|
|
|
|
%res6 = insertelement <8 x i16> %res5, i16 %a1, i32 6
|
|
|
|
%res7 = insertelement <8 x i16> %res6, i16 %a0, i32 7
|
|
|
|
%res = bitcast <8 x i16> %res7 to <2 x i64>
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_set_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind {
|
|
|
|
; X32-LABEL: test_mm_set_epi32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
|
|
|
; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-05-19 18:58:54 +08:00
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_set_epi32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movd %edi, %xmm0
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movd %esi, %xmm1
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movd %edx, %xmm2
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: movd %ecx, %xmm0
|
|
|
|
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: retq
|
|
|
|
%res0 = insertelement <4 x i32> undef, i32 %a3, i32 0
|
|
|
|
%res1 = insertelement <4 x i32> %res0, i32 %a2, i32 1
|
|
|
|
%res2 = insertelement <4 x i32> %res1, i32 %a1, i32 2
|
|
|
|
%res3 = insertelement <4 x i32> %res2, i32 %a0, i32 3
|
|
|
|
%res = bitcast <4 x i32> %res3 to <2 x i64>
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
; TODO test_mm_set_epi64
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_set_epi64x(i64 %a0, i64 %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_set_epi64x:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
|
|
|
; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
2016-05-19 18:58:54 +08:00
|
|
|
; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-05-19 18:58:54 +08:00
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_set_epi64x:
|
|
|
|
; X64: # BB#0:
|
2017-04-26 15:08:44 +08:00
|
|
|
; X64-NEXT: movq %rdi, %xmm1
|
|
|
|
; X64-NEXT: movq %rsi, %xmm0
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res0 = insertelement <2 x i64> undef, i64 %a1, i32 0
|
|
|
|
%res1 = insertelement <2 x i64> %res0, i64 %a0, i32 1
|
|
|
|
ret <2 x i64> %res1
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_set_pd(double %a0, double %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_set_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; X32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; X32-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_set_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
|
|
|
|
; X64-NEXT: movapd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res0 = insertelement <2 x double> undef, double %a1, i32 0
|
|
|
|
%res1 = insertelement <2 x double> %res0, double %a0, i32 1
|
|
|
|
ret <2 x double> %res1
|
|
|
|
}
|
|
|
|
|
2017-04-28 18:31:42 +08:00
|
|
|
define <2 x double> @test_mm_set_pd1(double %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_set_pd1:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_set_pd1:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res0 = insertelement <2 x double> undef, double %a0, i32 0
|
|
|
|
%res1 = insertelement <2 x double> %res0, double %a0, i32 1
|
|
|
|
ret <2 x double> %res1
|
|
|
|
}
|
|
|
|
|
2016-05-19 18:58:54 +08:00
|
|
|
define <2 x double> @test_mm_set_sd(double %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_set_sd:
|
|
|
|
; X32: # BB#0:
|
2016-12-16 00:05:29 +08:00
|
|
|
; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
2016-05-19 18:58:54 +08:00
|
|
|
; X32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_set_sd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res0 = insertelement <2 x double> undef, double %a0, i32 0
|
|
|
|
%res1 = insertelement <2 x double> %res0, double 0.0, i32 1
|
|
|
|
ret <2 x double> %res1
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_set1_epi8(i8 %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_set1_epi8:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm0
|
|
|
|
; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
|
|
|
; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
|
|
|
|
; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_set1_epi8:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movzbl %dil, %eax
|
|
|
|
; X64-NEXT: movd %eax, %xmm0
|
|
|
|
; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
|
|
|
; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
|
|
|
|
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res0 = insertelement <16 x i8> undef, i8 %a0, i32 0
|
|
|
|
%res1 = insertelement <16 x i8> %res0, i8 %a0, i32 1
|
|
|
|
%res2 = insertelement <16 x i8> %res1, i8 %a0, i32 2
|
|
|
|
%res3 = insertelement <16 x i8> %res2, i8 %a0, i32 3
|
|
|
|
%res4 = insertelement <16 x i8> %res3, i8 %a0, i32 4
|
|
|
|
%res5 = insertelement <16 x i8> %res4, i8 %a0, i32 5
|
|
|
|
%res6 = insertelement <16 x i8> %res5, i8 %a0, i32 6
|
|
|
|
%res7 = insertelement <16 x i8> %res6, i8 %a0, i32 7
|
|
|
|
%res8 = insertelement <16 x i8> %res7, i8 %a0, i32 8
|
|
|
|
%res9 = insertelement <16 x i8> %res8, i8 %a0, i32 9
|
|
|
|
%res10 = insertelement <16 x i8> %res9, i8 %a0, i32 10
|
|
|
|
%res11 = insertelement <16 x i8> %res10, i8 %a0, i32 11
|
|
|
|
%res12 = insertelement <16 x i8> %res11, i8 %a0, i32 12
|
|
|
|
%res13 = insertelement <16 x i8> %res12, i8 %a0, i32 13
|
|
|
|
%res14 = insertelement <16 x i8> %res13, i8 %a0, i32 14
|
|
|
|
%res15 = insertelement <16 x i8> %res14, i8 %a0, i32 15
|
|
|
|
%res = bitcast <16 x i8> %res15 to <2 x i64>
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_set1_epi16(i16 %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_set1_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
|
|
|
|
; X32-NEXT: movd %eax, %xmm0
|
|
|
|
; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
|
|
|
|
; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_set1_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movd %edi, %xmm0
|
|
|
|
; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
|
|
|
|
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res0 = insertelement <8 x i16> undef, i16 %a0, i32 0
|
|
|
|
%res1 = insertelement <8 x i16> %res0, i16 %a0, i32 1
|
|
|
|
%res2 = insertelement <8 x i16> %res1, i16 %a0, i32 2
|
|
|
|
%res3 = insertelement <8 x i16> %res2, i16 %a0, i32 3
|
|
|
|
%res4 = insertelement <8 x i16> %res3, i16 %a0, i32 4
|
|
|
|
%res5 = insertelement <8 x i16> %res4, i16 %a0, i32 5
|
|
|
|
%res6 = insertelement <8 x i16> %res5, i16 %a0, i32 6
|
|
|
|
%res7 = insertelement <8 x i16> %res6, i16 %a0, i32 7
|
|
|
|
%res = bitcast <8 x i16> %res7 to <2 x i64>
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_set1_epi32(i32 %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_set1_epi32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_set1_epi32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movd %edi, %xmm0
|
|
|
|
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res0 = insertelement <4 x i32> undef, i32 %a0, i32 0
|
|
|
|
%res1 = insertelement <4 x i32> %res0, i32 %a0, i32 1
|
|
|
|
%res2 = insertelement <4 x i32> %res1, i32 %a0, i32 2
|
|
|
|
%res3 = insertelement <4 x i32> %res2, i32 %a0, i32 3
|
|
|
|
%res = bitcast <4 x i32> %res3 to <2 x i64>
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
; TODO test_mm_set1_epi64
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_set1_epi64x(i64 %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_set1_epi64x:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
|
2016-05-19 18:58:54 +08:00
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_set1_epi64x:
|
|
|
|
; X64: # BB#0:
|
2017-04-26 15:08:44 +08:00
|
|
|
; X64-NEXT: movq %rdi, %xmm0
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res0 = insertelement <2 x i64> undef, i64 %a0, i32 0
|
|
|
|
%res1 = insertelement <2 x i64> %res0, i64 %a0, i32 1
|
|
|
|
ret <2 x i64> %res1
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_set1_pd(double %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_set1_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_set1_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res0 = insertelement <2 x double> undef, double %a0, i32 0
|
|
|
|
%res1 = insertelement <2 x double> %res0, double %a0, i32 1
|
|
|
|
ret <2 x double> %res1
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15) nounwind {
|
|
|
|
; X32-LABEL: test_mm_setr_epi8:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm0
|
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm1
|
|
|
|
; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm0
|
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm2
|
|
|
|
; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
|
2016-05-19 18:58:54 +08:00
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm0
|
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm3
|
|
|
|
; X32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
|
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm0
|
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm1
|
|
|
|
; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
|
|
|
|
; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
2016-05-19 18:58:54 +08:00
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm0
|
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm2
|
|
|
|
; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
|
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm0
|
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm3
|
|
|
|
; X32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
|
2016-05-19 18:58:54 +08:00
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm0
|
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm2
|
|
|
|
; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
|
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm4
|
|
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movd %eax, %xmm0
|
|
|
|
; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
|
|
|
|
; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
|
|
|
|
; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-05-19 18:58:54 +08:00
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_setr_epi8:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
|
|
|
|
; X64-NEXT: movd %eax, %xmm0
|
|
|
|
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
|
|
|
|
; X64-NEXT: movd %eax, %xmm1
|
|
|
|
; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
|
|
|
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
|
|
|
|
; X64-NEXT: movd %eax, %xmm0
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: movd %eax, %xmm2
|
|
|
|
; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
|
|
|
|
; X64-NEXT: movd %eax, %xmm0
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: movd %eax, %xmm3
|
|
|
|
; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
|
|
|
|
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
|
|
|
|
; X64-NEXT: movd %eax, %xmm0
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: movd %eax, %xmm1
|
|
|
|
; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
|
|
|
|
; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
|
|
|
|
; X64-NEXT: movd %eax, %xmm0
|
|
|
|
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
|
|
|
|
; X64-NEXT: movd %eax, %xmm2
|
|
|
|
; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movzbl %r9b, %eax
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: movd %eax, %xmm0
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movzbl %r8b, %eax
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: movd %eax, %xmm3
|
|
|
|
; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
|
|
|
|
; X64-NEXT: movzbl %cl, %eax
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: movd %eax, %xmm0
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movzbl %dl, %eax
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: movd %eax, %xmm2
|
|
|
|
; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movzbl %sil, %eax
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: movd %eax, %xmm4
|
|
|
|
; X64-NEXT: movzbl %dil, %eax
|
|
|
|
; X64-NEXT: movd %eax, %xmm0
|
|
|
|
; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
|
|
|
|
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
|
|
|
|
; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: retq
|
|
|
|
%res0 = insertelement <16 x i8> undef, i8 %a0 , i32 0
|
|
|
|
%res1 = insertelement <16 x i8> %res0, i8 %a1 , i32 1
|
|
|
|
%res2 = insertelement <16 x i8> %res1, i8 %a2 , i32 2
|
|
|
|
%res3 = insertelement <16 x i8> %res2, i8 %a3 , i32 3
|
|
|
|
%res4 = insertelement <16 x i8> %res3, i8 %a4 , i32 4
|
|
|
|
%res5 = insertelement <16 x i8> %res4, i8 %a5 , i32 5
|
|
|
|
%res6 = insertelement <16 x i8> %res5, i8 %a6 , i32 6
|
|
|
|
%res7 = insertelement <16 x i8> %res6, i8 %a7 , i32 7
|
|
|
|
%res8 = insertelement <16 x i8> %res7, i8 %a8 , i32 8
|
|
|
|
%res9 = insertelement <16 x i8> %res8, i8 %a9 , i32 9
|
|
|
|
%res10 = insertelement <16 x i8> %res9, i8 %a10, i32 10
|
|
|
|
%res11 = insertelement <16 x i8> %res10, i8 %a11, i32 11
|
|
|
|
%res12 = insertelement <16 x i8> %res11, i8 %a12, i32 12
|
|
|
|
%res13 = insertelement <16 x i8> %res12, i8 %a13, i32 13
|
|
|
|
%res14 = insertelement <16 x i8> %res13, i8 %a14, i32 14
|
|
|
|
%res15 = insertelement <16 x i8> %res14, i8 %a15, i32 15
|
|
|
|
%res = bitcast <16 x i8> %res15 to <2 x i64>
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
|
|
|
|
; X32-LABEL: test_mm_setr_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
|
|
|
|
; X32-NEXT: movd %eax, %xmm1
|
|
|
|
; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
|
|
|
|
; X32-NEXT: movd %eax, %xmm2
|
|
|
|
; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
|
|
|
|
; X32-NEXT: movd %eax, %xmm3
|
|
|
|
; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
|
|
|
|
; X32-NEXT: movd %eax, %xmm4
|
|
|
|
; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
|
|
|
|
; X32-NEXT: movd %eax, %xmm5
|
|
|
|
; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
|
|
|
|
; X32-NEXT: movd %eax, %xmm6
|
|
|
|
; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
|
|
|
|
; X32-NEXT: movd %eax, %xmm7
|
|
|
|
; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
|
|
|
|
; X32-NEXT: movd %eax, %xmm0
|
|
|
|
; X32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
|
|
|
|
; X32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
|
2016-05-19 18:58:54 +08:00
|
|
|
; X32-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
|
|
|
|
; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
|
|
|
|
; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
|
2016-05-19 18:58:54 +08:00
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_setr_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
|
|
|
|
; X64-NEXT: movw {{[0-9]+}}(%rsp), %r10w
|
|
|
|
; X64-NEXT: movd %eax, %xmm0
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movd %r10d, %xmm1
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
|
|
|
; X64-NEXT: movd %r9d, %xmm0
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movd %r8d, %xmm2
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
|
|
|
|
; X64-NEXT: movd %ecx, %xmm0
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: movd %edx, %xmm1
|
|
|
|
; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movd %esi, %xmm3
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: movd %edi, %xmm0
|
|
|
|
; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
|
|
|
; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: retq
|
|
|
|
%res0 = insertelement <8 x i16> undef, i16 %a0, i32 0
|
|
|
|
%res1 = insertelement <8 x i16> %res0, i16 %a1, i32 1
|
|
|
|
%res2 = insertelement <8 x i16> %res1, i16 %a2, i32 2
|
|
|
|
%res3 = insertelement <8 x i16> %res2, i16 %a3, i32 3
|
|
|
|
%res4 = insertelement <8 x i16> %res3, i16 %a4, i32 4
|
|
|
|
%res5 = insertelement <8 x i16> %res4, i16 %a5, i32 5
|
|
|
|
%res6 = insertelement <8 x i16> %res5, i16 %a6, i32 6
|
|
|
|
%res7 = insertelement <8 x i16> %res6, i16 %a7, i32 7
|
|
|
|
%res = bitcast <8 x i16> %res7 to <2 x i64>
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_setr_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind {
|
|
|
|
; X32-LABEL: test_mm_setr_epi32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
|
|
|
; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-05-19 18:58:54 +08:00
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_setr_epi32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movd %ecx, %xmm0
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movd %edx, %xmm1
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movd %esi, %xmm2
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: movd %edi, %xmm0
|
|
|
|
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: retq
|
|
|
|
%res0 = insertelement <4 x i32> undef, i32 %a0, i32 0
|
|
|
|
%res1 = insertelement <4 x i32> %res0, i32 %a1, i32 1
|
|
|
|
%res2 = insertelement <4 x i32> %res1, i32 %a2, i32 2
|
|
|
|
%res3 = insertelement <4 x i32> %res2, i32 %a3, i32 3
|
|
|
|
%res = bitcast <4 x i32> %res3 to <2 x i64>
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
; TODO test_mm_setr_epi64
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_setr_epi64x(i64 %a0, i64 %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_setr_epi64x:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
|
|
|
; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
2016-05-19 18:58:54 +08:00
|
|
|
; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-05-19 18:58:54 +08:00
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_setr_epi64x:
|
|
|
|
; X64: # BB#0:
|
2017-04-26 15:08:44 +08:00
|
|
|
; X64-NEXT: movq %rsi, %xmm1
|
|
|
|
; X64-NEXT: movq %rdi, %xmm0
|
2016-05-19 18:58:54 +08:00
|
|
|
; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res0 = insertelement <2 x i64> undef, i64 %a0, i32 0
|
|
|
|
%res1 = insertelement <2 x i64> %res0, i64 %a1, i32 1
|
|
|
|
ret <2 x i64> %res1
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_setr_pd(double %a0, double %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_setr_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; X32-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_setr_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res0 = insertelement <2 x double> undef, double %a0, i32 0
|
|
|
|
%res1 = insertelement <2 x double> %res0, double %a1, i32 1
|
|
|
|
ret <2 x double> %res1
|
|
|
|
}
|
|
|
|
|
2016-05-19 02:00:43 +08:00
|
|
|
define <2 x double> @test_mm_setzero_pd() {
|
|
|
|
; X32-LABEL: test_mm_setzero_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: xorps %xmm0, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_setzero_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: xorps %xmm0, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
ret <2 x double> zeroinitializer
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_setzero_si128() {
|
|
|
|
; X32-LABEL: test_mm_setzero_si128:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: xorps %xmm0, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_setzero_si128:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: xorps %xmm0, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
ret <2 x i64> zeroinitializer
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_shuffle_epi32(<2 x i64> %a0) {
|
|
|
|
; X32-LABEL: test_mm_shuffle_epi32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_shuffle_epi32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
|
|
|
|
%res = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
|
|
|
|
%bc = bitcast <4 x i32> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) {
|
|
|
|
; X32-LABEL: test_mm_shuffle_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_shuffle_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 2>
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_shufflehi_epi16(<2 x i64> %a0) {
|
|
|
|
; X32-LABEL: test_mm_shufflehi_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_shufflehi_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%res = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
|
|
|
|
%bc = bitcast <8 x i16> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_shufflelo_epi16(<2 x i64> %a0) {
|
|
|
|
; X32-LABEL: test_mm_shufflelo_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_shufflelo_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%res = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
|
|
|
|
%bc = bitcast <8 x i16> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_sll_epi16(<2 x i64> %a0, <2 x i64> %a1) {
|
|
|
|
; X32-LABEL: test_mm_sll_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: psllw %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_sll_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: psllw %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <8 x i16>
|
|
|
|
%res = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %arg0, <8 x i16> %arg1)
|
|
|
|
%bc = bitcast <8 x i16> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_sll_epi32(<2 x i64> %a0, <2 x i64> %a1) {
|
|
|
|
; X32-LABEL: test_mm_sll_epi32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pslld %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_sll_epi32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pslld %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
|
|
|
|
%res = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %arg0, <4 x i32> %arg1)
|
|
|
|
%bc = bitcast <4 x i32> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_sll_epi64(<2 x i64> %a0, <2 x i64> %a1) {
|
|
|
|
; X32-LABEL: test_mm_sll_epi64:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: psllq %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_sll_epi64:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: psllq %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1)
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_slli_epi16(<2 x i64> %a0) {
|
|
|
|
; X32-LABEL: test_mm_slli_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: psllw $1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_slli_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: psllw $1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%res = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %arg0, i32 1)
|
|
|
|
%bc = bitcast <8 x i16> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_slli_epi32(<2 x i64> %a0) {
|
|
|
|
; X32-LABEL: test_mm_slli_epi32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pslld $1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_slli_epi32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pslld $1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
|
|
|
|
%res = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %arg0, i32 1)
|
|
|
|
%bc = bitcast <4 x i32> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_slli_epi64(<2 x i64> %a0) {
|
|
|
|
; X32-LABEL: test_mm_slli_epi64:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: psllq $1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_slli_epi64:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: psllq $1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %a0, i32 1)
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_slli_si128(<2 x i64> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_slli_si128:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_slli_si128:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
|
|
|
|
%res = shufflevector <16 x i8> zeroinitializer, <16 x i8> %arg0, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
|
|
|
|
%bc = bitcast <16 x i8> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_sqrt_pd(<2 x double> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_sqrt_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: sqrtpd %xmm0, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_sqrt_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: sqrtpd %xmm0, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0)
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_sqrt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_sqrt_sd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: sqrtsd %xmm0, %xmm1
|
2017-02-26 14:45:35 +08:00
|
|
|
; X32-NEXT: movapd %xmm1, %xmm0
|
2016-05-19 02:00:43 +08:00
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_sqrt_sd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: sqrtsd %xmm0, %xmm1
|
2017-02-26 14:45:35 +08:00
|
|
|
; X64-NEXT: movapd %xmm1, %xmm0
|
2016-05-19 02:00:43 +08:00
|
|
|
; X64-NEXT: retq
|
|
|
|
%call = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0)
|
|
|
|
%ext0 = extractelement <2 x double> %call, i32 0
|
|
|
|
%ins0 = insertelement <2 x double> undef, double %ext0, i32 0
|
|
|
|
%ext1 = extractelement <2 x double> %a1, i32 1
|
|
|
|
%ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
|
|
|
|
ret <2 x double> %ins1
|
|
|
|
}
|
|
|
|
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_sra_epi16(<2 x i64> %a0, <2 x i64> %a1) {
|
|
|
|
; X32-LABEL: test_mm_sra_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: psraw %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_sra_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: psraw %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <8 x i16>
|
|
|
|
%res = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %arg0, <8 x i16> %arg1)
|
|
|
|
%bc = bitcast <8 x i16> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_sra_epi32(<2 x i64> %a0, <2 x i64> %a1) {
|
|
|
|
; X32-LABEL: test_mm_sra_epi32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: psrad %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_sra_epi32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: psrad %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
|
|
|
|
%res = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %arg0, <4 x i32> %arg1)
|
|
|
|
%bc = bitcast <4 x i32> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_srai_epi16(<2 x i64> %a0) {
|
|
|
|
; X32-LABEL: test_mm_srai_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: psraw $1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_srai_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: psraw $1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%res = call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %arg0, i32 1)
|
|
|
|
%bc = bitcast <8 x i16> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_srai_epi32(<2 x i64> %a0) {
|
|
|
|
; X32-LABEL: test_mm_srai_epi32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: psrad $1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_srai_epi32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: psrad $1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
|
|
|
|
%res = call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %arg0, i32 1)
|
|
|
|
%bc = bitcast <4 x i32> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_srl_epi16(<2 x i64> %a0, <2 x i64> %a1) {
|
|
|
|
; X32-LABEL: test_mm_srl_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: psrlw %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_srl_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: psrlw %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <8 x i16>
|
|
|
|
%res = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %arg0, <8 x i16> %arg1)
|
|
|
|
%bc = bitcast <8 x i16> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_srl_epi32(<2 x i64> %a0, <2 x i64> %a1) {
|
|
|
|
; X32-LABEL: test_mm_srl_epi32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: psrld %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_srl_epi32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: psrld %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
|
|
|
|
%res = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %arg0, <4 x i32> %arg1)
|
|
|
|
%bc = bitcast <4 x i32> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_srl_epi64(<2 x i64> %a0, <2 x i64> %a1) {
|
|
|
|
; X32-LABEL: test_mm_srl_epi64:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: psrlq %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_srl_epi64:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: psrlq %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1)
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_srli_epi16(<2 x i64> %a0) {
|
|
|
|
; X32-LABEL: test_mm_srli_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: psrlw $1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_srli_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: psrlw $1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%res = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %arg0, i32 1)
|
|
|
|
%bc = bitcast <8 x i16> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_srli_epi32(<2 x i64> %a0) {
|
|
|
|
; X32-LABEL: test_mm_srli_epi32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: psrld $1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_srli_epi32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: psrld $1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
|
|
|
|
%res = call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %arg0, i32 1)
|
|
|
|
%bc = bitcast <4 x i32> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_srli_epi64(<2 x i64> %a0) {
|
|
|
|
; X32-LABEL: test_mm_srli_epi64:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: psrlq $1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_srli_epi64:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: psrlq $1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %a0, i32 1)
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_srli_si128(<2 x i64> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_srli_si128:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_srli_si128:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
|
|
|
|
%res = shufflevector <16 x i8> %arg0, <16 x i8> zeroinitializer, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
|
|
|
|
%bc = bitcast <16 x i8> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @test_mm_store_pd(double *%a0, <2 x double> %a1) {
|
|
|
|
; X32-LABEL: test_mm_store_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movaps %xmm0, (%eax)
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_store_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movaps %xmm0, (%rdi)
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast double* %a0 to <2 x double>*
|
|
|
|
store <2 x double> %a1, <2 x double>* %arg0, align 16
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2016-05-31 02:18:44 +08:00
|
|
|
define void @test_mm_store_pd1(double *%a0, <2 x double> %a1) {
|
|
|
|
; X32-LABEL: test_mm_store_pd1:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
|
|
|
|
; X32-NEXT: movaps %xmm0, (%eax)
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_store_pd1:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
|
|
|
|
; X64-NEXT: movaps %xmm0, (%rdi)
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast double * %a0 to <2 x double>*
|
|
|
|
%shuf = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer
|
|
|
|
store <2 x double> %shuf, <2 x double>* %arg0, align 16
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2016-05-19 02:00:43 +08:00
|
|
|
define void @test_mm_store_sd(double *%a0, <2 x double> %a1) {
|
|
|
|
; X32-LABEL: test_mm_store_sd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movsd %xmm0, (%eax)
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_store_sd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movsd %xmm0, (%rdi)
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%ext = extractelement <2 x double> %a1, i32 0
|
|
|
|
store double %ext, double* %a0, align 1
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @test_mm_store_si128(<2 x i64> *%a0, <2 x i64> %a1) {
|
|
|
|
; X32-LABEL: test_mm_store_si128:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movaps %xmm0, (%eax)
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_store_si128:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movaps %xmm0, (%rdi)
|
|
|
|
; X64-NEXT: retq
|
|
|
|
store <2 x i64> %a1, <2 x i64>* %a0, align 16
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2016-05-25 17:42:29 +08:00
|
|
|
define void @test_mm_store1_pd(double *%a0, <2 x double> %a1) {
|
|
|
|
; X32-LABEL: test_mm_store1_pd:
|
2016-05-19 02:00:43 +08:00
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
2016-05-31 02:18:44 +08:00
|
|
|
; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
|
|
|
|
; X32-NEXT: movaps %xmm0, (%eax)
|
2016-05-19 02:00:43 +08:00
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
2016-05-25 17:42:29 +08:00
|
|
|
; X64-LABEL: test_mm_store1_pd:
|
2016-05-19 02:00:43 +08:00
|
|
|
; X64: # BB#0:
|
2016-05-31 02:18:44 +08:00
|
|
|
; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
|
|
|
|
; X64-NEXT: movaps %xmm0, (%rdi)
|
2016-05-19 02:00:43 +08:00
|
|
|
; X64-NEXT: retq
|
2016-05-31 02:18:44 +08:00
|
|
|
%arg0 = bitcast double * %a0 to <2 x double>*
|
|
|
|
%shuf = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer
|
|
|
|
store <2 x double> %shuf, <2 x double>* %arg0, align 16
|
2016-05-19 02:00:43 +08:00
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @test_mm_storeh_sd(double *%a0, <2 x double> %a1) {
|
|
|
|
; X32-LABEL: test_mm_storeh_sd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
2016-08-22 20:56:54 +08:00
|
|
|
; X32-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
|
2016-05-19 02:00:43 +08:00
|
|
|
; X32-NEXT: movsd %xmm0, (%eax)
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_storeh_sd:
|
|
|
|
; X64: # BB#0:
|
2016-08-22 20:56:54 +08:00
|
|
|
; X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
|
2016-05-19 02:00:43 +08:00
|
|
|
; X64-NEXT: movsd %xmm0, (%rdi)
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%ext = extractelement <2 x double> %a1, i32 1
|
|
|
|
store double %ext, double* %a0, align 8
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @test_mm_storel_epi64(<2 x i64> *%a0, <2 x i64> %a1) {
|
|
|
|
; X32-LABEL: test_mm_storel_epi64:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movlps %xmm0, (%eax)
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_storel_epi64:
|
|
|
|
; X64: # BB#0:
|
2017-04-26 15:08:44 +08:00
|
|
|
; X64-NEXT: movq %xmm0, %rax
|
2016-05-19 02:00:43 +08:00
|
|
|
; X64-NEXT: movq %rax, (%rdi)
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%ext = extractelement <2 x i64> %a1, i32 0
|
|
|
|
%bc = bitcast <2 x i64> *%a0 to i64*
|
|
|
|
store i64 %ext, i64* %bc, align 8
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @test_mm_storel_sd(double *%a0, <2 x double> %a1) {
|
|
|
|
; X32-LABEL: test_mm_storel_sd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movsd %xmm0, (%eax)
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_storel_sd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movsd %xmm0, (%rdi)
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%ext = extractelement <2 x double> %a1, i32 0
|
|
|
|
store double %ext, double* %a0, align 8
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @test_mm_storer_pd(double *%a0, <2 x double> %a1) {
|
|
|
|
; X32-LABEL: test_mm_storer_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
|
|
|
|
; X32-NEXT: movapd %xmm0, (%eax)
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_storer_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
|
|
|
|
; X64-NEXT: movapd %xmm0, (%rdi)
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast double* %a0 to <2 x double>*
|
|
|
|
%shuf = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
|
|
|
|
store <2 x double> %shuf, <2 x double>* %arg0, align 16
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @test_mm_storeu_pd(double *%a0, <2 x double> %a1) {
|
|
|
|
; X32-LABEL: test_mm_storeu_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movups %xmm0, (%eax)
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_storeu_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movups %xmm0, (%rdi)
|
|
|
|
; X64-NEXT: retq
|
2016-05-31 02:42:51 +08:00
|
|
|
%arg0 = bitcast double* %a0 to <2 x double>*
|
|
|
|
store <2 x double> %a1, <2 x double>* %arg0, align 1
|
2016-05-19 02:00:43 +08:00
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @test_mm_storeu_si128(<2 x i64> *%a0, <2 x i64> %a1) {
|
|
|
|
; X32-LABEL: test_mm_storeu_si128:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movups %xmm0, (%eax)
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_storeu_si128:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movups %xmm0, (%rdi)
|
|
|
|
; X64-NEXT: retq
|
2016-05-31 02:42:51 +08:00
|
|
|
store <2 x i64> %a1, <2 x i64>* %a0, align 1
|
2016-05-19 02:00:43 +08:00
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @test_mm_stream_pd(double *%a0, <2 x double> %a1) {
|
|
|
|
; X32-LABEL: test_mm_stream_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movntps %xmm0, (%eax)
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_stream_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movntps %xmm0, (%rdi)
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast double* %a0 to <2 x double>*
|
|
|
|
store <2 x double> %a1, <2 x double>* %arg0, align 16, !nontemporal !0
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @test_mm_stream_si32(i32 *%a0, i32 %a1) {
|
|
|
|
; X32-LABEL: test_mm_stream_si32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X32-NEXT: movntil %eax, (%ecx)
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_stream_si32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movntil %esi, (%rdi)
|
|
|
|
; X64-NEXT: retq
|
|
|
|
store i32 %a1, i32* %a0, align 1, !nontemporal !0
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @test_mm_stream_si128(<2 x i64> *%a0, <2 x i64> %a1) {
|
|
|
|
; X32-LABEL: test_mm_stream_si128:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movntps %xmm0, (%eax)
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_stream_si128:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movntps %xmm0, (%rdi)
|
|
|
|
; X64-NEXT: retq
|
|
|
|
store <2 x i64> %a1, <2 x i64>* %a0, align 16, !nontemporal !0
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_sub_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_sub_epi8:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: psubb %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_sub_epi8:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: psubb %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <16 x i8>
|
|
|
|
%res = sub <16 x i8> %arg0, %arg1
|
|
|
|
%bc = bitcast <16 x i8> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_sub_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_sub_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: psubw %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_sub_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: psubw %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <8 x i16>
|
|
|
|
%res = sub <8 x i16> %arg0, %arg1
|
|
|
|
%bc = bitcast <8 x i16> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_sub_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_sub_epi32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: psubd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_sub_epi32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: psubd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
|
|
|
|
%res = sub <4 x i32> %arg0, %arg1
|
|
|
|
%bc = bitcast <4 x i32> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_sub_epi64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_sub_epi64:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: psubq %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_sub_epi64:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: psubq %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = sub <2 x i64> %a0, %a1
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_sub_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_sub_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: subpd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_sub_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: subpd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = fsub <2 x double> %a0, %a1
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_sub_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_sub_sd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: subsd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_sub_sd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: subsd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%ext0 = extractelement <2 x double> %a0, i32 0
|
|
|
|
%ext1 = extractelement <2 x double> %a1, i32 0
|
|
|
|
%fsub = fsub double %ext0, %ext1
|
|
|
|
%res = insertelement <2 x double> %a0, double %fsub, i32 0
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_subs_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_subs_epi8:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: psubsb %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_subs_epi8:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: psubsb %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <16 x i8>
|
|
|
|
%res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %arg0, <16 x i8> %arg1)
|
|
|
|
%bc = bitcast <16 x i8> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_subs_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_subs_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: psubsw %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_subs_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: psubsw %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <8 x i16>
|
|
|
|
%res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %arg0, <8 x i16> %arg1)
|
|
|
|
%bc = bitcast <8 x i16> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_subs_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_subs_epu8:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: psubusb %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_subs_epu8:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: psubusb %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <16 x i8>
|
|
|
|
%res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %arg0, <16 x i8> %arg1)
|
|
|
|
%bc = bitcast <16 x i8> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_subs_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_subs_epu16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: psubusw %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_subs_epu16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: psubusw %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <8 x i16>
|
|
|
|
%res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %arg0, <8 x i16> %arg1)
|
|
|
|
%bc = bitcast <8 x i16> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
|
|
|
|
|
|
|
|
define i32 @test_mm_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_ucomieq_sd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: ucomisd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: setnp %al
|
|
|
|
; X32-NEXT: sete %cl
|
|
|
|
; X32-NEXT: andb %al, %cl
|
|
|
|
; X32-NEXT: movzbl %cl, %eax
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_ucomieq_sd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: ucomisd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: setnp %al
|
|
|
|
; X64-NEXT: sete %cl
|
|
|
|
; X64-NEXT: andb %al, %cl
|
|
|
|
; X64-NEXT: movzbl %cl, %eax
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readnone
|
|
|
|
|
|
|
|
define i32 @test_mm_ucomige_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_ucomige_sd:
|
|
|
|
; X32: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X32-NEXT: xorl %eax, %eax
|
2016-05-19 02:00:43 +08:00
|
|
|
; X32-NEXT: ucomisd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: setae %al
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_ucomige_sd:
|
|
|
|
; X64: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X64-NEXT: xorl %eax, %eax
|
2016-05-19 02:00:43 +08:00
|
|
|
; X64-NEXT: ucomisd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: setae %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse2.ucomige.sd(<2 x double> %a0, <2 x double> %a1)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse2.ucomige.sd(<2 x double>, <2 x double>) nounwind readnone
|
|
|
|
|
|
|
|
define i32 @test_mm_ucomigt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_ucomigt_sd:
|
|
|
|
; X32: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X32-NEXT: xorl %eax, %eax
|
2016-05-19 02:00:43 +08:00
|
|
|
; X32-NEXT: ucomisd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: seta %al
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_ucomigt_sd:
|
|
|
|
; X64: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X64-NEXT: xorl %eax, %eax
|
2016-05-19 02:00:43 +08:00
|
|
|
; X64-NEXT: ucomisd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: seta %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse2.ucomigt.sd(<2 x double> %a0, <2 x double> %a1)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse2.ucomigt.sd(<2 x double>, <2 x double>) nounwind readnone
|
|
|
|
|
|
|
|
define i32 @test_mm_ucomile_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_ucomile_sd:
|
|
|
|
; X32: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X32-NEXT: xorl %eax, %eax
|
2016-05-19 02:00:43 +08:00
|
|
|
; X32-NEXT: ucomisd %xmm0, %xmm1
|
|
|
|
; X32-NEXT: setae %al
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_ucomile_sd:
|
|
|
|
; X64: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X64-NEXT: xorl %eax, %eax
|
2016-05-19 02:00:43 +08:00
|
|
|
; X64-NEXT: ucomisd %xmm0, %xmm1
|
|
|
|
; X64-NEXT: setae %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse2.ucomile.sd(<2 x double> %a0, <2 x double> %a1)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse2.ucomile.sd(<2 x double>, <2 x double>) nounwind readnone
|
|
|
|
|
|
|
|
define i32 @test_mm_ucomilt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_ucomilt_sd:
|
|
|
|
; X32: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X32-NEXT: xorl %eax, %eax
|
2016-05-19 02:00:43 +08:00
|
|
|
; X32-NEXT: ucomisd %xmm0, %xmm1
|
|
|
|
; X32-NEXT: seta %al
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_ucomilt_sd:
|
|
|
|
; X64: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X64-NEXT: xorl %eax, %eax
|
2016-05-19 02:00:43 +08:00
|
|
|
; X64-NEXT: ucomisd %xmm0, %xmm1
|
|
|
|
; X64-NEXT: seta %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse2.ucomilt.sd(<2 x double> %a0, <2 x double> %a1)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse2.ucomilt.sd(<2 x double>, <2 x double>) nounwind readnone
|
|
|
|
|
|
|
|
define i32 @test_mm_ucomineq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_ucomineq_sd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: ucomisd %xmm1, %xmm0
|
|
|
|
; X32-NEXT: setp %al
|
|
|
|
; X32-NEXT: setne %cl
|
|
|
|
; X32-NEXT: orb %al, %cl
|
|
|
|
; X32-NEXT: movzbl %cl, %eax
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_ucomineq_sd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: ucomisd %xmm1, %xmm0
|
|
|
|
; X64-NEXT: setp %al
|
|
|
|
; X64-NEXT: setne %cl
|
|
|
|
; X64-NEXT: orb %al, %cl
|
|
|
|
; X64-NEXT: movzbl %cl, %eax
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse2.ucomineq.sd(<2 x double> %a0, <2 x double> %a1)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse2.ucomineq.sd(<2 x double>, <2 x double>) nounwind readnone
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_undefined_pd() {
|
|
|
|
; X32-LABEL: test_mm_undefined_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_undefined_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: retq
|
|
|
|
ret <2 x double> undef
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_undefined_si128() {
|
|
|
|
; X32-LABEL: test_mm_undefined_si128:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_undefined_si128:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: retq
|
|
|
|
ret <2 x i64> undef
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_unpackhi_epi8(<2 x i64> %a0, <2 x i64> %a1) {
|
|
|
|
; X32-LABEL: test_mm_unpackhi_epi8:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_unpackhi_epi8:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <16 x i8>
|
|
|
|
%res = shufflevector <16 x i8> %arg0, <16 x i8> %arg1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
|
|
|
|
%bc = bitcast <16 x i8> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_unpackhi_epi16(<2 x i64> %a0, <2 x i64> %a1) {
|
|
|
|
; X32-LABEL: test_mm_unpackhi_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_unpackhi_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <8 x i16>
|
|
|
|
%res = shufflevector <8 x i16> %arg0, <8 x i16> %arg1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
|
|
|
|
%bc = bitcast <8 x i16> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_unpackhi_epi32(<2 x i64> %a0, <2 x i64> %a1) {
|
|
|
|
; X32-LABEL: test_mm_unpackhi_epi32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_unpackhi_epi32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
|
|
|
|
%res = shufflevector <4 x i32> %arg0,<4 x i32> %arg1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
|
|
|
|
%bc = bitcast <4 x i32> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_unpackhi_epi64(<2 x i64> %a0, <2 x i64> %a1) {
|
|
|
|
; X32-LABEL: test_mm_unpackhi_epi64:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_unpackhi_epi64:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 1, i32 3>
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_unpackhi_pd(<2 x double> %a0, <2 x double> %a1) {
|
|
|
|
; X32-LABEL: test_mm_unpackhi_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_unpackhi_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_unpacklo_epi8(<2 x i64> %a0, <2 x i64> %a1) {
|
|
|
|
; X32-LABEL: test_mm_unpacklo_epi8:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_unpacklo_epi8:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <16 x i8>
|
|
|
|
%res = shufflevector <16 x i8> %arg0, <16 x i8> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
|
|
|
|
%bc = bitcast <16 x i8> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_unpacklo_epi16(<2 x i64> %a0, <2 x i64> %a1) {
|
|
|
|
; X32-LABEL: test_mm_unpacklo_epi16:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_unpacklo_epi16:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <8 x i16>
|
|
|
|
%res = shufflevector <8 x i16> %arg0, <8 x i16> %arg1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
|
|
|
|
%bc = bitcast <8 x i16> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_unpacklo_epi32(<2 x i64> %a0, <2 x i64> %a1) {
|
|
|
|
; X32-LABEL: test_mm_unpacklo_epi32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_unpacklo_epi32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
|
|
|
|
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
|
|
|
|
%res = shufflevector <4 x i32> %arg0,<4 x i32> %arg1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
|
|
|
|
%bc = bitcast <4 x i32> %res to <2 x i64>
|
|
|
|
ret <2 x i64> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_unpacklo_epi64(<2 x i64> %a0, <2 x i64> %a1) {
|
|
|
|
; X32-LABEL: test_mm_unpacklo_epi64:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_unpacklo_epi64:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 0, i32 2>
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_unpacklo_pd(<2 x double> %a0, <2 x double> %a1) {
|
|
|
|
; X32-LABEL: test_mm_unpacklo_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_unpacklo_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 2>
|
|
|
|
ret <2 x double> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @test_mm_xor_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_xor_pd:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: xorps %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_xor_pd:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: xorps %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <2 x double> %a0 to <4 x i32>
|
|
|
|
%arg1 = bitcast <2 x double> %a1 to <4 x i32>
|
|
|
|
%res = xor <4 x i32> %arg0, %arg1
|
|
|
|
%bc = bitcast <4 x i32> %res to <2 x double>
|
|
|
|
ret <2 x double> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test_mm_xor_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_xor_si128:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: xorps %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_xor_si128:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: xorps %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = xor <2 x i64> %a0, %a1
|
|
|
|
ret <2 x i64> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
!0 = !{i32 1}
|
|
|
|
|