[X86][AVX1] Split 256-bit vector non-temporal FastISel loads to keep it non-temporal (PR32744)

Extension to D33728

llvm-svn: 304798
This commit is contained in:
Simon Pilgrim 2017-06-06 14:18:39 +00:00
parent 8cd60a5067
commit f7113fd270
2 changed files with 36 additions and 6 deletions

View File

@ -414,6 +414,8 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
assert(HasAVX);
if (IsNonTemporal && Alignment >= 32 && HasAVX2)
Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;
else if (IsNonTemporal && Alignment >= 16)
return false; // Force split for X86::VMOVNTDQArm
else if (Alignment >= 32)
Opc = HasVLX ? X86::VMOVAPSZ256rm : X86::VMOVAPSYrm;
else
@ -424,6 +426,8 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
assert(HasAVX);
if (IsNonTemporal && Alignment >= 32 && HasAVX2)
Opc = X86::VMOVNTDQAYrm;
else if (IsNonTemporal && Alignment >= 16)
return false; // Force split for X86::VMOVNTDQArm
else if (Alignment >= 32)
Opc = HasVLX ? X86::VMOVAPDZ256rm : X86::VMOVAPDYrm;
else
@ -437,6 +441,8 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
assert(HasAVX);
if (IsNonTemporal && Alignment >= 32 && HasAVX2)
Opc = X86::VMOVNTDQAYrm;
else if (IsNonTemporal && Alignment >= 16)
return false; // Force split for X86::VMOVNTDQArm
else if (Alignment >= 32)
Opc = HasVLX ? X86::VMOVDQA64Z256rm : X86::VMOVDQAYrm;
else

View File

@ -545,7 +545,11 @@ define <8 x float> @test_load_nt8xfloat(<8 x float>* nocapture %ptr) {
;
; AVX1-LABEL: test_load_nt8xfloat:
; AVX1: # BB#0: # %entry
; AVX1-NEXT: vmovaps (%rdi), %ymm0
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
; AVX1-NEXT: # implicit-def: %YMM1
; AVX1-NEXT: vmovaps %xmm0, %xmm1
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_nt8xfloat:
@ -583,7 +587,11 @@ define <4 x double> @test_load_nt4xdouble(<4 x double>* nocapture %ptr) {
;
; AVX1-LABEL: test_load_nt4xdouble:
; AVX1: # BB#0: # %entry
; AVX1-NEXT: vmovapd (%rdi), %ymm0
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
; AVX1-NEXT: # implicit-def: %YMM1
; AVX1-NEXT: vmovaps %xmm0, %xmm1
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_nt4xdouble:
@ -621,7 +629,11 @@ define <32 x i8> @test_load_nt32xi8(<32 x i8>* nocapture %ptr) {
;
; AVX1-LABEL: test_load_nt32xi8:
; AVX1: # BB#0: # %entry
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
; AVX1-NEXT: # implicit-def: %YMM1
; AVX1-NEXT: vmovaps %xmm0, %xmm1
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_nt32xi8:
@ -659,7 +671,11 @@ define <16 x i16> @test_load_nt16xi16(<16 x i16>* nocapture %ptr) {
;
; AVX1-LABEL: test_load_nt16xi16:
; AVX1: # BB#0: # %entry
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
; AVX1-NEXT: # implicit-def: %YMM1
; AVX1-NEXT: vmovaps %xmm0, %xmm1
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_nt16xi16:
@ -697,7 +713,11 @@ define <8 x i32> @test_load_nt8xi32(<8 x i32>* nocapture %ptr) {
;
; AVX1-LABEL: test_load_nt8xi32:
; AVX1: # BB#0: # %entry
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
; AVX1-NEXT: # implicit-def: %YMM1
; AVX1-NEXT: vmovaps %xmm0, %xmm1
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_nt8xi32:
@ -735,7 +755,11 @@ define <4 x i64> @test_load_nt4xi64(<4 x i64>* nocapture %ptr) {
;
; AVX1-LABEL: test_load_nt4xi64:
; AVX1: # BB#0: # %entry
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
; AVX1-NEXT: # implicit-def: %YMM1
; AVX1-NEXT: vmovaps %xmm0, %xmm1
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_nt4xi64: