forked from OSchip/llvm-project
[X86] Require 32-byte alignment for 32-byte VMOVNTs.
We used to accept (and even test, and generate) 16-byte alignment for 32-byte nontemporal stores, but they require 32-byte alignment, per SDM. Found by inspection. Instead of hardcoding 16 in the patfrag, check for natural alignment. Also fix the autoupgrade and the various tests. Also, use explicit -mattr instead of -mcpu: I stared at the output several minutes wondering why I get 2x movntps for the unaligned case (which is the ideal output, but needs some work: see FIXME), until I remembered corei7-avx implies +slow-unaligned-mem-32. llvm-svn: 246733
This commit is contained in:
parent
78425200ee
commit
b03ea02479
|
@ -370,7 +370,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
|
|||
"cast");
|
||||
StoreInst *SI = Builder.CreateStore(Arg1, BC);
|
||||
SI->setMetadata(M->getMDKindID("nontemporal"), Node);
|
||||
SI->setAlignment(16);
|
||||
SI->setAlignment(32);
|
||||
|
||||
// Remove intrinsic.
|
||||
CI->eraseFromParent();
|
||||
|
|
|
@ -626,12 +626,14 @@ def nontemporalstore : PatFrag<(ops node:$val, node:$ptr),
|
|||
|
||||
def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
|
||||
(nontemporalstore node:$val, node:$ptr), [{
|
||||
return cast<StoreSDNode>(N)->getAlignment() >= 16;
|
||||
StoreSDNode *St = cast<StoreSDNode>(N);
|
||||
return St->getAlignment() >= St->getMemoryVT().getStoreSize();
|
||||
}]>;
|
||||
|
||||
def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
|
||||
(nontemporalstore node:$val, node:$ptr), [{
|
||||
return cast<StoreSDNode>(N)->getAlignment() < 16;
|
||||
StoreSDNode *St = cast<StoreSDNode>(N);
|
||||
return St->getAlignment() < St->getMemoryVT().getStoreSize();
|
||||
}]>;
|
||||
|
||||
def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
|
||||
|
|
|
@ -4,15 +4,15 @@ define void @f(<8 x float> %A, i8* %B, <4 x double> %C, <4 x i64> %E) {
|
|||
; CHECK: vmovntps %y
|
||||
%cast = bitcast i8* %B to <8 x float>*
|
||||
%A2 = fadd <8 x float> %A, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x4200000000000000>
|
||||
store <8 x float> %A2, <8 x float>* %cast, align 16, !nontemporal !0
|
||||
store <8 x float> %A2, <8 x float>* %cast, align 32, !nontemporal !0
|
||||
; CHECK: vmovntdq %y
|
||||
%cast1 = bitcast i8* %B to <4 x i64>*
|
||||
%E2 = add <4 x i64> %E, <i64 1, i64 2, i64 3, i64 4>
|
||||
store <4 x i64> %E2, <4 x i64>* %cast1, align 16, !nontemporal !0
|
||||
store <4 x i64> %E2, <4 x i64>* %cast1, align 32, !nontemporal !0
|
||||
; CHECK: vmovntpd %y
|
||||
%cast2 = bitcast i8* %B to <4 x double>*
|
||||
%C2 = fadd <4 x double> %C, <double 0x0, double 0x0, double 0x0, double 0x4200000000000000>
|
||||
store <4 x double> %C2, <4 x double>* %cast2, align 16, !nontemporal !0
|
||||
store <4 x double> %C2, <4 x double>* %cast2, align 32, !nontemporal !0
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
|
||||
define void @test(<2 x i64>* nocapture %a, <2 x i64> %b) nounwind optsize {
|
||||
entry:
|
||||
store <2 x i64> %b, <2 x i64>* %a, align 16, !nontemporal !0
|
||||
store <2 x i64> %b, <2 x i64>* %a, align 32, !nontemporal !0
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
|
||||
|
||||
; Make sure that we generate non-temporal stores for the test cases below.
|
||||
; We use xorps for zeroing, so domain information isn't available anymore.
|
||||
|
@ -300,4 +300,19 @@ define void @test_op_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %dst) {
|
|||
ret void
|
||||
}
|
||||
|
||||
; 256-bit NT stores require 256-bit alignment.
|
||||
; FIXME: For AVX, we could lower this to 2x movntps %xmm. Taken further, we
|
||||
; could even scalarize to movnti when we have 1-alignment: nontemporal is
|
||||
; probably always worth even some 20 instruction scalarization.
|
||||
define void @test_unaligned_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) {
|
||||
; CHECK-LABEL: test_unaligned_v8f32:
|
||||
; SSE: movntps %xmm
|
||||
; SSE: movntps %xmm
|
||||
; AVX-NOT: movnt
|
||||
; AVX: vmovups %ymm
|
||||
%r = fadd <8 x float> %a, %b
|
||||
store <8 x float> %r, <8 x float>* %dst, align 16, !nontemporal !1
|
||||
ret void
|
||||
}
|
||||
|
||||
!1 = !{i32 1}
|
||||
|
|
Loading…
Reference in New Issue