From b03ea02479c82430b4149609e0ac3e0490d5ca12 Mon Sep 17 00:00:00 2001 From: Ahmed Bougacha Date: Wed, 2 Sep 2015 23:25:39 +0000 Subject: [PATCH] [X86] Require 32-byte alignment for 32-byte VMOVNTs. We used to accept (and even test, and generate) 16-byte alignment for 32-byte nontemporal stores, but they require 32-byte alignment, per SDM. Found by inspection. Instead of hardcoding 16 in the patfrag, check for natural alignment. Also fix the autoupgrade and the various tests. Also, use explicit -mattr instead of -mcpu: I stared at the output several minutes wondering why I get 2x movntps for the unaligned case (which is the ideal output, but needs some work: see FIXME), until I remembered corei7-avx implies +slow-unaligned-mem-32. llvm-svn: 246733 --- llvm/lib/IR/AutoUpgrade.cpp | 2 +- llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 6 ++++-- llvm/test/CodeGen/X86/avx2-nontemporal.ll | 6 +++--- llvm/test/CodeGen/X86/movntdq-no-avx.ll | 2 +- llvm/test/CodeGen/X86/nontemporal-2.ll | 21 +++++++++++++++++--- 5 files changed, 27 insertions(+), 10 deletions(-) diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index aeefa38f74d6..71448feb9ab3 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -370,7 +370,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { "cast"); StoreInst *SI = Builder.CreateStore(Arg1, BC); SI->setMetadata(M->getMDKindID("nontemporal"), Node); - SI->setAlignment(16); + SI->setAlignment(32); // Remove intrinsic. CI->eraseFromParent(); diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index b8ab1feed9ea..19bf986c33c9 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -626,12 +626,14 @@ def nontemporalstore : PatFrag<(ops node:$val, node:$ptr), def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr), (nontemporalstore node:$val, node:$ptr), [{ - return cast(N)->getAlignment() >= 16; + StoreSDNode *St = cast(N); + return St->getAlignment() >= St->getMemoryVT().getStoreSize(); }]>; def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr), (nontemporalstore node:$val, node:$ptr), [{ - return cast(N)->getAlignment() < 16; + StoreSDNode *St = cast(N); + return St->getAlignment() < St->getMemoryVT().getStoreSize(); }]>; def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), diff --git a/llvm/test/CodeGen/X86/avx2-nontemporal.ll b/llvm/test/CodeGen/X86/avx2-nontemporal.ll index 544c096c52df..058358f13b86 100644 --- a/llvm/test/CodeGen/X86/avx2-nontemporal.ll +++ b/llvm/test/CodeGen/X86/avx2-nontemporal.ll @@ -4,15 +4,15 @@ define void @f(<8 x float> %A, i8* %B, <4 x double> %C, <4 x i64> %E) { ; CHECK: vmovntps %y %cast = bitcast i8* %B to <8 x float>* %A2 = fadd <8 x float> %A, - store <8 x float> %A2, <8 x float>* %cast, align 16, !nontemporal !0 + store <8 x float> %A2, <8 x float>* %cast, align 32, !nontemporal !0 ; CHECK: vmovntdq %y %cast1 = bitcast i8* %B to <4 x i64>* %E2 = add <4 x i64> %E, - store <4 x i64> %E2, <4 x i64>* %cast1, align 16, !nontemporal !0 + store <4 x i64> %E2, <4 x i64>* %cast1, align 32, !nontemporal !0 ; CHECK: vmovntpd %y %cast2 = bitcast i8* %B to <4 x double>* %C2 = fadd <4 x double> %C, - store <4 x double> %C2, <4 x double>* %cast2, align 16, !nontemporal !0 + store <4 x double> %C2, <4 x double>* %cast2, align 32, !nontemporal !0 ret void } diff --git a/llvm/test/CodeGen/X86/movntdq-no-avx.ll b/llvm/test/CodeGen/X86/movntdq-no-avx.ll index cc35e201e6b3..2bf09dd6f581 100644 --- a/llvm/test/CodeGen/X86/movntdq-no-avx.ll +++ b/llvm/test/CodeGen/X86/movntdq-no-avx.ll @@ -5,7 +5,7 @@ define void @test(<2 x i64>* nocapture %a, <2 x i64> %b) nounwind optsize { entry: - store <2 x i64> %b, <2 x i64>* %a, align 16, !nontemporal !0 + store <2 x i64> %b, <2 x i64>* %a, align 32, !nontemporal !0 ret void } diff --git a/llvm/test/CodeGen/X86/nontemporal-2.ll b/llvm/test/CodeGen/X86/nontemporal-2.ll index 8c08b3c163c0..c9767f88488c 100644 --- a/llvm/test/CodeGen/X86/nontemporal-2.ll +++ b/llvm/test/CodeGen/X86/nontemporal-2.ll @@ -1,6 +1,6 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s -check-prefix=CHECK -check-prefix=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 ; Make sure that we generate non-temporal stores for the test cases below. ; We use xorps for zeroing, so domain information isn't available anymore. @@ -300,4 +300,19 @@ define void @test_op_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %dst) { ret void } +; 256-bit NT stores require 256-bit alignment. +; FIXME: For AVX, we could lower this to 2x movntps %xmm. Taken further, we +; could even scalarize to movnti when we have 1-alignment: nontemporal is +; probably always worth even some 20 instruction scalarization. +define void @test_unaligned_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) { +; CHECK-LABEL: test_unaligned_v8f32: +; SSE: movntps %xmm +; SSE: movntps %xmm +; AVX-NOT: movnt +; AVX: vmovups %ymm + %r = fadd <8 x float> %a, %b + store <8 x float> %r, <8 x float>* %dst, align 16, !nontemporal !1 + ret void +} + !1 = !{i32 1}