forked from OSchip/llvm-project
[X86 TTI] Implement LSV hook
Summary: LSV wants to know the maximum size that can be loaded to a vector register. On X86, this always matches the maximum register width. Implement this accordingly and add a test to make sure that LSV can vectorize up to the maximum permissible width on X86. Reviewers: delena, arsenm Reviewed By: arsenm Subscribers: wdng, llvm-commits Differential Revision: https://reviews.llvm.org/D31504 llvm-svn: 299589
This commit is contained in:
parent
46f1d4a12c
commit
1ec5dd85a2
|
@ -78,7 +78,7 @@ unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
|
|||
return 8;
|
||||
}
|
||||
|
||||
unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) {
|
||||
unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
|
||||
if (Vector) {
|
||||
if (ST->hasAVX512())
|
||||
return 512;
|
||||
|
@ -95,6 +95,10 @@ unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) {
|
|||
return 32;
|
||||
}
|
||||
|
||||
unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
|
||||
return getRegisterBitWidth(true);
|
||||
}
|
||||
|
||||
unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
|
||||
// If the loop will not be vectorized, don't interleave the loop.
|
||||
// Let regular unroll to unroll the loop, which saves the overflow
|
||||
|
|
|
@ -51,7 +51,8 @@ public:
|
|||
/// @{
|
||||
|
||||
unsigned getNumberOfRegisters(bool Vector);
|
||||
unsigned getRegisterBitWidth(bool Vector);
|
||||
unsigned getRegisterBitWidth(bool Vector) const;
|
||||
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const;
|
||||
unsigned getMaxInterleaveFactor(unsigned VF);
|
||||
int getArithmeticInstrCost(
|
||||
unsigned Opcode, Type *Ty,
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu haswell -S -o - %s | FileCheck --check-prefix=CHECK-HSW %s
|
||||
; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu knl -S -o - %s | FileCheck --check-prefix=CHECK-KNL %s
|
||||
|
||||
define <8 x double> @loadwidth_insert_extract(double* %ptr) {
|
||||
%a = bitcast double* %ptr to <2 x double> *
|
||||
%b = getelementptr <2 x double>, <2 x double>* %a, i32 1
|
||||
%c = getelementptr <2 x double>, <2 x double>* %a, i32 2
|
||||
%d = getelementptr <2 x double>, <2 x double>* %a, i32 3
|
||||
; CHECK-HSW: load <4 x double>
|
||||
; CHECK-HSW: load <4 x double>
|
||||
; CHECK-HSW-NOT: load
|
||||
; CHECK-KNL: load <8 x double>
|
||||
; CHECK-KNL-NOT: load
|
||||
%la = load <2 x double>, <2 x double> *%a
|
||||
%lb = load <2 x double>, <2 x double> *%b
|
||||
%lc = load <2 x double>, <2 x double> *%c
|
||||
%ld = load <2 x double>, <2 x double> *%d
|
||||
; Scalarize everything - Explicitly not a shufflevector to test this code
|
||||
; path in the LSV
|
||||
%v1 = extractelement <2 x double> %la, i32 0
|
||||
%v2 = extractelement <2 x double> %la, i32 1
|
||||
%v3 = extractelement <2 x double> %lb, i32 0
|
||||
%v4 = extractelement <2 x double> %lb, i32 1
|
||||
%v5 = extractelement <2 x double> %lc, i32 0
|
||||
%v6 = extractelement <2 x double> %lc, i32 1
|
||||
%v7 = extractelement <2 x double> %ld, i32 0
|
||||
%v8 = extractelement <2 x double> %ld, i32 1
|
||||
; Make a vector again
|
||||
%i1 = insertelement <8 x double> undef, double %v1, i32 0
|
||||
%i2 = insertelement <8 x double> %i1, double %v2, i32 1
|
||||
%i3 = insertelement <8 x double> %i2, double %v3, i32 2
|
||||
%i4 = insertelement <8 x double> %i3, double %v4, i32 3
|
||||
%i5 = insertelement <8 x double> %i4, double %v5, i32 4
|
||||
%i6 = insertelement <8 x double> %i5, double %v6, i32 5
|
||||
%i7 = insertelement <8 x double> %i6, double %v7, i32 6
|
||||
%i8 = insertelement <8 x double> %i7, double %v8, i32 7
|
||||
ret <8 x double> %i8
|
||||
}
|
Loading…
Reference in New Issue