forked from OSchip/llvm-project
[VectorCombine] optimize alignment for load transform
Here's another minimal step suggested by D93229 / D93397 . (I'm trying to be extra careful in these changes because load transforms are easy to get wrong.) We can optimistically choose the greater alignment of a load and its pointer operand. As the test diffs show, this can improve what would have been unaligned vector loads into aligned loads. When we enhance with gep offsets, we will need to adjust the alignment calculation to include that offset. Differential Revision: https://reviews.llvm.org/D93406
This commit is contained in:
parent
e53b9f733a
commit
38ebc1a13d
|
@ -143,7 +143,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
|
|||
return false;
|
||||
|
||||
// Original pattern: insertelt undef, load [free casts of] PtrOp, 0
|
||||
Align Alignment = Load->getAlign();
|
||||
// Use the greater of the alignment on the load or its source pointer.
|
||||
Align Alignment = std::max(SrcPtr->getPointerAlignment(DL), Load->getAlign());
|
||||
Type *LoadTy = Load->getType();
|
||||
int OldCost = TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS);
|
||||
APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
|
||||
|
|
|
@ -175,7 +175,7 @@ define double @larger_fp_scalar_256bit_vec(<8 x float>* align 32 dereferenceable
|
|||
define <4 x float> @load_f32_insert_v4f32(float* align 16 dereferenceable(16) %p) {
|
||||
; CHECK-LABEL: @load_f32_insert_v4f32(
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
|
||||
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: ret <4 x float> [[R]]
|
||||
;
|
||||
|
@ -201,7 +201,7 @@ define <4 x float> @casted_load_f32_insert_v4f32(<4 x float>* align 4 dereferenc
|
|||
define <4 x i32> @load_i32_insert_v4i32(i32* align 16 dereferenceable(16) %p) {
|
||||
; CHECK-LABEL: @load_i32_insert_v4i32(
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 16
|
||||
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: ret <4 x i32> [[R]]
|
||||
;
|
||||
|
@ -434,7 +434,7 @@ define <4 x float> @load_f32_insert_v4f32_deref(float* align 4 dereferenceable(1
|
|||
define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) {
|
||||
; CHECK-LABEL: @load_i32_insert_v8i32(
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 16
|
||||
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: ret <8 x i32> [[R]]
|
||||
;
|
||||
|
@ -458,7 +458,7 @@ define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceabl
|
|||
define <16 x float> @load_f32_insert_v16f32(float* align 16 dereferenceable(16) %p) {
|
||||
; CHECK-LABEL: @load_f32_insert_v16f32(
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
|
||||
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: ret <16 x float> [[R]]
|
||||
;
|
||||
|
@ -470,7 +470,7 @@ define <16 x float> @load_f32_insert_v16f32(float* align 16 dereferenceable(16)
|
|||
define <2 x float> @load_f32_insert_v2f32(float* align 16 dereferenceable(16) %p) {
|
||||
; CHECK-LABEL: @load_f32_insert_v2f32(
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
|
||||
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <2 x i32> <i32 0, i32 undef>
|
||||
; CHECK-NEXT: ret <2 x float> [[R]]
|
||||
;
|
||||
|
@ -525,7 +525,7 @@ define void @PR47558_multiple_use_load(<2 x float>* nocapture nonnull %resultptr
|
|||
define <4 x float> @load_v2f32_extract_insert_v4f32(<2 x float>* align 16 dereferenceable(16) %p) {
|
||||
; CHECK-LABEL: @load_v2f32_extract_insert_v4f32(
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float>* [[P:%.*]] to <4 x float>*
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
|
||||
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: ret <4 x float> [[R]]
|
||||
;
|
||||
|
@ -538,7 +538,7 @@ define <4 x float> @load_v2f32_extract_insert_v4f32(<2 x float>* align 16 derefe
|
|||
define <4 x float> @load_v8f32_extract_insert_v4f32(<8 x float>* align 16 dereferenceable(16) %p) {
|
||||
; CHECK-LABEL: @load_v8f32_extract_insert_v4f32(
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x float>* [[P:%.*]] to <4 x float>*
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
|
||||
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: ret <4 x float> [[R]]
|
||||
;
|
||||
|
|
Loading…
Reference in New Issue