ARM: Use a dedicated intrinsic for vector bitwise select.

The expression based expansion too often results in IR level optimizations
splitting the intermediate values into separate basic blocks, preventing
the formation of the VBSL instruction as the code author intended. In
particular, LICM would often hoist part of the computation out of a loop.

rdar://11011471

llvm-svn: 164340
This commit is contained in:
Jim Grosbach 2012-09-21 00:18:20 +00:00
parent 14f779c4d6
commit 74b61c398c
3 changed files with 83 additions and 2 deletions

View File

@ -421,4 +421,9 @@ def int_arm_neon_vst4lane : Intrinsic<[],
LLVMMatchType<0>, llvm_i32_ty,
llvm_i32_ty], [IntrReadWriteArgMem]>;
// Vector bitwise select.
def int_arm_neon_vbsl : Intrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
[IntrNoMem]>;
} // end TargetPrefix

View File

@ -4488,10 +4488,23 @@ def VBSLd : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd),
"vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
[(set DPR:$Vd,
(v2i32 (NEONvbsl DPR:$src1, DPR:$Vn, DPR:$Vm)))]>;
def : Pat<(v8i8 (int_arm_neon_vbsl (v8i8 DPR:$src1),
(v8i8 DPR:$Vn), (v8i8 DPR:$Vm))),
(VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
Requires<[HasNEON]>;
def : Pat<(v4i16 (int_arm_neon_vbsl (v4i16 DPR:$src1),
(v4i16 DPR:$Vn), (v4i16 DPR:$Vm))),
(VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
Requires<[HasNEON]>;
def : Pat<(v2i32 (int_arm_neon_vbsl (v2i32 DPR:$src1),
(v2i32 DPR:$Vn), (v2i32 DPR:$Vm))),
(VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
Requires<[HasNEON]>;
def : Pat<(v2i32 (or (and DPR:$Vn, DPR:$Vd),
(and DPR:$Vm, (vnotd DPR:$Vd)))),
(VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>;
(VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>,
Requires<[HasNEON]>;
def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
(ins QPR:$src1, QPR:$Vn, QPR:$Vm),
@ -4500,9 +4513,23 @@ def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
[(set QPR:$Vd,
(v4i32 (NEONvbsl QPR:$src1, QPR:$Vn, QPR:$Vm)))]>;
def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 QPR:$src1),
(v16i8 QPR:$Vn), (v16i8 QPR:$Vm))),
(VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
Requires<[HasNEON]>;
def : Pat<(v8i16 (int_arm_neon_vbsl (v8i16 QPR:$src1),
(v8i16 QPR:$Vn), (v8i16 QPR:$Vm))),
(VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
Requires<[HasNEON]>;
def : Pat<(v4i32 (int_arm_neon_vbsl (v4i32 QPR:$src1),
(v4i32 QPR:$Vn), (v4i32 QPR:$Vm))),
(VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
Requires<[HasNEON]>;
def : Pat<(v4i32 (or (and QPR:$Vn, QPR:$Vd),
(and QPR:$Vm, (vnotq QPR:$Vd)))),
(VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>;
(VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>,
Requires<[HasNEON]>;
// VBIF : Vector Bitwise Insert if False
// like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst",

View File

@ -103,3 +103,52 @@ define <2 x i64> @v_bslQi64(<2 x i64>* %A, <2 x i64>* %B, <2 x i64>* %C) nounwin
%tmp7 = or <2 x i64> %tmp4, %tmp6
ret <2 x i64> %tmp7
}
define <8 x i8> @f1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) nounwind readnone optsize ssp {
; CHECK: f1:
; CHECK: vbsl
%vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) nounwind
ret <8 x i8> %vbsl.i
}
define <4 x i16> @f2(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
; CHECK: f2:
; CHECK: vbsl
%vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) nounwind
ret <4 x i16> %vbsl3.i
}
define <2 x i32> @f3(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
; CHECK: f3:
; CHECK: vbsl
%vbsl3.i = tail call <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind
ret <2 x i32> %vbsl3.i
}
define <16 x i8> @g1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) nounwind readnone optsize ssp {
; CHECK: g1:
; CHECK: vbsl
%vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) nounwind
ret <16 x i8> %vbsl.i
}
define <8 x i16> @g2(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone optsize ssp {
; CHECK: g2:
; CHECK: vbsl
%vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind
ret <8 x i16> %vbsl3.i
}
define <4 x i32> @g3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
; CHECK: g3:
; CHECK: vbsl
%vbsl3.i = tail call <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind
ret <4 x i32> %vbsl3.i
}
declare <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone