forked from OSchip/llvm-project
ARM: Use a dedicated intrinsic for vector bitwise select.
The expression based expansion too often results in IR level optimizations splitting the intermediate values into separate basic blocks, preventing the formation of the VBSL instruction as the code author intended. In particular, LICM would often hoist part of the computation out of a loop. rdar://11011471 llvm-svn: 164340
This commit is contained in:
parent
14f779c4d6
commit
74b61c398c
|
@ -421,4 +421,9 @@ def int_arm_neon_vst4lane : Intrinsic<[],
|
|||
LLVMMatchType<0>, llvm_i32_ty,
|
||||
llvm_i32_ty], [IntrReadWriteArgMem]>;
|
||||
|
||||
// Vector bitwise select.
|
||||
def int_arm_neon_vbsl : Intrinsic<[llvm_anyvector_ty],
|
||||
[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
|
||||
[IntrNoMem]>;
|
||||
|
||||
} // end TargetPrefix
|
||||
|
|
|
@ -4488,10 +4488,23 @@ def VBSLd : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd),
|
|||
"vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
|
||||
[(set DPR:$Vd,
|
||||
(v2i32 (NEONvbsl DPR:$src1, DPR:$Vn, DPR:$Vm)))]>;
|
||||
def : Pat<(v8i8 (int_arm_neon_vbsl (v8i8 DPR:$src1),
|
||||
(v8i8 DPR:$Vn), (v8i8 DPR:$Vm))),
|
||||
(VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
|
||||
Requires<[HasNEON]>;
|
||||
def : Pat<(v4i16 (int_arm_neon_vbsl (v4i16 DPR:$src1),
|
||||
(v4i16 DPR:$Vn), (v4i16 DPR:$Vm))),
|
||||
(VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
|
||||
Requires<[HasNEON]>;
|
||||
def : Pat<(v2i32 (int_arm_neon_vbsl (v2i32 DPR:$src1),
|
||||
(v2i32 DPR:$Vn), (v2i32 DPR:$Vm))),
|
||||
(VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
|
||||
Requires<[HasNEON]>;
|
||||
|
||||
def : Pat<(v2i32 (or (and DPR:$Vn, DPR:$Vd),
|
||||
(and DPR:$Vm, (vnotd DPR:$Vd)))),
|
||||
(VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>;
|
||||
(VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>,
|
||||
Requires<[HasNEON]>;
|
||||
|
||||
def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
|
||||
(ins QPR:$src1, QPR:$Vn, QPR:$Vm),
|
||||
|
@ -4500,9 +4513,23 @@ def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
|
|||
[(set QPR:$Vd,
|
||||
(v4i32 (NEONvbsl QPR:$src1, QPR:$Vn, QPR:$Vm)))]>;
|
||||
|
||||
def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 QPR:$src1),
|
||||
(v16i8 QPR:$Vn), (v16i8 QPR:$Vm))),
|
||||
(VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
|
||||
Requires<[HasNEON]>;
|
||||
def : Pat<(v8i16 (int_arm_neon_vbsl (v8i16 QPR:$src1),
|
||||
(v8i16 QPR:$Vn), (v8i16 QPR:$Vm))),
|
||||
(VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
|
||||
Requires<[HasNEON]>;
|
||||
def : Pat<(v4i32 (int_arm_neon_vbsl (v4i32 QPR:$src1),
|
||||
(v4i32 QPR:$Vn), (v4i32 QPR:$Vm))),
|
||||
(VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
|
||||
Requires<[HasNEON]>;
|
||||
|
||||
def : Pat<(v4i32 (or (and QPR:$Vn, QPR:$Vd),
|
||||
(and QPR:$Vm, (vnotq QPR:$Vd)))),
|
||||
(VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>;
|
||||
(VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>,
|
||||
Requires<[HasNEON]>;
|
||||
|
||||
// VBIF : Vector Bitwise Insert if False
|
||||
// like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst",
|
||||
|
|
|
@ -103,3 +103,52 @@ define <2 x i64> @v_bslQi64(<2 x i64>* %A, <2 x i64>* %B, <2 x i64>* %C) nounwin
|
|||
%tmp7 = or <2 x i64> %tmp4, %tmp6
|
||||
ret <2 x i64> %tmp7
|
||||
}
|
||||
|
||||
define <8 x i8> @f1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) nounwind readnone optsize ssp {
|
||||
; CHECK: f1:
|
||||
; CHECK: vbsl
|
||||
%vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) nounwind
|
||||
ret <8 x i8> %vbsl.i
|
||||
}
|
||||
|
||||
define <4 x i16> @f2(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
|
||||
; CHECK: f2:
|
||||
; CHECK: vbsl
|
||||
%vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) nounwind
|
||||
ret <4 x i16> %vbsl3.i
|
||||
}
|
||||
|
||||
define <2 x i32> @f3(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
|
||||
; CHECK: f3:
|
||||
; CHECK: vbsl
|
||||
%vbsl3.i = tail call <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind
|
||||
ret <2 x i32> %vbsl3.i
|
||||
}
|
||||
|
||||
define <16 x i8> @g1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) nounwind readnone optsize ssp {
|
||||
; CHECK: g1:
|
||||
; CHECK: vbsl
|
||||
%vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) nounwind
|
||||
ret <16 x i8> %vbsl.i
|
||||
}
|
||||
|
||||
define <8 x i16> @g2(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone optsize ssp {
|
||||
; CHECK: g2:
|
||||
; CHECK: vbsl
|
||||
%vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind
|
||||
ret <8 x i16> %vbsl3.i
|
||||
}
|
||||
|
||||
define <4 x i32> @g3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
|
||||
; CHECK: g3:
|
||||
; CHECK: vbsl
|
||||
%vbsl3.i = tail call <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind
|
||||
ret <4 x i32> %vbsl3.i
|
||||
}
|
||||
|
||||
declare <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
|
||||
declare <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
|
||||
declare <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
|
||||
declare <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
|
||||
declare <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone
|
||||
declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
|
||||
|
|
Loading…
Reference in New Issue