forked from OSchip/llvm-project
ARM: fix vectorized division on WoA
The Windows on ARM target uses custom division for normal division as the backend needs to insert division-by-zero checks. However, it is designed to only handle non-vectorized division. ARM has custom lowering for vectorized division as that can avoid loading registers with the values and invoke a division routine for each one, preferring to lower using NEON instructions. Fall back to the custom lowering for the NEON instructions if we encounter a vectorized division. Resolves PR31778! llvm-svn: 293259
This commit is contained in:
parent
c479686af2
commit
26c00e3700
|
@ -7571,11 +7571,11 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
|||
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
|
||||
case ISD::MUL: return LowerMUL(Op, DAG);
|
||||
case ISD::SDIV:
|
||||
if (Subtarget->isTargetWindows())
|
||||
if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
|
||||
return LowerDIV_Windows(Op, DAG, /* Signed */ true);
|
||||
return LowerSDIV(Op, DAG);
|
||||
case ISD::UDIV:
|
||||
if (Subtarget->isTargetWindows())
|
||||
if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
|
||||
return LowerDIV_Windows(Op, DAG, /* Signed */ false);
|
||||
return LowerUDIV(Op, DAG);
|
||||
case ISD::ADDC:
|
||||
|
|
|
@ -1,49 +1,58 @@
|
|||
; RUN: llc -mtriple=arm-eabi -mattr=+neon -pre-RA-sched=source -disable-post-ra %s -o - \
|
||||
; RUN: | FileCheck %s
|
||||
; RUN: llc -mtriple arm-eabi -mattr=+neon -disable-post-ra -pre-RA-sched source %s -o - | FileCheck %s
|
||||
; RUN: llc -mtriple thumbv7-windows-itanium -mattr=+neon -disable-post-ra -pre-RA-sched source %s -o - | FileCheck %s
|
||||
|
||||
define <8 x i8> @sdivi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
|
||||
;CHECK: vrecpe.f32
|
||||
;CHECK: vmovn.i32
|
||||
;CHECK: vrecpe.f32
|
||||
;CHECK: vmovn.i32
|
||||
;CHECK: vmovn.i16
|
||||
%tmp1 = load <8 x i8>, <8 x i8>* %A
|
||||
%tmp2 = load <8 x i8>, <8 x i8>* %B
|
||||
%tmp3 = sdiv <8 x i8> %tmp1, %tmp2
|
||||
ret <8 x i8> %tmp3
|
||||
%tmp1 = load <8 x i8>, <8 x i8>* %A
|
||||
%tmp2 = load <8 x i8>, <8 x i8>* %B
|
||||
%tmp3 = sdiv <8 x i8> %tmp1, %tmp2
|
||||
ret <8 x i8> %tmp3
|
||||
}
|
||||
|
||||
; CHECK-LABEL: sdivi8:
|
||||
; CHECK: vrecpe.f32
|
||||
; CHECK: vmovn.i32
|
||||
; CHECK: vrecpe.f32
|
||||
; CHECK: vmovn.i32
|
||||
; CHECK: vmovn.i16
|
||||
|
||||
define <8 x i8> @udivi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
|
||||
;CHECK: vrecpe.f32
|
||||
;CHECK: vrecps.f32
|
||||
;CHECK: vmovn.i32
|
||||
;CHECK: vrecpe.f32
|
||||
;CHECK: vrecps.f32
|
||||
;CHECK: vmovn.i32
|
||||
;CHECK: vqmovun.s16
|
||||
%tmp1 = load <8 x i8>, <8 x i8>* %A
|
||||
%tmp2 = load <8 x i8>, <8 x i8>* %B
|
||||
%tmp3 = udiv <8 x i8> %tmp1, %tmp2
|
||||
ret <8 x i8> %tmp3
|
||||
%tmp1 = load <8 x i8>, <8 x i8>* %A
|
||||
%tmp2 = load <8 x i8>, <8 x i8>* %B
|
||||
%tmp3 = udiv <8 x i8> %tmp1, %tmp2
|
||||
ret <8 x i8> %tmp3
|
||||
}
|
||||
|
||||
; CHECK-LABEL: udivi8:
|
||||
; CHECK: vrecpe.f32
|
||||
; CHECK: vrecps.f32
|
||||
; CHECK: vmovn.i32
|
||||
; CHECK: vrecpe.f32
|
||||
; CHECK: vrecps.f32
|
||||
; CHECK: vmovn.i32
|
||||
; CHECK: vqmovun.s16
|
||||
|
||||
define <4 x i16> @sdivi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
|
||||
;CHECK: vrecpe.f32
|
||||
;CHECK: vrecps.f32
|
||||
;CHECK: vmovn.i32
|
||||
%tmp1 = load <4 x i16>, <4 x i16>* %A
|
||||
%tmp2 = load <4 x i16>, <4 x i16>* %B
|
||||
%tmp3 = sdiv <4 x i16> %tmp1, %tmp2
|
||||
ret <4 x i16> %tmp3
|
||||
%tmp1 = load <4 x i16>, <4 x i16>* %A
|
||||
%tmp2 = load <4 x i16>, <4 x i16>* %B
|
||||
%tmp3 = sdiv <4 x i16> %tmp1, %tmp2
|
||||
ret <4 x i16> %tmp3
|
||||
}
|
||||
|
||||
; CHECK-LABEL: sdivi16:
|
||||
; CHECK: vrecpe.f32
|
||||
; CHECK: vrecps.f32
|
||||
; CHECK: vmovn.i32
|
||||
|
||||
define <4 x i16> @udivi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
|
||||
;CHECK: vrecpe.f32
|
||||
;CHECK: vrecps.f32
|
||||
;CHECK: vrecps.f32
|
||||
;CHECK: vmovn.i32
|
||||
%tmp1 = load <4 x i16>, <4 x i16>* %A
|
||||
%tmp2 = load <4 x i16>, <4 x i16>* %B
|
||||
%tmp3 = udiv <4 x i16> %tmp1, %tmp2
|
||||
ret <4 x i16> %tmp3
|
||||
%tmp1 = load <4 x i16>, <4 x i16>* %A
|
||||
%tmp2 = load <4 x i16>, <4 x i16>* %B
|
||||
%tmp3 = udiv <4 x i16> %tmp1, %tmp2
|
||||
ret <4 x i16> %tmp3
|
||||
}
|
||||
|
||||
; CHECK-LABEL: udivi16:
|
||||
; CHECK: vrecpe.f32
|
||||
; CHECK: vrecps.f32
|
||||
; CHECK: vrecps.f32
|
||||
; CHECK: vmovn.i32
|
||||
|
||||
|
|
Loading…
Reference in New Issue