ARM: fix vectorized division on WoA

The Windows on ARM target uses custom division for normal division as
the backend needs to insert division-by-zero checks.  However, it is
designed to only handle non-vectorized division.  ARM has custom
lowering for vectorized division as that can avoid loading registers
with the values and invoke a division routine for each one, preferring
to lower using NEON instructions.  Fall back to the custom lowering for
the NEON instructions if we encounter a vectorized division.

Resolves PR31778!

llvm-svn: 293259
This commit is contained in:
Saleem Abdulrasool 2017-01-27 03:41:53 +00:00
parent c479686af2
commit 26c00e3700
2 changed files with 48 additions and 39 deletions

View File

@ -7571,11 +7571,11 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
case ISD::MUL: return LowerMUL(Op, DAG);
case ISD::SDIV:
if (Subtarget->isTargetWindows())
if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
return LowerDIV_Windows(Op, DAG, /* Signed */ true);
return LowerSDIV(Op, DAG);
case ISD::UDIV:
if (Subtarget->isTargetWindows())
if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
return LowerDIV_Windows(Op, DAG, /* Signed */ false);
return LowerUDIV(Op, DAG);
case ISD::ADDC:

View File

@ -1,49 +1,58 @@
; RUN: llc -mtriple=arm-eabi -mattr=+neon -pre-RA-sched=source -disable-post-ra %s -o - \
; RUN: | FileCheck %s
; RUN: llc -mtriple arm-eabi -mattr=+neon -disable-post-ra -pre-RA-sched source %s -o - | FileCheck %s
; RUN: llc -mtriple thumbv7-windows-itanium -mattr=+neon -disable-post-ra -pre-RA-sched source %s -o - | FileCheck %s
define <8 x i8> @sdivi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vrecpe.f32
;CHECK: vmovn.i32
;CHECK: vrecpe.f32
;CHECK: vmovn.i32
;CHECK: vmovn.i16
%tmp1 = load <8 x i8>, <8 x i8>* %A
%tmp2 = load <8 x i8>, <8 x i8>* %B
%tmp3 = sdiv <8 x i8> %tmp1, %tmp2
ret <8 x i8> %tmp3
%tmp1 = load <8 x i8>, <8 x i8>* %A
%tmp2 = load <8 x i8>, <8 x i8>* %B
%tmp3 = sdiv <8 x i8> %tmp1, %tmp2
ret <8 x i8> %tmp3
}
; CHECK-LABEL: sdivi8:
; CHECK: vrecpe.f32
; CHECK: vmovn.i32
; CHECK: vrecpe.f32
; CHECK: vmovn.i32
; CHECK: vmovn.i16
define <8 x i8> @udivi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vrecpe.f32
;CHECK: vrecps.f32
;CHECK: vmovn.i32
;CHECK: vrecpe.f32
;CHECK: vrecps.f32
;CHECK: vmovn.i32
;CHECK: vqmovun.s16
%tmp1 = load <8 x i8>, <8 x i8>* %A
%tmp2 = load <8 x i8>, <8 x i8>* %B
%tmp3 = udiv <8 x i8> %tmp1, %tmp2
ret <8 x i8> %tmp3
%tmp1 = load <8 x i8>, <8 x i8>* %A
%tmp2 = load <8 x i8>, <8 x i8>* %B
%tmp3 = udiv <8 x i8> %tmp1, %tmp2
ret <8 x i8> %tmp3
}
; CHECK-LABEL: udivi8:
; CHECK: vrecpe.f32
; CHECK: vrecps.f32
; CHECK: vmovn.i32
; CHECK: vrecpe.f32
; CHECK: vrecps.f32
; CHECK: vmovn.i32
; CHECK: vqmovun.s16
define <4 x i16> @sdivi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vrecpe.f32
;CHECK: vrecps.f32
;CHECK: vmovn.i32
%tmp1 = load <4 x i16>, <4 x i16>* %A
%tmp2 = load <4 x i16>, <4 x i16>* %B
%tmp3 = sdiv <4 x i16> %tmp1, %tmp2
ret <4 x i16> %tmp3
%tmp1 = load <4 x i16>, <4 x i16>* %A
%tmp2 = load <4 x i16>, <4 x i16>* %B
%tmp3 = sdiv <4 x i16> %tmp1, %tmp2
ret <4 x i16> %tmp3
}
; CHECK-LABEL: sdivi16:
; CHECK: vrecpe.f32
; CHECK: vrecps.f32
; CHECK: vmovn.i32
define <4 x i16> @udivi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vrecpe.f32
;CHECK: vrecps.f32
;CHECK: vrecps.f32
;CHECK: vmovn.i32
%tmp1 = load <4 x i16>, <4 x i16>* %A
%tmp2 = load <4 x i16>, <4 x i16>* %B
%tmp3 = udiv <4 x i16> %tmp1, %tmp2
ret <4 x i16> %tmp3
%tmp1 = load <4 x i16>, <4 x i16>* %A
%tmp2 = load <4 x i16>, <4 x i16>* %B
%tmp3 = udiv <4 x i16> %tmp1, %tmp2
ret <4 x i16> %tmp3
}
; CHECK-LABEL: udivi16:
; CHECK: vrecpe.f32
; CHECK: vrecps.f32
; CHECK: vrecps.f32
; CHECK: vmovn.i32