[AArch64] Improve add/sub/cmp isel of uxtw forms.

Don't match the UXTW extended reg forms of ADD/ADDS/SUB/SUBS if the
32-bit to 64-bit zero-extend can be done for free by taking advantage
of the 32-bit defining instruction zeroing the upper 32-bits of the X
register destination.  This enables better instruction selection in a
few cases, such as:

  sub x0, xzr, x8
  instead of:
  mov x8, xzr
  sub x0, x8, w9, uxtw

  madd x0, x1, x1, x8
  instead of:
  mul x9, x1, x1
  add x0, x9, w8, uxtw

  cmp x2, x8
  instead of:
  sub x8, x2, w8, uxtw
  cmp x8, #0

  add x0, x8, x1, lsl #3
  instead of:
  lsl x9, x1, #3
  add x0, x9, w8, uxtw

Reviewers: t.p.northover, jmolloy

Subscribers: mcrosier, aemerson, llvm-commits, rengolin

Differential Revision: https://reviews.llvm.org/D24747

llvm-svn: 282413
This commit is contained in:
Geoff Berry 2016-09-26 15:34:47 +00:00
parent e45de8a5ec
commit 256fcf975f
4 changed files with 124 additions and 14 deletions

View File

@ -586,6 +586,11 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
return false;
Reg = N.getOperand(0);
// Don't match if free 32-bit -> 64-bit zext can be used instead.
if (Ext == AArch64_AM::UXTW &&
Reg->getValueType(0).getSizeInBits() == 32 && isDef32(*Reg.getNode()))
return false;
}
// AArch64 mandates that the RHS of the operation must use the smallest

View File

@ -215,6 +215,21 @@ enum NodeType : unsigned {
} // end namespace AArch64ISD
namespace {
// Any instruction that defines a 32-bit result zeros out the high half of the
// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
// be copying from a truncate. But any other 32-bit operation will zero-extend
// up to 64 bits.
// FIXME: X86 also checks for CMOV here. Do we need something similar?
static inline bool isDef32(const SDNode &N) {
unsigned Opc = N.getOpcode();
return Opc != ISD::TRUNCATE && Opc != TargetOpcode::EXTRACT_SUBREG &&
Opc != ISD::CopyFromReg;
}
} // end anonymous namespace
class AArch64Subtarget;
class AArch64TargetMachine;

View File

@ -5272,15 +5272,8 @@ def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_aarch64_crypto_sha256su0
//----------------------------------------------------------------------------
// FIXME: Like for X86, these should go in their own separate .td file.
// Any instruction that defines a 32-bit result leaves the high half of the
// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
// be copying from a truncate. But any other 32-bit operation will zero-extend
// up to 64 bits.
// FIXME: X86 also checks for CMOV here. Do we need something similar?
def def32 : PatLeaf<(i32 GPR32:$src), [{
return N->getOpcode() != ISD::TRUNCATE &&
N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
N->getOpcode() != ISD::CopyFromReg;
return isDef32(*N);
}]>;
// In the case of a 32-bit def that is known to implicitly zero-extend,

View File

@ -274,19 +274,20 @@ define void @sub_i16rhs() minsize {
; N.b. we could probably check more here ("add w2, w3, w1, uxtw" for
; example), but the remaining instructions are probably not idiomatic
; in the face of "add/sub (shifted register)" so I don't intend to.
define void @addsub_i32rhs() minsize {
define void @addsub_i32rhs(i32 %in32) minsize {
; CHECK-LABEL: addsub_i32rhs:
%val32_tmp = load i32, i32* @var32
%lhs64 = load i64, i64* @var64
%val32 = add i32 %val32_tmp, 123
%rhs64_zext = zext i32 %val32 to i64
%rhs64_zext = zext i32 %in32 to i64
%res64_zext = add i64 %lhs64, %rhs64_zext
store volatile i64 %res64_zext, i64* @var64
; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxtw
%rhs64_zext_shift = shl i64 %rhs64_zext, 2
%rhs64_zext2 = zext i32 %val32 to i64
%rhs64_zext_shift = shl i64 %rhs64_zext2, 2
%res64_zext_shift = add i64 %lhs64, %rhs64_zext_shift
store volatile i64 %res64_zext_shift, i64* @var64
; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxtw #2
@ -304,19 +305,20 @@ define void @addsub_i32rhs() minsize {
ret void
}
define void @sub_i32rhs() minsize {
define void @sub_i32rhs(i32 %in32) minsize {
; CHECK-LABEL: sub_i32rhs:
%val32_tmp = load i32, i32* @var32
%lhs64 = load i64, i64* @var64
%val32 = add i32 %val32_tmp, 123
%rhs64_zext = zext i32 %val32 to i64
%rhs64_zext = zext i32 %in32 to i64
%res64_zext = sub i64 %lhs64, %rhs64_zext
store volatile i64 %res64_zext, i64* @var64
; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxtw
%rhs64_zext_shift = shl i64 %rhs64_zext, 2
%rhs64_zext2 = zext i32 %val32 to i64
%rhs64_zext_shift = shl i64 %rhs64_zext2, 2
%res64_zext_shift = sub i64 %lhs64, %rhs64_zext_shift
store volatile i64 %res64_zext_shift, i64* @var64
; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxtw #2
@ -333,3 +335,98 @@ define void @sub_i32rhs() minsize {
ret void
}
; Check that implicit zext from w reg write is used instead of uxtw form of add.
define i64 @add_fold_uxtw(i32 %x, i64 %y) {
; CHECK-LABEL: add_fold_uxtw:
entry:
; CHECK: and w[[TMP:[0-9]+]], w0, #0x3
%m = and i32 %x, 3
%ext = zext i32 %m to i64
; CHECK-NEXT: add x0, x1, x[[TMP]]
%ret = add i64 %y, %ext
ret i64 %ret
}
; Check that implicit zext from w reg write is used instead of uxtw
; form of sub and that mov WZR is folded to form a neg instruction.
define i64 @sub_fold_uxtw_xzr(i32 %x) {
; CHECK-LABEL: sub_fold_uxtw_xzr:
entry:
; CHECK: and w[[TMP:[0-9]+]], w0, #0x3
%m = and i32 %x, 3
%ext = zext i32 %m to i64
; CHECK-NEXT: neg x0, x[[TMP]]
%ret = sub i64 0, %ext
ret i64 %ret
}
; Check that implicit zext from w reg write is used instead of uxtw form of subs/cmp.
define i1 @cmp_fold_uxtw(i32 %x, i64 %y) {
; CHECK-LABEL: cmp_fold_uxtw:
entry:
; CHECK: and w[[TMP:[0-9]+]], w0, #0x3
%m = and i32 %x, 3
%ext = zext i32 %m to i64
; CHECK-NEXT: cmp x1, x[[TMP]]
; CHECK-NEXT: cset
%ret = icmp eq i64 %y, %ext
ret i1 %ret
}
; Check that implicit zext from w reg write is used instead of uxtw
; form of add, leading to madd selection.
define i64 @madd_fold_uxtw(i32 %x, i64 %y) {
; CHECK-LABEL: madd_fold_uxtw:
entry:
; CHECK: and w[[TMP:[0-9]+]], w0, #0x3
%m = and i32 %x, 3
%ext = zext i32 %m to i64
; CHECK-NEXT: madd x0, x1, x1, x[[TMP]]
%mul = mul i64 %y, %y
%ret = add i64 %mul, %ext
ret i64 %ret
}
; Check that implicit zext from w reg write is used instead of uxtw
; form of sub, leading to sub/cmp folding.
; Check that implicit zext from w reg write is used instead of uxtw form of subs/cmp.
define i1 @cmp_sub_fold_uxtw(i32 %x, i64 %y, i64 %z) {
; CHECK-LABEL: cmp_sub_fold_uxtw:
entry:
; CHECK: and w[[TMP:[0-9]+]], w0, #0x3
%m = and i32 %x, 3
%ext = zext i32 %m to i64
; CHECK-NEXT: cmp x[[TMP2:[0-9]+]], x[[TMP]]
; CHECK-NEXT: cset
%sub = sub i64 %z, %ext
%ret = icmp eq i64 %sub, 0
ret i1 %ret
}
; Check that implicit zext from w reg write is used instead of uxtw
; form of add and add of -1 gets selected as sub.
define i64 @add_imm_fold_uxtw(i32 %x) {
; CHECK-LABEL: add_imm_fold_uxtw:
entry:
; CHECK: and w[[TMP:[0-9]+]], w0, #0x3
%m = and i32 %x, 3
%ext = zext i32 %m to i64
; CHECK-NEXT: sub x0, x[[TMP]], #1
%ret = add i64 %ext, -1
ret i64 %ret
}
; Check that implicit zext from w reg write is used instead of uxtw
; form of add and add lsl form gets selected.
define i64 @add_lsl_fold_uxtw(i32 %x, i64 %y) {
; CHECK-LABEL: add_lsl_fold_uxtw:
entry:
; CHECK: orr w[[TMP:[0-9]+]], w0, #0x3
%m = or i32 %x, 3
%ext = zext i32 %m to i64
%shift = shl i64 %y, 3
; CHECK-NEXT: add x0, x[[TMP]], x1, lsl #3
%ret = add i64 %ext, %shift
ret i64 %ret
}