[PowerPC] Implement low-order Vector Multiply, Modulus and Divide Instructions

This patch aims to implement the low order vector multiply, divide and modulo instructions available on Power10. The patch involves legalizing the ISD nodes MUL, UDIV, SDIV, UREM and SREM for v2i64 and v4i32 vector types in order to utilize the following instructions: - Vector Multiply Low Doubleword: vmulld - Vector Modulus Word/Doubleword: vmodsw, vmoduw, vmodsd, vmodud - Vector Divide Word/Doubleword: vdivsw, vdivsd, vdivuw, vdivud Differential Revision: https://reviews.llvm.org/D82510
2020-07-23 13:12:45 -05:00 · 2020-07-23 13:12:45 -05:00 · 1dc1a3fb0c
parent 38c71b7c85
commit 1dc1a3fb0c
5 changed files with 207 additions and 9 deletions
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@ -809,6 +809,18 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
    else
      setOperationAction(ISD::MUL, MVT::v4i32, Custom);

+    if (Subtarget.isISA3_1()) {
+      setOperationAction(ISD::MUL, MVT::v2i64, Legal);
+      setOperationAction(ISD::UDIV, MVT::v2i64, Legal);
+      setOperationAction(ISD::SDIV, MVT::v2i64, Legal);
+      setOperationAction(ISD::UDIV, MVT::v4i32, Legal);
+      setOperationAction(ISD::SDIV, MVT::v4i32, Legal);
+      setOperationAction(ISD::UREM, MVT::v2i64, Legal);
+      setOperationAction(ISD::SREM, MVT::v2i64, Legal);
+      setOperationAction(ISD::UREM, MVT::v4i32, Legal);
+      setOperationAction(ISD::SREM, MVT::v4i32, Legal);
+    }
+
    setOperationAction(ISD::MUL, MVT::v8i16, Legal);
    setOperationAction(ISD::MUL, MVT::v16i8, Custom);

--- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
@ -973,7 +973,8 @@ let Predicates = [IsISA3_1] in {
                         [(set v16i8:$vD,
                               (int_ppc_altivec_vclrrb v16i8:$vA, i32:$rB))]>;
  def VMULLD : VXForm_1<457, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vmulld $vD, $vA, $vB", IIC_VecGeneral, []>;
+                        "vmulld $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v2i64:$vD, (mul v2i64:$vA, v2i64:$vB))]>;
  def VMULHSW : VXForm_1<905, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
                         "vmulhsw $vD, $vA, $vB", IIC_VecGeneral, []>;
  def VMULHUW : VXForm_1<649, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
@ -983,21 +984,29 @@ let Predicates = [IsISA3_1] in {
  def VMULHUD : VXForm_1<713, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
                         "vmulhud $vD, $vA, $vB", IIC_VecGeneral, []>;
  def VMODSW : VXForm_1<1931, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vmodsw $vD, $vA, $vB", IIC_VecGeneral, []>;
+                        "vmodsw $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v4i32:$vD, (srem v4i32:$vA, v4i32:$vB))]>;
  def VMODUW : VXForm_1<1675, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vmoduw $vD, $vA, $vB", IIC_VecGeneral, []>;
+                        "vmoduw $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v4i32:$vD, (urem v4i32:$vA, v4i32:$vB))]>;
  def VMODSD : VXForm_1<1995, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vmodsd $vD, $vA, $vB", IIC_VecGeneral, []>;
+                        "vmodsd $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v2i64:$vD, (srem v2i64:$vA, v2i64:$vB))]>;
  def VMODUD : VXForm_1<1739, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vmodud $vD, $vA, $vB", IIC_VecGeneral, []>;
+                        "vmodud $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v2i64:$vD, (urem v2i64:$vA, v2i64:$vB))]>;
  def VDIVSW : VXForm_1<395, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vdivsw $vD, $vA, $vB", IIC_VecGeneral, []>;
+                        "vdivsw $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v4i32:$vD, (sdiv v4i32:$vA, v4i32:$vB))]>;
  def VDIVUW : VXForm_1<139, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vdivuw $vD, $vA, $vB", IIC_VecGeneral, []>;
+                        "vdivuw $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v4i32:$vD, (udiv v4i32:$vA, v4i32:$vB))]>;
  def VDIVSD : VXForm_1<459, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vdivsd $vD, $vA, $vB", IIC_VecGeneral, []>;
+                        "vdivsd $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v2i64:$vD, (sdiv v2i64:$vA, v2i64:$vB))]>;
  def VDIVUD : VXForm_1<203, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vdivud $vD, $vA, $vB", IIC_VecGeneral, []>;
+                        "vdivud $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v2i64:$vD, (udiv v2i64:$vA, v2i64:$vB))]>;
  def VDIVESW : VXForm_1<907, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
                         "vdivesw $vD, $vA, $vB", IIC_VecGeneral, []>;
  def VDIVEUW : VXForm_1<651, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
--- a/llvm/test/CodeGen/PowerPC/p10-vector-divide.ll
+++ b/llvm/test/CodeGen/PowerPC/p10-vector-divide.ll
@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s
+
+; This test case aims to test the vector divide instructions on Power10.
+; This includes the low order and extended versions of vector divide,
+; that operate on signed and unsigned words and doublewords.
+
+define <2 x i64> @test_vdivud(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vdivud:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vdivud v2, v2, v3
+; CHECK-NEXT:    blr
+entry:
+  %div = udiv <2 x i64> %a, %b
+  ret <2 x i64> %div
+}
+
+define <2 x i64> @test_vdivsd(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vdivsd:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vdivsd v2, v2, v3
+; CHECK-NEXT:    blr
+entry:
+  %div = sdiv <2 x i64> %a, %b
+  ret <2 x i64> %div
+}
+
+define <4 x i32> @test_vdivuw(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vdivuw:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vdivuw v2, v2, v3
+; CHECK-NEXT:    blr
+entry:
+  %div = udiv <4 x i32> %a, %b
+  ret <4 x i32> %div
+}
+
+define <4 x i32> @test_vdivsw(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vdivsw:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vdivsw v2, v2, v3
+; CHECK-NEXT:    blr
+entry:
+  %div = sdiv <4 x i32> %a, %b
+  ret <4 x i32> %div
+}
--- a/llvm/test/CodeGen/PowerPC/p10-vector-modulo.ll
+++ b/llvm/test/CodeGen/PowerPC/p10-vector-modulo.ll
@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s
+
+; This test case aims to test the vector modulo instructions on Power10.
+; The vector modulo instructions operate on signed and unsigned words
+; and doublewords.
+
+define <2 x i64> @test_vmodud(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vmodud:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmodud v2, v2, v3
+; CHECK-NEXT:    blr
+entry:
+  %rem = urem <2 x i64> %a, %b
+  ret <2 x i64> %rem
+}
+
+define <2 x i64> @test_vmodsd(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vmodsd:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmodsd v2, v2, v3
+; CHECK-NEXT:    blr
+entry:
+  %rem = srem <2 x i64> %a, %b
+  ret <2 x i64> %rem
+}
+
+define <4 x i32> @test_vmoduw(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmoduw:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmoduw v2, v2, v3
+; CHECK-NEXT:    blr
+entry:
+  %rem = urem <4 x i32> %a, %b
+  ret <4 x i32> %rem
+}
+
+define <4 x i32> @test_vmodsw(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmodsw:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmodsw v2, v2, v3
+; CHECK-NEXT:    blr
+entry:
+  %rem = srem <4 x i32> %a, %b
+  ret <4 x i32> %rem
+}
+
+define <2 x i64> @test_vmodud_with_div(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vmodud_with_div:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmodud v4, v2, v3
+; CHECK-NEXT:    vdivud v2, v2, v3
+; CHECK-NEXT:    vaddudm v2, v4, v2
+; CHECK-NEXT:    blr
+entry:
+  %rem = urem <2 x i64> %a, %b
+  %div = udiv <2 x i64> %a, %b
+  %add = add <2 x i64> %rem, %div
+  ret <2 x i64> %add
+}
+
+define <2 x i64> @test_vmodsd_with_div(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vmodsd_with_div:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmodsd v4, v2, v3
+; CHECK-NEXT:    vdivsd v2, v2, v3
+; CHECK-NEXT:    vaddudm v2, v4, v2
+; CHECK-NEXT:    blr
+entry:
+  %rem = srem <2 x i64> %a, %b
+  %div = sdiv <2 x i64> %a, %b
+  %add = add <2 x i64> %rem, %div
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmoduw_with_div(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmoduw_with_div:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmoduw v4, v2, v3
+; CHECK-NEXT:    vdivuw v2, v2, v3
+; CHECK-NEXT:    vadduwm v2, v4, v2
+; CHECK-NEXT:    blr
+entry:
+  %rem = urem <4 x i32> %a, %b
+  %div = udiv <4 x i32> %a, %b
+  %add = add <4 x i32> %rem, %div
+  ret <4 x i32> %add
+}
+
+define <4 x i32> @test_vmodsw_div(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmodsw_div:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmodsw v4, v2, v3
+; CHECK-NEXT:    vdivsw v2, v2, v3
+; CHECK-NEXT:    vadduwm v2, v4, v2
+; CHECK-NEXT:    blr
+entry:
+  %rem = srem <4 x i32> %a, %b
+  %div = sdiv <4 x i32> %a, %b
+  %add = add <4 x i32> %rem, %div
+  ret <4 x i32> %add
+}
--- a/llvm/test/CodeGen/PowerPC/p10-vector-multiply.ll
+++ b/llvm/test/CodeGen/PowerPC/p10-vector-multiply.ll
@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s
+
+; This test case aims to test the vector multiply instructions on Power10.
+
+define <2 x i64> @test_vmulld(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vmulld:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmulld v2, v3, v2
+; CHECK-NEXT:    blr
+entry:
+  %mul = mul <2 x i64> %b, %a
+  ret <2 x i64> %mul
+}