[ARM] Remove duplicate fp16 intrinsics

These vdup and vmov float16 intrinsics are being defined in both the general section and then again in fp16 under a !aarch64 flag. The vdup_lane intrinsics were being defined in both aarch64 and !aarch64 sections, so have been commoned. They are defined as macros, so do not give duplicate warnings, but removing the duplicates shouldn't alter the available intrinsics.
2022-07-28 14:26:17 +01:00 · 2022-07-28 14:26:17 +01:00 · 3b09e532ee
parent 69d5a038b9
commit 3b09e532ee
2 changed files with 40 additions and 24 deletions
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@ -530,7 +530,7 @@ def VMOV_N   : WOpInst<"vmov_n", ".1",
 }
 let InstName = "" in
 def VDUP_LANE: WOpInst<"vdup_lane", ".qI",
-                       "UcUsUicsiPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUl",
+                       "UcUsUicsiPcPshfQUcQUsQUiQcQsQiQPcQPsQhQflUlQlQUl",
                       OP_DUP_LN>;

 ////////////////////////////////////////////////////////////////////////////////
@ -980,7 +980,7 @@ def COPYQ_LANEQ : IOpInst<"vcopy_laneq", "..I.I",

 ////////////////////////////////////////////////////////////////////////////////
 // Set all lanes to same value
-def VDUP_LANE1: WOpInst<"vdup_lane", ".qI", "hdQhQdPlQPl", OP_DUP_LN>;
+def VDUP_LANE1: WOpInst<"vdup_lane", ".qI", "dQdPlQPl", OP_DUP_LN>;
 def VDUP_LANE2: WOpInst<"vdup_laneq", ".QI",
                  "csilUcUsUiUlPcPshfdQcQsQiQlQPcQPsQUcQUsQUiQUlQhQfQdPlQPl",
                        OP_DUP_LN> {
@ -1644,7 +1644,8 @@ def SCALAR_VDUP_LANE : IInst<"vdup_lane", "1.I", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs
 def SCALAR_VDUP_LANEQ : IInst<"vdup_laneq", "1QI", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs"> {
  let isLaneQ = 1;
 }
-}
+
+} // ArchGuard = "defined(__aarch64__)"

 // ARMv8.2-A FP16 vector intrinsics for A32/A64.
 let ArchGuard = "defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)" in {
@ -1763,15 +1764,6 @@ let ArchGuard = "defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)" in {
  def VUZPH    : WInst<"vuzp", "2..", "hQh">;
  def VTRNH    : WInst<"vtrn", "2..", "hQh">;

-
-  let ArchGuard = "!defined(__aarch64__)" in {
-    // Set all lanes to same value.
-    // Already implemented prior to ARMv8.2-A.
-    def VMOV_NH  : WOpInst<"vmov_n", ".1", "hQh", OP_DUP>;
-    def VDUP_NH  : WOpInst<"vdup_n", ".1", "hQh", OP_DUP>;
-    def VDUP_LANE1H : WOpInst<"vdup_lane", ".qI", "hQh", OP_DUP_LN>;
-  }
-
  // Vector Extract
  def VEXTH      : WInst<"vext", "...I", "hQh">;

--- a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
+++ b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
@ -1754,15 +1754,15 @@ float16x8_t test_vmulq_n_f16(float16x8_t a, float16_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmulh_lane_f16
 // CHECK-SAME: (half noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_851:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_851:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_847:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_847:%.*]] = alloca i16, align 2
 // CHECK-NEXT:    [[CONV:%.*]] = fpext half [[A]] to float
-// CHECK-NEXT:    store <4 x half> [[B]], <4 x half>* [[__REINT_851]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_851]] to <4 x i16>*
+// CHECK-NEXT:    store <4 x half> [[B]], <4 x half>* [[__REINT_847]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_847]] to <4 x i16>*
 // CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
 // CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE]], i16* [[__REINT1_851]], align 2
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_851]] to half*
+// CHECK-NEXT:    store i16 [[VGET_LANE]], i16* [[__REINT1_847]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_847]] to half*
 // CHECK-NEXT:    [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
 // CHECK-NEXT:    [[CONV2:%.*]] = fpext half [[TMP3]] to float
 // CHECK-NEXT:    [[MUL:%.*]] = fmul float [[CONV]], [[CONV2]]
@ -1776,15 +1776,15 @@ float16_t test_vmulh_lane_f16(float16_t a, float16x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmulh_laneq_f16
 // CHECK-SAME: (half noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_854:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_854:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__REINT_850:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_850:%.*]] = alloca i16, align 2
 // CHECK-NEXT:    [[CONV:%.*]] = fpext half [[A]] to float
-// CHECK-NEXT:    store <8 x half> [[B]], <8 x half>* [[__REINT_854]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_854]] to <8 x i16>*
+// CHECK-NEXT:    store <8 x half> [[B]], <8 x half>* [[__REINT_850]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_850]] to <8 x i16>*
 // CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
 // CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE]], i16* [[__REINT1_854]], align 2
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_854]] to half*
+// CHECK-NEXT:    store i16 [[VGETQ_LANE]], i16* [[__REINT1_850]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_850]] to half*
 // CHECK-NEXT:    [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
 // CHECK-NEXT:    [[CONV2:%.*]] = fpext half [[TMP3]] to float
 // CHECK-NEXT:    [[MUL:%.*]] = fmul float [[CONV]], [[CONV2]]
@ -2281,6 +2281,30 @@ float16x8_t test_vdupq_lane_f16(float16x4_t a) {
  return vdupq_lane_f16(a, 3);
 }

+// CHECK-LABEL: define {{[^@]+}}@test_vdup_laneq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    ret <4 x half> [[LANE]]
+//
+float16x4_t test_vdup_laneq_f16(float16x8_t a) {
+  return vdup_laneq_f16(a, 1);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vdupq_laneq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    ret <8 x half> [[LANE]]
+//
+float16x8_t test_vdupq_laneq_f16(float16x8_t a) {
+  return vdupq_laneq_f16(a, 7);
+}
+
 // CHECK-LABEL: define {{[^@]+}}@test_vext_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry: