[Clang][AArch64] Inline assembly support for the ACLE type 'data512_t'

In LLVM IR terms the ACLE type 'data512_t' is essentially an aggregate type { [8 x i64] }. When emitting code for inline assembly operands, clang tries to scalarize aggregate types to an integer of the equivalent length, otherwise it passes them by-reference. This patch adds a target hook to tell whether a given inline assembly operand is scalarizable so that clang can emit code to pass/return it by-value. Differential Revision: https://reviews.llvm.org/D94098
2021-07-28 16:40:59 +01:00 · 2021-07-28 16:40:59 +01:00 · 29b263a34f
parent fb09f365ae
commit 29b263a34f
5 changed files with 137 additions and 17 deletions
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@ -431,7 +431,8 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const {
           Feature == "sve2-aes" || Feature == "sve2-sha3" ||
           Feature == "sve2-sm4" || Feature == "f64mm" || Feature == "f32mm" ||
           Feature == "i8mm" || Feature == "bf16") &&
-          (FPU & SveMode));
+          (FPU & SveMode)) ||
+         (Feature == "ls64" && HasLS64);
 }

 bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
@ -752,6 +753,9 @@ bool AArch64TargetInfo::validateConstraintModifier(
      if (Size == 64)
        return true;

+      if (Size == 512)
+        return HasLS64;
+
      SuggestedModifier = "w";
      return false;
    }
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@ -2097,7 +2097,8 @@ CodeGenFunction::EmitAsmInputLValue(const TargetInfo::ConstraintInfo &Info,
    } else {
      llvm::Type *Ty = ConvertType(InputType);
      uint64_t Size = CGM.getDataLayout().getTypeSizeInBits(Ty);
-      if (Size <= 64 && llvm::isPowerOf2_64(Size)) {
+      if ((Size <= 64 && llvm::isPowerOf2_64(Size)) ||
+          getTargetHooks().isScalarizableAsmOperand(*this, Ty)) {
        Ty = llvm::IntegerType::get(getLLVMContext(), Size);
        Ty = llvm::PointerType::getUnqual(Ty);

@ -2320,23 +2321,28 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {

    // If this is a register output, then make the inline asm return it
    // by-value.  If this is a memory result, return the value by-reference.
-    bool isScalarizableAggregate =
-        hasAggregateEvaluationKind(OutExpr->getType());
-    if (!Info.allowsMemory() && (hasScalarEvaluationKind(OutExpr->getType()) ||
-                                 isScalarizableAggregate)) {
+    QualType QTy = OutExpr->getType();
+    const bool IsScalarOrAggregate = hasScalarEvaluationKind(QTy) ||
+                                     hasAggregateEvaluationKind(QTy);
+    if (!Info.allowsMemory() && IsScalarOrAggregate) {
+
      Constraints += "=" + OutputConstraint;
-      ResultRegQualTys.push_back(OutExpr->getType());
+      ResultRegQualTys.push_back(QTy);
      ResultRegDests.push_back(Dest);
-      ResultTruncRegTypes.push_back(ConvertTypeForMem(OutExpr->getType()));
-      if (Info.allowsRegister() && isScalarizableAggregate) {
-        ResultTypeRequiresCast.push_back(true);
-        unsigned Size = getContext().getTypeSize(OutExpr->getType());
-        llvm::Type *ConvTy = llvm::IntegerType::get(getLLVMContext(), Size);
-        ResultRegTypes.push_back(ConvTy);
-      } else {
-        ResultTypeRequiresCast.push_back(false);
-        ResultRegTypes.push_back(ResultTruncRegTypes.back());
+
+      llvm::Type *Ty = ConvertTypeForMem(QTy);
+      const bool RequiresCast = Info.allowsRegister() &&
+          (getTargetHooks().isScalarizableAsmOperand(*this, Ty) ||
+           Ty->isAggregateType());
+
+      ResultTruncRegTypes.push_back(Ty);
+      ResultTypeRequiresCast.push_back(RequiresCast);
+
+      if (RequiresCast) {
+        unsigned Size = getContext().getTypeSize(QTy);
+        Ty = llvm::IntegerType::get(getLLVMContext(), Size);
      }
+      ResultRegTypes.push_back(Ty);
      // If this output is tied to an input, and if the input is larger, then
      // we need to set the actual result type of the inline asm node to be the
      // same as the input type.
@ -2638,11 +2644,11 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
  assert(ResultTypeRequiresCast.size() <= ResultRegDests.size());
  for (unsigned i = 0, e = RegResults.size(); i != e; ++i) {
    llvm::Value *Tmp = RegResults[i];
+    llvm::Type *TruncTy = ResultTruncRegTypes[i];

    // If the result type of the LLVM IR asm doesn't match the result type of
    // the expression, do the conversion.
    if (ResultRegTypes[i] != ResultTruncRegTypes[i]) {
-      llvm::Type *TruncTy = ResultTruncRegTypes[i];

      // Truncate the integer result to the right size, note that TruncTy can be
      // a pointer.
@ -2672,6 +2678,11 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
      unsigned Size = getContext().getTypeSize(ResultRegQualTys[i]);
      Address A = Builder.CreateBitCast(Dest.getAddress(*this),
                                        ResultRegTypes[i]->getPointerTo());
+      if (getTargetHooks().isScalarizableAsmOperand(*this, TruncTy)) {
+        Builder.CreateStore(Tmp, A);
+        continue;
+      }
+
      QualType Ty = getContext().getIntTypeForBitwidth(Size, /*Signed*/ false);
      if (Ty.isNull()) {
        const Expr *OutExpr = S.getOutputExpr(i);
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@ -5526,6 +5526,20 @@ public:
    Fn->addFnAttr("branch-target-enforcement",
                  BPI.BranchTargetEnforcement ? "true" : "false");
  }
+
+  bool isScalarizableAsmOperand(CodeGen::CodeGenFunction &CGF,
+                                llvm::Type *Ty) const override {
+    if (CGF.getTarget().hasFeature("ls64")) {
+      auto *ST = dyn_cast<llvm::StructType>(Ty);
+      if (ST && ST->getNumElements() == 1) {
+        auto *AT = dyn_cast<llvm::ArrayType>(ST->getElementType(0));
+        if (AT && AT->getNumElements() == 8 &&
+            AT->getElementType()->isIntegerTy(64))
+          return true;
+      }
+    }
+    return TargetCodeGenInfo::isScalarizableAsmOperand(CGF, Ty);
+  }
 };

 class WindowsAArch64TargetCodeGenInfo : public AArch64TargetCodeGenInfo {
--- a/clang/lib/CodeGen/TargetInfo.h
+++ b/clang/lib/CodeGen/TargetInfo.h
@ -148,6 +148,13 @@ public:
    return Ty;
  }

+  /// Target hook to decide whether an inline asm operand can be passed
+  /// by value.
+  virtual bool isScalarizableAsmOperand(CodeGen::CodeGenFunction &CGF,
+                                        llvm::Type *Ty) const {
+    return false;
+  }
+
  /// Adds constraints and types for result registers.
  virtual void addReturnRegisterOutputs(
      CodeGen::CodeGenFunction &CGF, CodeGen::LValue ReturnValue,
--- a/clang/test/CodeGen/aarch64-ls64-inline-asm.c
+++ b/clang/test/CodeGen/aarch64-ls64-inline-asm.c
@ -0,0 +1,84 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple aarch64-eabi -target-feature +ls64 -O1 -S -emit-llvm -x c %s -o - | FileCheck %s
+
+struct foo { unsigned long long x[8]; };
+
+// CHECK-LABEL: @load(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i512 asm sideeffect "ld64b $0,[$1]", "=r,r,~{memory}"(i8* [[ADDR:%.*]]) #[[ATTR1:[0-9]+]], !srcloc !6
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.foo* [[OUTPUT:%.*]] to i512*
+// CHECK-NEXT:    store i512 [[TMP0]], i512* [[TMP1]], align 8
+// CHECK-NEXT:    ret void
+//
+void load(struct foo *output, void *addr)
+{
+    __asm__ volatile ("ld64b %0,[%1]" : "=r" (*output) : "r" (addr) : "memory");
+}
+
+// CHECK-LABEL: @store(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.foo* [[INPUT:%.*]] to i512*
+// CHECK-NEXT:    [[TMP1:%.*]] = load i512, i512* [[TMP0]], align 8
+// CHECK-NEXT:    call void asm sideeffect "st64b $0,[$1]", "r,r,~{memory}"(i512 [[TMP1]], i8* [[ADDR:%.*]]) #[[ATTR1]], !srcloc !7
+// CHECK-NEXT:    ret void
+//
+void store(const struct foo *input, void *addr)
+{
+    __asm__ volatile ("st64b %0,[%1]" : : "r" (*input), "r" (addr) : "memory" );
+}
+
+// CHECK-LABEL: @store2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[IN:%.*]], align 4, !tbaa [[TBAA8:![0-9]+]]
+// CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[TMP0]] to i64
+// CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[CONV2:%.*]] = sext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[CONV5:%.*]] = sext i32 [[TMP2]] to i64
+// CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX7]], align 4, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[CONV8:%.*]] = sext i32 [[TMP3]] to i64
+// CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 25
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX10]], align 4, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[CONV11:%.*]] = sext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 36
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX13]], align 4, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[CONV14:%.*]] = sext i32 [[TMP5]] to i64
+// CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 49
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX16]], align 4, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[CONV17:%.*]] = sext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX19]], align 4, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[CONV20:%.*]] = sext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[S_SROA_10_0_INSERT_EXT:%.*]] = zext i64 [[CONV20]] to i512
+// CHECK-NEXT:    [[S_SROA_10_0_INSERT_SHIFT:%.*]] = shl nuw i512 [[S_SROA_10_0_INSERT_EXT]], 448
+// CHECK-NEXT:    [[S_SROA_9_0_INSERT_EXT:%.*]] = zext i64 [[CONV17]] to i512
+// CHECK-NEXT:    [[S_SROA_9_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_9_0_INSERT_EXT]], 384
+// CHECK-NEXT:    [[S_SROA_9_0_INSERT_INSERT:%.*]] = or i512 [[S_SROA_10_0_INSERT_SHIFT]], [[S_SROA_9_0_INSERT_SHIFT]]
+// CHECK-NEXT:    [[S_SROA_8_0_INSERT_EXT:%.*]] = zext i64 [[CONV14]] to i512
+// CHECK-NEXT:    [[S_SROA_8_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_8_0_INSERT_EXT]], 320
+// CHECK-NEXT:    [[S_SROA_8_0_INSERT_INSERT:%.*]] = or i512 [[S_SROA_9_0_INSERT_INSERT]], [[S_SROA_8_0_INSERT_SHIFT]]
+// CHECK-NEXT:    [[S_SROA_7_0_INSERT_EXT:%.*]] = zext i64 [[CONV11]] to i512
+// CHECK-NEXT:    [[S_SROA_7_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_7_0_INSERT_EXT]], 256
+// CHECK-NEXT:    [[S_SROA_7_0_INSERT_INSERT:%.*]] = or i512 [[S_SROA_8_0_INSERT_INSERT]], [[S_SROA_7_0_INSERT_SHIFT]]
+// CHECK-NEXT:    [[S_SROA_6_0_INSERT_EXT:%.*]] = zext i64 [[CONV8]] to i512
+// CHECK-NEXT:    [[S_SROA_6_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_6_0_INSERT_EXT]], 192
+// CHECK-NEXT:    [[S_SROA_6_0_INSERT_INSERT:%.*]] = or i512 [[S_SROA_7_0_INSERT_INSERT]], [[S_SROA_6_0_INSERT_SHIFT]]
+// CHECK-NEXT:    [[S_SROA_5_0_INSERT_EXT:%.*]] = zext i64 [[CONV5]] to i512
+// CHECK-NEXT:    [[S_SROA_5_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_5_0_INSERT_EXT]], 128
+// CHECK-NEXT:    [[S_SROA_4_0_INSERT_EXT:%.*]] = zext i64 [[CONV2]] to i512
+// CHECK-NEXT:    [[S_SROA_4_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_4_0_INSERT_EXT]], 64
+// CHECK-NEXT:    [[S_SROA_4_0_INSERT_MASK:%.*]] = or i512 [[S_SROA_6_0_INSERT_INSERT]], [[S_SROA_5_0_INSERT_SHIFT]]
+// CHECK-NEXT:    [[S_SROA_0_0_INSERT_EXT:%.*]] = zext i64 [[CONV]] to i512
+// CHECK-NEXT:    [[S_SROA_0_0_INSERT_MASK:%.*]] = or i512 [[S_SROA_4_0_INSERT_MASK]], [[S_SROA_4_0_INSERT_SHIFT]]
+// CHECK-NEXT:    [[S_SROA_0_0_INSERT_INSERT:%.*]] = or i512 [[S_SROA_0_0_INSERT_MASK]], [[S_SROA_0_0_INSERT_EXT]]
+// CHECK-NEXT:    call void asm sideeffect "st64b $0,[$1]", "r,r,~{memory}"(i512 [[S_SROA_0_0_INSERT_INSERT]], i8* [[ADDR:%.*]]) #[[ATTR1]], !srcloc !12
+// CHECK-NEXT:    ret void
+//
+void store2(int *in, void *addr)
+{
+    struct foo s = { in[0], in[1], in[4], in[16], in[25], in[36], in[49], in[64] };
+    __asm__ volatile ("st64b %0,[%1]" : : "r" (s), "r" (addr) : "memory" );
+}