[Clang][AArch64] Inline assembly support for the ACLE type 'data512_t'

In LLVM IR terms the ACLE type 'data512_t' is essentially an aggregate
type { [8 x i64] }. When emitting code for inline assembly operands,
clang tries to scalarize aggregate types to an integer of the equivalent
length, otherwise it passes them by-reference. This patch adds a target
hook to tell whether a given inline assembly operand is scalarizable
so that clang can emit code to pass/return it by-value.

Differential Revision: https://reviews.llvm.org/D94098
This commit is contained in:
Alexandros Lamprineas 2021-07-28 16:40:59 +01:00
parent fb09f365ae
commit 29b263a34f
5 changed files with 137 additions and 17 deletions

View File

@ -431,7 +431,8 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const {
Feature == "sve2-aes" || Feature == "sve2-sha3" ||
Feature == "sve2-sm4" || Feature == "f64mm" || Feature == "f32mm" ||
Feature == "i8mm" || Feature == "bf16") &&
(FPU & SveMode));
(FPU & SveMode)) ||
(Feature == "ls64" && HasLS64);
}
bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
@ -752,6 +753,9 @@ bool AArch64TargetInfo::validateConstraintModifier(
if (Size == 64)
return true;
if (Size == 512)
return HasLS64;
SuggestedModifier = "w";
return false;
}

View File

@ -2097,7 +2097,8 @@ CodeGenFunction::EmitAsmInputLValue(const TargetInfo::ConstraintInfo &Info,
} else {
llvm::Type *Ty = ConvertType(InputType);
uint64_t Size = CGM.getDataLayout().getTypeSizeInBits(Ty);
if (Size <= 64 && llvm::isPowerOf2_64(Size)) {
if ((Size <= 64 && llvm::isPowerOf2_64(Size)) ||
getTargetHooks().isScalarizableAsmOperand(*this, Ty)) {
Ty = llvm::IntegerType::get(getLLVMContext(), Size);
Ty = llvm::PointerType::getUnqual(Ty);
@ -2320,23 +2321,28 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
// If this is a register output, then make the inline asm return it
// by-value. If this is a memory result, return the value by-reference.
bool isScalarizableAggregate =
hasAggregateEvaluationKind(OutExpr->getType());
if (!Info.allowsMemory() && (hasScalarEvaluationKind(OutExpr->getType()) ||
isScalarizableAggregate)) {
QualType QTy = OutExpr->getType();
const bool IsScalarOrAggregate = hasScalarEvaluationKind(QTy) ||
hasAggregateEvaluationKind(QTy);
if (!Info.allowsMemory() && IsScalarOrAggregate) {
Constraints += "=" + OutputConstraint;
ResultRegQualTys.push_back(OutExpr->getType());
ResultRegQualTys.push_back(QTy);
ResultRegDests.push_back(Dest);
ResultTruncRegTypes.push_back(ConvertTypeForMem(OutExpr->getType()));
if (Info.allowsRegister() && isScalarizableAggregate) {
ResultTypeRequiresCast.push_back(true);
unsigned Size = getContext().getTypeSize(OutExpr->getType());
llvm::Type *ConvTy = llvm::IntegerType::get(getLLVMContext(), Size);
ResultRegTypes.push_back(ConvTy);
} else {
ResultTypeRequiresCast.push_back(false);
ResultRegTypes.push_back(ResultTruncRegTypes.back());
llvm::Type *Ty = ConvertTypeForMem(QTy);
const bool RequiresCast = Info.allowsRegister() &&
(getTargetHooks().isScalarizableAsmOperand(*this, Ty) ||
Ty->isAggregateType());
ResultTruncRegTypes.push_back(Ty);
ResultTypeRequiresCast.push_back(RequiresCast);
if (RequiresCast) {
unsigned Size = getContext().getTypeSize(QTy);
Ty = llvm::IntegerType::get(getLLVMContext(), Size);
}
ResultRegTypes.push_back(Ty);
// If this output is tied to an input, and if the input is larger, then
// we need to set the actual result type of the inline asm node to be the
// same as the input type.
@ -2638,11 +2644,11 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
assert(ResultTypeRequiresCast.size() <= ResultRegDests.size());
for (unsigned i = 0, e = RegResults.size(); i != e; ++i) {
llvm::Value *Tmp = RegResults[i];
llvm::Type *TruncTy = ResultTruncRegTypes[i];
// If the result type of the LLVM IR asm doesn't match the result type of
// the expression, do the conversion.
if (ResultRegTypes[i] != ResultTruncRegTypes[i]) {
llvm::Type *TruncTy = ResultTruncRegTypes[i];
// Truncate the integer result to the right size, note that TruncTy can be
// a pointer.
@ -2672,6 +2678,11 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
unsigned Size = getContext().getTypeSize(ResultRegQualTys[i]);
Address A = Builder.CreateBitCast(Dest.getAddress(*this),
ResultRegTypes[i]->getPointerTo());
if (getTargetHooks().isScalarizableAsmOperand(*this, TruncTy)) {
Builder.CreateStore(Tmp, A);
continue;
}
QualType Ty = getContext().getIntTypeForBitwidth(Size, /*Signed*/ false);
if (Ty.isNull()) {
const Expr *OutExpr = S.getOutputExpr(i);

View File

@ -5526,6 +5526,20 @@ public:
Fn->addFnAttr("branch-target-enforcement",
BPI.BranchTargetEnforcement ? "true" : "false");
}
bool isScalarizableAsmOperand(CodeGen::CodeGenFunction &CGF,
llvm::Type *Ty) const override {
if (CGF.getTarget().hasFeature("ls64")) {
auto *ST = dyn_cast<llvm::StructType>(Ty);
if (ST && ST->getNumElements() == 1) {
auto *AT = dyn_cast<llvm::ArrayType>(ST->getElementType(0));
if (AT && AT->getNumElements() == 8 &&
AT->getElementType()->isIntegerTy(64))
return true;
}
}
return TargetCodeGenInfo::isScalarizableAsmOperand(CGF, Ty);
}
};
class WindowsAArch64TargetCodeGenInfo : public AArch64TargetCodeGenInfo {

View File

@ -148,6 +148,13 @@ public:
return Ty;
}
/// Target hook to decide whether an inline asm operand can be passed
/// by value.
virtual bool isScalarizableAsmOperand(CodeGen::CodeGenFunction &CGF,
llvm::Type *Ty) const {
return false;
}
/// Adds constraints and types for result registers.
virtual void addReturnRegisterOutputs(
CodeGen::CodeGenFunction &CGF, CodeGen::LValue ReturnValue,

View File

@ -0,0 +1,84 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
// RUN: %clang_cc1 -triple aarch64-eabi -target-feature +ls64 -O1 -S -emit-llvm -x c %s -o - | FileCheck %s
struct foo { unsigned long long x[8]; };
// CHECK-LABEL: @load(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = call i512 asm sideeffect "ld64b $0,[$1]", "=r,r,~{memory}"(i8* [[ADDR:%.*]]) #[[ATTR1:[0-9]+]], !srcloc !6
// CHECK-NEXT: [[TMP1:%.*]] = bitcast %struct.foo* [[OUTPUT:%.*]] to i512*
// CHECK-NEXT: store i512 [[TMP0]], i512* [[TMP1]], align 8
// CHECK-NEXT: ret void
//
void load(struct foo *output, void *addr)
{
__asm__ volatile ("ld64b %0,[%1]" : "=r" (*output) : "r" (addr) : "memory");
}
// CHECK-LABEL: @store(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.foo* [[INPUT:%.*]] to i512*
// CHECK-NEXT: [[TMP1:%.*]] = load i512, i512* [[TMP0]], align 8
// CHECK-NEXT: call void asm sideeffect "st64b $0,[$1]", "r,r,~{memory}"(i512 [[TMP1]], i8* [[ADDR:%.*]]) #[[ATTR1]], !srcloc !7
// CHECK-NEXT: ret void
//
void store(const struct foo *input, void *addr)
{
__asm__ volatile ("st64b %0,[%1]" : : "r" (*input), "r" (addr) : "memory" );
}
// CHECK-LABEL: @store2(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[IN:%.*]], align 4, !tbaa [[TBAA8:![0-9]+]]
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP0]] to i64
// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 1
// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4, !tbaa [[TBAA8]]
// CHECK-NEXT: [[CONV2:%.*]] = sext i32 [[TMP1]] to i64
// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 4
// CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4, !tbaa [[TBAA8]]
// CHECK-NEXT: [[CONV5:%.*]] = sext i32 [[TMP2]] to i64
// CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 16
// CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX7]], align 4, !tbaa [[TBAA8]]
// CHECK-NEXT: [[CONV8:%.*]] = sext i32 [[TMP3]] to i64
// CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 25
// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX10]], align 4, !tbaa [[TBAA8]]
// CHECK-NEXT: [[CONV11:%.*]] = sext i32 [[TMP4]] to i64
// CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 36
// CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX13]], align 4, !tbaa [[TBAA8]]
// CHECK-NEXT: [[CONV14:%.*]] = sext i32 [[TMP5]] to i64
// CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 49
// CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX16]], align 4, !tbaa [[TBAA8]]
// CHECK-NEXT: [[CONV17:%.*]] = sext i32 [[TMP6]] to i64
// CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 64
// CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX19]], align 4, !tbaa [[TBAA8]]
// CHECK-NEXT: [[CONV20:%.*]] = sext i32 [[TMP7]] to i64
// CHECK-NEXT: [[S_SROA_10_0_INSERT_EXT:%.*]] = zext i64 [[CONV20]] to i512
// CHECK-NEXT: [[S_SROA_10_0_INSERT_SHIFT:%.*]] = shl nuw i512 [[S_SROA_10_0_INSERT_EXT]], 448
// CHECK-NEXT: [[S_SROA_9_0_INSERT_EXT:%.*]] = zext i64 [[CONV17]] to i512
// CHECK-NEXT: [[S_SROA_9_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_9_0_INSERT_EXT]], 384
// CHECK-NEXT: [[S_SROA_9_0_INSERT_INSERT:%.*]] = or i512 [[S_SROA_10_0_INSERT_SHIFT]], [[S_SROA_9_0_INSERT_SHIFT]]
// CHECK-NEXT: [[S_SROA_8_0_INSERT_EXT:%.*]] = zext i64 [[CONV14]] to i512
// CHECK-NEXT: [[S_SROA_8_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_8_0_INSERT_EXT]], 320
// CHECK-NEXT: [[S_SROA_8_0_INSERT_INSERT:%.*]] = or i512 [[S_SROA_9_0_INSERT_INSERT]], [[S_SROA_8_0_INSERT_SHIFT]]
// CHECK-NEXT: [[S_SROA_7_0_INSERT_EXT:%.*]] = zext i64 [[CONV11]] to i512
// CHECK-NEXT: [[S_SROA_7_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_7_0_INSERT_EXT]], 256
// CHECK-NEXT: [[S_SROA_7_0_INSERT_INSERT:%.*]] = or i512 [[S_SROA_8_0_INSERT_INSERT]], [[S_SROA_7_0_INSERT_SHIFT]]
// CHECK-NEXT: [[S_SROA_6_0_INSERT_EXT:%.*]] = zext i64 [[CONV8]] to i512
// CHECK-NEXT: [[S_SROA_6_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_6_0_INSERT_EXT]], 192
// CHECK-NEXT: [[S_SROA_6_0_INSERT_INSERT:%.*]] = or i512 [[S_SROA_7_0_INSERT_INSERT]], [[S_SROA_6_0_INSERT_SHIFT]]
// CHECK-NEXT: [[S_SROA_5_0_INSERT_EXT:%.*]] = zext i64 [[CONV5]] to i512
// CHECK-NEXT: [[S_SROA_5_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_5_0_INSERT_EXT]], 128
// CHECK-NEXT: [[S_SROA_4_0_INSERT_EXT:%.*]] = zext i64 [[CONV2]] to i512
// CHECK-NEXT: [[S_SROA_4_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_4_0_INSERT_EXT]], 64
// CHECK-NEXT: [[S_SROA_4_0_INSERT_MASK:%.*]] = or i512 [[S_SROA_6_0_INSERT_INSERT]], [[S_SROA_5_0_INSERT_SHIFT]]
// CHECK-NEXT: [[S_SROA_0_0_INSERT_EXT:%.*]] = zext i64 [[CONV]] to i512
// CHECK-NEXT: [[S_SROA_0_0_INSERT_MASK:%.*]] = or i512 [[S_SROA_4_0_INSERT_MASK]], [[S_SROA_4_0_INSERT_SHIFT]]
// CHECK-NEXT: [[S_SROA_0_0_INSERT_INSERT:%.*]] = or i512 [[S_SROA_0_0_INSERT_MASK]], [[S_SROA_0_0_INSERT_EXT]]
// CHECK-NEXT: call void asm sideeffect "st64b $0,[$1]", "r,r,~{memory}"(i512 [[S_SROA_0_0_INSERT_INSERT]], i8* [[ADDR:%.*]]) #[[ATTR1]], !srcloc !12
// CHECK-NEXT: ret void
//
void store2(int *in, void *addr)
{
struct foo s = { in[0], in[1], in[4], in[16], in[25], in[36], in[49], in[64] };
__asm__ volatile ("st64b %0,[%1]" : : "r" (s), "r" (addr) : "memory" );
}