From 7d11d99d2049762d3a2f769b39e85e8c217a3544 Mon Sep 17 00:00:00 2001 From: Hao Liu Date: Tue, 7 Jan 2014 10:50:43 +0000 Subject: [PATCH] [AArch64]Add support to spill/fill D tuples such as DPair/DTriple/DQuad. There is no test cases for D tuple as the original test cases are too large. As the spill/fill of the D tuple is similar to the Q tuple, the correctness can be guaranteed. llvm-svn: 198684 --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 39 ++++++++++++++---- .../Target/AArch64/AArch64RegisterInfo.cpp | 5 ++- .../lib/Target/AArch64/AArch64RegisterInfo.td | 3 +- .../CodeGen/AArch64/neon-vector-list-spill.ll | 41 +++++++++++++++++++ 4 files changed, 77 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index f4d13932a519..b0b0a8716b60 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -477,12 +477,18 @@ AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, default: llvm_unreachable("Unknown size for regclass"); } - } else { // The spill of D tuples is implemented by Q tuples - if (RC == &AArch64::QPairRegClass) + } else { // For a super register class has more than one sub registers + if (AArch64::DPairRegClass.hasSubClassEq(RC)) + StoreOp = AArch64::ST1x2_8B; + else if (AArch64::DTripleRegClass.hasSubClassEq(RC)) + StoreOp = AArch64::ST1x3_8B; + else if (AArch64::DQuadRegClass.hasSubClassEq(RC)) + StoreOp = AArch64::ST1x4_8B; + else if (AArch64::QPairRegClass.hasSubClassEq(RC)) StoreOp = AArch64::ST1x2_16B; - else if (RC == &AArch64::QTripleRegClass) + else if (AArch64::QTripleRegClass.hasSubClassEq(RC)) StoreOp = AArch64::ST1x3_16B; - else if (RC == &AArch64::QQuadRegClass) + else if (AArch64::QQuadRegClass.hasSubClassEq(RC)) StoreOp = AArch64::ST1x4_16B; else llvm_unreachable("Unknown reg class"); @@ -537,12 +543,18 @@ AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, default: llvm_unreachable("Unknown size for regclass"); } - } else { // The spill of D tuples is implemented by Q tuples - if (RC == &AArch64::QPairRegClass) + } else { // For a super register class has more than one sub registers + if (AArch64::DPairRegClass.hasSubClassEq(RC)) + LoadOp = AArch64::LD1x2_8B; + else if (AArch64::DTripleRegClass.hasSubClassEq(RC)) + LoadOp = AArch64::LD1x3_8B; + else if (AArch64::DQuadRegClass.hasSubClassEq(RC)) + LoadOp = AArch64::LD1x4_8B; + else if (AArch64::QPairRegClass.hasSubClassEq(RC)) LoadOp = AArch64::LD1x2_16B; - else if (RC == &AArch64::QTripleRegClass) + else if (AArch64::QTripleRegClass.hasSubClassEq(RC)) LoadOp = AArch64::LD1x3_16B; - else if (RC == &AArch64::QQuadRegClass) + else if (AArch64::QQuadRegClass.hasSubClassEq(RC)) LoadOp = AArch64::LD1x4_16B; else llvm_unreachable("Unknown reg class"); @@ -649,6 +661,17 @@ void AArch64InstrInfo::getAddressConstraints(const MachineInstr &MI, MinOffset = -0x40 * AccessScale; MaxOffset = 0x3f * AccessScale; return; + case AArch64::LD1x2_8B: case AArch64::ST1x2_8B: + AccessScale = 16; + MinOffset = 0; + MaxOffset = 0xfff * AccessScale; + return; + case AArch64::LD1x3_8B: case AArch64::ST1x3_8B: + AccessScale = 24; + MinOffset = 0; + MaxOffset = 0xfff * AccessScale; + return; + case AArch64::LD1x4_8B: case AArch64::ST1x4_8B: case AArch64::LD1x2_16B: case AArch64::ST1x2_16B: AccessScale = 32; MinOffset = 0; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 618f6fb9289b..973faf7363a5 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -77,7 +77,10 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const { } static bool hasFrameOffset(int opcode) { - return opcode != AArch64::LD1x2_16B && opcode != AArch64::LD1x3_16B && + return opcode != AArch64::LD1x2_8B && opcode != AArch64::LD1x3_8B && + opcode != AArch64::LD1x4_8B && opcode != AArch64::ST1x2_8B && + opcode != AArch64::ST1x3_8B && opcode != AArch64::ST1x4_8B && + opcode != AArch64::LD1x2_16B && opcode != AArch64::LD1x3_16B && opcode != AArch64::LD1x4_16B && opcode != AArch64::ST1x2_16B && opcode != AArch64::ST1x3_16B && opcode != AArch64::ST1x4_16B; } diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index 8b1a9cb90740..cfc0c953bd22 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -30,7 +30,6 @@ def dsub_0 : SubRegIndex<64>; def dsub_1 : SubRegIndex<64, 64>; def dsub_2 : ComposedSubRegIndex; def dsub_3 : ComposedSubRegIndex; -def dsub_4 : ComposedSubRegIndex; } // Registers are identified with 5-bit ID numbers. @@ -206,7 +205,7 @@ def FlagClass : RegisterClass<"AArch64", [i32], 32, (add NZCV)> { //===----------------------------------------------------------------------===// // Consecutive vector registers //===----------------------------------------------------------------------===// -// 2 Consecutive 64-bit registers: D0_D1, D1_D2, ..., D30_D31 +// 2 Consecutive 64-bit registers: D0_D1, D1_D2, ..., D31_D0 def Tuples2D : RegisterTuples<[dsub_0, dsub_1], [(rotl FPR64, 0), (rotl FPR64, 1)]>; diff --git a/llvm/test/CodeGen/AArch64/neon-vector-list-spill.ll b/llvm/test/CodeGen/AArch64/neon-vector-list-spill.ll index 9ac2c05ebd0f..3ab69c4a02af 100644 --- a/llvm/test/CodeGen/AArch64/neon-vector-list-spill.ll +++ b/llvm/test/CodeGen/AArch64/neon-vector-list-spill.ll @@ -132,3 +132,44 @@ declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8*, declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8*, i32) declare void @foo() + +; FIXME: We should not generate ld/st for such register spill/fill, because the +; test case seems very simple and the register pressure is not high. If the +; spill/fill algorithm is optimized, this test case may not be triggered. And +; then we can delete it. +; check the spill for Register Class QPair_with_qsub_0_in_FPR128Lo +define <8 x i16> @test_2xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) { + tail call void @llvm.arm.neon.vst2lane.v1i64(i8* %ptr, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i32 0, i32 8) + tail call void @foo() + %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> + %1 = bitcast <2 x i64> %sv to <8 x i16> + %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> + %3 = mul <8 x i16> %2, %2 + ret <8 x i16> %3 +} + +; check the spill for Register Class QTriple_with_qsub_0_in_FPR128Lo +define <8 x i16> @test_3xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) { + tail call void @llvm.arm.neon.vst3lane.v1i64(i8* %ptr, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i32 0, i32 8) + tail call void @foo() + %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> + %1 = bitcast <2 x i64> %sv to <8 x i16> + %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> + %3 = mul <8 x i16> %2, %2 + ret <8 x i16> %3 +} + +; check the spill for Register Class QQuad_with_qsub_0_in_FPR128Lo +define <8 x i16> @test_4xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) { + tail call void @llvm.arm.neon.vst4lane.v1i64(i8* %ptr, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i32 0, i32 8) + tail call void @foo() + %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> + %1 = bitcast <2 x i64> %sv to <8 x i16> + %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> + %3 = mul <8 x i16> %2, %2 + ret <8 x i16> %3 +} + +declare void @llvm.arm.neon.vst2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32) \ No newline at end of file