diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index f644a18f1f27..f02c93dd7c46 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -35,6 +35,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST, const LLT S16 = LLT::scalar(16); const LLT S32 = LLT::scalar(32); const LLT S64 = LLT::scalar(64); + const LLT S256 = LLT::scalar(256); const LLT S512 = LLT::scalar(512); const LLT V2S16 = LLT::vector(2, 16); @@ -298,25 +299,85 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST, unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; + auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { + const LLT &Ty = Query.Types[TypeIdx]; + if (Ty.isVector()) { + const LLT &EltTy = Ty.getElementType(); + if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) + return true; + if (!isPowerOf2_32(EltTy.getSizeInBits())) + return true; + } + return false; + }; + + auto scalarize = + [=](const LegalityQuery &Query, unsigned TypeIdx) { + const LLT &Ty = Query.Types[TypeIdx]; + return std::make_pair(TypeIdx, Ty.getElementType()); + }; + getActionDefinitionsBuilder(Op) + // Break up vectors with weird elements into scalars + .fewerElementsIf( + [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, + [=](const LegalityQuery &Query) { return scalarize(Query, 0); }) + .fewerElementsIf( + [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, + [=](const LegalityQuery &Query) { return scalarize(Query, 1); }) + .clampScalar(BigTyIdx, S32, S512) + .widenScalarIf( + [=](const LegalityQuery &Query) { + const LLT &Ty = Query.Types[BigTyIdx]; + return !isPowerOf2_32(Ty.getSizeInBits()) && + Ty.getSizeInBits() % 16 != 0; + }, + [=](const LegalityQuery &Query) { + // Pick the next power of 2, or a multiple of 64 over 128. + // Whichever is smaller. + const LLT &Ty = Query.Types[BigTyIdx]; + unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); + if (NewSizeInBits >= 256) { + unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); + if (RoundedTo < NewSizeInBits) + NewSizeInBits = RoundedTo; + } + return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); + }) + .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) + // Clamp the little scalar to s8-s256 and make it a power of 2. It's not + // worth considering the multiples of 64 since 2*192 and 2*384 are not + // valid. + .clampScalar(LitTyIdx, S16, S256) + .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) .legalIf([=](const LegalityQuery &Query) { const LLT &BigTy = Query.Types[BigTyIdx]; const LLT &LitTy = Query.Types[LitTyIdx]; - return BigTy.getSizeInBits() % 32 == 0 && - LitTy.getSizeInBits() % 32 == 0 && + + if (BigTy.isVector() && BigTy.getSizeInBits() < 32) + return false; + if (LitTy.isVector() && LitTy.getSizeInBits() < 32) + return false; + + return BigTy.getSizeInBits() % 16 == 0 && + LitTy.getSizeInBits() % 16 == 0 && BigTy.getSizeInBits() <= 512; }) // Any vectors left are the wrong size. Scalarize them. - .fewerElementsIf([](const LegalityQuery &Query) { return true; }, - [](const LegalityQuery &Query) { - return std::make_pair( - 0, Query.Types[0].getElementType()); - }) - .fewerElementsIf([](const LegalityQuery &Query) { return true; }, - [](const LegalityQuery &Query) { - return std::make_pair( - 1, Query.Types[1].getElementType()); - }); + .fewerElementsIf([](const LegalityQuery &Query) { + return Query.Types[0].isVector(); + }, + [](const LegalityQuery &Query) { + return std::make_pair( + 0, Query.Types[0].getElementType()); + }) + .fewerElementsIf([](const LegalityQuery &Query) { + return Query.Types[1].isVector(); + }, + [](const LegalityQuery &Query) { + return std::make_pair( + 1, Query.Types[1].getElementType()); + }); } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values-xfail.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values-xfail.mir new file mode 100644 index 000000000000..5ffbeab9872d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values-xfail.mir @@ -0,0 +1,14 @@ +# RUN: not llc -mtriple=amdgcn-- -O0 -run-pass=legalizer -o - %s 2>&1 | FileCheck %s + +# CHECK: LLVM ERROR: unable to legalize instruction: %1:_(s1), %2:_(s1) = G_UNMERGE_VALUES %0:_(<2 x s1>) (in function: test_unmerge_v2s1) + +--- +name: test_unmerge_v2s1 +body: | + bb.0: + %0:_(<2 x s1>) = G_IMPLICIT_DEF + %1:_(s1), %2:_(s1) = G_UNMERGE_VALUES %0 + S_NOP 0, implicit %1 + S_NOP 0, implicit %2 +... + diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir index 1e004ac478b3..adee5f321341 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir @@ -2,17 +2,76 @@ # RUN: llc -mtriple=amdgcn-- -O0 -run-pass=legalizer -o - %s | FileCheck %s --- -name: test_unmerge_s64_s32 +name: test_unmerge_s32_s64 body: | bb.0: - ; CHECK-LABEL: name: test_unmerge_s64_s32 + ; CHECK-LABEL: name: test_unmerge_s32_s64 ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C]](s64) ; CHECK: $vgpr0 = COPY [[UV]](s32) - ; CHECK: $vgpr2 = COPY [[UV1]](s32) + ; CHECK: $vgpr1 = COPY [[UV1]](s32) %0:_(s64) = G_CONSTANT i64 0 %1:_(s32), %2:_(s32) = G_UNMERGE_VALUES %0:_(s64) $vgpr0 = COPY %1(s32) - $vgpr2 = COPY %2(s32) + $vgpr1 = COPY %2(s32) +... + +--- +name: test_unmerge_s32_v2s32 +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: test_unmerge_s32_v2s32 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; CHECK: $vgpr0 = COPY [[UV]](s32) + ; CHECK: $vgpr21 = COPY [[UV1]](s32) + %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:_(s32), %2:_(s32) = G_UNMERGE_VALUES %0 + $vgpr0 = COPY %1 + $vgpr21= COPY %2 +... + +--- +name: test_unmerge_s16_v2s16 +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: test_unmerge_s16_v2s16 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; CHECK: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](<2 x s16>) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16) + ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: $vgpr1 = COPY [[ANYEXT1]](s32) + %0:_(<2 x s16>) = COPY $vgpr0 + %1:_(s16), %2:_(s16) = G_UNMERGE_VALUES %0 + %3:_(s32) = G_ANYEXT %1 + %4:_(s32) = G_ANYEXT %2 + $vgpr0 = COPY %3 + $vgpr1 = COPY %4 +... + +--- +name: test_unmerge_s16_v3s16 +body: | + bb.0: + ; CHECK-LABEL: name: test_unmerge_s16_v3s16 + ; CHECK: [[DEF:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[DEF]](<3 x s16>) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16) + ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16) + ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: $vgpr1 = COPY [[ANYEXT1]](s32) + ; CHECK: $vgpr2 = COPY [[ANYEXT2]](s32) + %0:_(<3 x s16>) = G_IMPLICIT_DEF + %1:_(s16), %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0 + %4:_(s32) = G_ANYEXT %1 + %5:_(s32) = G_ANYEXT %2 + %6:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 + $vgpr1 = COPY %5 + $vgpr2 = COPY %6 ...