From 0e2eb357e0471bbac81f18097e5ad761ae1431f0 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sun, 9 Feb 2020 15:48:31 -0500 Subject: [PATCH] GlobalISel: Extend narrowing to G_ASHR --- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 36 +++- .../AMDGPU/AMDGPUPreLegalizerCombiner.cpp | 1 + .../AMDGPU/GlobalISel/combine-ashr-narrow.mir | 204 ++++++++++++++++++ 3 files changed, 234 insertions(+), 7 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/combine-ashr-narrow.mir diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 42b769f9dc9b..da27a7cd5b8a 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -1377,7 +1377,8 @@ bool CombinerHelper::matchCombineShiftToUnmerge(MachineInstr &MI, unsigned TargetShiftSize, unsigned &ShiftVal) { assert((MI.getOpcode() == TargetOpcode::G_SHL || - MI.getOpcode() == TargetOpcode::G_LSHR) && "Expected a shift"); + MI.getOpcode() == TargetOpcode::G_LSHR || + MI.getOpcode() == TargetOpcode::G_ASHR) && "Expected a shift"); LLT Ty = MRI.getType(MI.getOperand(0).getReg()); if (Ty.isVector()) // TODO: @@ -1404,8 +1405,8 @@ bool CombinerHelper::applyCombineShiftToUnmerge(MachineInstr &MI, LLT Ty = MRI.getType(SrcReg); unsigned Size = Ty.getSizeInBits(); unsigned HalfSize = Size / 2; - assert(ShiftVal >= HalfSize); + LLT HalfTy = LLT::scalar(HalfSize); Builder.setInstr(MI); @@ -1427,16 +1428,12 @@ bool CombinerHelper::applyCombineShiftToUnmerge(MachineInstr &MI, auto Zero = Builder.buildConstant(HalfTy, 0); Builder.buildMerge(DstReg, { Narrowed, Zero }); - } else { + } else if (MI.getOpcode() == TargetOpcode::G_SHL) { Register Narrowed = Unmerge.getReg(0); // dst = G_SHL s64:x, C for C >= 32 // => // lo, hi = G_UNMERGE_VALUES x // dst = G_MERGE_VALUES 0, (G_SHL hi, C - 32) - - // TODO: ashr - assert(MI.getOpcode() == TargetOpcode::G_SHL); - if (NarrowShiftAmt != 0) { Narrowed = Builder.buildShl(HalfTy, Narrowed, Builder.buildConstant(HalfTy, NarrowShiftAmt)).getReg(0); @@ -1444,6 +1441,31 @@ bool CombinerHelper::applyCombineShiftToUnmerge(MachineInstr &MI, auto Zero = Builder.buildConstant(HalfTy, 0); Builder.buildMerge(DstReg, { Zero, Narrowed }); + } else { + assert(MI.getOpcode() == TargetOpcode::G_ASHR); + auto Hi = Builder.buildAShr( + HalfTy, Unmerge.getReg(1), + Builder.buildConstant(HalfTy, HalfSize - 1)); + + if (ShiftVal == HalfSize) { + // (G_ASHR i64:x, 32) -> + // G_MERGE_VALUES lo_32(x), (G_ASHR hi_32(x), 31) + Builder.buildMerge(DstReg, { Unmerge.getReg(0), Hi }); + } else if (ShiftVal == Size - 1) { + // Don't need a second shift. + // (G_ASHR i64:x, 63) -> + // %narrowed = (G_ASHR hi_32(x), 31) + // G_MERGE_VALUES %narrowed, %narrowed + Builder.buildMerge(DstReg, { Hi, Hi }); + } else { + auto Lo = Builder.buildAShr( + HalfTy, Unmerge.getReg(1), + Builder.buildConstant(HalfTy, ShiftVal - HalfSize)); + + // (G_ASHR i64:x, C) ->, for C >= 32 + // G_MERGE_VALUES (G_ASHR hi_32(x), C - 32), (G_ASHR hi_32(x), 31) + Builder.buildMerge(DstReg, { Lo, Hi }); + } } MI.eraseFromParent(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp index 2d14c1fc9f9d..2757dde6f257 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -167,6 +167,7 @@ bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, switch (MI.getOpcode()) { case TargetOpcode::G_SHL: case TargetOpcode::G_LSHR: + case TargetOpcode::G_ASHR: // On some subtargets, 64-bit shift is a quarter rate instruction. In the // common case, splitting this into a move and a 32-bit shift is faster and // the same code size. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-ashr-narrow.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-ashr-narrow.mir new file mode 100644 index 000000000000..163eb78ae505 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-ashr-narrow.mir @@ -0,0 +1,204 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: narrow_ashr_s64_32_s64amt +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: narrow_ashr_s64_32_s64amt + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; CHECK: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[UV1]], [[C]](s32) + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV]](s32), [[ASHR]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = G_CONSTANT i64 32 + %2:_(s64) = G_ASHR %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: narrow_ashr_s64_32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: narrow_ashr_s64_32 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; CHECK: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[UV1]], [[C]](s32) + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV]](s32), [[ASHR]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 32 + %2:_(s64) = G_ASHR %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: narrow_ashr_s64_33 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: narrow_ashr_s64_33 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; CHECK: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[UV1]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[UV1]], [[C1]](s32) + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[ASHR1]](s32), [[ASHR]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 33 + %2:_(s64) = G_ASHR %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: narrow_ashr_s64_31 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: narrow_ashr_s64_31 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; CHECK: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY]], [[C]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[ASHR]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 31 + %2:_(s64) = G_ASHR %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: narrow_ashr_s64_63 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: narrow_ashr_s64_63 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; CHECK: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[UV1]], [[C]](s32) + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[ASHR]](s32), [[ASHR]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 63 + %2:_(s64) = G_ASHR %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: narrow_ashr_s64_64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: narrow_ashr_s64_64 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 64 + ; CHECK: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY]], [[C]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[ASHR]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 64 + %2:_(s64) = G_ASHR %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: narrow_ashr_s64_65 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: narrow_ashr_s64_65 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65 + ; CHECK: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY]], [[C]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[ASHR]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 65 + %2:_(s64) = G_ASHR %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: narrow_ashr_s32_16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: narrow_ashr_s32_16 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[C]](s32) + ; CHECK: $vgpr0 = COPY [[ASHR]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CONSTANT i32 16 + %2:_(s32) = G_ASHR %0, %1 + $vgpr0 = COPY %2 +... + +--- +name: narrow_ashr_s32_17 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: narrow_ashr_s32_17 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 17 + ; CHECK: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[C]](s32) + ; CHECK: $vgpr0 = COPY [[ASHR]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CONSTANT i32 17 + %2:_(s32) = G_ASHR %0, %1 + $vgpr0 = COPY %2 +... + +--- +name: narrow_ashr_v2s32_17 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: narrow_ashr_v2s32_17 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 17 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32) + ; CHECK: [[ASHR:%[0-9]+]]:_(<2 x s32>) = G_ASHR [[COPY]], [[BUILD_VECTOR]](<2 x s32>) + ; CHECK: $vgpr0_vgpr1 = COPY [[ASHR]](<2 x s32>) + %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 17 + %2:_(<2 x s32>) = G_BUILD_VECTOR %1, %1 + %3:_(<2 x s32>) = G_ASHR %0, %2 + $vgpr0_vgpr1 = COPY %3 +...