diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h index 6c6f82ee4861..4c89f5b68ade 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h @@ -750,6 +750,8 @@ public: Changed = tryCombineMerges(MI, DeadInsts, UpdatedDefs, WrapperObserver); break; case TargetOpcode::G_MERGE_VALUES: + case TargetOpcode::G_BUILD_VECTOR: + case TargetOpcode::G_CONCAT_VECTORS: // If any of the users of this merge are an unmerge, then add them to the // artifact worklist in case there's folding that can be done looking up. for (MachineInstr &U : MRI.use_instructions(MI.getOperand(0).getReg())) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-build-vector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-build-vector.mir new file mode 100644 index 000000000000..12f296573ce9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-build-vector.mir @@ -0,0 +1,31 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s + +# The G_ZEXT and G_SHL will be scalarized, introducing a +# G_UNMERGE_VALUES of G_BUILD_VECTOR. The artifact combiner should +# eliminate the pair. +--- +name: revisit_build_vector_unmerge_user +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; GFX9-LABEL: name: revisit_build_vector_unmerge_user + ; GFX9: liveins: $vgpr0_vgpr1 + ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[UV]](s32) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[UV1]](s32) + ; GFX9: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ZEXT]], [[C]](s32) + ; GFX9: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[ZEXT1]], [[C]](s32) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SHL]](s64), [[SHL1]](s64) + ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 2 + %2:_(<2 x s32>) = G_BUILD_VECTOR %1, %1 + %3:_(<2 x s64>) = G_ZEXT %0 + %4:_(<2 x s64>) = G_SHL %3, %2 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %4 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-concat-vectors.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-concat-vectors.mir new file mode 100644 index 000000000000..af64874e874a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-concat-vectors.mir @@ -0,0 +1,46 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s + +# The G_SHL will be split into <2 x s16>, introducing a +# G_UNMERGE_VALUES of G_CONCAT_VECTORS. The artifact combiner should +# eliminate the pair. +--- +name: revisit_concat_vectors_unmerge_user +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; GFX9-LABEL: name: revisit_concat_vectors_unmerge_user + ; GFX9: liveins: $vgpr0_vgpr1 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32) + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32) + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY1]](s32), [[COPY2]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[COPY4]](s32) + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C3]](s32) + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C3]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[COPY6]](s32) + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C3]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY7]](s32), [[C3]](s32) + ; GFX9: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SHL]](<2 x s16>), [[SHL1]](<2 x s16>) + ; GFX9: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + %0:_(s32) = COPY $vgpr0 + %1:_(<4 x s8>) = G_BITCAST %0 + %2:_(s16) = G_CONSTANT i16 2 + %3:_(<4 x s16>) = G_BUILD_VECTOR %2, %2, %2, %2 + %4:_(<4 x s16>) = G_ANYEXT %1 + %5:_(<4 x s16>) = G_SHL %4, %3 + $vgpr0_vgpr1 = COPY %5 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-unmerge-values.mir index 1c9c2727f312..137c785231b8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-unmerge-values.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-unmerge-values.mir @@ -194,9 +194,11 @@ body: | ; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ANYEXT1]](s16) ; CHECK: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[ANYEXT]](s16) ; CHECK: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[ANYEXT1]](s16) - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[ZEXT]](s32), [[ZEXT1]](s32), [[ZEXT2]](s32), [[ZEXT3]](s32) - ; CHECK: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>), [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; CHECK: S_ENDPGM 0, implicit [[UV4]](<2 x s16>), implicit [[UV5]](<2 x s16>), implicit [[UV6]](<2 x s16>), implicit [[UV7]](<2 x s16>) + ; CHECK: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[ZEXT]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[ZEXT1]](s32) + ; CHECK: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[ZEXT2]](s32) + ; CHECK: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[ZEXT3]](s32) + ; CHECK: S_ENDPGM 0, implicit [[BITCAST]](<2 x s16>), implicit [[BITCAST1]](<2 x s16>), implicit [[BITCAST2]](<2 x s16>), implicit [[BITCAST3]](<2 x s16>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr0_vgpr1 %2:_(s32), %3:_(s32) = G_UNMERGE_VALUES %0(<2 x s32>) @@ -341,9 +343,7 @@ body: | ; CHECK: [[SEXT1:%[0-9]+]]:_(s64) = G_SEXT [[UV1]](s32) ; CHECK: [[SEXT2:%[0-9]+]]:_(s64) = G_SEXT [[UV2]](s32) ; CHECK: [[SEXT3:%[0-9]+]]:_(s64) = G_SEXT [[UV3]](s32) - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[SEXT]](s64), [[SEXT1]](s64), [[SEXT2]](s64), [[SEXT3]](s64) - ; CHECK: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64), [[UV6:%[0-9]+]]:_(s64), [[UV7:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s64>) - ; CHECK: S_ENDPGM 0, implicit [[UV4]](s64), implicit [[UV5]](s64), implicit [[UV6]](s64), implicit [[UV7]](s64) + ; CHECK: S_ENDPGM 0, implicit [[SEXT]](s64), implicit [[SEXT1]](s64), implicit [[SEXT2]](s64), implicit [[SEXT3]](s64) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 %2:_(<4 x s32>) = G_CONCAT_VECTORS %0, %1 @@ -365,9 +365,7 @@ body: | ; CHECK: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[UV1]](s32) ; CHECK: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[UV2]](s32) ; CHECK: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[UV3]](s32) - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[ZEXT]](s64), [[ZEXT1]](s64), [[ZEXT2]](s64), [[ZEXT3]](s64) - ; CHECK: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64), [[UV6:%[0-9]+]]:_(s64), [[UV7:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s64>) - ; CHECK: S_ENDPGM 0, implicit [[UV4]](s64), implicit [[UV5]](s64), implicit [[UV6]](s64), implicit [[UV7]](s64) + ; CHECK: S_ENDPGM 0, implicit [[ZEXT]](s64), implicit [[ZEXT1]](s64), implicit [[ZEXT2]](s64), implicit [[ZEXT3]](s64) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 %2:_(<4 x s32>) = G_CONCAT_VECTORS %0, %1 @@ -389,9 +387,7 @@ body: | ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[UV1]](s32) ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[UV2]](s32) ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[UV3]](s32) - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[ANYEXT]](s64), [[ANYEXT1]](s64), [[ANYEXT2]](s64), [[ANYEXT3]](s64) - ; CHECK: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64), [[UV6:%[0-9]+]]:_(s64), [[UV7:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s64>) - ; CHECK: S_ENDPGM 0, implicit [[UV4]](s64), implicit [[UV5]](s64), implicit [[UV6]](s64), implicit [[UV7]](s64) + ; CHECK: S_ENDPGM 0, implicit [[ANYEXT]](s64), implicit [[ANYEXT1]](s64), implicit [[ANYEXT2]](s64), implicit [[ANYEXT3]](s64) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 %2:_(<4 x s32>) = G_CONCAT_VECTORS %0, %1 @@ -477,9 +473,11 @@ body: | ; CHECK: [[TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_TRUNC [[COPY1]](<2 x s32>) ; CHECK: [[TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_TRUNC [[COPY2]](<2 x s32>) ; CHECK: [[TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_TRUNC [[COPY3]](<2 x s32>) - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[TRUNC]](<2 x s16>), [[TRUNC1]](<2 x s16>), [[TRUNC2]](<2 x s16>), [[TRUNC3]](<2 x s16>) - ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s16>) - ; CHECK: S_ENDPGM 0, implicit [[UV]](s32), implicit [[UV1]](s32), implicit [[UV2]](s32), implicit [[UV3]](s32) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[TRUNC]](<2 x s16>) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[TRUNC1]](<2 x s16>) + ; CHECK: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[TRUNC2]](<2 x s16>) + ; CHECK: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[TRUNC3]](<2 x s16>) + ; CHECK: S_ENDPGM 0, implicit [[BITCAST]](s32), implicit [[BITCAST1]](s32), implicit [[BITCAST2]](s32), implicit [[BITCAST3]](s32) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 %2:_(<2 x s32>) = COPY $vgpr4_vgpr5 @@ -503,9 +501,9 @@ body: | ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[TRUNC:%[0-9]+]]:_(<2 x s16>) = G_TRUNC [[BUILD_VECTOR]](<2 x s32>) ; CHECK: [[TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_TRUNC [[BUILD_VECTOR1]](<2 x s32>) - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[TRUNC]](<2 x s16>), [[TRUNC1]](<2 x s16>) - ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s16>) - ; CHECK: S_ENDPGM 0, implicit [[UV]](s32), implicit [[UV1]](s32) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[TRUNC]](<2 x s16>) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[TRUNC1]](<2 x s16>) + ; CHECK: S_ENDPGM 0, implicit [[BITCAST]](s32), implicit [[BITCAST1]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = COPY $vgpr2 @@ -544,9 +542,9 @@ body: | ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[AND1]](s32), [[AND2]](s32), [[AND3]](s32) - ; CHECK: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; CHECK: S_ENDPGM 0, implicit [[UV]](s64), implicit [[UV1]](s64) + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[AND]](s32), [[AND1]](s32) + ; CHECK: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[AND2]](s32), [[AND3]](s32) + ; CHECK: S_ENDPGM 0, implicit [[MV]](s64), implicit [[MV1]](s64) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<4 x s16>) = G_CONCAT_VECTORS %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcopysign.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcopysign.mir index c6e73ff2ba99..c31f78963645 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcopysign.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcopysign.mir @@ -624,18 +624,16 @@ body: | ; SI: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) ; SI: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[UV2]](s32) ; SI: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[UV3]](s32) - ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[ZEXT]](s64), [[ZEXT1]](s64) - ; SI: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s64>) ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; SI: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV4]], [[COPY2]](s32) - ; SI: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV5]], [[C2]](s32) + ; SI: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ZEXT]], [[COPY2]](s32) + ; SI: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[ZEXT1]], [[C2]](s32) ; SI: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SHL]], [[C]] ; SI: [[AND3:%[0-9]+]]:_(s64) = G_AND [[SHL1]], [[C]] ; SI: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND2]] ; SI: [[OR1:%[0-9]+]]:_(s64) = G_OR [[AND1]], [[AND3]] - ; SI: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR]](s64), [[OR1]](s64) - ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR1]](<2 x s64>) + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR]](s64), [[OR1]](s64) + ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; VI-LABEL: name: test_copysign_v2s64_v2s32 ; VI: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; VI: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr4_vgpr5 @@ -647,18 +645,16 @@ body: | ; VI: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) ; VI: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[UV2]](s32) ; VI: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[UV3]](s32) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[ZEXT]](s64), [[ZEXT1]](s64) - ; VI: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s64>) ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; VI: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV4]], [[COPY2]](s32) - ; VI: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV5]], [[C2]](s32) + ; VI: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ZEXT]], [[COPY2]](s32) + ; VI: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[ZEXT1]], [[C2]](s32) ; VI: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SHL]], [[C]] ; VI: [[AND3:%[0-9]+]]:_(s64) = G_AND [[SHL1]], [[C]] ; VI: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND2]] ; VI: [[OR1:%[0-9]+]]:_(s64) = G_OR [[AND1]], [[AND3]] - ; VI: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR]](s64), [[OR1]](s64) - ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR1]](<2 x s64>) + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR]](s64), [[OR1]](s64) + ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; GFX9-LABEL: name: test_copysign_v2s64_v2s32 ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr4_vgpr5 @@ -670,18 +666,16 @@ body: | ; GFX9: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) ; GFX9: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[UV2]](s32) ; GFX9: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[UV3]](s32) - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[ZEXT]](s64), [[ZEXT1]](s64) - ; GFX9: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s64>) ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; GFX9: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV4]], [[COPY2]](s32) - ; GFX9: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV5]], [[C2]](s32) + ; GFX9: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ZEXT]], [[COPY2]](s32) + ; GFX9: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[ZEXT1]], [[C2]](s32) ; GFX9: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SHL]], [[C]] ; GFX9: [[AND3:%[0-9]+]]:_(s64) = G_AND [[SHL1]], [[C]] ; GFX9: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND2]] ; GFX9: [[OR1:%[0-9]+]]:_(s64) = G_OR [[AND1]], [[AND3]] - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR]](s64), [[OR1]](s64) - ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR1]](<2 x s64>) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR]](s64), [[OR1]](s64) + ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(<2 x s32>) = COPY $vgpr4_vgpr5 %2:_(<2 x s64>) = G_FCOPYSIGN %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index bbed8675c64f..8c6f7cb717f5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -906,28 +906,28 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX9-NEXT: s_lshr_b32 s3, s0, 16 ; GFX9-NEXT: s_lshr_b32 s4, s0, 24 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX9-NEXT: s_lshr_b32 s5, s1, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s4 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: s_mov_b32 s3, 0x80008 +; GFX9-NEXT: s_lshr_b32 s5, s1, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, s3 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s4, s2, 16 ; GFX9-NEXT: s_lshr_b32 s6, s1, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 24 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX9-NEXT: s_lshr_b32 s5, s0, 16 -; GFX9-NEXT: s_mov_b32 s4, 0x80008 -; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s5, s5, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s5 -; GFX9-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-NEXT: s_lshl_b32 s2, s2, s4 -; GFX9-NEXT: s_lshl_b32 s5, s5, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5 ; GFX9-NEXT: s_lshr_b32 s5, s1, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s6, s7 -; GFX9-NEXT: s_lshl_b32 s1, s1, s4 +; GFX9-NEXT: s_lshl_b32 s2, s2, s3 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s7 +; GFX9-NEXT: s_lshl_b32 s1, s1, s3 ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 +; GFX9-NEXT: s_lshr_b32 s5, s4, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshl_b32 s3, s3, s4 +; GFX9-NEXT: s_lshl_b32 s3, s4, s3 ; GFX9-NEXT: s_lshl_b32 s4, s5, 8 ; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 @@ -956,25 +956,25 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s6, s7 -; GFX10-NEXT: s_lshr_b32 s6, s1, 16 -; GFX10-NEXT: s_lshl_b32 s0, s0, s3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s3 -; GFX10-NEXT: s_lshl_b32 s6, s6, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s6 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_lshr_b32 s6, s5, 16 +; GFX10-NEXT: s_lshr_b32 s8, s0, 16 +; GFX10-NEXT: s_lshr_b32 s5, s1, 16 +; GFX10-NEXT: s_mov_b32 s2, 0x80008 +; GFX10-NEXT: s_lshl_b32 s8, s8, 8 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s1, s1, s2 +; GFX10-NEXT: s_lshl_b32 s5, s5, 8 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX10-NEXT: s_lshr_b32 s4, s3, 16 +; GFX10-NEXT: s_lshr_b32 s5, s6, 16 ; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp -; GFX10-NEXT: s_lshl_b32 s2, s2, s3 +; GFX10-NEXT: s_lshl_b32 s3, s3, s2 ; GFX10-NEXT: s_lshl_b32 s4, s4, 8 -; GFX10-NEXT: s_lshl_b32 s0, s5, s3 -; GFX10-NEXT: s_lshl_b32 s1, s6, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: s_lshl_b32 s0, s6, s2 +; GFX10-NEXT: s_lshl_b32 s1, s5, 8 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_i16 v1, s2, s0 clamp diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 2d82db0a0f99..3cad20cb141b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -906,28 +906,28 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX9-NEXT: s_lshr_b32 s3, s0, 16 ; GFX9-NEXT: s_lshr_b32 s4, s0, 24 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX9-NEXT: s_lshr_b32 s5, s1, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s4 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: s_mov_b32 s3, 0x80008 +; GFX9-NEXT: s_lshr_b32 s5, s1, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, s3 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s4, s2, 16 ; GFX9-NEXT: s_lshr_b32 s6, s1, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 24 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX9-NEXT: s_lshr_b32 s5, s0, 16 -; GFX9-NEXT: s_mov_b32 s4, 0x80008 -; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s5, s5, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s5 -; GFX9-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-NEXT: s_lshl_b32 s2, s2, s4 -; GFX9-NEXT: s_lshl_b32 s5, s5, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5 ; GFX9-NEXT: s_lshr_b32 s5, s1, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s6, s7 -; GFX9-NEXT: s_lshl_b32 s1, s1, s4 +; GFX9-NEXT: s_lshl_b32 s2, s2, s3 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s7 +; GFX9-NEXT: s_lshl_b32 s1, s1, s3 ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 +; GFX9-NEXT: s_lshr_b32 s5, s4, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshl_b32 s3, s3, s4 +; GFX9-NEXT: s_lshl_b32 s3, s4, s3 ; GFX9-NEXT: s_lshl_b32 s4, s5, 8 ; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 @@ -956,25 +956,25 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s6, s7 -; GFX10-NEXT: s_lshr_b32 s6, s1, 16 -; GFX10-NEXT: s_lshl_b32 s0, s0, s3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s3 -; GFX10-NEXT: s_lshl_b32 s6, s6, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s6 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_lshr_b32 s6, s5, 16 +; GFX10-NEXT: s_lshr_b32 s8, s0, 16 +; GFX10-NEXT: s_lshr_b32 s5, s1, 16 +; GFX10-NEXT: s_mov_b32 s2, 0x80008 +; GFX10-NEXT: s_lshl_b32 s8, s8, 8 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s1, s1, s2 +; GFX10-NEXT: s_lshl_b32 s5, s5, 8 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX10-NEXT: s_lshr_b32 s4, s3, 16 +; GFX10-NEXT: s_lshr_b32 s5, s6, 16 ; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 clamp -; GFX10-NEXT: s_lshl_b32 s2, s2, s3 +; GFX10-NEXT: s_lshl_b32 s3, s3, s2 ; GFX10-NEXT: s_lshl_b32 s4, s4, 8 -; GFX10-NEXT: s_lshl_b32 s0, s5, s3 -; GFX10-NEXT: s_lshl_b32 s1, s6, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: s_lshl_b32 s0, s6, s2 +; GFX10-NEXT: s_lshl_b32 s1, s5, 8 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_i16 v1, s2, s0 clamp diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll index 7df5b42b5c1b..b62592eb1c47 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -622,28 +622,28 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX9-NEXT: s_lshr_b32 s3, s0, 16 ; GFX9-NEXT: s_lshr_b32 s4, s0, 24 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX9-NEXT: s_lshr_b32 s5, s1, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s4 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: s_mov_b32 s3, 0x80008 +; GFX9-NEXT: s_lshr_b32 s5, s1, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, s3 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s4, s2, 16 ; GFX9-NEXT: s_lshr_b32 s6, s1, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 24 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX9-NEXT: s_lshr_b32 s5, s0, 16 -; GFX9-NEXT: s_mov_b32 s4, 0x80008 -; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s5, s5, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s5 -; GFX9-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-NEXT: s_lshl_b32 s2, s2, s4 -; GFX9-NEXT: s_lshl_b32 s5, s5, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5 ; GFX9-NEXT: s_lshr_b32 s5, s1, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s6, s7 -; GFX9-NEXT: s_lshl_b32 s1, s1, s4 +; GFX9-NEXT: s_lshl_b32 s2, s2, s3 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s7 +; GFX9-NEXT: s_lshl_b32 s1, s1, s3 ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 +; GFX9-NEXT: s_lshr_b32 s5, s4, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshl_b32 s3, s3, s4 +; GFX9-NEXT: s_lshl_b32 s3, s4, s3 ; GFX9-NEXT: s_lshl_b32 s4, s5, 8 ; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 @@ -672,25 +672,25 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s6, s7 -; GFX10-NEXT: s_lshr_b32 s6, s1, 16 -; GFX10-NEXT: s_lshl_b32 s0, s0, s3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s3 -; GFX10-NEXT: s_lshl_b32 s6, s6, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s6 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_lshr_b32 s6, s5, 16 +; GFX10-NEXT: s_lshr_b32 s8, s0, 16 +; GFX10-NEXT: s_lshr_b32 s5, s1, 16 +; GFX10-NEXT: s_mov_b32 s2, 0x80008 +; GFX10-NEXT: s_lshl_b32 s8, s8, 8 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s1, s1, s2 +; GFX10-NEXT: s_lshl_b32 s5, s5, 8 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX10-NEXT: s_lshr_b32 s4, s3, 16 +; GFX10-NEXT: s_lshr_b32 s5, s6, 16 ; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp -; GFX10-NEXT: s_lshl_b32 s2, s2, s3 +; GFX10-NEXT: s_lshl_b32 s3, s3, s2 ; GFX10-NEXT: s_lshl_b32 s4, s4, 8 -; GFX10-NEXT: s_lshl_b32 s0, s5, s3 -; GFX10-NEXT: s_lshl_b32 s1, s6, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: s_lshl_b32 s0, s6, s2 +; GFX10-NEXT: s_lshl_b32 s1, s5, 8 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_u16 v1, s2, s0 clamp diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll index 2a73640506b5..b1a1a313f7e6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -606,28 +606,28 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX9-NEXT: s_lshr_b32 s3, s0, 16 ; GFX9-NEXT: s_lshr_b32 s4, s0, 24 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX9-NEXT: s_lshr_b32 s5, s1, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s4 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: s_mov_b32 s3, 0x80008 +; GFX9-NEXT: s_lshr_b32 s5, s1, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, s3 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s4, s2, 16 ; GFX9-NEXT: s_lshr_b32 s6, s1, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 24 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX9-NEXT: s_lshr_b32 s5, s0, 16 -; GFX9-NEXT: s_mov_b32 s4, 0x80008 -; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s5, s5, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s5 -; GFX9-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-NEXT: s_lshl_b32 s2, s2, s4 -; GFX9-NEXT: s_lshl_b32 s5, s5, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5 ; GFX9-NEXT: s_lshr_b32 s5, s1, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s6, s7 -; GFX9-NEXT: s_lshl_b32 s1, s1, s4 +; GFX9-NEXT: s_lshl_b32 s2, s2, s3 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s7 +; GFX9-NEXT: s_lshl_b32 s1, s1, s3 ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 +; GFX9-NEXT: s_lshr_b32 s5, s4, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshl_b32 s3, s3, s4 +; GFX9-NEXT: s_lshl_b32 s3, s4, s3 ; GFX9-NEXT: s_lshl_b32 s4, s5, 8 ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 @@ -656,25 +656,25 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s6, s7 -; GFX10-NEXT: s_lshr_b32 s6, s1, 16 -; GFX10-NEXT: s_lshl_b32 s0, s0, s3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s3 -; GFX10-NEXT: s_lshl_b32 s6, s6, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s6 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_lshr_b32 s6, s5, 16 +; GFX10-NEXT: s_lshr_b32 s8, s0, 16 +; GFX10-NEXT: s_lshr_b32 s5, s1, 16 +; GFX10-NEXT: s_mov_b32 s2, 0x80008 +; GFX10-NEXT: s_lshl_b32 s8, s8, 8 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s1, s1, s2 +; GFX10-NEXT: s_lshl_b32 s5, s5, 8 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX10-NEXT: s_lshr_b32 s4, s3, 16 +; GFX10-NEXT: s_lshr_b32 s5, s6, 16 ; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp -; GFX10-NEXT: s_lshl_b32 s2, s2, s3 +; GFX10-NEXT: s_lshl_b32 s3, s3, s2 ; GFX10-NEXT: s_lshl_b32 s4, s4, 8 -; GFX10-NEXT: s_lshl_b32 s0, s5, s3 -; GFX10-NEXT: s_lshl_b32 s1, s6, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: s_lshl_b32 s0, s6, s2 +; GFX10-NEXT: s_lshl_b32 s1, s5, 8 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_u16 v1, s2, s0 clamp