GlobalISel: Revisit users of other merge opcodes in artifact combiner

The artifact combiner searches for the uses of G_MERGE_VALUES for
unmerge/trunc that need further combining. This also needs to handle
the vector merge opcodes the same way. This fixes leaving behind some
pairs I expected to be removed, that were if the legalizer is run a
second time.
This commit is contained in:
Matt Arsenault 2020-08-16 10:04:12 -04:00
parent e0ec7a0206
commit fe171908e9
9 changed files with 234 additions and 163 deletions

View File

@ -750,6 +750,8 @@ public:
Changed = tryCombineMerges(MI, DeadInsts, UpdatedDefs, WrapperObserver);
break;
case TargetOpcode::G_MERGE_VALUES:
case TargetOpcode::G_BUILD_VECTOR:
case TargetOpcode::G_CONCAT_VECTORS:
// If any of the users of this merge are an unmerge, then add them to the
// artifact worklist in case there's folding that can be done looking up.
for (MachineInstr &U : MRI.use_instructions(MI.getOperand(0).getReg())) {

View File

@ -0,0 +1,31 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s
# The G_ZEXT and G_SHL will be scalarized, introducing a
# G_UNMERGE_VALUES of G_BUILD_VECTOR. The artifact combiner should
# eliminate the pair.
---
name: revisit_build_vector_unmerge_user
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0_vgpr1
; GFX9-LABEL: name: revisit_build_vector_unmerge_user
; GFX9: liveins: $vgpr0_vgpr1
; GFX9: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
; GFX9: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[UV]](s32)
; GFX9: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[UV1]](s32)
; GFX9: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ZEXT]], [[C]](s32)
; GFX9: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[ZEXT1]], [[C]](s32)
; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SHL]](s64), [[SHL1]](s64)
; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
%0:_(<2 x s32>) = COPY $vgpr0_vgpr1
%1:_(s32) = G_CONSTANT i32 2
%2:_(<2 x s32>) = G_BUILD_VECTOR %1, %1
%3:_(<2 x s64>) = G_ZEXT %0
%4:_(<2 x s64>) = G_SHL %3, %2
$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %4
...

View File

@ -0,0 +1,46 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s
# The G_SHL will be split into <2 x s16>, introducing a
# G_UNMERGE_VALUES of G_CONCAT_VECTORS. The artifact combiner should
# eliminate the pair.
---
name: revisit_concat_vectors_unmerge_user
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0_vgpr1
; GFX9-LABEL: name: revisit_concat_vectors_unmerge_user
; GFX9: liveins: $vgpr0_vgpr1
; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32)
; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32)
; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY1]](s32), [[COPY2]](s32)
; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[COPY4]](s32)
; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[COPY6]](s32)
; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY7]](s32), [[C3]](s32)
; GFX9: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>)
; GFX9: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]](<2 x s16>)
; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SHL]](<2 x s16>), [[SHL1]](<2 x s16>)
; GFX9: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
%0:_(s32) = COPY $vgpr0
%1:_(<4 x s8>) = G_BITCAST %0
%2:_(s16) = G_CONSTANT i16 2
%3:_(<4 x s16>) = G_BUILD_VECTOR %2, %2, %2, %2
%4:_(<4 x s16>) = G_ANYEXT %1
%5:_(<4 x s16>) = G_SHL %4, %3
$vgpr0_vgpr1 = COPY %5
...

View File

@ -194,9 +194,11 @@ body: |
; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ANYEXT1]](s16)
; CHECK: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[ANYEXT]](s16)
; CHECK: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[ANYEXT1]](s16)
; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[ZEXT]](s32), [[ZEXT1]](s32), [[ZEXT2]](s32), [[ZEXT3]](s32)
; CHECK: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>), [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; CHECK: S_ENDPGM 0, implicit [[UV4]](<2 x s16>), implicit [[UV5]](<2 x s16>), implicit [[UV6]](<2 x s16>), implicit [[UV7]](<2 x s16>)
; CHECK: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[ZEXT]](s32)
; CHECK: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[ZEXT1]](s32)
; CHECK: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[ZEXT2]](s32)
; CHECK: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[ZEXT3]](s32)
; CHECK: S_ENDPGM 0, implicit [[BITCAST]](<2 x s16>), implicit [[BITCAST1]](<2 x s16>), implicit [[BITCAST2]](<2 x s16>), implicit [[BITCAST3]](<2 x s16>)
%0:_(<2 x s32>) = COPY $vgpr0_vgpr1
%1:_(<2 x s32>) = COPY $vgpr0_vgpr1
%2:_(s32), %3:_(s32) = G_UNMERGE_VALUES %0(<2 x s32>)
@ -341,9 +343,7 @@ body: |
; CHECK: [[SEXT1:%[0-9]+]]:_(s64) = G_SEXT [[UV1]](s32)
; CHECK: [[SEXT2:%[0-9]+]]:_(s64) = G_SEXT [[UV2]](s32)
; CHECK: [[SEXT3:%[0-9]+]]:_(s64) = G_SEXT [[UV3]](s32)
; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[SEXT]](s64), [[SEXT1]](s64), [[SEXT2]](s64), [[SEXT3]](s64)
; CHECK: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64), [[UV6:%[0-9]+]]:_(s64), [[UV7:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s64>)
; CHECK: S_ENDPGM 0, implicit [[UV4]](s64), implicit [[UV5]](s64), implicit [[UV6]](s64), implicit [[UV7]](s64)
; CHECK: S_ENDPGM 0, implicit [[SEXT]](s64), implicit [[SEXT1]](s64), implicit [[SEXT2]](s64), implicit [[SEXT3]](s64)
%0:_(<2 x s32>) = COPY $vgpr0_vgpr1
%1:_(<2 x s32>) = COPY $vgpr2_vgpr3
%2:_(<4 x s32>) = G_CONCAT_VECTORS %0, %1
@ -365,9 +365,7 @@ body: |
; CHECK: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[UV1]](s32)
; CHECK: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[UV2]](s32)
; CHECK: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[UV3]](s32)
; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[ZEXT]](s64), [[ZEXT1]](s64), [[ZEXT2]](s64), [[ZEXT3]](s64)
; CHECK: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64), [[UV6:%[0-9]+]]:_(s64), [[UV7:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s64>)
; CHECK: S_ENDPGM 0, implicit [[UV4]](s64), implicit [[UV5]](s64), implicit [[UV6]](s64), implicit [[UV7]](s64)
; CHECK: S_ENDPGM 0, implicit [[ZEXT]](s64), implicit [[ZEXT1]](s64), implicit [[ZEXT2]](s64), implicit [[ZEXT3]](s64)
%0:_(<2 x s32>) = COPY $vgpr0_vgpr1
%1:_(<2 x s32>) = COPY $vgpr2_vgpr3
%2:_(<4 x s32>) = G_CONCAT_VECTORS %0, %1
@ -389,9 +387,7 @@ body: |
; CHECK: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[UV1]](s32)
; CHECK: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[UV2]](s32)
; CHECK: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[UV3]](s32)
; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[ANYEXT]](s64), [[ANYEXT1]](s64), [[ANYEXT2]](s64), [[ANYEXT3]](s64)
; CHECK: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64), [[UV6:%[0-9]+]]:_(s64), [[UV7:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s64>)
; CHECK: S_ENDPGM 0, implicit [[UV4]](s64), implicit [[UV5]](s64), implicit [[UV6]](s64), implicit [[UV7]](s64)
; CHECK: S_ENDPGM 0, implicit [[ANYEXT]](s64), implicit [[ANYEXT1]](s64), implicit [[ANYEXT2]](s64), implicit [[ANYEXT3]](s64)
%0:_(<2 x s32>) = COPY $vgpr0_vgpr1
%1:_(<2 x s32>) = COPY $vgpr2_vgpr3
%2:_(<4 x s32>) = G_CONCAT_VECTORS %0, %1
@ -477,9 +473,11 @@ body: |
; CHECK: [[TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_TRUNC [[COPY1]](<2 x s32>)
; CHECK: [[TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_TRUNC [[COPY2]](<2 x s32>)
; CHECK: [[TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_TRUNC [[COPY3]](<2 x s32>)
; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[TRUNC]](<2 x s16>), [[TRUNC1]](<2 x s16>), [[TRUNC2]](<2 x s16>), [[TRUNC3]](<2 x s16>)
; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s16>)
; CHECK: S_ENDPGM 0, implicit [[UV]](s32), implicit [[UV1]](s32), implicit [[UV2]](s32), implicit [[UV3]](s32)
; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[TRUNC]](<2 x s16>)
; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[TRUNC1]](<2 x s16>)
; CHECK: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[TRUNC2]](<2 x s16>)
; CHECK: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[TRUNC3]](<2 x s16>)
; CHECK: S_ENDPGM 0, implicit [[BITCAST]](s32), implicit [[BITCAST1]](s32), implicit [[BITCAST2]](s32), implicit [[BITCAST3]](s32)
%0:_(<2 x s32>) = COPY $vgpr0_vgpr1
%1:_(<2 x s32>) = COPY $vgpr2_vgpr3
%2:_(<2 x s32>) = COPY $vgpr4_vgpr5
@ -503,9 +501,9 @@ body: |
; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
; CHECK: [[TRUNC:%[0-9]+]]:_(<2 x s16>) = G_TRUNC [[BUILD_VECTOR]](<2 x s32>)
; CHECK: [[TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_TRUNC [[BUILD_VECTOR1]](<2 x s32>)
; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[TRUNC]](<2 x s16>), [[TRUNC1]](<2 x s16>)
; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s16>)
; CHECK: S_ENDPGM 0, implicit [[UV]](s32), implicit [[UV1]](s32)
; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[TRUNC]](<2 x s16>)
; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[TRUNC1]](<2 x s16>)
; CHECK: S_ENDPGM 0, implicit [[BITCAST]](s32), implicit [[BITCAST1]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s32) = COPY $vgpr2
@ -544,9 +542,9 @@ body: |
; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]]
; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]]
; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[AND1]](s32), [[AND2]](s32), [[AND3]](s32)
; CHECK: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; CHECK: S_ENDPGM 0, implicit [[UV]](s64), implicit [[UV1]](s64)
; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[AND]](s32), [[AND1]](s32)
; CHECK: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[AND2]](s32), [[AND3]](s32)
; CHECK: S_ENDPGM 0, implicit [[MV]](s64), implicit [[MV1]](s64)
%0:_(<2 x s16>) = COPY $vgpr0
%1:_(<2 x s16>) = COPY $vgpr1
%2:_(<4 x s16>) = G_CONCAT_VECTORS %0, %1

View File

@ -624,18 +624,16 @@ body: |
; SI: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
; SI: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[UV2]](s32)
; SI: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[UV3]](s32)
; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[ZEXT]](s64), [[ZEXT1]](s64)
; SI: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s64>)
; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
; SI: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV4]], [[COPY2]](s32)
; SI: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV5]], [[C2]](s32)
; SI: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ZEXT]], [[COPY2]](s32)
; SI: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[ZEXT1]], [[C2]](s32)
; SI: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SHL]], [[C]]
; SI: [[AND3:%[0-9]+]]:_(s64) = G_AND [[SHL1]], [[C]]
; SI: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND2]]
; SI: [[OR1:%[0-9]+]]:_(s64) = G_OR [[AND1]], [[AND3]]
; SI: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR]](s64), [[OR1]](s64)
; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR1]](<2 x s64>)
; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR]](s64), [[OR1]](s64)
; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
; VI-LABEL: name: test_copysign_v2s64_v2s32
; VI: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
; VI: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr4_vgpr5
@ -647,18 +645,16 @@ body: |
; VI: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
; VI: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[UV2]](s32)
; VI: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[UV3]](s32)
; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[ZEXT]](s64), [[ZEXT1]](s64)
; VI: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s64>)
; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
; VI: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV4]], [[COPY2]](s32)
; VI: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV5]], [[C2]](s32)
; VI: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ZEXT]], [[COPY2]](s32)
; VI: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[ZEXT1]], [[C2]](s32)
; VI: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SHL]], [[C]]
; VI: [[AND3:%[0-9]+]]:_(s64) = G_AND [[SHL1]], [[C]]
; VI: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND2]]
; VI: [[OR1:%[0-9]+]]:_(s64) = G_OR [[AND1]], [[AND3]]
; VI: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR]](s64), [[OR1]](s64)
; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR1]](<2 x s64>)
; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR]](s64), [[OR1]](s64)
; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
; GFX9-LABEL: name: test_copysign_v2s64_v2s32
; GFX9: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr4_vgpr5
@ -670,18 +666,16 @@ body: |
; GFX9: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
; GFX9: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[UV2]](s32)
; GFX9: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[UV3]](s32)
; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[ZEXT]](s64), [[ZEXT1]](s64)
; GFX9: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s64>)
; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
; GFX9: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV4]], [[COPY2]](s32)
; GFX9: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV5]], [[C2]](s32)
; GFX9: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ZEXT]], [[COPY2]](s32)
; GFX9: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[ZEXT1]], [[C2]](s32)
; GFX9: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SHL]], [[C]]
; GFX9: [[AND3:%[0-9]+]]:_(s64) = G_AND [[SHL1]], [[C]]
; GFX9: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND2]]
; GFX9: [[OR1:%[0-9]+]]:_(s64) = G_OR [[AND1]], [[AND3]]
; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR]](s64), [[OR1]](s64)
; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR1]](<2 x s64>)
; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR]](s64), [[OR1]](s64)
; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
%0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
%1:_(<2 x s32>) = COPY $vgpr4_vgpr5
%2:_(<2 x s64>) = G_FCOPYSIGN %0, %1

View File

@ -906,28 +906,28 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX9-NEXT: s_lshr_b32 s3, s0, 16
; GFX9-NEXT: s_lshr_b32 s4, s0, 24
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX9-NEXT: s_lshr_b32 s5, s1, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s4
; GFX9-NEXT: s_lshr_b32 s4, s0, 16
; GFX9-NEXT: s_mov_b32 s3, 0x80008
; GFX9-NEXT: s_lshr_b32 s5, s1, 8
; GFX9-NEXT: s_lshl_b32 s0, s0, s3
; GFX9-NEXT: s_lshl_b32 s4, s4, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4
; GFX9-NEXT: s_lshr_b32 s4, s2, 16
; GFX9-NEXT: s_lshr_b32 s6, s1, 16
; GFX9-NEXT: s_lshr_b32 s7, s1, 24
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX9-NEXT: s_lshr_b32 s5, s0, 16
; GFX9-NEXT: s_mov_b32 s4, 0x80008
; GFX9-NEXT: s_lshl_b32 s0, s0, s4
; GFX9-NEXT: s_lshl_b32 s5, s5, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s5
; GFX9-NEXT: s_lshr_b32 s5, s2, 16
; GFX9-NEXT: s_lshl_b32 s2, s2, s4
; GFX9-NEXT: s_lshl_b32 s5, s5, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5
; GFX9-NEXT: s_lshr_b32 s5, s1, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s6, s7
; GFX9-NEXT: s_lshl_b32 s1, s1, s4
; GFX9-NEXT: s_lshl_b32 s2, s2, s3
; GFX9-NEXT: s_lshl_b32 s4, s4, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4
; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s7
; GFX9-NEXT: s_lshl_b32 s1, s1, s3
; GFX9-NEXT: s_lshl_b32 s5, s5, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX9-NEXT: s_lshr_b32 s5, s3, 16
; GFX9-NEXT: s_lshr_b32 s5, s4, 16
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: s_lshl_b32 s3, s3, s4
; GFX9-NEXT: s_lshl_b32 s3, s4, s3
; GFX9-NEXT: s_lshl_b32 s4, s5, 8
; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4
@ -956,25 +956,25 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX10-NEXT: s_lshr_b32 s6, s1, 16
; GFX10-NEXT: s_lshr_b32 s7, s1, 24
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4
; GFX10-NEXT: s_lshr_b32 s4, s0, 16
; GFX10-NEXT: s_mov_b32 s3, 0x80008
; GFX10-NEXT: s_pack_ll_b32_b16 s5, s6, s7
; GFX10-NEXT: s_lshr_b32 s6, s1, 16
; GFX10-NEXT: s_lshl_b32 s0, s0, s3
; GFX10-NEXT: s_lshl_b32 s4, s4, 8
; GFX10-NEXT: s_lshl_b32 s1, s1, s3
; GFX10-NEXT: s_lshl_b32 s6, s6, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s6
; GFX10-NEXT: s_lshr_b32 s4, s2, 16
; GFX10-NEXT: s_lshr_b32 s6, s5, 16
; GFX10-NEXT: s_lshr_b32 s8, s0, 16
; GFX10-NEXT: s_lshr_b32 s5, s1, 16
; GFX10-NEXT: s_mov_b32 s2, 0x80008
; GFX10-NEXT: s_lshl_b32 s8, s8, 8
; GFX10-NEXT: s_lshl_b32 s0, s0, s2
; GFX10-NEXT: s_lshl_b32 s1, s1, s2
; GFX10-NEXT: s_lshl_b32 s5, s5, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4
; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s7
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s8
; GFX10-NEXT: s_lshr_b32 s4, s3, 16
; GFX10-NEXT: s_lshr_b32 s5, s6, 16
; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp
; GFX10-NEXT: s_lshl_b32 s2, s2, s3
; GFX10-NEXT: s_lshl_b32 s3, s3, s2
; GFX10-NEXT: s_lshl_b32 s4, s4, 8
; GFX10-NEXT: s_lshl_b32 s0, s5, s3
; GFX10-NEXT: s_lshl_b32 s1, s6, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4
; GFX10-NEXT: s_lshl_b32 s0, s6, s2
; GFX10-NEXT: s_lshl_b32 s1, s5, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_add_i16 v1, s2, s0 clamp

View File

@ -906,28 +906,28 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX9-NEXT: s_lshr_b32 s3, s0, 16
; GFX9-NEXT: s_lshr_b32 s4, s0, 24
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX9-NEXT: s_lshr_b32 s5, s1, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s4
; GFX9-NEXT: s_lshr_b32 s4, s0, 16
; GFX9-NEXT: s_mov_b32 s3, 0x80008
; GFX9-NEXT: s_lshr_b32 s5, s1, 8
; GFX9-NEXT: s_lshl_b32 s0, s0, s3
; GFX9-NEXT: s_lshl_b32 s4, s4, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4
; GFX9-NEXT: s_lshr_b32 s4, s2, 16
; GFX9-NEXT: s_lshr_b32 s6, s1, 16
; GFX9-NEXT: s_lshr_b32 s7, s1, 24
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX9-NEXT: s_lshr_b32 s5, s0, 16
; GFX9-NEXT: s_mov_b32 s4, 0x80008
; GFX9-NEXT: s_lshl_b32 s0, s0, s4
; GFX9-NEXT: s_lshl_b32 s5, s5, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s5
; GFX9-NEXT: s_lshr_b32 s5, s2, 16
; GFX9-NEXT: s_lshl_b32 s2, s2, s4
; GFX9-NEXT: s_lshl_b32 s5, s5, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5
; GFX9-NEXT: s_lshr_b32 s5, s1, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s6, s7
; GFX9-NEXT: s_lshl_b32 s1, s1, s4
; GFX9-NEXT: s_lshl_b32 s2, s2, s3
; GFX9-NEXT: s_lshl_b32 s4, s4, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4
; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s7
; GFX9-NEXT: s_lshl_b32 s1, s1, s3
; GFX9-NEXT: s_lshl_b32 s5, s5, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX9-NEXT: s_lshr_b32 s5, s3, 16
; GFX9-NEXT: s_lshr_b32 s5, s4, 16
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: s_lshl_b32 s3, s3, s4
; GFX9-NEXT: s_lshl_b32 s3, s4, s3
; GFX9-NEXT: s_lshl_b32 s4, s5, 8
; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4
@ -956,25 +956,25 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX10-NEXT: s_lshr_b32 s6, s1, 16
; GFX10-NEXT: s_lshr_b32 s7, s1, 24
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4
; GFX10-NEXT: s_lshr_b32 s4, s0, 16
; GFX10-NEXT: s_mov_b32 s3, 0x80008
; GFX10-NEXT: s_pack_ll_b32_b16 s5, s6, s7
; GFX10-NEXT: s_lshr_b32 s6, s1, 16
; GFX10-NEXT: s_lshl_b32 s0, s0, s3
; GFX10-NEXT: s_lshl_b32 s4, s4, 8
; GFX10-NEXT: s_lshl_b32 s1, s1, s3
; GFX10-NEXT: s_lshl_b32 s6, s6, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s6
; GFX10-NEXT: s_lshr_b32 s4, s2, 16
; GFX10-NEXT: s_lshr_b32 s6, s5, 16
; GFX10-NEXT: s_lshr_b32 s8, s0, 16
; GFX10-NEXT: s_lshr_b32 s5, s1, 16
; GFX10-NEXT: s_mov_b32 s2, 0x80008
; GFX10-NEXT: s_lshl_b32 s8, s8, 8
; GFX10-NEXT: s_lshl_b32 s0, s0, s2
; GFX10-NEXT: s_lshl_b32 s1, s1, s2
; GFX10-NEXT: s_lshl_b32 s5, s5, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4
; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s7
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s8
; GFX10-NEXT: s_lshr_b32 s4, s3, 16
; GFX10-NEXT: s_lshr_b32 s5, s6, 16
; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 clamp
; GFX10-NEXT: s_lshl_b32 s2, s2, s3
; GFX10-NEXT: s_lshl_b32 s3, s3, s2
; GFX10-NEXT: s_lshl_b32 s4, s4, 8
; GFX10-NEXT: s_lshl_b32 s0, s5, s3
; GFX10-NEXT: s_lshl_b32 s1, s6, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4
; GFX10-NEXT: s_lshl_b32 s0, s6, s2
; GFX10-NEXT: s_lshl_b32 s1, s5, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_sub_i16 v1, s2, s0 clamp

View File

@ -622,28 +622,28 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX9-NEXT: s_lshr_b32 s3, s0, 16
; GFX9-NEXT: s_lshr_b32 s4, s0, 24
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX9-NEXT: s_lshr_b32 s5, s1, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s4
; GFX9-NEXT: s_lshr_b32 s4, s0, 16
; GFX9-NEXT: s_mov_b32 s3, 0x80008
; GFX9-NEXT: s_lshr_b32 s5, s1, 8
; GFX9-NEXT: s_lshl_b32 s0, s0, s3
; GFX9-NEXT: s_lshl_b32 s4, s4, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4
; GFX9-NEXT: s_lshr_b32 s4, s2, 16
; GFX9-NEXT: s_lshr_b32 s6, s1, 16
; GFX9-NEXT: s_lshr_b32 s7, s1, 24
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX9-NEXT: s_lshr_b32 s5, s0, 16
; GFX9-NEXT: s_mov_b32 s4, 0x80008
; GFX9-NEXT: s_lshl_b32 s0, s0, s4
; GFX9-NEXT: s_lshl_b32 s5, s5, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s5
; GFX9-NEXT: s_lshr_b32 s5, s2, 16
; GFX9-NEXT: s_lshl_b32 s2, s2, s4
; GFX9-NEXT: s_lshl_b32 s5, s5, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5
; GFX9-NEXT: s_lshr_b32 s5, s1, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s6, s7
; GFX9-NEXT: s_lshl_b32 s1, s1, s4
; GFX9-NEXT: s_lshl_b32 s2, s2, s3
; GFX9-NEXT: s_lshl_b32 s4, s4, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4
; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s7
; GFX9-NEXT: s_lshl_b32 s1, s1, s3
; GFX9-NEXT: s_lshl_b32 s5, s5, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX9-NEXT: s_lshr_b32 s5, s3, 16
; GFX9-NEXT: s_lshr_b32 s5, s4, 16
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: s_lshl_b32 s3, s3, s4
; GFX9-NEXT: s_lshl_b32 s3, s4, s3
; GFX9-NEXT: s_lshl_b32 s4, s5, 8
; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4
@ -672,25 +672,25 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX10-NEXT: s_lshr_b32 s6, s1, 16
; GFX10-NEXT: s_lshr_b32 s7, s1, 24
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4
; GFX10-NEXT: s_lshr_b32 s4, s0, 16
; GFX10-NEXT: s_mov_b32 s3, 0x80008
; GFX10-NEXT: s_pack_ll_b32_b16 s5, s6, s7
; GFX10-NEXT: s_lshr_b32 s6, s1, 16
; GFX10-NEXT: s_lshl_b32 s0, s0, s3
; GFX10-NEXT: s_lshl_b32 s4, s4, 8
; GFX10-NEXT: s_lshl_b32 s1, s1, s3
; GFX10-NEXT: s_lshl_b32 s6, s6, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s6
; GFX10-NEXT: s_lshr_b32 s4, s2, 16
; GFX10-NEXT: s_lshr_b32 s6, s5, 16
; GFX10-NEXT: s_lshr_b32 s8, s0, 16
; GFX10-NEXT: s_lshr_b32 s5, s1, 16
; GFX10-NEXT: s_mov_b32 s2, 0x80008
; GFX10-NEXT: s_lshl_b32 s8, s8, 8
; GFX10-NEXT: s_lshl_b32 s0, s0, s2
; GFX10-NEXT: s_lshl_b32 s1, s1, s2
; GFX10-NEXT: s_lshl_b32 s5, s5, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4
; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s7
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s8
; GFX10-NEXT: s_lshr_b32 s4, s3, 16
; GFX10-NEXT: s_lshr_b32 s5, s6, 16
; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp
; GFX10-NEXT: s_lshl_b32 s2, s2, s3
; GFX10-NEXT: s_lshl_b32 s3, s3, s2
; GFX10-NEXT: s_lshl_b32 s4, s4, 8
; GFX10-NEXT: s_lshl_b32 s0, s5, s3
; GFX10-NEXT: s_lshl_b32 s1, s6, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4
; GFX10-NEXT: s_lshl_b32 s0, s6, s2
; GFX10-NEXT: s_lshl_b32 s1, s5, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_add_u16 v1, s2, s0 clamp

View File

@ -606,28 +606,28 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX9-NEXT: s_lshr_b32 s3, s0, 16
; GFX9-NEXT: s_lshr_b32 s4, s0, 24
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX9-NEXT: s_lshr_b32 s5, s1, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s4
; GFX9-NEXT: s_lshr_b32 s4, s0, 16
; GFX9-NEXT: s_mov_b32 s3, 0x80008
; GFX9-NEXT: s_lshr_b32 s5, s1, 8
; GFX9-NEXT: s_lshl_b32 s0, s0, s3
; GFX9-NEXT: s_lshl_b32 s4, s4, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4
; GFX9-NEXT: s_lshr_b32 s4, s2, 16
; GFX9-NEXT: s_lshr_b32 s6, s1, 16
; GFX9-NEXT: s_lshr_b32 s7, s1, 24
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX9-NEXT: s_lshr_b32 s5, s0, 16
; GFX9-NEXT: s_mov_b32 s4, 0x80008
; GFX9-NEXT: s_lshl_b32 s0, s0, s4
; GFX9-NEXT: s_lshl_b32 s5, s5, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s5
; GFX9-NEXT: s_lshr_b32 s5, s2, 16
; GFX9-NEXT: s_lshl_b32 s2, s2, s4
; GFX9-NEXT: s_lshl_b32 s5, s5, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5
; GFX9-NEXT: s_lshr_b32 s5, s1, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s6, s7
; GFX9-NEXT: s_lshl_b32 s1, s1, s4
; GFX9-NEXT: s_lshl_b32 s2, s2, s3
; GFX9-NEXT: s_lshl_b32 s4, s4, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4
; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s7
; GFX9-NEXT: s_lshl_b32 s1, s1, s3
; GFX9-NEXT: s_lshl_b32 s5, s5, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX9-NEXT: s_lshr_b32 s5, s3, 16
; GFX9-NEXT: s_lshr_b32 s5, s4, 16
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: s_lshl_b32 s3, s3, s4
; GFX9-NEXT: s_lshl_b32 s3, s4, s3
; GFX9-NEXT: s_lshl_b32 s4, s5, 8
; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4
@ -656,25 +656,25 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX10-NEXT: s_lshr_b32 s6, s1, 16
; GFX10-NEXT: s_lshr_b32 s7, s1, 24
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4
; GFX10-NEXT: s_lshr_b32 s4, s0, 16
; GFX10-NEXT: s_mov_b32 s3, 0x80008
; GFX10-NEXT: s_pack_ll_b32_b16 s5, s6, s7
; GFX10-NEXT: s_lshr_b32 s6, s1, 16
; GFX10-NEXT: s_lshl_b32 s0, s0, s3
; GFX10-NEXT: s_lshl_b32 s4, s4, 8
; GFX10-NEXT: s_lshl_b32 s1, s1, s3
; GFX10-NEXT: s_lshl_b32 s6, s6, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s6
; GFX10-NEXT: s_lshr_b32 s4, s2, 16
; GFX10-NEXT: s_lshr_b32 s6, s5, 16
; GFX10-NEXT: s_lshr_b32 s8, s0, 16
; GFX10-NEXT: s_lshr_b32 s5, s1, 16
; GFX10-NEXT: s_mov_b32 s2, 0x80008
; GFX10-NEXT: s_lshl_b32 s8, s8, 8
; GFX10-NEXT: s_lshl_b32 s0, s0, s2
; GFX10-NEXT: s_lshl_b32 s1, s1, s2
; GFX10-NEXT: s_lshl_b32 s5, s5, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4
; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s7
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s8
; GFX10-NEXT: s_lshr_b32 s4, s3, 16
; GFX10-NEXT: s_lshr_b32 s5, s6, 16
; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp
; GFX10-NEXT: s_lshl_b32 s2, s2, s3
; GFX10-NEXT: s_lshl_b32 s3, s3, s2
; GFX10-NEXT: s_lshl_b32 s4, s4, 8
; GFX10-NEXT: s_lshl_b32 s0, s5, s3
; GFX10-NEXT: s_lshl_b32 s1, s6, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4
; GFX10-NEXT: s_lshl_b32 s0, s6, s2
; GFX10-NEXT: s_lshl_b32 s1, s5, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_sub_u16 v1, s2, s0 clamp