GlobalISel: Implement bitcast action for G_INSERT_VECTOR_ELT

This mirrors the support for the equivalent extracts. This also
creates a huge mess that would be greatly improved if we had any bit
operation combines.
This commit is contained in:
Matt Arsenault 2020-06-15 21:35:15 -04:00
parent b1600d8b89
commit e2f1b48f86
6 changed files with 9539 additions and 10 deletions

View File

@ -318,6 +318,10 @@ public:
LegalizeResult bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
LLT CastTy);
/// Perform Bitcast legalize action on G_INSERT_VECTOR_ELT.
LegalizeResult bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
LLT CastTy);
LegalizeResult lowerBitcast(MachineInstr &MI);
LegalizeResult lowerLoad(MachineInstr &MI);
LegalizeResult lowerStore(MachineInstr &MI);

View File

@ -2369,6 +2369,28 @@ LegalizerHelper::lowerBitcast(MachineInstr &MI) {
return UnableToLegalize;
}
/// Figure out the bit offset into a register when coercing a vector index for
/// the wide element type. This is only for the case when promoting vector to
/// one with larger elements.
//
///
/// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
/// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
Register Idx,
unsigned NewEltSize,
unsigned OldEltSize) {
const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
LLT IdxTy = B.getMRI()->getType(Idx);
// Now figure out the amount we need to shift to get the target bits.
auto OffsetMask = B.buildConstant(
IdxTy, ~(APInt::getAllOnesValue(IdxTy.getSizeInBits()) << Log2EltRatio));
auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask);
return B.buildShl(IdxTy, OffsetIdx,
B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0);
}
/// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this
/// is casting to a vector with a smaller element size, perform multiple element
/// extracts and merge the results. If this is coercing to a vector with larger
@ -2467,13 +2489,9 @@ LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
ScaledIdx).getReg(0);
}
// Now figure out the amount we need to shift to get the target bits.
auto OffsetMask = MIRBuilder.buildConstant(
IdxTy, ~(APInt::getAllOnesValue(IdxTy.getSizeInBits()) << Log2EltRatio));
auto OffsetIdx = MIRBuilder.buildAnd(IdxTy, Idx, OffsetMask);
auto OffsetBits = MIRBuilder.buildShl(
IdxTy, OffsetIdx,
MIRBuilder.buildConstant(IdxTy, Log2_32(OldEltSize)));
// Compute the bit offset into the register of the target element.
Register OffsetBits = getBitcastWiderVectorElementOffset(
MIRBuilder, Idx, NewEltSize, OldEltSize);
// Shift the wide element to get the target element.
auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits);
@ -2485,6 +2503,104 @@ LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
return UnableToLegalize;
}
/// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p
/// TargetReg, while preserving other bits in \p TargetReg.
///
/// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
static Register buildBitFieldInsert(MachineIRBuilder &B,
Register TargetReg, Register InsertReg,
Register OffsetBits) {
LLT TargetTy = B.getMRI()->getType(TargetReg);
LLT InsertTy = B.getMRI()->getType(InsertReg);
auto ZextVal = B.buildZExt(TargetTy, InsertReg);
auto ShiftedInsertVal = B.buildShl(TargetTy, ZextVal, OffsetBits);
// Produce a bitmask of the value to insert
auto EltMask = B.buildConstant(
TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(),
InsertTy.getSizeInBits()));
// Shift it into position
auto ShiftedMask = B.buildShl(TargetTy, EltMask, OffsetBits);
auto InvShiftedMask = B.buildNot(TargetTy, ShiftedMask);
// Clear out the bits in the wide element
auto MaskedOldElt = B.buildAnd(TargetTy, TargetReg, InvShiftedMask);
// The value to insert has all zeros already, so stick it into the masked
// wide element.
return B.buildOr(TargetTy, MaskedOldElt, ShiftedInsertVal).getReg(0);
}
/// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this
/// is increasing the element size, perform the indexing in the target element
/// type, and use bit operations to insert at the element position. This is
/// intended for architectures that can dynamically index the register file and
/// want to force indexing in the native register size.
LegalizerHelper::LegalizeResult
LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
LLT CastTy) {
if (TypeIdx != 0)
return UnableToLegalize;
Register Dst = MI.getOperand(0).getReg();
Register SrcVec = MI.getOperand(1).getReg();
Register Val = MI.getOperand(2).getReg();
Register Idx = MI.getOperand(3).getReg();
LLT VecTy = MRI.getType(Dst);
LLT ValTy = MRI.getType(Val);
LLT IdxTy = MRI.getType(Idx);
LLT VecEltTy = VecTy.getElementType();
LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
const unsigned NewEltSize = NewEltTy.getSizeInBits();
const unsigned OldEltSize = VecEltTy.getSizeInBits();
unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
unsigned OldNumElts = VecTy.getNumElements();
Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
if (NewNumElts < OldNumElts) {
if (NewEltSize % OldEltSize != 0)
return UnableToLegalize;
// This only depends on powers of 2 because we use bit tricks to figure out
// the bit offset we need to shift to get the target element. A general
// expansion could emit division/multiply.
if (!isPowerOf2_32(NewEltSize / OldEltSize))
return UnableToLegalize;
const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
// Divide to get the index in the wider element type.
auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
Register ExtractedElt = CastVec;
if (CastTy.isVector()) {
ExtractedElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
ScaledIdx).getReg(0);
}
// Compute the bit offset into the register of the target element.
Register OffsetBits = getBitcastWiderVectorElementOffset(
MIRBuilder, Idx, NewEltSize, OldEltSize);
Register InsertedElt = buildBitFieldInsert(MIRBuilder, ExtractedElt,
Val, OffsetBits);
if (CastTy.isVector()) {
InsertedElt = MIRBuilder.buildInsertVectorElement(
CastTy, CastVec, InsertedElt, ScaledIdx).getReg(0);
}
MIRBuilder.buildBitcast(Dst, InsertedElt);
MI.eraseFromParent();
return Legalized;
}
return UnableToLegalize;
}
LegalizerHelper::LegalizeResult
LegalizerHelper::lowerLoad(MachineInstr &MI) {
// Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
@ -2674,6 +2790,8 @@ LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
}
case TargetOpcode::G_EXTRACT_VECTOR_ELT:
return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
case TargetOpcode::G_INSERT_VECTOR_ELT:
return bitcastInsertVectorElt(MI, TypeIdx, CastTy);
default:
return UnableToLegalize;
}

View File

@ -1338,11 +1338,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
VecTy.getSizeInBits() <= MaxRegisterSize &&
IdxTy.getSizeInBits() == 32;
})
.bitcastIf(all(sizeIsMultipleOf32(1), scalarOrEltNarrowerThan(1, 32)),
bitcastToVectorElement32(1))
.bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
bitcastToVectorElement32(VecTypeIdx))
//.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
.bitcastIf(
all(sizeIsMultipleOf32(1), scalarOrEltWiderThan(1, 64)),
all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
[=](const LegalityQuery &Query) {
// For > 64-bit element types, try to turn this into a 64-bit
// element vector since we may be able to do better indexing

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1738,3 +1738,195 @@ body: |
%5:_(p1) = COPY $vgpr0_vgpr1
G_STORE %4, %5 :: (store 256, align 4, addrspace 1)
...
---
name: insert_vector_elt_varidx_v4s8
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
; CHECK-LABEL: name: insert_vector_elt_varidx_v4s8
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32)
; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32)
; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]]
; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]]
; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32)
; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]]
; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C4]]
; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C4]](s32)
; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]]
; CHECK: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[SHL3]](s32)
; CHECK: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[C3]], [[SHL3]](s32)
; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; CHECK: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[SHL5]], [[C5]]
; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[OR2]], [[XOR]]
; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL4]]
; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[OR3]], [[C]](s32)
; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[OR3]], [[C1]](s32)
; CHECK: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[OR3]], [[C2]](s32)
; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[OR3]](s32)
; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]]
; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]]
; CHECK: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32)
; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[SHL6]]
; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32)
; CHECK: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]]
; CHECK: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C1]](s32)
; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL7]]
; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32)
; CHECK: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]]
; CHECK: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C2]](s32)
; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[OR5]], [[SHL8]]
; CHECK: $vgpr0 = COPY [[OR6]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s32) = COPY $vgpr2
%3:_(<4 x s8>) = G_BITCAST %0
%4:_(s8) = G_TRUNC %1
%5:_(<4 x s8>) = G_INSERT_VECTOR_ELT %3, %4, %2
%6:_(s32) = G_BITCAST %5
$vgpr0 = COPY %6
...
---
name: insert_vector_elt_varidx_v8s8
body: |
bb.0:
liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
; CHECK-LABEL: name: insert_vector_elt_varidx_v8s8
; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr3
; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32)
; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32)
; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
; CHECK: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32)
; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
; CHECK: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
; CHECK: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[C1]](s16)
; CHECK: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C1]](s16)
; CHECK: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC2]], [[C1]](s16)
; CHECK: [[LSHR5:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[C1]](s16)
; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]]
; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16)
; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32)
; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]]
; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR3]](s16)
; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32)
; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]]
; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR4]](s16)
; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C2]]
; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C3]](s32)
; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]]
; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C2]]
; CHECK: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32)
; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]]
; CHECK: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR5]](s16)
; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C2]]
; CHECK: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32)
; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]]
; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C5]](s32)
; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<2 x s32>), [[LSHR6]](s32)
; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C6]]
; CHECK: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C6]](s32)
; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
; CHECK: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]]
; CHECK: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[SHL6]](s32)
; CHECK: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[SHL6]](s32)
; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; CHECK: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[SHL8]], [[C7]]
; CHECK: [[AND10:%[0-9]+]]:_(s32) = G_AND [[EVEC]], [[XOR]]
; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL7]]
; CHECK: [[IVEC:%[0-9]+]]:_(<2 x s32>) = G_INSERT_VECTOR_ELT [[BUILD_VECTOR]], [[OR6]](s32), [[LSHR6]](s32)
; CHECK: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[IVEC]](<2 x s32>)
; CHECK: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C3]](s32)
; CHECK: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C]](s32)
; CHECK: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C4]](s32)
; CHECK: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C3]](s32)
; CHECK: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C]](s32)
; CHECK: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C4]](s32)
; CHECK: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
; CHECK: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[UV2]](s32)
; CHECK: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C8]]
; CHECK: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32)
; CHECK: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C8]]
; CHECK: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND12]], [[C1]](s16)
; CHECK: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND11]], [[SHL9]]
; CHECK: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR8]](s32)
; CHECK: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C8]]
; CHECK: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR9]](s32)
; CHECK: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C8]]
; CHECK: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND14]], [[C1]](s16)
; CHECK: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND13]], [[SHL10]]
; CHECK: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[UV3]](s32)
; CHECK: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C8]]
; CHECK: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR10]](s32)
; CHECK: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C8]]
; CHECK: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND16]], [[C1]](s16)
; CHECK: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND15]], [[SHL11]]
; CHECK: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR11]](s32)
; CHECK: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C8]]
; CHECK: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR12]](s32)
; CHECK: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C8]]
; CHECK: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND18]], [[C1]](s16)
; CHECK: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND17]], [[SHL12]]
; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16)
; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16)
; CHECK: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
; CHECK: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL13]]
; CHECK: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16)
; CHECK: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16)
; CHECK: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C]](s32)
; CHECK: [[OR12:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL14]]
; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR11]](s32), [[OR12]](s32)
; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s32) = COPY $vgpr2
%2:_(s32) = COPY $vgpr3
%3:_(<8 x s8>) = G_BITCAST %0
%4:_(s8) = G_TRUNC %1
%5:_(<8 x s8>) = G_INSERT_VECTOR_ELT %3, %4, %2
%6:_(s64) = G_BITCAST %5
$vgpr0_vgpr1 = COPY %6
...