[x86][CGP] try to hoist funnel shift above select-of-splats

This is basically the same patch as D63233, but converted to
funnel shifts rather than regular shifts. I did not see a
way to effectively share code for these 2 cases though.

This follows D79718 and D79827 to re-fix PR37426 because
that gets canonicalized to funnel shift intrinsics in IR.

I did draft an alternative patch as an enhancement to
"shouldSinkOperands()", but that was awkward because
we have to key the transform from the select, but then
look at both its users and its operands.
This commit is contained in:
Sanjay Patel 2020-05-15 15:22:30 -04:00
parent 72f1fb2edf
commit 5be37cb124
3 changed files with 181 additions and 71 deletions

View File

@ -391,6 +391,7 @@ class TypePromotionTransaction;
bool optimizeExtUses(Instruction *I);
bool optimizeLoadExt(LoadInst *Load);
bool optimizeShiftInst(BinaryOperator *BO);
bool optimizeFunnelShift(IntrinsicInst *Fsh);
bool optimizeSelectInst(SelectInst *SI);
bool optimizeShuffleVectorInst(ShuffleVectorInst *SVI);
bool optimizeSwitchInst(SwitchInst *SI);
@ -2061,6 +2062,9 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
case Intrinsic::ctlz:
// If counting zeros is expensive, try to avoid it.
return despeculateCountZeros(II, TLI, DL, ModifiedDT);
case Intrinsic::fshl:
case Intrinsic::fshr:
return optimizeFunnelShift(II);
case Intrinsic::dbg_value:
return fixupDbgValue(II);
case Intrinsic::vscale: {
@ -6240,6 +6244,41 @@ bool CodeGenPrepare::optimizeShiftInst(BinaryOperator *Shift) {
return true;
}
bool CodeGenPrepare::optimizeFunnelShift(IntrinsicInst *Fsh) {
Intrinsic::ID Opcode = Fsh->getIntrinsicID();
assert((Opcode == Intrinsic::fshl || Opcode == Intrinsic::fshr) &&
"Expected a funnel shift");
// If this is (1) a vector funnel shift, (2) shifts by scalars are cheaper
// than general vector shifts, and (3) the shift amount is select-of-splatted
// values, hoist the funnel shifts before the select:
// fsh Op0, Op1, (select Cond, TVal, FVal) -->
// select Cond, (fsh Op0, Op1, TVal), (fsh Op0, Op1, FVal)
//
// This is inverting a generic IR transform when we know that the cost of a
// general vector shift is more than the cost of 2 shift-by-scalars.
// We can't do this effectively in SDAG because we may not be able to
// determine if the select operands are splats from within a basic block.
Type *Ty = Fsh->getType();
if (!Ty->isVectorTy() || !TLI->isVectorShiftByScalarCheap(Ty))
return false;
Value *Cond, *TVal, *FVal;
if (!match(Fsh->getOperand(2),
m_OneUse(m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal)))))
return false;
if (!isSplatValue(TVal) || !isSplatValue(FVal))
return false;
IRBuilder<> Builder(Fsh);
Value *X = Fsh->getOperand(0), *Y = Fsh->getOperand(1);
Value *NewTVal = Builder.CreateIntrinsic(Opcode, Ty, { X, Y, TVal });
Value *NewFVal = Builder.CreateIntrinsic(Opcode, Ty, { X, Y, FVal });
Value *NewSel = Builder.CreateSelect(Cond, NewTVal, NewFVal);
Fsh->replaceAllUsesWith(NewSel);
Fsh->eraseFromParent();
return true;
}
/// If we have a SelectInst that will likely profit from branch prediction,
/// turn it into a branch.
bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {

View File

@ -1695,58 +1695,48 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
define void @fancierRotate2(i32* %arr, i8* %control, i32 %rot0, i32 %rot1) {
; AVX1-LABEL: fancierRotate2:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovd %edx, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm9
; AVX1-NEXT: vmovd %ecx, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm10
; AVX1-NEXT: vmovd %edx, %xmm1
; AVX1-NEXT: vmovd %ecx, %xmm3
; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm11
; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm4
; AVX1-NEXT: vmovaps {{.*#+}} xmm12 = [31,31,31,31]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [31,31,31,31]
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm2[0],zero,xmm2[1],zero
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
; AVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm2[0],zero,xmm2[1],zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm4
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm4[0],zero,xmm4[1],zero
; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX1-NEXT: .p2align 4, 0x90
; AVX1-NEXT: .LBB8_1: # %loop
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
; AVX1-NEXT: vmovq {{.*#+}} xmm7 = mem[0],zero
; AVX1-NEXT: vpcmpeqb %xmm7, %xmm8, %xmm7
; AVX1-NEXT: vpmovsxbd %xmm7, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm7, %xmm7
; AVX1-NEXT: vblendvps %xmm7, %xmm11, %xmm4, %xmm7
; AVX1-NEXT: vmovdqu 4096(%rdi,%rax,4), %xmm0
; AVX1-NEXT: vmovdqu 4112(%rdi,%rax,4), %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; AVX1-NEXT: vandps %xmm7, %xmm12, %xmm7
; AVX1-NEXT: vpslld $23, %xmm7, %xmm7
; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm7
; AVX1-NEXT: vcvttps2dq %xmm7, %xmm7
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
; AVX1-NEXT: vpmuludq %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpmuludq %xmm7, %xmm1, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,3],xmm5[4,5],xmm3[6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
; AVX1-NEXT: vpor %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; AVX1-NEXT: vblendvps %xmm2, %xmm9, %xmm10, %xmm2
; AVX1-NEXT: vandps %xmm2, %xmm12, %xmm2
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
; AVX1-NEXT: vpmuludq %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vmovdqu %xmm0, 4096(%rdi,%rax,4)
; AVX1-NEXT: vmovdqu %xmm1, 4112(%rdi,%rax,4)
; AVX1-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
; AVX1-NEXT: vpcmpeqb %xmm5, %xmm8, %xmm5
; AVX1-NEXT: vpmovsxbd %xmm5, %xmm6
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5
; AVX1-NEXT: vmovdqu 4096(%rdi,%rax,4), %xmm7
; AVX1-NEXT: vmovdqu 4112(%rdi,%rax,4), %xmm0
; AVX1-NEXT: vpslld %xmm9, %xmm7, %xmm1
; AVX1-NEXT: vpsrld %xmm10, %xmm7, %xmm2
; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpslld %xmm9, %xmm0, %xmm2
; AVX1-NEXT: vpsrld %xmm10, %xmm0, %xmm3
; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpslld %xmm11, %xmm7, %xmm3
; AVX1-NEXT: vpsrld %xmm4, %xmm7, %xmm7
; AVX1-NEXT: vpor %xmm7, %xmm3, %xmm3
; AVX1-NEXT: vblendvps %xmm6, %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpslld %xmm11, %xmm0, %xmm3
; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vblendvps %xmm5, %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vmovups %xmm1, 4096(%rdi,%rax,4)
; AVX1-NEXT: vmovups %xmm0, 4112(%rdi,%rax,4)
; AVX1-NEXT: addq $8, %rax
; AVX1-NEXT: jne .LBB8_1
; AVX1-NEXT: # %bb.2: # %exit

View File

@ -265,32 +265,113 @@ exit:
}
; PR37426 - https://bugs.llvm.org/show_bug.cgi?id=37426
; If we don't have real vector shift instructions (AVX1), convert the funnel
; shift into 2 funnel shifts and sink the splat shuffles into the loop.
define void @fancierRotate2(i32* %arr, i8* %control, i32 %rot0, i32 %rot1) {
; ALL-LABEL: @fancierRotate2(
; ALL-NEXT: entry:
; ALL-NEXT: [[I0:%.*]] = insertelement <8 x i32> undef, i32 [[ROT0:%.*]], i32 0
; ALL-NEXT: [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
; ALL-NEXT: [[I1:%.*]] = insertelement <8 x i32> undef, i32 [[ROT1:%.*]], i32 0
; ALL-NEXT: [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
; ALL-NEXT: br label [[LOOP:%.*]]
; ALL: loop:
; ALL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
; ALL-NEXT: [[T0:%.*]] = getelementptr inbounds i8, i8* [[CONTROL:%.*]], i64 [[INDEX]]
; ALL-NEXT: [[T1:%.*]] = bitcast i8* [[T0]] to <8 x i8>*
; ALL-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[T1]], align 1
; ALL-NEXT: [[T2:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
; ALL-NEXT: [[SHAMT:%.*]] = select <8 x i1> [[T2]], <8 x i32> [[S0]], <8 x i32> [[S1]]
; ALL-NEXT: [[T4:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
; ALL-NEXT: [[T5:%.*]] = bitcast i32* [[T4]] to <8 x i32>*
; ALL-NEXT: [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[T5]], align 4
; ALL-NEXT: [[ROT:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[SHAMT]])
; ALL-NEXT: store <8 x i32> [[ROT]], <8 x i32>* [[T5]], align 4
; ALL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
; ALL-NEXT: [[T7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; ALL-NEXT: br i1 [[T7]], label [[EXIT:%.*]], label [[LOOP]]
; ALL: exit:
; ALL-NEXT: ret void
; AVX1-LABEL: @fancierRotate2(
; AVX1-NEXT: entry:
; AVX1-NEXT: [[I0:%.*]] = insertelement <8 x i32> undef, i32 [[ROT0:%.*]], i32 0
; AVX1-NEXT: [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
; AVX1-NEXT: [[I1:%.*]] = insertelement <8 x i32> undef, i32 [[ROT1:%.*]], i32 0
; AVX1-NEXT: [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
; AVX1-NEXT: br label [[LOOP:%.*]]
; AVX1: loop:
; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
; AVX1-NEXT: [[T0:%.*]] = getelementptr inbounds i8, i8* [[CONTROL:%.*]], i64 [[INDEX]]
; AVX1-NEXT: [[T1:%.*]] = bitcast i8* [[T0]] to <8 x i8>*
; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[T1]], align 1
; AVX1-NEXT: [[T2:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
; AVX1-NEXT: [[SHAMT:%.*]] = select <8 x i1> [[T2]], <8 x i32> [[S0]], <8 x i32> [[S1]]
; AVX1-NEXT: [[T4:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
; AVX1-NEXT: [[T5:%.*]] = bitcast i32* [[T4]] to <8 x i32>*
; AVX1-NEXT: [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[T5]], align 4
; AVX1-NEXT: [[TMP0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
; AVX1-NEXT: [[TMP1:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[TMP0]])
; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
; AVX1-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[TMP2]])
; AVX1-NEXT: [[TMP4:%.*]] = select <8 x i1> [[T2]], <8 x i32> [[TMP1]], <8 x i32> [[TMP3]]
; AVX1-NEXT: store <8 x i32> [[TMP4]], <8 x i32>* [[T5]], align 4
; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
; AVX1-NEXT: [[T7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; AVX1-NEXT: br i1 [[T7]], label [[EXIT:%.*]], label [[LOOP]]
; AVX1: exit:
; AVX1-NEXT: ret void
;
; AVX2-LABEL: @fancierRotate2(
; AVX2-NEXT: entry:
; AVX2-NEXT: [[I0:%.*]] = insertelement <8 x i32> undef, i32 [[ROT0:%.*]], i32 0
; AVX2-NEXT: [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
; AVX2-NEXT: [[I1:%.*]] = insertelement <8 x i32> undef, i32 [[ROT1:%.*]], i32 0
; AVX2-NEXT: [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
; AVX2-NEXT: br label [[LOOP:%.*]]
; AVX2: loop:
; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
; AVX2-NEXT: [[T0:%.*]] = getelementptr inbounds i8, i8* [[CONTROL:%.*]], i64 [[INDEX]]
; AVX2-NEXT: [[T1:%.*]] = bitcast i8* [[T0]] to <8 x i8>*
; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[T1]], align 1
; AVX2-NEXT: [[T2:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
; AVX2-NEXT: [[SHAMT:%.*]] = select <8 x i1> [[T2]], <8 x i32> [[S0]], <8 x i32> [[S1]]
; AVX2-NEXT: [[T4:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
; AVX2-NEXT: [[T5:%.*]] = bitcast i32* [[T4]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[T5]], align 4
; AVX2-NEXT: [[ROT:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[SHAMT]])
; AVX2-NEXT: store <8 x i32> [[ROT]], <8 x i32>* [[T5]], align 4
; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
; AVX2-NEXT: [[T7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; AVX2-NEXT: br i1 [[T7]], label [[EXIT:%.*]], label [[LOOP]]
; AVX2: exit:
; AVX2-NEXT: ret void
;
; AVX512BW-LABEL: @fancierRotate2(
; AVX512BW-NEXT: entry:
; AVX512BW-NEXT: [[I0:%.*]] = insertelement <8 x i32> undef, i32 [[ROT0:%.*]], i32 0
; AVX512BW-NEXT: [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
; AVX512BW-NEXT: [[I1:%.*]] = insertelement <8 x i32> undef, i32 [[ROT1:%.*]], i32 0
; AVX512BW-NEXT: [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
; AVX512BW-NEXT: br label [[LOOP:%.*]]
; AVX512BW: loop:
; AVX512BW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
; AVX512BW-NEXT: [[T0:%.*]] = getelementptr inbounds i8, i8* [[CONTROL:%.*]], i64 [[INDEX]]
; AVX512BW-NEXT: [[T1:%.*]] = bitcast i8* [[T0]] to <8 x i8>*
; AVX512BW-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[T1]], align 1
; AVX512BW-NEXT: [[T2:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
; AVX512BW-NEXT: [[SHAMT:%.*]] = select <8 x i1> [[T2]], <8 x i32> [[S0]], <8 x i32> [[S1]]
; AVX512BW-NEXT: [[T4:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
; AVX512BW-NEXT: [[T5:%.*]] = bitcast i32* [[T4]] to <8 x i32>*
; AVX512BW-NEXT: [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[T5]], align 4
; AVX512BW-NEXT: [[ROT:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[SHAMT]])
; AVX512BW-NEXT: store <8 x i32> [[ROT]], <8 x i32>* [[T5]], align 4
; AVX512BW-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
; AVX512BW-NEXT: [[T7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; AVX512BW-NEXT: br i1 [[T7]], label [[EXIT:%.*]], label [[LOOP]]
; AVX512BW: exit:
; AVX512BW-NEXT: ret void
;
; XOP-LABEL: @fancierRotate2(
; XOP-NEXT: entry:
; XOP-NEXT: [[I0:%.*]] = insertelement <8 x i32> undef, i32 [[ROT0:%.*]], i32 0
; XOP-NEXT: [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
; XOP-NEXT: [[I1:%.*]] = insertelement <8 x i32> undef, i32 [[ROT1:%.*]], i32 0
; XOP-NEXT: [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
; XOP-NEXT: br label [[LOOP:%.*]]
; XOP: loop:
; XOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
; XOP-NEXT: [[T0:%.*]] = getelementptr inbounds i8, i8* [[CONTROL:%.*]], i64 [[INDEX]]
; XOP-NEXT: [[T1:%.*]] = bitcast i8* [[T0]] to <8 x i8>*
; XOP-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[T1]], align 1
; XOP-NEXT: [[T2:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
; XOP-NEXT: [[SHAMT:%.*]] = select <8 x i1> [[T2]], <8 x i32> [[S0]], <8 x i32> [[S1]]
; XOP-NEXT: [[T4:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
; XOP-NEXT: [[T5:%.*]] = bitcast i32* [[T4]] to <8 x i32>*
; XOP-NEXT: [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[T5]], align 4
; XOP-NEXT: [[ROT:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[SHAMT]])
; XOP-NEXT: store <8 x i32> [[ROT]], <8 x i32>* [[T5]], align 4
; XOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
; XOP-NEXT: [[T7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; XOP-NEXT: br i1 [[T7]], label [[EXIT:%.*]], label [[LOOP]]
; XOP: exit:
; XOP-NEXT: ret void
;
entry:
%i0 = insertelement <8 x i32> undef, i32 %rot0, i32 0