[CodeGen] Replace the reused stores' chain for extractelt expansion.

This fixes a subtle issue that was introduced in r205153.

When reusing a store for the extractelement expansion (to load directly
from it, inserting of going through the stack), later stores to the
same location might have overwritten the data we were expecting to
extract from.

To fix that, we need to explicitly replace the chain going out of the
reused store, so that later stores also have an explicit dependency on
the generated element-extracting loads, and can't clobber them.

rdar://20066785
Differential Revision: http://reviews.llvm.org/D8180

llvm-svn: 231721
This commit is contained in:
Ahmed Bougacha 2015-03-09 22:51:05 +00:00
parent 540469d8a2
commit c809761dc0
3 changed files with 205 additions and 128 deletions

View File

@ -1442,13 +1442,27 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
Idx = DAG.getZExtOrTrunc(Idx, dl, TLI.getPointerTy());
StackPtr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, StackPtr);
SDValue NewLoad;
if (Op.getValueType().isVector())
return DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr,MachinePointerInfo(),
false, false, false, 0);
return DAG.getExtLoad(ISD::EXTLOAD, dl, Op.getValueType(), Ch, StackPtr,
MachinePointerInfo(),
Vec.getValueType().getVectorElementType(),
false, false, false, 0);
NewLoad = DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr,
MachinePointerInfo(), false, false, false, 0);
else
NewLoad = DAG.getExtLoad(
ISD::EXTLOAD, dl, Op.getValueType(), Ch, StackPtr, MachinePointerInfo(),
Vec.getValueType().getVectorElementType(), false, false, false, 0);
// Replace the chain going out of the store, by the one out of the load.
DAG.ReplaceAllUsesOfValueWith(Ch, SDValue(NewLoad.getNode(), 1));
// We introduced a cycle though, so update the loads operands, making sure
// to use the original store's chain as an incoming chain.
SmallVector<SDValue, 6> NewLoadOperands(NewLoad->op_begin(),
NewLoad->op_end());
NewLoadOperands[0] = Ch;
NewLoad =
SDValue(DAG.UpdateNodeOperands(NewLoad.getNode(), NewLoadOperands), 0);
return NewLoad;
}
SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {

View File

@ -0,0 +1,57 @@
; RUN: llc < %s -mtriple i386-apple-darwin -mcpu=yonah | FileCheck %s
target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
; Make sure we don't break load/store ordering when turning an extractelement
; into loads, off the stack or a previous store.
; Be very explicit about the ordering/stack offsets.
; CHECK-LABEL: test_extractelement_legalization_storereuse:
; CHECK: # BB#0
; CHECK-NEXT: pushl %ebx
; CHECK-NEXT: pushl %edi
; CHECK-NEXT: pushl %esi
; CHECK-NEXT: movl 16(%esp), %eax
; CHECK-NEXT: movl 24(%esp), %ecx
; CHECK-NEXT: movl 20(%esp), %edx
; CHECK-NEXT: paddd (%edx), %xmm0
; CHECK-NEXT: movdqa %xmm0, (%edx)
; CHECK-NEXT: shll $4, %ecx
; CHECK-NEXT: movl (%ecx,%edx), %esi
; CHECK-NEXT: movl 12(%ecx,%edx), %edi
; CHECK-NEXT: movl 8(%ecx,%edx), %ebx
; CHECK-NEXT: movl 4(%ecx,%edx), %edx
; CHECK-NEXT: movl %esi, 12(%eax,%ecx)
; CHECK-NEXT: movl %edx, (%eax,%ecx)
; CHECK-NEXT: movl %ebx, 8(%eax,%ecx)
; CHECK-NEXT: movl %edi, 4(%eax,%ecx)
; CHECK-NEXT: popl %esi
; CHECK-NEXT: popl %edi
; CHECK-NEXT: popl %ebx
; CHECK-NEXT: retl
define void @test_extractelement_legalization_storereuse(<4 x i32> %a, i32* nocapture %x, i32* nocapture readonly %y, i32 %i) #0 {
entry:
%0 = bitcast i32* %y to <4 x i32>*
%1 = load <4 x i32>, <4 x i32>* %0, align 16
%am = add <4 x i32> %a, %1
store <4 x i32> %am, <4 x i32>* %0, align 16
%ip0 = shl nsw i32 %i, 2
%ip1 = or i32 %ip0, 1
%ip2 = or i32 %ip0, 2
%ip3 = or i32 %ip0, 3
%vecext = extractelement <4 x i32> %am, i32 %ip0
%arrayidx = getelementptr inbounds i32, i32* %x, i32 %ip3
store i32 %vecext, i32* %arrayidx, align 4
%vecext5 = extractelement <4 x i32> %am, i32 %ip1
%arrayidx8 = getelementptr inbounds i32, i32* %x, i32 %ip0
store i32 %vecext5, i32* %arrayidx8, align 4
%vecext11 = extractelement <4 x i32> %am, i32 %ip2
%arrayidx14 = getelementptr inbounds i32, i32* %x, i32 %ip2
store i32 %vecext11, i32* %arrayidx14, align 4
%vecext17 = extractelement <4 x i32> %am, i32 %ip3
%arrayidx20 = getelementptr inbounds i32, i32* %x, i32 %ip1
store i32 %vecext17, i32* %arrayidx20, align 4
ret void
}
attributes #0 = { nounwind }

View File

@ -460,6 +460,9 @@ define <16 x i8> @test7(<16 x i8> %a) #0 {
;
; SSE-LABEL: test7:
; SSE: # BB#0:
; SSE-NEXT: pushq %rbp
; SSE-NEXT: pushq %r14
; SSE-NEXT: pushq %rbx
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: imull $-109, %eax, %ecx
@ -471,156 +474,156 @@ define <16 x i8> @test7(<16 x i8> %a) #0 {
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movzbl %cl, %eax
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %r14d
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %edx
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %r9d
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: imull $-109, %eax, %ecx
; SSE-NEXT: shrl $8, %ecx
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movb %cl, %al
; SSE-NEXT: shrb $7, %al
; SSE-NEXT: sarb $2, %cl
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movzbl %cl, %eax
; SSE-NEXT: movd %eax, %xmm1
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %r11d
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %ecx
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %r8d
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %esi
; SSE-NEXT: imull $-109, %esi, %edi
; SSE-NEXT: shrl $8, %edi
; SSE-NEXT: addb %sil, %dil
; SSE-NEXT: movb %dil, %bl
; SSE-NEXT: shrb $7, %bl
; SSE-NEXT: sarb $2, %dil
; SSE-NEXT: addb %bl, %dil
; SSE-NEXT: movzbl %dil, %esi
; SSE-NEXT: movd %esi, %xmm1
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: imull $-109, %eax, %ecx
; SSE-NEXT: shrl $8, %ecx
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movb %cl, %al
; SSE-NEXT: imull $-109, %eax, %esi
; SSE-NEXT: shrl $8, %esi
; SSE-NEXT: addb %al, %sil
; SSE-NEXT: movb %sil, %al
; SSE-NEXT: shrb $7, %al
; SSE-NEXT: sarb $2, %cl
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movzbl %cl, %eax
; SSE-NEXT: sarb $2, %sil
; SSE-NEXT: addb %al, %sil
; SSE-NEXT: movzbl %sil, %eax
; SSE-NEXT: movd %eax, %xmm2
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: imull $-109, %eax, %ecx
; SSE-NEXT: shrl $8, %ecx
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movb %cl, %al
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %ebp
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %esi
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %r10d
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %edi
; SSE-NEXT: imull $-109, %edi, %ebx
; SSE-NEXT: shrl $8, %ebx
; SSE-NEXT: addb %dil, %bl
; SSE-NEXT: movb %bl, %al
; SSE-NEXT: shrb $7, %al
; SSE-NEXT: sarb $2, %cl
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movzbl %cl, %eax
; SSE-NEXT: sarb $2, %bl
; SSE-NEXT: addb %al, %bl
; SSE-NEXT: movzbl %bl, %eax
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: imull $-109, %eax, %ecx
; SSE-NEXT: shrl $8, %ecx
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movb %cl, %al
; SSE-NEXT: shrb $7, %al
; SSE-NEXT: sarb $2, %cl
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movzbl %cl, %eax
; SSE-NEXT: imull $-109, %edx, %eax
; SSE-NEXT: shrl $8, %eax
; SSE-NEXT: addb %dl, %al
; SSE-NEXT: movb %al, %dl
; SSE-NEXT: shrb $7, %dl
; SSE-NEXT: sarb $2, %al
; SSE-NEXT: addb %dl, %al
; SSE-NEXT: movzbl %al, %eax
; SSE-NEXT: movd %eax, %xmm1
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: imull $-109, %eax, %ecx
; SSE-NEXT: shrl $8, %ecx
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movb %cl, %al
; SSE-NEXT: shrb $7, %al
; SSE-NEXT: sarb $2, %cl
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movzbl %cl, %eax
; SSE-NEXT: imull $-109, %esi, %eax
; SSE-NEXT: shrl $8, %eax
; SSE-NEXT: addb %sil, %al
; SSE-NEXT: movb %al, %dl
; SSE-NEXT: shrb $7, %dl
; SSE-NEXT: sarb $2, %al
; SSE-NEXT: addb %dl, %al
; SSE-NEXT: movzbl %al, %eax
; SSE-NEXT: movd %eax, %xmm2
; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: imull $-109, %eax, %ecx
; SSE-NEXT: shrl $8, %ecx
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movb %cl, %al
; SSE-NEXT: shrb $7, %al
; SSE-NEXT: sarb $2, %cl
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movzbl %cl, %eax
; SSE-NEXT: imull $-109, %ecx, %eax
; SSE-NEXT: shrl $8, %eax
; SSE-NEXT: addb %cl, %al
; SSE-NEXT: movb %al, %cl
; SSE-NEXT: shrb $7, %cl
; SSE-NEXT: sarb $2, %al
; SSE-NEXT: addb %cl, %al
; SSE-NEXT: movzbl %al, %eax
; SSE-NEXT: movd %eax, %xmm3
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %ecx
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: imull $-109, %eax, %ecx
; SSE-NEXT: shrl $8, %ecx
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movb %cl, %al
; SSE-NEXT: imull $-109, %eax, %edx
; SSE-NEXT: shrl $8, %edx
; SSE-NEXT: addb %al, %dl
; SSE-NEXT: movb %dl, %al
; SSE-NEXT: shrb $7, %al
; SSE-NEXT: sarb $2, %cl
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movzbl %cl, %eax
; SSE-NEXT: sarb $2, %dl
; SSE-NEXT: addb %al, %dl
; SSE-NEXT: movzbl %dl, %eax
; SSE-NEXT: movd %eax, %xmm1
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: imull $-109, %eax, %ecx
; SSE-NEXT: shrl $8, %ecx
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movb %cl, %al
; SSE-NEXT: shrb $7, %al
; SSE-NEXT: sarb $2, %cl
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movzbl %cl, %eax
; SSE-NEXT: imull $-109, %r14d, %eax
; SSE-NEXT: shrl $8, %eax
; SSE-NEXT: addb %r14b, %al
; SSE-NEXT: movb %al, %dl
; SSE-NEXT: shrb $7, %dl
; SSE-NEXT: sarb $2, %al
; SSE-NEXT: addb %dl, %al
; SSE-NEXT: movzbl %al, %eax
; SSE-NEXT: movd %eax, %xmm2
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: imull $-109, %eax, %ecx
; SSE-NEXT: shrl $8, %ecx
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movb %cl, %al
; SSE-NEXT: shrb $7, %al
; SSE-NEXT: sarb $2, %cl
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movzbl %cl, %eax
; SSE-NEXT: imull $-109, %ebp, %eax
; SSE-NEXT: shrl $8, %eax
; SSE-NEXT: addb %bpl, %al
; SSE-NEXT: movb %al, %dl
; SSE-NEXT: shrb $7, %dl
; SSE-NEXT: sarb $2, %al
; SSE-NEXT: addb %dl, %al
; SSE-NEXT: movzbl %al, %eax
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: imull $-109, %eax, %ecx
; SSE-NEXT: shrl $8, %ecx
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movb %cl, %al
; SSE-NEXT: shrb $7, %al
; SSE-NEXT: sarb $2, %cl
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movzbl %cl, %eax
; SSE-NEXT: imull $-109, %r11d, %eax
; SSE-NEXT: shrl $8, %eax
; SSE-NEXT: addb %r11b, %al
; SSE-NEXT: movb %al, %dl
; SSE-NEXT: shrb $7, %dl
; SSE-NEXT: sarb $2, %al
; SSE-NEXT: addb %dl, %al
; SSE-NEXT: movzbl %al, %eax
; SSE-NEXT: movd %eax, %xmm3
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: imull $-109, %eax, %ecx
; SSE-NEXT: shrl $8, %ecx
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movb %cl, %al
; SSE-NEXT: shrb $7, %al
; SSE-NEXT: sarb $2, %cl
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movzbl %cl, %eax
; SSE-NEXT: imull $-109, %ecx, %eax
; SSE-NEXT: shrl $8, %eax
; SSE-NEXT: addb %cl, %al
; SSE-NEXT: movb %al, %cl
; SSE-NEXT: shrb $7, %cl
; SSE-NEXT: sarb $2, %al
; SSE-NEXT: addb %cl, %al
; SSE-NEXT: movzbl %al, %eax
; SSE-NEXT: movd %eax, %xmm2
; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: imull $-109, %eax, %ecx
; SSE-NEXT: shrl $8, %ecx
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movb %cl, %al
; SSE-NEXT: shrb $7, %al
; SSE-NEXT: sarb $2, %cl
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movzbl %cl, %eax
; SSE-NEXT: imull $-109, %r9d, %eax
; SSE-NEXT: shrl $8, %eax
; SSE-NEXT: addb %r9b, %al
; SSE-NEXT: movb %al, %cl
; SSE-NEXT: shrb $7, %cl
; SSE-NEXT: sarb $2, %al
; SSE-NEXT: addb %cl, %al
; SSE-NEXT: movzbl %al, %eax
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: imull $-109, %eax, %ecx
; SSE-NEXT: shrl $8, %ecx
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movb %cl, %al
; SSE-NEXT: shrb $7, %al
; SSE-NEXT: sarb $2, %cl
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movzbl %cl, %eax
; SSE-NEXT: imull $-109, %r10d, %eax
; SSE-NEXT: shrl $8, %eax
; SSE-NEXT: addb %r10b, %al
; SSE-NEXT: movb %al, %cl
; SSE-NEXT: shrb $7, %cl
; SSE-NEXT: sarb $2, %al
; SSE-NEXT: addb %cl, %al
; SSE-NEXT: movzbl %al, %eax
; SSE-NEXT: movd %eax, %xmm3
; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: imull $-109, %eax, %ecx
; SSE-NEXT: shrl $8, %ecx
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movb %cl, %al
; SSE-NEXT: shrb $7, %al
; SSE-NEXT: sarb $2, %cl
; SSE-NEXT: addb %al, %cl
; SSE-NEXT: movzbl %cl, %eax
; SSE-NEXT: imull $-109, %r8d, %eax
; SSE-NEXT: shrl $8, %eax
; SSE-NEXT: addb %r8b, %al
; SSE-NEXT: movb %al, %cl
; SSE-NEXT: shrb $7, %cl
; SSE-NEXT: sarb $2, %al
; SSE-NEXT: addb %cl, %al
; SSE-NEXT: movzbl %al, %eax
; SSE-NEXT: movd %eax, %xmm4
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: imull $-109, %eax, %ecx
@ -636,6 +639,9 @@ define <16 x i8> @test7(<16 x i8> %a) #0 {
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE-NEXT: popq %rbx
; SSE-NEXT: popq %r14
; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; AVX-LABEL: test7: