diff --git a/llvm/include/llvm/Target/TargetLowering.h b/llvm/include/llvm/Target/TargetLowering.h index fb43ef19a645..a68bd9cdf066 100644 --- a/llvm/include/llvm/Target/TargetLowering.h +++ b/llvm/include/llvm/Target/TargetLowering.h @@ -3118,6 +3118,13 @@ public: EVT DataVT, SelectionDAG &DAG, bool IsCompressedMemory) const; + /// Get a pointer to vector element \p Idx located in memory for a vector of + /// type \p VecVT starting at a base address of \p VecPtr. If \p Idx is out of + /// bounds the returned pointer is unspecified, but will be within the vector + /// bounds. + SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, + SDValue Idx) const; + //===--------------------------------------------------------------------===// // Instruction Emitting Hooks // diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 3485e35e6f5d..b0028252836a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -330,8 +330,6 @@ SDValue SelectionDAGLegalize::PerformInsertVectorEltInMemory(SDValue Vec, // supported by the target. EVT VT = Tmp1.getValueType(); EVT EltVT = VT.getVectorElementType(); - EVT IdxVT = Tmp3.getValueType(); - EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); SDValue StackPtr = DAG.CreateStackTemporary(VT); int SPFI = cast(StackPtr.getNode())->getIndex(); @@ -341,13 +339,8 @@ SDValue SelectionDAGLegalize::PerformInsertVectorEltInMemory(SDValue Vec, DAG.getEntryNode(), dl, Tmp1, StackPtr, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI)); - // Truncate or zero extend offset to target pointer type. - Tmp3 = DAG.getZExtOrTrunc(Tmp3, dl, PtrVT); - // Add the offset to the index. - unsigned EltSize = EltVT.getSizeInBits()/8; - Tmp3 = DAG.getNode(ISD::MUL, dl, IdxVT, Tmp3, - DAG.getConstant(EltSize, dl, IdxVT)); - SDValue StackPtr2 = DAG.getNode(ISD::ADD, dl, IdxVT, Tmp3, StackPtr); + SDValue StackPtr2 = TLI.getVectorElementPointer(DAG, StackPtr, VT, Tmp3); + // Store the scalar value. Ch = DAG.getTruncStore(Ch, dl, Tmp2, StackPtr2, MachinePointerInfo(), EltVT); // Load the updated vector. @@ -1209,20 +1202,16 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) { } } + EVT VecVT = Vec.getValueType(); + if (!Ch.getNode()) { // Store the value to a temporary stack slot, then LOAD the returned part. - StackPtr = DAG.CreateStackTemporary(Vec.getValueType()); + StackPtr = DAG.CreateStackTemporary(VecVT); Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, MachinePointerInfo()); } - // Add the offset to the index. - unsigned EltSize = Vec.getScalarValueSizeInBits() / 8; - Idx = DAG.getNode(ISD::MUL, dl, Idx.getValueType(), Idx, - DAG.getConstant(EltSize, SDLoc(Vec), Idx.getValueType())); - - Idx = DAG.getZExtOrTrunc(Idx, dl, TLI.getPointerTy(DAG.getDataLayout())); - StackPtr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, StackPtr); + StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx); SDValue NewLoad; @@ -1232,7 +1221,7 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) { else NewLoad = DAG.getExtLoad(ISD::EXTLOAD, dl, Op.getValueType(), Ch, StackPtr, MachinePointerInfo(), - Vec.getValueType().getVectorElementType()); + VecVT.getVectorElementType()); // Replace the chain going out of the store, by the one out of the load. DAG.ReplaceAllUsesOfValueWith(Ch, SDValue(NewLoad.getNode(), 1)); @@ -1256,8 +1245,8 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) { SDLoc dl(Op); // Store the value to a temporary stack slot, then LOAD the returned part. - - SDValue StackPtr = DAG.CreateStackTemporary(Vec.getValueType()); + EVT VecVT = Vec.getValueType(); + SDValue StackPtr = DAG.CreateStackTemporary(VecVT); int FI = cast(StackPtr.getNode())->getIndex(); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); @@ -1266,16 +1255,7 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) { SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo); // Then store the inserted part. - - // Add the offset to the index. - unsigned EltSize = Vec.getScalarValueSizeInBits() / 8; - - Idx = DAG.getNode(ISD::MUL, dl, Idx.getValueType(), Idx, - DAG.getConstant(EltSize, SDLoc(Vec), Idx.getValueType())); - Idx = DAG.getZExtOrTrunc(Idx, dl, TLI.getPointerTy(DAG.getDataLayout())); - - SDValue SubStackPtr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, - StackPtr); + SDValue SubStackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx); // Store the subvector. Ch = DAG.getStore(Ch, dl, Part, SubStackPtr, MachinePointerInfo()); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp index 693f5e2120a7..cf19d75676cd 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp @@ -1021,22 +1021,6 @@ void DAGTypeLegalizer::GetPairElements(SDValue Pair, DAG.getIntPtrConstant(1, dl)); } -SDValue DAGTypeLegalizer::GetVectorElementPointer(SDValue VecPtr, EVT EltVT, - SDValue Index) { - SDLoc dl(Index); - // Make sure the index type is big enough to compute in. - Index = DAG.getZExtOrTrunc(Index, dl, TLI.getPointerTy(DAG.getDataLayout())); - - // Calculate the element offset and add it to the pointer. - unsigned EltSize = EltVT.getSizeInBits() / 8; // FIXME: should be ABI size. - assert(EltSize * 8 == EltVT.getSizeInBits() && - "Converting bits to bytes lost precision"); - - Index = DAG.getNode(ISD::MUL, dl, Index.getValueType(), Index, - DAG.getConstant(EltSize, dl, Index.getValueType())); - return DAG.getNode(ISD::ADD, dl, Index.getValueType(), Index, VecPtr); -} - /// Build an integer with low bits Lo and high bits Hi. SDValue DAGTypeLegalizer::JoinIntegers(SDValue Lo, SDValue Hi) { // Arbitrarily use dlHi for result SDLoc diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index d1022af69477..bf09f4696f94 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -173,7 +173,6 @@ private: /// input operand is returned. SDValue DisintegrateMERGE_VALUES(SDNode *N, unsigned ResNo); - SDValue GetVectorElementPointer(SDValue VecPtr, EVT EltVT, SDValue Index); SDValue JoinIntegers(SDValue Lo, SDValue Hi); SDValue LibCallify(RTLIB::Libcall LC, SDNode *N, bool isSigned); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 57c179ac15b8..f85daf2ea9d5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -846,7 +846,6 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo, GetSplitVector(Vec, Lo, Hi); EVT VecVT = Vec.getValueType(); - EVT VecElemVT = VecVT.getVectorElementType(); unsigned VecElems = VecVT.getVectorNumElements(); unsigned SubElems = SubVec.getValueType().getVectorNumElements(); @@ -872,7 +871,7 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo, DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, MachinePointerInfo()); // Store the new subvector into the specified index. - SDValue SubVecPtr = GetVectorElementPointer(StackPtr, VecElemVT, Idx); + SDValue SubVecPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx); Type *VecType = VecVT.getTypeForEVT(*DAG.getContext()); unsigned Alignment = DAG.getDataLayout().getPrefTypeAlignment(VecType); Store = DAG.getStore(Store, dl, SubVec, SubVecPtr, MachinePointerInfo()); @@ -1003,7 +1002,7 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, // Store the new element. This may be larger than the vector element type, // so use a truncating store. - SDValue EltPtr = GetVectorElementPointer(StackPtr, EltVT, Idx); + SDValue EltPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx); Type *VecType = VecVT.getTypeForEVT(*DAG.getContext()); unsigned Alignment = DAG.getDataLayout().getPrefTypeAlignment(VecType); Store = @@ -1650,7 +1649,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) { DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, MachinePointerInfo()); // Load back the required element. - StackPtr = GetVectorElementPointer(StackPtr, EltVT, Idx); + StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx); return DAG.getExtLoad(ISD::EXTLOAD, dl, N->getValueType(0), Store, StackPtr, MachinePointerInfo(), EltVT); } diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 591a37d600cc..690f0d2c8082 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -3706,7 +3706,7 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST, return Result; } -SDValue +SDValue TargetLowering::IncrementMemoryAddress(SDValue Addr, SDValue Mask, const SDLoc &DL, EVT DataVT, SelectionDAG &DAG, @@ -3738,6 +3738,49 @@ TargetLowering::IncrementMemoryAddress(SDValue Addr, SDValue Mask, return DAG.getNode(ISD::ADD, DL, AddrVT, Addr, Increment); } +static SDValue clampDynamicVectorIndex(SelectionDAG &DAG, + SDValue Idx, + EVT VecVT, + const SDLoc &dl) { + if (isa(Idx)) + return Idx; + + EVT IdxVT = Idx.getValueType(); + unsigned NElts = VecVT.getVectorNumElements(); + if (isPowerOf2_32(NElts)) { + APInt Imm = APInt::getLowBitsSet(IdxVT.getSizeInBits(), + Log2_32(NElts)); + return DAG.getNode(ISD::AND, dl, IdxVT, Idx, + DAG.getConstant(Imm, dl, IdxVT)); + } + + return DAG.getNode(ISD::UMIN, dl, IdxVT, Idx, + DAG.getConstant(NElts - 1, dl, IdxVT)); +} + +SDValue TargetLowering::getVectorElementPointer(SelectionDAG &DAG, + SDValue VecPtr, EVT VecVT, + SDValue Index) const { + SDLoc dl(Index); + // Make sure the index type is big enough to compute in. + Index = DAG.getZExtOrTrunc(Index, dl, getPointerTy(DAG.getDataLayout())); + + EVT EltVT = VecVT.getVectorElementType(); + + // Calculate the element offset and add it to the pointer. + unsigned EltSize = EltVT.getSizeInBits() / 8; // FIXME: should be ABI size. + assert(EltSize * 8 == EltVT.getSizeInBits() && + "Converting bits to bytes lost precision"); + + Index = clampDynamicVectorIndex(DAG, Index, VecVT, dl); + + EVT IdxVT = Index.getValueType(); + + Index = DAG.getNode(ISD::MUL, dl, IdxVT, Index, + DAG.getConstant(EltSize, dl, IdxVT)); + return DAG.getNode(ISD::ADD, dl, IdxVT, Index, VecPtr); +} + //===----------------------------------------------------------------------===// // Implementation of Emulated TLS Model //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll index e91a1a42c233..8d9a8c06aa3c 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -904,8 +904,9 @@ define <8 x i8> @getl(<16 x i8> %x) #0 { ; CHECK-LABEL: test_extracts_inserts_varidx_extract: ; CHECK: str q0 -; CHECK: add x[[PTR:[0-9]+]], {{.*}}, w0, sxtw #1 -; CHECK-DAG: ld1 { v[[R:[0-9]+]].h }[0], [x[[PTR]]] +; CHECK-DAG: and [[MASKED_IDX:x[0-9]+]], x0, #0x7 +; CHECK: bfi [[PTR:x[0-9]+]], [[MASKED_IDX]], #1, #3 +; CHECK-DAG: ld1 { v[[R:[0-9]+]].h }[0], {{\[}}[[PTR]]{{\]}} ; CHECK-DAG: ins v[[R]].h[1], v0.h[1] ; CHECK-DAG: ins v[[R]].h[2], v0.h[2] ; CHECK-DAG: ins v[[R]].h[3], v0.h[3] @@ -922,7 +923,9 @@ define <4 x i16> @test_extracts_inserts_varidx_extract(<8 x i16> %x, i32 %idx) { } ; CHECK-LABEL: test_extracts_inserts_varidx_insert: -; CHECK: str h0, [{{.*}}, w0, sxtw #1] +; CHECK: and [[MASKED_IDX:x[0-9]+]], x0, #0x3 +; CHECK: bfi x9, [[MASKED_IDX]], #1, #2 +; CHECK: st1 { v0.h }[0], [x9] ; CHECK-DAG: ldr d[[R:[0-9]+]] ; CHECK-DAG: ins v[[R]].h[1], v0.h[1] ; CHECK-DAG: ins v[[R]].h[2], v0.h[2] diff --git a/llvm/test/CodeGen/AArch64/arm64-nvcast.ll b/llvm/test/CodeGen/AArch64/arm64-nvcast.ll index c3a1640ab012..ba2512718c4e 100644 --- a/llvm/test/CodeGen/AArch64/arm64-nvcast.ll +++ b/llvm/test/CodeGen/AArch64/arm64-nvcast.ll @@ -1,10 +1,12 @@ ; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s ; CHECK-LABEL: _test: -; CHECK: fmov.2d v0, #2.00000000 -; CHECK: str q0, [sp, #-16]! -; CHECK: mov x8, sp -; CHECK: ldr s0, [x8, w1, sxtw #2] +; CHECK-DAG: fmov.2d v0, #2.00000000 +; CHECK-DAG: and [[MASK_IDX:x[0-9]+]], x1, #0x3 +; CHECK-DAG: mov x9, sp +; CHECK-DAG: str q0, [sp], #16 +; CHECK-DAG: bfi [[PTR:x[0-9]+]], [[MASK_IDX]], #2, #2 +; CHECK: ldr s0, {{\[}}[[PTR]]{{\]}} ; CHECK: str s0, [x0] define void @test(float * %p1, i32 %v1) { @@ -16,9 +18,11 @@ entry: ; CHECK-LABEL: _test2 ; CHECK: movi.16b v0, #63 -; CHECK: str q0, [sp, #-16]! -; CHECK: mov x8, sp -; CHECK: ldr s0, [x8, w1, sxtw #2] +; CHECK-DAG: and [[MASK_IDX:x[0-9]+]], x1, #0x3 +; CHECK-DAG: str q0, [sp], #16 +; CHECK-DAG: mov x9, sp +; CHECK-DAG: bfi [[PTR:x[0-9]+]], [[MASK_IDX]], #2, #2 +; CHECK: ldr s0, {{\[}}[[PTR]]{{\]}} ; CHECK: str s0, [x0] define void @test2(float * %p1, i32 %v1) { diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index 7351665f06e4..2c538b16e743 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -207,11 +207,15 @@ define void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> ; GCN: buffer_load_ushort v{{[0-9]+}}, off ; GCN: buffer_load_ushort v{{[0-9]+}}, off +; GCN-DAG: v_mov_b32_e32 [[BASE_FI:v[0-9]+]], 0{{$}} +; GCN-DAG: s_and_b32 [[MASK_IDX:s[0-9]+]], s{{[0-9]+}}, 3{{$}} +; GCN-DAG: v_or_b32_e32 [[IDX:v[0-9]+]], [[MASK_IDX]], [[BASE_FI]]{{$}} + ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:6 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2 -; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}} -; GCN: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} +; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} +; GCN: buffer_store_short v{{[0-9]+}}, [[IDX]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} ; GCN: s_waitcnt diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-slot-bug.ll b/llvm/test/CodeGen/AMDGPU/local-stack-slot-bug.ll index d49fa2bf48a7..2ef045dbb8eb 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-slot-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-slot-bug.ll @@ -7,11 +7,14 @@ ; ; CHECK-LABEL: {{^}}main: +; CHECK-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x200 +; CHECK-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} ; CHECK-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0 -; CHECK-DAG: v_add_i32_e32 [[HI_OFF:v[0-9]+]], vcc, 0x200, [[BYTES]] +; CHECK-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]] ; TODO: add 0? -; CHECK-DAG: v_add_i32_e32 [[LO_OFF:v[0-9]+]], vcc, 0, [[BYTES]] +; CHECK-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], [[CLAMP_IDX]], [[ZERO]] +; CHECK-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], [[CLAMP_IDX]], [[K]] ; CHECK: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen ; CHECK: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen diff --git a/llvm/test/CodeGen/ARM/fp16-promote.ll b/llvm/test/CodeGen/ARM/fp16-promote.ll index ebc5934df022..824123687287 100644 --- a/llvm/test/CodeGen/ARM/fp16-promote.ll +++ b/llvm/test/CodeGen/ARM/fp16-promote.ll @@ -825,7 +825,7 @@ define void @test_fmuladd(half* %p, half* %q, half* %r) #0 { ; CHECK-ALL: strh ; CHECK-ALL: mov ; CHECK-ALL-DAG: ldrh -; CHECK-ALL-DAG: add +; CHECK-ALL-DAG: orr ; CHECK-ALL: strh ; CHECK-ALL: ldrh ; CHECK-ALL: strh @@ -855,7 +855,7 @@ define void @test_insertelement(half* %p, <4 x half>* %q, i32 %i) #0 { ; CHECK-VFP: orr ; CHECK-VFP: str ; CHECK-VFP: mov -; CHECK-VFP: add +; CHECK-VFP: orr ; CHECK-VFP: ldrh ; CHECK-VFP: strh ; CHECK-VFP: add sp, sp, #8 diff --git a/llvm/test/CodeGen/ARM/vdup.ll b/llvm/test/CodeGen/ARM/vdup.ll index 25c4807d9862..b7693c797635 100644 --- a/llvm/test/CodeGen/ARM/vdup.ll +++ b/llvm/test/CodeGen/ARM/vdup.ll @@ -373,7 +373,8 @@ define <8 x i8> @check_i8_varidx(<16 x i8> %v, i32 %idx) { ; CHECK: mov r[[FP:[0-9]+]], sp ; CHECK: ldr r[[IDX:[0-9]+]], [r[[FP]], #4] ; CHECK: mov r[[SPCOPY:[0-9]+]], sp -; CHECK: vst1.64 {d{{.*}}, d{{.*}}}, [r[[SPCOPY]]:128], r[[IDX]] +; CHECK: and r[[MASKED_IDX:[0-9]+]], r[[IDX]], #15 +; CHECK: vst1.64 {d{{.*}}, d{{.*}}}, [r[[SPCOPY]]:128], r[[MASKED_IDX]] ; CHECK: vld1.8 {d{{.*}}[]}, [r[[SPCOPY]]] %x = extractelement <16 x i8> %v, i32 %idx %1 = insertelement <8 x i8> undef, i8 %x, i32 0 diff --git a/llvm/test/CodeGen/Mips/llvm-ir/extractelement.ll b/llvm/test/CodeGen/Mips/llvm-ir/extractelement.ll index 1e1b02df99a2..3c7df4a5e99f 100644 --- a/llvm/test/CodeGen/Mips/llvm-ir/extractelement.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/extractelement.ll @@ -14,6 +14,7 @@ define i1 @via_stack_bug(i8 signext %idx) { ; ALL-DAG: addiu [[ONE:\$[0-9]+]], $zero, 1 ; ALL-DAG: sb [[ONE]], 7($sp) ; ALL-DAG: sb $zero, 6($sp) +; ALL-DAG: andi [[MASKED_IDX:\$[0-9]+]], $4, 1 ; ALL-DAG: addiu [[VPTR:\$[0-9]+]], $sp, 6 -; ALL-DAG: addu [[EPTR:\$[0-9]+]], $4, [[VPTR]] +; ALL-DAG: or [[EPTR:\$[0-9]+]], [[MASKED_IDX]], [[VPTR]] ; ALL: lbu $2, 0([[EPTR]]) diff --git a/llvm/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll b/llvm/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll index b61acab7f7cb..98862cd049a5 100644 --- a/llvm/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll +++ b/llvm/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll @@ -23,7 +23,7 @@ entry: ; CHECK: mfvsrd [[TOGPR:[0-9]+]], ; CHECK: srd [[RSHREG:[0-9]+]], [[TOGPR]], [[SHAMREG]] ; CHECK: extsw 3, [[RSHREG]] -; CHECK-P7-DAG: sldi [[ELEMOFFREG:[0-9]+]], 5, 2 +; CHECK-P7-DAG: rlwinm [[ELEMOFFREG:[0-9]+]], 5, 2, 28, 29 ; CHECK-P7-DAG: stxvw4x 34, ; CHECK-P7: lwax 3, [[ELEMOFFREG]], ; CHECK-BE-DAG: andi. [[ANDREG:[0-9]+]], 5, 2 @@ -52,7 +52,7 @@ entry: ; CHECK-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SHIFTREG]] ; CHECK-DAG: vperm [[PERMVEC:[0-9]+]], 2, 2, [[SHMSKREG]] ; CHECK: mfvsrd 3, -; CHECK-P7-DAG: sldi [[ELEMOFFREG:[0-9]+]], 5, 3 +; CHECK-P7-DAG: rlwinm [[ELEMOFFREG:[0-9]+]], 5, 3, 28, 28 ; CHECK-P7-DAG: stxvd2x 34, ; CHECK-P7: ldx 3, [[ELEMOFFREG]], ; CHECK-BE-DAG: andi. [[ANDREG:[0-9]+]], 5, 1 @@ -75,7 +75,7 @@ entry: ; CHECK: lvsl [[SHMSKREG:[0-9]+]], 0, [[TRUNCREG]] ; CHECK: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]] ; CHECK: xscvspdpn 1, -; CHECK-P7-DAG: sldi [[ELEMOFFREG:[0-9]+]], 5, 2 +; CHECK-P7-DAG: rlwinm [[ELEMOFFREG:[0-9]+]], 5, 2, 28, 29 ; CHECK-P7-DAG: stxvw4x 34, ; CHECK-P7: lfsx 1, [[ELEMOFFREG]], ; CHECK-BE: sldi [[ELNOREG:[0-9]+]], 5, 2 diff --git a/llvm/test/CodeGen/X86/extractelement-index.ll b/llvm/test/CodeGen/X86/extractelement-index.ll index 13448a13ab4c..8c12e7148aa7 100644 --- a/llvm/test/CodeGen/X86/extractelement-index.ll +++ b/llvm/test/CodeGen/X86/extractelement-index.ll @@ -404,6 +404,7 @@ define i64 @extractelement_v4i64_3(<4 x i64> %a, i256 %i) nounwind { define i8 @extractelement_v16i8_var(<16 x i8> %a, i256 %i) nounwind { ; SSE-LABEL: extractelement_v16i8_var: ; SSE: # BB#0: +; SSE-NEXT: andl $15, %edi ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: leaq -{{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movb (%rdi,%rax), %al @@ -411,6 +412,7 @@ define i8 @extractelement_v16i8_var(<16 x i8> %a, i256 %i) nounwind { ; ; AVX-LABEL: extractelement_v16i8_var: ; AVX: # BB#0: +; AVX-NEXT: andl $15, %edi ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax ; AVX-NEXT: movb (%rdi,%rax), %al @@ -426,6 +428,7 @@ define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind { ; SSE-NEXT: movq %rsp, %rbp ; SSE-NEXT: andq $-32, %rsp ; SSE-NEXT: subq $64, %rsp +; SSE-NEXT: andl $31, %edi ; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE-NEXT: movaps %xmm0, (%rsp) ; SSE-NEXT: movq %rsp, %rax @@ -440,6 +443,7 @@ define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind { ; AVX-NEXT: movq %rsp, %rbp ; AVX-NEXT: andq $-32, %rsp ; AVX-NEXT: subq $64, %rsp +; AVX-NEXT: andl $31, %edi ; AVX-NEXT: vmovaps %ymm0, (%rsp) ; AVX-NEXT: movq %rsp, %rax ; AVX-NEXT: movb (%rdi,%rax), %al @@ -454,12 +458,14 @@ define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind { define i16 @extractelement_v8i16_var(<8 x i16> %a, i256 %i) nounwind { ; SSE-LABEL: extractelement_v8i16_var: ; SSE: # BB#0: +; SSE-NEXT: andl $7, %edi ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movzwl -24(%rsp,%rdi,2), %eax ; SSE-NEXT: retq ; ; AVX-LABEL: extractelement_v8i16_var: ; AVX: # BB#0: +; AVX-NEXT: andl $7, %edi ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movzwl -24(%rsp,%rdi,2), %eax ; AVX-NEXT: retq @@ -474,6 +480,7 @@ define i16 @extractelement_v16i16_var(<16 x i16> %a, i256 %i) nounwind { ; SSE-NEXT: movq %rsp, %rbp ; SSE-NEXT: andq $-32, %rsp ; SSE-NEXT: subq $64, %rsp +; SSE-NEXT: andl $15, %edi ; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE-NEXT: movaps %xmm0, (%rsp) ; SSE-NEXT: movzwl (%rsp,%rdi,2), %eax @@ -487,6 +494,7 @@ define i16 @extractelement_v16i16_var(<16 x i16> %a, i256 %i) nounwind { ; AVX-NEXT: movq %rsp, %rbp ; AVX-NEXT: andq $-32, %rsp ; AVX-NEXT: subq $64, %rsp +; AVX-NEXT: andl $15, %edi ; AVX-NEXT: vmovaps %ymm0, (%rsp) ; AVX-NEXT: movzwl (%rsp,%rdi,2), %eax ; AVX-NEXT: movq %rbp, %rsp @@ -500,12 +508,14 @@ define i16 @extractelement_v16i16_var(<16 x i16> %a, i256 %i) nounwind { define i32 @extractelement_v4i32_var(<4 x i32> %a, i256 %i) nounwind { ; SSE-LABEL: extractelement_v4i32_var: ; SSE: # BB#0: +; SSE-NEXT: andl $3, %edi ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movl -24(%rsp,%rdi,4), %eax ; SSE-NEXT: retq ; ; AVX-LABEL: extractelement_v4i32_var: ; AVX: # BB#0: +; AVX-NEXT: andl $3, %edi ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movl -24(%rsp,%rdi,4), %eax ; AVX-NEXT: retq @@ -520,6 +530,7 @@ define i32 @extractelement_v8i32_var(<8 x i32> %a, i256 %i) nounwind { ; SSE-NEXT: movq %rsp, %rbp ; SSE-NEXT: andq $-32, %rsp ; SSE-NEXT: subq $64, %rsp +; SSE-NEXT: andl $7, %edi ; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE-NEXT: movaps %xmm0, (%rsp) ; SSE-NEXT: movl (%rsp,%rdi,4), %eax @@ -533,6 +544,7 @@ define i32 @extractelement_v8i32_var(<8 x i32> %a, i256 %i) nounwind { ; AVX1-NEXT: movq %rsp, %rbp ; AVX1-NEXT: andq $-32, %rsp ; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: andl $7, %edi ; AVX1-NEXT: vmovaps %ymm0, (%rsp) ; AVX1-NEXT: movl (%rsp,%rdi,4), %eax ; AVX1-NEXT: movq %rbp, %rsp @@ -554,12 +566,14 @@ define i32 @extractelement_v8i32_var(<8 x i32> %a, i256 %i) nounwind { define i64 @extractelement_v2i64_var(<2 x i64> %a, i256 %i) nounwind { ; SSE-LABEL: extractelement_v2i64_var: ; SSE: # BB#0: +; SSE-NEXT: andl $1, %edi ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq -24(%rsp,%rdi,8), %rax ; SSE-NEXT: retq ; ; AVX-LABEL: extractelement_v2i64_var: ; AVX: # BB#0: +; AVX-NEXT: andl $1, %edi ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movq -24(%rsp,%rdi,8), %rax ; AVX-NEXT: retq @@ -574,6 +588,7 @@ define i64 @extractelement_v4i64_var(<4 x i64> %a, i256 %i) nounwind { ; SSE-NEXT: movq %rsp, %rbp ; SSE-NEXT: andq $-32, %rsp ; SSE-NEXT: subq $64, %rsp +; SSE-NEXT: andl $3, %edi ; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE-NEXT: movaps %xmm0, (%rsp) ; SSE-NEXT: movq (%rsp,%rdi,8), %rax @@ -587,6 +602,7 @@ define i64 @extractelement_v4i64_var(<4 x i64> %a, i256 %i) nounwind { ; AVX-NEXT: movq %rsp, %rbp ; AVX-NEXT: andq $-32, %rsp ; AVX-NEXT: subq $64, %rsp +; AVX-NEXT: andl $3, %edi ; AVX-NEXT: vmovaps %ymm0, (%rsp) ; AVX-NEXT: movq (%rsp,%rdi,8), %rax ; AVX-NEXT: movq %rbp, %rsp diff --git a/llvm/test/CodeGen/X86/extractelement-legalization-store-ordering.ll b/llvm/test/CodeGen/X86/extractelement-legalization-store-ordering.ll index 946516c8a46d..c418e67ecb67 100644 --- a/llvm/test/CodeGen/X86/extractelement-legalization-store-ordering.ll +++ b/llvm/test/CodeGen/X86/extractelement-legalization-store-ordering.ll @@ -16,11 +16,11 @@ target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" ; CHECK-NEXT: movl 20(%esp), %edx ; CHECK-NEXT: paddd (%edx), %xmm0 ; CHECK-NEXT: movdqa %xmm0, (%edx) -; CHECK-NEXT: shll $4, %ecx -; CHECK-NEXT: movl (%ecx,%edx), %esi -; CHECK-NEXT: movl 12(%ecx,%edx), %edi -; CHECK-NEXT: movl 8(%ecx,%edx), %ebx -; CHECK-NEXT: movl 4(%ecx,%edx), %edx +; CHECK-NEXT: movl (%edx), %esi +; CHECK-NEXT: movl 12(%edx), %edi +; CHECK-NEXT: movl 8(%edx), %ebx +; CHECK-NEXT: movl 4(%edx), %edx +; CHECK-NEXT: shll $4, %ecx ; CHECK-NEXT: movl %esi, 12(%eax,%ecx) ; CHECK-NEXT: movl %edx, (%eax,%ecx) ; CHECK-NEXT: movl %ebx, 8(%eax,%ecx) diff --git a/llvm/test/CodeGen/X86/i64-mem-copy.ll b/llvm/test/CodeGen/X86/i64-mem-copy.ll index 1fa752774251..7b1926da245c 100644 --- a/llvm/test/CodeGen/X86/i64-mem-copy.ll +++ b/llvm/test/CodeGen/X86/i64-mem-copy.ll @@ -68,9 +68,10 @@ define void @store_i64_from_vector256(<16 x i16> %x, <16 x i16> %y, i64* %i) { define void @PR23476(<5 x i64> %in, i64* %out, i32 %index) { ; X32-LABEL: PR23476: +; X32: andl $7, %eax ; X32: movsd {{.*#+}} xmm0 = mem[0],zero ; X32: movsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: movsd %xmm0, (%eax) +; X32-NEXT: movsd %xmm0, (%ecx) %ext = extractelement <5 x i64> %in, i32 %index store i64 %ext, i64* %out, align 8 ret void diff --git a/llvm/test/CodeGen/X86/vec_ins_extract-1.ll b/llvm/test/CodeGen/X86/vec_ins_extract-1.ll index 85c7875d923b..1dc8b7abd207 100644 --- a/llvm/test/CodeGen/X86/vec_ins_extract-1.ll +++ b/llvm/test/CodeGen/X86/vec_ins_extract-1.ll @@ -12,6 +12,7 @@ define i32 @t0(i32 inreg %t7, <4 x i32> inreg %t8) nounwind { ; X32-NEXT: movl %esp, %ebp ; X32-NEXT: andl $-16, %esp ; X32-NEXT: subl $32, %esp +; X32-NEXT: andl $3, %eax ; X32-NEXT: movaps %xmm0, (%esp) ; X32-NEXT: movl $76, (%esp,%eax,4) ; X32-NEXT: movl (%esp), %eax @@ -21,9 +22,10 @@ define i32 @t0(i32 inreg %t7, <4 x i32> inreg %t8) nounwind { ; ; X64-LABEL: t0: ; X64: # BB#0: +; X64-NEXT: # kill: %EDI %EDI %RDI ; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movslq %edi, %rax -; X64-NEXT: movl $76, -24(%rsp,%rax,4) +; X64-NEXT: andl $3, %edi +; X64-NEXT: movl $76, -24(%rsp,%rdi,4) ; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: retq %t13 = insertelement <4 x i32> %t8, i32 76, i32 %t7 @@ -38,6 +40,7 @@ define i32 @t1(i32 inreg %t7, <4 x i32> inreg %t8) nounwind { ; X32-NEXT: movl %esp, %ebp ; X32-NEXT: andl $-16, %esp ; X32-NEXT: subl $32, %esp +; X32-NEXT: andl $3, %eax ; X32-NEXT: movl $76, %ecx ; X32-NEXT: pinsrd $0, %ecx, %xmm0 ; X32-NEXT: movdqa %xmm0, (%esp) @@ -48,11 +51,12 @@ define i32 @t1(i32 inreg %t7, <4 x i32> inreg %t8) nounwind { ; ; X64-LABEL: t1: ; X64: # BB#0: +; X64-NEXT: # kill: %EDI %EDI %RDI ; X64-NEXT: movl $76, %eax ; X64-NEXT: pinsrd $0, %eax, %xmm0 ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movslq %edi, %rax -; X64-NEXT: movl -24(%rsp,%rax,4), %eax +; X64-NEXT: andl $3, %edi +; X64-NEXT: movl -24(%rsp,%rdi,4), %eax ; X64-NEXT: retq %t13 = insertelement <4 x i32> %t8, i32 76, i32 0 %t9 = extractelement <4 x i32> %t13, i32 %t7 @@ -66,6 +70,7 @@ define <4 x i32> @t2(i32 inreg %t7, <4 x i32> inreg %t8) nounwind { ; X32-NEXT: movl %esp, %ebp ; X32-NEXT: andl $-16, %esp ; X32-NEXT: subl $32, %esp +; X32-NEXT: andl $3, %eax ; X32-NEXT: movdqa %xmm0, (%esp) ; X32-NEXT: pinsrd $0, (%esp,%eax,4), %xmm0 ; X32-NEXT: movl %ebp, %esp @@ -74,9 +79,10 @@ define <4 x i32> @t2(i32 inreg %t7, <4 x i32> inreg %t8) nounwind { ; ; X64-LABEL: t2: ; X64: # BB#0: +; X64-NEXT: # kill: %EDI %EDI %RDI ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movslq %edi, %rax -; X64-NEXT: pinsrd $0, -24(%rsp,%rax,4), %xmm0 +; X64-NEXT: andl $3, %edi +; X64-NEXT: pinsrd $0, -24(%rsp,%rdi,4), %xmm0 ; X64-NEXT: retq %t9 = extractelement <4 x i32> %t8, i32 %t7 %t13 = insertelement <4 x i32> %t8, i32 %t9, i32 0 @@ -90,6 +96,7 @@ define <4 x i32> @t3(i32 inreg %t7, <4 x i32> inreg %t8) nounwind { ; X32-NEXT: movl %esp, %ebp ; X32-NEXT: andl $-16, %esp ; X32-NEXT: subl $32, %esp +; X32-NEXT: andl $3, %eax ; X32-NEXT: movaps %xmm0, (%esp) ; X32-NEXT: movss %xmm0, (%esp,%eax,4) ; X32-NEXT: movaps (%esp), %xmm0 @@ -99,9 +106,10 @@ define <4 x i32> @t3(i32 inreg %t7, <4 x i32> inreg %t8) nounwind { ; ; X64-LABEL: t3: ; X64: # BB#0: +; X64-NEXT: # kill: %EDI %EDI %RDI ; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movslq %edi, %rax -; X64-NEXT: movss %xmm0, -24(%rsp,%rax,4) +; X64-NEXT: andl $3, %edi +; X64-NEXT: movss %xmm0, -24(%rsp,%rdi,4) ; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; X64-NEXT: retq %t9 = extractelement <4 x i32> %t8, i32 0 diff --git a/llvm/test/CodeGen/X86/vec_insert-4.ll b/llvm/test/CodeGen/X86/vec_insert-4.ll index c847ac983003..82627c54e663 100644 --- a/llvm/test/CodeGen/X86/vec_insert-4.ll +++ b/llvm/test/CodeGen/X86/vec_insert-4.ll @@ -10,6 +10,7 @@ define <8 x float> @f(<8 x float> %a, i32 %b) nounwind { ; X32-NEXT: andl $-32, %esp ; X32-NEXT: subl $64, %esp ; X32-NEXT: movl 8(%ebp), %eax +; X32-NEXT: andl $7, %eax ; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X32-NEXT: movaps %xmm0, (%esp) ; X32-NEXT: movl $1084227584, (%esp,%eax,4) ## imm = 0x40A00000 @@ -25,10 +26,11 @@ define <8 x float> @f(<8 x float> %a, i32 %b) nounwind { ; X64-NEXT: movq %rsp, %rbp ; X64-NEXT: andq $-32, %rsp ; X64-NEXT: subq $64, %rsp +; X64-NEXT: ## kill: %EDI %EDI %RDI ; X64-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; X64-NEXT: movaps %xmm0, (%rsp) -; X64-NEXT: movslq %edi, %rax -; X64-NEXT: movl $1084227584, (%rsp,%rax,4) ## imm = 0x40A00000 +; X64-NEXT: andl $7, %edi +; X64-NEXT: movl $1084227584, (%rsp,%rdi,4) ## imm = 0x40A00000 ; X64-NEXT: movaps (%rsp), %xmm0 ; X64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ; X64-NEXT: movq %rbp, %rsp diff --git a/llvm/test/CodeGen/X86/vec_insert-8.ll b/llvm/test/CodeGen/X86/vec_insert-8.ll index d612e7eb10d3..4074b6d32353 100644 --- a/llvm/test/CodeGen/X86/vec_insert-8.ll +++ b/llvm/test/CodeGen/X86/vec_insert-8.ll @@ -11,10 +11,11 @@ define <4 x i32> @var_insert(<4 x i32> %x, i32 %val, i32 %idx) nounwind { ; X32-NEXT: movl %esp, %ebp ; X32-NEXT: andl $-16, %esp ; X32-NEXT: subl $32, %esp -; X32-NEXT: movl 8(%ebp), %eax -; X32-NEXT: movl 12(%ebp), %ecx +; X32-NEXT: movl 12(%ebp), %eax +; X32-NEXT: andl $3, %eax +; X32-NEXT: movl 8(%ebp), %ecx ; X32-NEXT: movaps %xmm0, (%esp) -; X32-NEXT: movl %eax, (%esp,%ecx,4) +; X32-NEXT: movl %ecx, (%esp,%eax,4) ; X32-NEXT: movaps (%esp), %xmm0 ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp @@ -22,9 +23,10 @@ define <4 x i32> @var_insert(<4 x i32> %x, i32 %val, i32 %idx) nounwind { ; ; X64-LABEL: var_insert: ; X64: # BB#0: # %entry +; X64-NEXT: # kill: %ESI %ESI %RSI ; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movslq %esi, %rax -; X64-NEXT: movl %edi, -24(%rsp,%rax,4) +; X64-NEXT: andl $3, %esi +; X64-NEXT: movl %edi, -24(%rsp,%rsi,4) ; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; X64-NEXT: retq entry: @@ -40,6 +42,7 @@ define i32 @var_extract(<4 x i32> %x, i32 %idx) nounwind { ; X32-NEXT: andl $-16, %esp ; X32-NEXT: subl $32, %esp ; X32-NEXT: movl 8(%ebp), %eax +; X32-NEXT: andl $3, %eax ; X32-NEXT: movaps %xmm0, (%esp) ; X32-NEXT: movl (%esp,%eax,4), %eax ; X32-NEXT: movl %ebp, %esp @@ -48,9 +51,10 @@ define i32 @var_extract(<4 x i32> %x, i32 %idx) nounwind { ; ; X64-LABEL: var_extract: ; X64: # BB#0: # %entry +; X64-NEXT: # kill: %EDI %EDI %RDI ; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movslq %edi, %rax -; X64-NEXT: movl -24(%rsp,%rax,4), %eax +; X64-NEXT: andl $3, %edi +; X64-NEXT: movl -24(%rsp,%rdi,4), %eax ; X64-NEXT: retq entry: %tmp3 = extractelement <4 x i32> %x, i32 %idx diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll index d130e7ff00b2..70b7fb16fc25 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll @@ -12,6 +12,8 @@ define <2 x double> @var_shuffle_v2f64_v2f64_xx_i64(<2 x double> %x, i64 %i0, i64 %i1) nounwind { ; SSE-LABEL: var_shuffle_v2f64_v2f64_xx_i64: ; SSE: # BB#0: +; SSE-NEXT: andl $1, %esi +; SSE-NEXT: andl $1, %edi ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -19,6 +21,8 @@ define <2 x double> @var_shuffle_v2f64_v2f64_xx_i64(<2 x double> %x, i64 %i0, i6 ; ; AVX-LABEL: var_shuffle_v2f64_v2f64_xx_i64: ; AVX: # BB#0: +; AVX-NEXT: andl $1, %esi +; AVX-NEXT: andl $1, %edi ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -33,9 +37,11 @@ define <2 x double> @var_shuffle_v2f64_v2f64_xx_i64(<2 x double> %x, i64 %i0, i6 define <2 x i64> @var_shuffle_v2i64_v2i64_xx_i64(<2 x i64> %x, i32 %i0, i32 %i1) nounwind { ; SSE-LABEL: var_shuffle_v2i64_v2i64_xx_i64: ; SSE: # BB#0: -; SSE-NEXT: movslq %edi, %rax +; SSE-NEXT: # kill: %ESI %ESI %RSI +; SSE-NEXT: # kill: %EDI %EDI %RDI +; SSE-NEXT: andl $1, %edi ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movslq %esi, %rcx +; SSE-NEXT: andl $1, %esi ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -43,9 +49,11 @@ define <2 x i64> @var_shuffle_v2i64_v2i64_xx_i64(<2 x i64> %x, i32 %i0, i32 %i1) ; ; AVX-LABEL: var_shuffle_v2i64_v2i64_xx_i64: ; AVX: # BB#0: -; AVX-NEXT: movslq %edi, %rax +; AVX-NEXT: # kill: %ESI %ESI %RSI +; AVX-NEXT: # kill: %EDI %EDI %RDI +; AVX-NEXT: andl $1, %edi ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movslq %esi, %rcx +; AVX-NEXT: andl $1, %esi ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -60,11 +68,15 @@ define <2 x i64> @var_shuffle_v2i64_v2i64_xx_i64(<2 x i64> %x, i32 %i0, i32 %i1) define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind { ; SSE2-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32: ; SSE2: # BB#0: -; SSE2-NEXT: movslq %edi, %rax -; SSE2-NEXT: movslq %esi, %rsi -; SSE2-NEXT: movslq %edx, %rdx +; SSE2-NEXT: # kill: %ECX %ECX %RCX +; SSE2-NEXT: # kill: %EDX %EDX %RDX +; SSE2-NEXT: # kill: %ESI %ESI %RSI +; SSE2-NEXT: # kill: %EDI %EDI %RDI +; SSE2-NEXT: andl $3, %edi +; SSE2-NEXT: andl $3, %esi +; SSE2-NEXT: andl $3, %edx ; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movslq %ecx, %rcx +; SSE2-NEXT: andl $3, %ecx ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero @@ -76,11 +88,15 @@ define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i3 ; ; SSSE3-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32: ; SSSE3: # BB#0: -; SSSE3-NEXT: movslq %edi, %rax -; SSSE3-NEXT: movslq %esi, %rsi -; SSSE3-NEXT: movslq %edx, %rdx +; SSSE3-NEXT: # kill: %ECX %ECX %RCX +; SSSE3-NEXT: # kill: %EDX %EDX %RDX +; SSSE3-NEXT: # kill: %ESI %ESI %RSI +; SSSE3-NEXT: # kill: %EDI %EDI %RDI +; SSSE3-NEXT: andl $3, %edi +; SSSE3-NEXT: andl $3, %esi +; SSSE3-NEXT: andl $3, %edx ; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movslq %ecx, %rcx +; SSSE3-NEXT: andl $3, %ecx ; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero @@ -92,11 +108,15 @@ define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i3 ; ; SSE41-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32: ; SSE41: # BB#0: -; SSE41-NEXT: movslq %edi, %rax -; SSE41-NEXT: movslq %esi, %rsi -; SSE41-NEXT: movslq %edx, %rdx +; SSE41-NEXT: # kill: %ECX %ECX %RCX +; SSE41-NEXT: # kill: %EDX %EDX %RDX +; SSE41-NEXT: # kill: %ESI %ESI %RSI +; SSE41-NEXT: # kill: %EDI %EDI %RDI +; SSE41-NEXT: andl $3, %edi +; SSE41-NEXT: andl $3, %esi +; SSE41-NEXT: andl $3, %edx ; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: movslq %ecx, %rcx +; SSE41-NEXT: andl $3, %ecx ; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] @@ -105,11 +125,15 @@ define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i3 ; ; AVX-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32: ; AVX: # BB#0: -; AVX-NEXT: movslq %edi, %rax -; AVX-NEXT: movslq %esi, %rsi -; AVX-NEXT: movslq %edx, %rdx +; AVX-NEXT: # kill: %ECX %ECX %RCX +; AVX-NEXT: # kill: %EDX %EDX %RDX +; AVX-NEXT: # kill: %ESI %ESI %RSI +; AVX-NEXT: # kill: %EDI %EDI %RDI +; AVX-NEXT: andl $3, %edi +; AVX-NEXT: andl $3, %esi +; AVX-NEXT: andl $3, %edx ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movslq %ecx, %rcx +; AVX-NEXT: andl $3, %ecx ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] @@ -129,11 +153,15 @@ define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i3 define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind { ; SSE2-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32: ; SSE2: # BB#0: -; SSE2-NEXT: movslq %edi, %rax -; SSE2-NEXT: movslq %esi, %rsi -; SSE2-NEXT: movslq %edx, %rdx +; SSE2-NEXT: # kill: %ECX %ECX %RCX +; SSE2-NEXT: # kill: %EDX %EDX %RDX +; SSE2-NEXT: # kill: %ESI %ESI %RSI +; SSE2-NEXT: # kill: %EDI %EDI %RDI +; SSE2-NEXT: andl $3, %edi +; SSE2-NEXT: andl $3, %esi +; SSE2-NEXT: andl $3, %edx ; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movslq %ecx, %rcx +; SSE2-NEXT: andl $3, %ecx ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero @@ -145,11 +173,15 @@ define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i ; ; SSSE3-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32: ; SSSE3: # BB#0: -; SSSE3-NEXT: movslq %edi, %rax -; SSSE3-NEXT: movslq %esi, %rsi -; SSSE3-NEXT: movslq %edx, %rdx +; SSSE3-NEXT: # kill: %ECX %ECX %RCX +; SSSE3-NEXT: # kill: %EDX %EDX %RDX +; SSSE3-NEXT: # kill: %ESI %ESI %RSI +; SSSE3-NEXT: # kill: %EDI %EDI %RDI +; SSSE3-NEXT: andl $3, %edi +; SSSE3-NEXT: andl $3, %esi +; SSSE3-NEXT: andl $3, %edx ; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movslq %ecx, %rcx +; SSSE3-NEXT: andl $3, %ecx ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero @@ -161,11 +193,15 @@ define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i ; ; SSE41-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32: ; SSE41: # BB#0: -; SSE41-NEXT: movslq %edi, %rax -; SSE41-NEXT: movslq %esi, %rsi -; SSE41-NEXT: movslq %edx, %rdx +; SSE41-NEXT: # kill: %ECX %ECX %RCX +; SSE41-NEXT: # kill: %EDX %EDX %RDX +; SSE41-NEXT: # kill: %ESI %ESI %RSI +; SSE41-NEXT: # kill: %EDI %EDI %RDI +; SSE41-NEXT: andl $3, %edi +; SSE41-NEXT: andl $3, %esi +; SSE41-NEXT: andl $3, %edx ; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: movslq %ecx, %rcx +; SSE41-NEXT: andl $3, %ecx ; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE41-NEXT: pinsrd $1, -24(%rsp,%rsi,4), %xmm0 ; SSE41-NEXT: pinsrd $2, -24(%rsp,%rdx,4), %xmm0 @@ -174,11 +210,15 @@ define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i ; ; AVX-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32: ; AVX: # BB#0: -; AVX-NEXT: movslq %edi, %rax -; AVX-NEXT: movslq %esi, %rsi -; AVX-NEXT: movslq %edx, %rdx +; AVX-NEXT: # kill: %ECX %ECX %RCX +; AVX-NEXT: # kill: %EDX %EDX %RDX +; AVX-NEXT: # kill: %ESI %ESI %RSI +; AVX-NEXT: # kill: %EDI %EDI %RDI +; AVX-NEXT: andl $3, %edi +; AVX-NEXT: andl $3, %esi +; AVX-NEXT: andl $3, %edx ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movslq %ecx, %rcx +; AVX-NEXT: andl $3, %ecx ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vpinsrd $1, -24(%rsp,%rsi,4), %xmm0, %xmm0 ; AVX-NEXT: vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0 @@ -204,34 +244,36 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1 ; SSE2-NEXT: # kill: %EDX %EDX %RDX ; SSE2-NEXT: # kill: %ESI %ESI %RSI ; SSE2-NEXT: # kill: %EDI %EDI %RDI -; SSE2-NEXT: movswq %di, %rax -; SSE2-NEXT: movswq %si, %rsi -; SSE2-NEXT: movswq %dx, %rdx -; SSE2-NEXT: movswq %cx, %r10 -; SSE2-NEXT: movswq %r8w, %r11 +; SSE2-NEXT: andl $7, %edi +; SSE2-NEXT: andl $7, %esi +; SSE2-NEXT: andl $7, %edx +; SSE2-NEXT: andl $7, %ecx +; SSE2-NEXT: andl $7, %r8d ; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movswq %r9w, %r8 -; SSE2-NEXT: movswq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: movswq {{[0-9]+}}(%rsp), %rdi -; SSE2-NEXT: movzwl -24(%rsp,%rcx,2), %ecx -; SSE2-NEXT: movzwl -24(%rsp,%rdi,2), %edi +; SSE2-NEXT: andl $7, %r9d +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; SSE2-NEXT: andl $7, %r10d +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: andl $7, %eax +; SSE2-NEXT: movzwl -24(%rsp,%r10,2), %r10d ; SSE2-NEXT: movzwl -24(%rsp,%rax,2), %eax +; SSE2-NEXT: movzwl -24(%rsp,%rdi,2), %edi ; SSE2-NEXT: movzwl -24(%rsp,%rsi,2), %esi -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movzwl -24(%rsp,%rdx,2), %ecx -; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: movd %r10d, %xmm0 +; SSE2-NEXT: movzwl -24(%rsp,%rdx,2), %edx +; SSE2-NEXT: movd %edx, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movzwl -24(%rsp,%r10,2), %ecx -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzwl -24(%rsp,%r11,2), %eax -; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: movzwl -24(%rsp,%rcx,2), %ecx +; SSE2-NEXT: movd %edi, %xmm0 +; SSE2-NEXT: movzwl -24(%rsp,%r8,2), %edx +; SSE2-NEXT: movd %edx, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: movd %edi, %xmm1 +; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: movd %ecx, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-NEXT: movd %esi, %xmm1 -; SSE2-NEXT: movzwl -24(%rsp,%r8,2), %eax +; SSE2-NEXT: movzwl -24(%rsp,%r9,2), %eax ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] @@ -246,34 +288,36 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1 ; SSSE3-NEXT: # kill: %EDX %EDX %RDX ; SSSE3-NEXT: # kill: %ESI %ESI %RSI ; SSSE3-NEXT: # kill: %EDI %EDI %RDI -; SSSE3-NEXT: movswq %di, %rax -; SSSE3-NEXT: movswq %si, %rsi -; SSSE3-NEXT: movswq %dx, %rdx -; SSSE3-NEXT: movswq %cx, %r10 -; SSSE3-NEXT: movswq %r8w, %r11 +; SSSE3-NEXT: andl $7, %edi +; SSSE3-NEXT: andl $7, %esi +; SSSE3-NEXT: andl $7, %edx +; SSSE3-NEXT: andl $7, %ecx +; SSSE3-NEXT: andl $7, %r8d ; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movswq %r9w, %r8 -; SSSE3-NEXT: movswq {{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: movswq {{[0-9]+}}(%rsp), %rdi -; SSSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx -; SSSE3-NEXT: movzwl -24(%rsp,%rdi,2), %edi +; SSSE3-NEXT: andl $7, %r9d +; SSSE3-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; SSSE3-NEXT: andl $7, %r10d +; SSSE3-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSSE3-NEXT: andl $7, %eax +; SSSE3-NEXT: movzwl -24(%rsp,%r10,2), %r10d ; SSSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax +; SSSE3-NEXT: movzwl -24(%rsp,%rdi,2), %edi ; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %esi -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movzwl -24(%rsp,%rdx,2), %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: movd %r10d, %xmm0 +; SSSE3-NEXT: movzwl -24(%rsp,%rdx,2), %edx +; SSSE3-NEXT: movd %edx, %xmm1 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: movzwl -24(%rsp,%r10,2), %ecx -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzwl -24(%rsp,%r11,2), %eax -; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx +; SSSE3-NEXT: movd %edi, %xmm0 +; SSSE3-NEXT: movzwl -24(%rsp,%r8,2), %edx +; SSSE3-NEXT: movd %edx, %xmm2 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: movd %edi, %xmm1 +; SSSE3-NEXT: movd %eax, %xmm1 ; SSSE3-NEXT: movd %ecx, %xmm2 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSSE3-NEXT: movd %esi, %xmm1 -; SSSE3-NEXT: movzwl -24(%rsp,%r8,2), %eax +; SSSE3-NEXT: movzwl -24(%rsp,%r9,2), %eax ; SSSE3-NEXT: movd %eax, %xmm3 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] @@ -282,68 +326,66 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1 ; ; SSE41-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16: ; SSE41: # BB#0: -; SSE41-NEXT: pushq %rbx ; SSE41-NEXT: # kill: %R9D %R9D %R9 ; SSE41-NEXT: # kill: %R8D %R8D %R8 ; SSE41-NEXT: # kill: %ECX %ECX %RCX ; SSE41-NEXT: # kill: %EDX %EDX %RDX ; SSE41-NEXT: # kill: %ESI %ESI %RSI ; SSE41-NEXT: # kill: %EDI %EDI %RDI -; SSE41-NEXT: movswq %di, %rax -; SSE41-NEXT: movswq %si, %rbx -; SSE41-NEXT: movswq %dx, %r11 -; SSE41-NEXT: movswq %cx, %r10 -; SSE41-NEXT: movswq %r8w, %rdi +; SSE41-NEXT: andl $7, %edi +; SSE41-NEXT: andl $7, %esi +; SSE41-NEXT: andl $7, %edx +; SSE41-NEXT: andl $7, %ecx +; SSE41-NEXT: andl $7, %r8d ; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: movswq %r9w, %rcx -; SSE41-NEXT: movswq {{[0-9]+}}(%rsp), %rdx -; SSE41-NEXT: movswq {{[0-9]+}}(%rsp), %rsi -; SSE41-NEXT: movzwl -16(%rsp,%rdx,2), %edx -; SSE41-NEXT: movzwl -16(%rsp,%rsi,2), %esi -; SSE41-NEXT: movzwl -16(%rsp,%rax,2), %eax -; SSE41-NEXT: movd %eax, %xmm0 -; SSE41-NEXT: pinsrw $1, -16(%rsp,%rbx,2), %xmm0 -; SSE41-NEXT: pinsrw $2, -16(%rsp,%r11,2), %xmm0 -; SSE41-NEXT: pinsrw $3, -16(%rsp,%r10,2), %xmm0 -; SSE41-NEXT: pinsrw $4, -16(%rsp,%rdi,2), %xmm0 -; SSE41-NEXT: pinsrw $5, -16(%rsp,%rcx,2), %xmm0 -; SSE41-NEXT: pinsrw $6, %edx, %xmm0 -; SSE41-NEXT: pinsrw $7, %esi, %xmm0 -; SSE41-NEXT: popq %rbx +; SSE41-NEXT: andl $7, %r9d +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; SSE41-NEXT: andl $7, %r10d +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: andl $7, %eax +; SSE41-NEXT: movzwl -24(%rsp,%r10,2), %r10d +; SSE41-NEXT: movzwl -24(%rsp,%rax,2), %eax +; SSE41-NEXT: movzwl -24(%rsp,%rdi,2), %edi +; SSE41-NEXT: movd %edi, %xmm0 +; SSE41-NEXT: pinsrw $1, -24(%rsp,%rsi,2), %xmm0 +; SSE41-NEXT: pinsrw $2, -24(%rsp,%rdx,2), %xmm0 +; SSE41-NEXT: pinsrw $3, -24(%rsp,%rcx,2), %xmm0 +; SSE41-NEXT: pinsrw $4, -24(%rsp,%r8,2), %xmm0 +; SSE41-NEXT: pinsrw $5, -24(%rsp,%r9,2), %xmm0 +; SSE41-NEXT: pinsrw $6, %r10d, %xmm0 +; SSE41-NEXT: pinsrw $7, %eax, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16: ; AVX: # BB#0: -; AVX-NEXT: pushq %r14 -; AVX-NEXT: pushq %rbx ; AVX-NEXT: # kill: %R9D %R9D %R9 ; AVX-NEXT: # kill: %R8D %R8D %R8 ; AVX-NEXT: # kill: %ECX %ECX %RCX ; AVX-NEXT: # kill: %EDX %EDX %RDX ; AVX-NEXT: # kill: %ESI %ESI %RSI ; AVX-NEXT: # kill: %EDI %EDI %RDI -; AVX-NEXT: movswq %di, %r10 -; AVX-NEXT: movswq %si, %r11 -; AVX-NEXT: movswq %dx, %r14 -; AVX-NEXT: movswq %cx, %rcx -; AVX-NEXT: movswq %r8w, %rdi +; AVX-NEXT: andl $7, %edi +; AVX-NEXT: andl $7, %esi +; AVX-NEXT: andl $7, %edx +; AVX-NEXT: andl $7, %ecx +; AVX-NEXT: andl $7, %r8d ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movswq %r9w, %rax -; AVX-NEXT: movswq {{[0-9]+}}(%rsp), %rsi -; AVX-NEXT: movswq {{[0-9]+}}(%rsp), %rdx -; AVX-NEXT: movzwl -24(%rsp,%rsi,2), %esi -; AVX-NEXT: movzwl -24(%rsp,%rdx,2), %edx -; AVX-NEXT: movzwl -24(%rsp,%r10,2), %ebx -; AVX-NEXT: vmovd %ebx, %xmm0 -; AVX-NEXT: vpinsrw $1, -24(%rsp,%r11,2), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $2, -24(%rsp,%r14,2), %xmm0, %xmm0 +; AVX-NEXT: andl $7, %r9d +; AVX-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; AVX-NEXT: andl $7, %r10d +; AVX-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX-NEXT: andl $7, %eax +; AVX-NEXT: movzwl -24(%rsp,%r10,2), %r10d +; AVX-NEXT: movzwl -24(%rsp,%rax,2), %eax +; AVX-NEXT: movzwl -24(%rsp,%rdi,2), %edi +; AVX-NEXT: vmovd %edi, %xmm0 +; AVX-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm0, %xmm0 ; AVX-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $4, -24(%rsp,%rdi,2), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $6, %esi, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $7, %edx, %xmm0, %xmm0 -; AVX-NEXT: popq %rbx -; AVX-NEXT: popq %r14 +; AVX-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $6, %r10d, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ; AVX-NEXT: retq %x0 = extractelement <8 x i16> %x, i16 %i0 %x1 = extractelement <8 x i16> %x, i16 %i1 @@ -374,54 +416,64 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSE2-NEXT: # kill: %ESI %ESI %RSI ; SSE2-NEXT: # kill: %EDI %EDI %RDI ; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %r10 +; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; SSE2-NEXT: andl $15, %r10d ; SSE2-NEXT: leaq -{{[0-9]+}}(%rsp), %r11 ; SSE2-NEXT: movzbl (%r10,%r11), %eax ; SSE2-NEXT: movd %eax, %xmm15 -; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl (%rax,%r11), %eax ; SSE2-NEXT: movd %eax, %xmm8 -; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl (%rax,%r11), %eax ; SSE2-NEXT: movd %eax, %xmm9 -; SSE2-NEXT: movsbq %dl, %rax -; SSE2-NEXT: movzbl (%rax,%r11), %eax +; SSE2-NEXT: andl $15, %edx +; SSE2-NEXT: movzbl (%rdx,%r11), %eax ; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl (%rax,%r11), %eax ; SSE2-NEXT: movd %eax, %xmm10 -; SSE2-NEXT: movsbq %dil, %rax -; SSE2-NEXT: movzbl (%rax,%r11), %eax +; SSE2-NEXT: andl $15, %edi +; SSE2-NEXT: movzbl (%rdi,%r11), %eax ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl (%rax,%r11), %eax ; SSE2-NEXT: movd %eax, %xmm11 -; SSE2-NEXT: movsbq %r8b, %rax -; SSE2-NEXT: movzbl (%rax,%r11), %eax +; SSE2-NEXT: andl $15, %r8d +; SSE2-NEXT: movzbl (%r8,%r11), %eax ; SSE2-NEXT: movd %eax, %xmm7 -; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl (%rax,%r11), %eax ; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl (%rax,%r11), %eax ; SSE2-NEXT: movd %eax, %xmm12 -; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl (%rax,%r11), %eax ; SSE2-NEXT: movd %eax, %xmm13 -; SSE2-NEXT: movsbq %cl, %rax -; SSE2-NEXT: movzbl (%rax,%r11), %eax +; SSE2-NEXT: andl $15, %ecx +; SSE2-NEXT: movzbl (%rcx,%r11), %eax ; SSE2-NEXT: movd %eax, %xmm6 -; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl (%rax,%r11), %eax ; SSE2-NEXT: movd %eax, %xmm14 -; SSE2-NEXT: movsbq %sil, %rax -; SSE2-NEXT: movzbl (%rax,%r11), %eax +; SSE2-NEXT: andl $15, %esi +; SSE2-NEXT: movzbl (%rsi,%r11), %eax ; SSE2-NEXT: movd %eax, %xmm5 -; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl (%rax,%r11), %eax ; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: movsbq %r9b, %rax -; SSE2-NEXT: movzbl (%rax,%r11), %eax +; SSE2-NEXT: andl $15, %r9d +; SSE2-NEXT: movzbl (%r9,%r11), %eax ; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] @@ -449,54 +501,64 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSSE3-NEXT: # kill: %ESI %ESI %RSI ; SSSE3-NEXT: # kill: %EDI %EDI %RDI ; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %r10 +; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; SSSE3-NEXT: andl $15, %r10d ; SSSE3-NEXT: leaq -{{[0-9]+}}(%rsp), %r11 ; SSSE3-NEXT: movzbl (%r10,%r11), %eax ; SSSE3-NEXT: movd %eax, %xmm15 -; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax +; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl (%rax,%r11), %eax ; SSSE3-NEXT: movd %eax, %xmm8 -; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax +; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl (%rax,%r11), %eax ; SSSE3-NEXT: movd %eax, %xmm9 -; SSSE3-NEXT: movsbq %dl, %rax -; SSSE3-NEXT: movzbl (%rax,%r11), %eax +; SSSE3-NEXT: andl $15, %edx +; SSSE3-NEXT: movzbl (%rdx,%r11), %eax ; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax +; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl (%rax,%r11), %eax ; SSSE3-NEXT: movd %eax, %xmm10 -; SSSE3-NEXT: movsbq %dil, %rax -; SSSE3-NEXT: movzbl (%rax,%r11), %eax +; SSSE3-NEXT: andl $15, %edi +; SSSE3-NEXT: movzbl (%rdi,%r11), %eax ; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax +; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl (%rax,%r11), %eax ; SSSE3-NEXT: movd %eax, %xmm11 -; SSSE3-NEXT: movsbq %r8b, %rax -; SSSE3-NEXT: movzbl (%rax,%r11), %eax +; SSSE3-NEXT: andl $15, %r8d +; SSSE3-NEXT: movzbl (%r8,%r11), %eax ; SSSE3-NEXT: movd %eax, %xmm7 -; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax +; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl (%rax,%r11), %eax ; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax +; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl (%rax,%r11), %eax ; SSSE3-NEXT: movd %eax, %xmm12 -; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax +; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl (%rax,%r11), %eax ; SSSE3-NEXT: movd %eax, %xmm13 -; SSSE3-NEXT: movsbq %cl, %rax -; SSSE3-NEXT: movzbl (%rax,%r11), %eax +; SSSE3-NEXT: andl $15, %ecx +; SSSE3-NEXT: movzbl (%rcx,%r11), %eax ; SSSE3-NEXT: movd %eax, %xmm6 -; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax +; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl (%rax,%r11), %eax ; SSSE3-NEXT: movd %eax, %xmm14 -; SSSE3-NEXT: movsbq %sil, %rax -; SSSE3-NEXT: movzbl (%rax,%r11), %eax +; SSSE3-NEXT: andl $15, %esi +; SSSE3-NEXT: movzbl (%rsi,%r11), %eax ; SSSE3-NEXT: movd %eax, %xmm5 -; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax +; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl (%rax,%r11), %eax ; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: movsbq %r9b, %rax -; SSSE3-NEXT: movzbl (%rax,%r11), %eax +; SSSE3-NEXT: andl $15, %r9d +; SSSE3-NEXT: movzbl (%r9,%r11), %eax ; SSSE3-NEXT: movd %eax, %xmm1 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] @@ -520,7 +582,6 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSE41-NEXT: pushq %rbp ; SSE41-NEXT: pushq %r15 ; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %r13 ; SSE41-NEXT: pushq %r12 ; SSE41-NEXT: pushq %rbx ; SSE41-NEXT: # kill: %R9D %R9D %R9 @@ -529,54 +590,63 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSE41-NEXT: # kill: %EDX %EDX %RDX ; SSE41-NEXT: # kill: %ESI %ESI %RSI ; SSE41-NEXT: # kill: %EDI %EDI %RDI -; SSE41-NEXT: movsbq %dil, %r15 -; SSE41-NEXT: movsbq %sil, %r14 -; SSE41-NEXT: movsbq %dl, %r11 -; SSE41-NEXT: movsbq %cl, %r10 -; SSE41-NEXT: movsbq %r8b, %r8 +; SSE41-NEXT: andl $15, %edi +; SSE41-NEXT: andl $15, %esi +; SSE41-NEXT: andl $15, %edx +; SSE41-NEXT: andl $15, %ecx +; SSE41-NEXT: andl $15, %r8d ; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: movsbq %r9b, %r9 -; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %r12 -; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %r13 -; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %rbp -; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %rbx +; SSE41-NEXT: andl $15, %r9d +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; SSE41-NEXT: andl $15, %r10d +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d +; SSE41-NEXT: andl $15, %r11d +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d +; SSE41-NEXT: andl $15, %r14d +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d +; SSE41-NEXT: andl $15, %r15d ; SSE41-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; SSE41-NEXT: movzbl (%r15,%rax), %ecx -; SSE41-NEXT: movd %ecx, %xmm0 -; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %r15 -; SSE41-NEXT: pinsrb $1, (%r14,%rax), %xmm0 -; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %r14 -; SSE41-NEXT: pinsrb $2, (%r11,%rax), %xmm0 -; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %r11 -; SSE41-NEXT: pinsrb $3, (%r10,%rax), %xmm0 -; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %r10 +; SSE41-NEXT: movzbl (%rdi,%rax), %edi +; SSE41-NEXT: movd %edi, %xmm0 +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d +; SSE41-NEXT: andl $15, %r12d +; SSE41-NEXT: pinsrb $1, (%rsi,%rax), %xmm0 +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %esi +; SSE41-NEXT: andl $15, %esi +; SSE41-NEXT: pinsrb $2, (%rdx,%rax), %xmm0 +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %edx +; SSE41-NEXT: andl $15, %edx +; SSE41-NEXT: pinsrb $3, (%rcx,%rax), %xmm0 +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; SSE41-NEXT: andl $15, %ecx ; SSE41-NEXT: pinsrb $4, (%r8,%rax), %xmm0 -; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %rcx +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx +; SSE41-NEXT: andl $15, %ebx ; SSE41-NEXT: pinsrb $5, (%r9,%rax), %xmm0 -; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %rdx -; SSE41-NEXT: movzbl (%r12,%rax), %esi -; SSE41-NEXT: movzbl (%r13,%rax), %edi -; SSE41-NEXT: movzbl (%rbp,%rax), %ebp -; SSE41-NEXT: movzbl (%rbx,%rax), %ebx -; SSE41-NEXT: movzbl (%r15,%rax), %r8d -; SSE41-NEXT: movzbl (%r14,%rax), %r9d -; SSE41-NEXT: movzbl (%r11,%rax), %r11d -; SSE41-NEXT: movzbl (%r10,%rax), %r10d +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; SSE41-NEXT: andl $15, %edi +; SSE41-NEXT: movzbl (%r10,%rax), %r8d +; SSE41-NEXT: movzbl (%r11,%rax), %r9d +; SSE41-NEXT: movzbl (%r14,%rax), %r10d +; SSE41-NEXT: movzbl (%r15,%rax), %r11d +; SSE41-NEXT: movzbl (%r12,%rax), %ebp +; SSE41-NEXT: movzbl (%rsi,%rax), %esi +; SSE41-NEXT: movzbl (%rdx,%rax), %edx ; SSE41-NEXT: movzbl (%rcx,%rax), %ecx -; SSE41-NEXT: movzbl (%rdx,%rax), %eax -; SSE41-NEXT: pinsrb $6, %esi, %xmm0 -; SSE41-NEXT: pinsrb $7, %edi, %xmm0 -; SSE41-NEXT: pinsrb $8, %ebp, %xmm0 -; SSE41-NEXT: pinsrb $9, %ebx, %xmm0 -; SSE41-NEXT: pinsrb $10, %r8d, %xmm0 -; SSE41-NEXT: pinsrb $11, %r9d, %xmm0 -; SSE41-NEXT: pinsrb $12, %r11d, %xmm0 -; SSE41-NEXT: pinsrb $13, %r10d, %xmm0 -; SSE41-NEXT: pinsrb $14, %ecx, %xmm0 +; SSE41-NEXT: movzbl (%rbx,%rax), %ebx +; SSE41-NEXT: movzbl (%rdi,%rax), %eax +; SSE41-NEXT: pinsrb $6, %r8d, %xmm0 +; SSE41-NEXT: pinsrb $7, %r9d, %xmm0 +; SSE41-NEXT: pinsrb $8, %r10d, %xmm0 +; SSE41-NEXT: pinsrb $9, %r11d, %xmm0 +; SSE41-NEXT: pinsrb $10, %ebp, %xmm0 +; SSE41-NEXT: pinsrb $11, %esi, %xmm0 +; SSE41-NEXT: pinsrb $12, %edx, %xmm0 +; SSE41-NEXT: pinsrb $13, %ecx, %xmm0 +; SSE41-NEXT: pinsrb $14, %ebx, %xmm0 ; SSE41-NEXT: pinsrb $15, %eax, %xmm0 ; SSE41-NEXT: popq %rbx ; SSE41-NEXT: popq %r12 -; SSE41-NEXT: popq %r13 ; SSE41-NEXT: popq %r14 ; SSE41-NEXT: popq %r15 ; SSE41-NEXT: popq %rbp @@ -587,7 +657,6 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; AVX-NEXT: pushq %rbp ; AVX-NEXT: pushq %r15 ; AVX-NEXT: pushq %r14 -; AVX-NEXT: pushq %r13 ; AVX-NEXT: pushq %r12 ; AVX-NEXT: pushq %rbx ; AVX-NEXT: # kill: %R9D %R9D %R9 @@ -596,54 +665,63 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; AVX-NEXT: # kill: %EDX %EDX %RDX ; AVX-NEXT: # kill: %ESI %ESI %RSI ; AVX-NEXT: # kill: %EDI %EDI %RDI -; AVX-NEXT: movsbq %dil, %r10 -; AVX-NEXT: movsbq %sil, %r11 -; AVX-NEXT: movsbq %dl, %r14 -; AVX-NEXT: movsbq %cl, %r15 -; AVX-NEXT: movsbq %r8b, %r8 +; AVX-NEXT: andl $15, %edi +; AVX-NEXT: andl $15, %esi +; AVX-NEXT: andl $15, %edx +; AVX-NEXT: andl $15, %ecx +; AVX-NEXT: andl $15, %r8d ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movsbq %r9b, %r9 -; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %r12 -; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %r13 -; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %rbp -; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rdi -; AVX-NEXT: movzbl (%r10,%rdi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %r10 -; AVX-NEXT: vpinsrb $1, (%r11,%rdi), %xmm0, %xmm0 -; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %r11 -; AVX-NEXT: vpinsrb $2, (%r14,%rdi), %xmm0, %xmm0 -; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %r14 -; AVX-NEXT: vpinsrb $3, (%r15,%rdi), %xmm0, %xmm0 -; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %r15 -; AVX-NEXT: vpinsrb $4, (%r8,%rdi), %xmm0, %xmm0 -; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %r8 -; AVX-NEXT: vpinsrb $5, (%r9,%rdi), %xmm0, %xmm0 -; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %rsi -; AVX-NEXT: movzbl (%r12,%rdi), %edx -; AVX-NEXT: movzbl (%r13,%rdi), %ebx -; AVX-NEXT: movzbl (%rbp,%rdi), %ebp -; AVX-NEXT: movzbl (%rcx,%rdi), %ecx -; AVX-NEXT: movzbl (%r10,%rdi), %eax -; AVX-NEXT: movzbl (%r11,%rdi), %r9d -; AVX-NEXT: movzbl (%r14,%rdi), %r10d -; AVX-NEXT: movzbl (%r15,%rdi), %r11d -; AVX-NEXT: movzbl (%r8,%rdi), %r8d -; AVX-NEXT: movzbl (%rsi,%rdi), %esi -; AVX-NEXT: vpinsrb $6, %edx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $7, %ebx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $11, %r9d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $12, %r10d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $13, %r11d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $15, %esi, %xmm0, %xmm0 +; AVX-NEXT: andl $15, %r9d +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; AVX-NEXT: andl $15, %r10d +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d +; AVX-NEXT: andl $15, %r11d +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d +; AVX-NEXT: andl $15, %r14d +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d +; AVX-NEXT: andl $15, %r15d +; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movzbl (%rdi,%rax), %edi +; AVX-NEXT: vmovd %edi, %xmm0 +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d +; AVX-NEXT: andl $15, %r12d +; AVX-NEXT: vpinsrb $1, (%rsi,%rax), %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %esi +; AVX-NEXT: andl $15, %esi +; AVX-NEXT: vpinsrb $2, (%rdx,%rax), %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %edx +; AVX-NEXT: andl $15, %edx +; AVX-NEXT: vpinsrb $3, (%rcx,%rax), %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; AVX-NEXT: andl $15, %ecx +; AVX-NEXT: vpinsrb $4, (%r8,%rax), %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx +; AVX-NEXT: andl $15, %ebx +; AVX-NEXT: vpinsrb $5, (%r9,%rax), %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; AVX-NEXT: andl $15, %edi +; AVX-NEXT: movzbl (%r10,%rax), %r8d +; AVX-NEXT: movzbl (%r11,%rax), %r9d +; AVX-NEXT: movzbl (%r14,%rax), %r10d +; AVX-NEXT: movzbl (%r15,%rax), %r11d +; AVX-NEXT: movzbl (%r12,%rax), %ebp +; AVX-NEXT: movzbl (%rsi,%rax), %esi +; AVX-NEXT: movzbl (%rdx,%rax), %edx +; AVX-NEXT: movzbl (%rcx,%rax), %ecx +; AVX-NEXT: movzbl (%rbx,%rax), %ebx +; AVX-NEXT: movzbl (%rdi,%rax), %eax +; AVX-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $7, %r9d, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $9, %r11d, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $11, %esi, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $14, %ebx, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX-NEXT: popq %rbx ; AVX-NEXT: popq %r12 -; AVX-NEXT: popq %r13 ; AVX-NEXT: popq %r14 ; AVX-NEXT: popq %r15 ; AVX-NEXT: popq %rbp @@ -690,11 +768,15 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwind { ; SSE2-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32: ; SSE2: # BB#0: -; SSE2-NEXT: movslq (%rdi), %rax +; SSE2-NEXT: movl (%rdi), %eax +; SSE2-NEXT: movl 4(%rdi), %ecx +; SSE2-NEXT: andl $3, %eax ; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movslq 4(%rdi), %rcx -; SSE2-NEXT: movslq 8(%rdi), %rdx -; SSE2-NEXT: movslq 12(%rdi), %rsi +; SSE2-NEXT: andl $3, %ecx +; SSE2-NEXT: movl 8(%rdi), %edx +; SSE2-NEXT: andl $3, %edx +; SSE2-NEXT: movl 12(%rdi), %esi +; SSE2-NEXT: andl $3, %esi ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero @@ -706,11 +788,15 @@ define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwi ; ; SSSE3-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32: ; SSSE3: # BB#0: -; SSSE3-NEXT: movslq (%rdi), %rax +; SSSE3-NEXT: movl (%rdi), %eax +; SSSE3-NEXT: movl 4(%rdi), %ecx +; SSSE3-NEXT: andl $3, %eax ; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movslq 4(%rdi), %rcx -; SSSE3-NEXT: movslq 8(%rdi), %rdx -; SSSE3-NEXT: movslq 12(%rdi), %rsi +; SSSE3-NEXT: andl $3, %ecx +; SSSE3-NEXT: movl 8(%rdi), %edx +; SSSE3-NEXT: andl $3, %edx +; SSSE3-NEXT: movl 12(%rdi), %esi +; SSSE3-NEXT: andl $3, %esi ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero @@ -722,11 +808,15 @@ define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwi ; ; SSE41-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32: ; SSE41: # BB#0: -; SSE41-NEXT: movslq (%rdi), %rax +; SSE41-NEXT: movl (%rdi), %eax +; SSE41-NEXT: movl 4(%rdi), %ecx +; SSE41-NEXT: andl $3, %eax ; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: movslq 4(%rdi), %rcx -; SSE41-NEXT: movslq 8(%rdi), %rdx -; SSE41-NEXT: movslq 12(%rdi), %rsi +; SSE41-NEXT: andl $3, %ecx +; SSE41-NEXT: movl 8(%rdi), %edx +; SSE41-NEXT: andl $3, %edx +; SSE41-NEXT: movl 12(%rdi), %esi +; SSE41-NEXT: andl $3, %esi ; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE41-NEXT: pinsrd $1, -24(%rsp,%rcx,4), %xmm0 ; SSE41-NEXT: pinsrd $2, -24(%rsp,%rdx,4), %xmm0 @@ -735,11 +825,15 @@ define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwi ; ; AVX-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32: ; AVX: # BB#0: -; AVX-NEXT: movslq (%rdi), %rax +; AVX-NEXT: movl (%rdi), %eax +; AVX-NEXT: movl 4(%rdi), %ecx +; AVX-NEXT: andl $3, %eax ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movslq 4(%rdi), %rcx -; AVX-NEXT: movslq 8(%rdi), %rdx -; AVX-NEXT: movslq 12(%rdi), %rsi +; AVX-NEXT: andl $3, %ecx +; AVX-NEXT: movl 8(%rdi), %edx +; AVX-NEXT: andl $3, %edx +; AVX-NEXT: movl 12(%rdi), %esi +; AVX-NEXT: andl $3, %esi ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vpinsrd $1, -24(%rsp,%rcx,4), %xmm0, %xmm0 ; AVX-NEXT: vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0 @@ -767,55 +861,71 @@ define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwi define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8* %i) nounwind { ; SSE2-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: ; SSE2: # BB#0: -; SSE2-NEXT: movsbq (%rdi), %rcx +; SSE2-NEXT: movzbl (%rdi), %eax +; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movzbl (%rcx,%rax), %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movsbq 8(%rdi), %rcx -; SSE2-NEXT: movzbl (%rcx,%rax), %ecx -; SSE2-NEXT: movd %ecx, %xmm8 -; SSE2-NEXT: movsbq 12(%rdi), %rcx -; SSE2-NEXT: movzbl (%rcx,%rax), %ecx -; SSE2-NEXT: movd %ecx, %xmm9 -; SSE2-NEXT: movsbq 4(%rdi), %rcx -; SSE2-NEXT: movzbl (%rcx,%rax), %ecx -; SSE2-NEXT: movd %ecx, %xmm3 -; SSE2-NEXT: movsbq 14(%rdi), %rcx -; SSE2-NEXT: movzbl (%rcx,%rax), %ecx -; SSE2-NEXT: movd %ecx, %xmm10 -; SSE2-NEXT: movsbq 6(%rdi), %rcx -; SSE2-NEXT: movzbl (%rcx,%rax), %ecx -; SSE2-NEXT: movd %ecx, %xmm5 -; SSE2-NEXT: movsbq 10(%rdi), %rcx -; SSE2-NEXT: movzbl (%rcx,%rax), %ecx -; SSE2-NEXT: movd %ecx, %xmm11 -; SSE2-NEXT: movsbq 2(%rdi), %rcx -; SSE2-NEXT: movzbl (%rcx,%rax), %ecx -; SSE2-NEXT: movd %ecx, %xmm7 -; SSE2-NEXT: movsbq 15(%rdi), %rcx -; SSE2-NEXT: movzbl (%rcx,%rax), %ecx -; SSE2-NEXT: movd %ecx, %xmm12 -; SSE2-NEXT: movsbq 7(%rdi), %rcx -; SSE2-NEXT: movzbl (%rcx,%rax), %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: movsbq 11(%rdi), %rcx -; SSE2-NEXT: movzbl (%rcx,%rax), %ecx -; SSE2-NEXT: movd %ecx, %xmm13 -; SSE2-NEXT: movsbq 3(%rdi), %rcx -; SSE2-NEXT: movzbl (%rcx,%rax), %ecx -; SSE2-NEXT: movd %ecx, %xmm6 -; SSE2-NEXT: movsbq 13(%rdi), %rcx -; SSE2-NEXT: movzbl (%rcx,%rax), %ecx -; SSE2-NEXT: movd %ecx, %xmm14 -; SSE2-NEXT: movsbq 5(%rdi), %rcx -; SSE2-NEXT: movzbl (%rcx,%rax), %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: movsbq 9(%rdi), %rcx -; SSE2-NEXT: movzbl (%rcx,%rax), %ecx -; SSE2-NEXT: movd %ecx, %xmm15 -; SSE2-NEXT: movsbq 1(%rdi), %rcx -; SSE2-NEXT: movzbl (%rcx,%rax), %eax +; SSE2-NEXT: leaq -{{[0-9]+}}(%rsp), %rcx +; SSE2-NEXT: movzbl (%rax,%rcx), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl 8(%rdi), %eax +; SSE2-NEXT: andl $15, %eax +; SSE2-NEXT: movzbl (%rax,%rcx), %eax +; SSE2-NEXT: movd %eax, %xmm8 +; SSE2-NEXT: movzbl 12(%rdi), %eax +; SSE2-NEXT: andl $15, %eax +; SSE2-NEXT: movzbl (%rax,%rcx), %eax +; SSE2-NEXT: movd %eax, %xmm9 +; SSE2-NEXT: movzbl 4(%rdi), %eax +; SSE2-NEXT: andl $15, %eax +; SSE2-NEXT: movzbl (%rax,%rcx), %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: movzbl 14(%rdi), %eax +; SSE2-NEXT: andl $15, %eax +; SSE2-NEXT: movzbl (%rax,%rcx), %eax +; SSE2-NEXT: movd %eax, %xmm10 +; SSE2-NEXT: movzbl 6(%rdi), %eax +; SSE2-NEXT: andl $15, %eax +; SSE2-NEXT: movzbl (%rax,%rcx), %eax +; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: movzbl 10(%rdi), %eax +; SSE2-NEXT: andl $15, %eax +; SSE2-NEXT: movzbl (%rax,%rcx), %eax +; SSE2-NEXT: movd %eax, %xmm11 +; SSE2-NEXT: movzbl 2(%rdi), %eax +; SSE2-NEXT: andl $15, %eax +; SSE2-NEXT: movzbl (%rax,%rcx), %eax +; SSE2-NEXT: movd %eax, %xmm7 +; SSE2-NEXT: movzbl 15(%rdi), %eax +; SSE2-NEXT: andl $15, %eax +; SSE2-NEXT: movzbl (%rax,%rcx), %eax +; SSE2-NEXT: movd %eax, %xmm12 +; SSE2-NEXT: movzbl 7(%rdi), %eax +; SSE2-NEXT: andl $15, %eax +; SSE2-NEXT: movzbl (%rax,%rcx), %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: movzbl 11(%rdi), %eax +; SSE2-NEXT: andl $15, %eax +; SSE2-NEXT: movzbl (%rax,%rcx), %eax +; SSE2-NEXT: movd %eax, %xmm13 +; SSE2-NEXT: movzbl 3(%rdi), %eax +; SSE2-NEXT: andl $15, %eax +; SSE2-NEXT: movzbl (%rax,%rcx), %eax +; SSE2-NEXT: movd %eax, %xmm6 +; SSE2-NEXT: movzbl 13(%rdi), %eax +; SSE2-NEXT: andl $15, %eax +; SSE2-NEXT: movzbl (%rax,%rcx), %eax +; SSE2-NEXT: movd %eax, %xmm14 +; SSE2-NEXT: movzbl 5(%rdi), %eax +; SSE2-NEXT: andl $15, %eax +; SSE2-NEXT: movzbl (%rax,%rcx), %eax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movzbl 9(%rdi), %eax +; SSE2-NEXT: andl $15, %eax +; SSE2-NEXT: movzbl (%rax,%rcx), %eax +; SSE2-NEXT: movd %eax, %xmm15 +; SSE2-NEXT: movzbl 1(%rdi), %eax +; SSE2-NEXT: andl $15, %eax +; SSE2-NEXT: movzbl (%rax,%rcx), %eax ; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] @@ -836,55 +946,71 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8* ; ; SSSE3-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: ; SSSE3: # BB#0: -; SSSE3-NEXT: movsbq (%rdi), %rcx +; SSSE3-NEXT: movzbl (%rdi), %eax +; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movsbq 8(%rdi), %rcx -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm8 -; SSSE3-NEXT: movsbq 12(%rdi), %rcx -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm9 -; SSSE3-NEXT: movsbq 4(%rdi), %rcx -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm3 -; SSSE3-NEXT: movsbq 14(%rdi), %rcx -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm10 -; SSSE3-NEXT: movsbq 6(%rdi), %rcx -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm5 -; SSSE3-NEXT: movsbq 10(%rdi), %rcx -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm11 -; SSSE3-NEXT: movsbq 2(%rdi), %rcx -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm7 -; SSSE3-NEXT: movsbq 15(%rdi), %rcx -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm12 -; SSSE3-NEXT: movsbq 7(%rdi), %rcx -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: movsbq 11(%rdi), %rcx -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm13 -; SSSE3-NEXT: movsbq 3(%rdi), %rcx -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm6 -; SSSE3-NEXT: movsbq 13(%rdi), %rcx -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm14 -; SSSE3-NEXT: movsbq 5(%rdi), %rcx -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm4 -; SSSE3-NEXT: movsbq 9(%rdi), %rcx -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm15 -; SSSE3-NEXT: movsbq 1(%rdi), %rcx -; SSSE3-NEXT: movzbl (%rcx,%rax), %eax +; SSSE3-NEXT: leaq -{{[0-9]+}}(%rsp), %rcx +; SSSE3-NEXT: movzbl (%rax,%rcx), %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl 8(%rdi), %eax +; SSSE3-NEXT: andl $15, %eax +; SSSE3-NEXT: movzbl (%rax,%rcx), %eax +; SSSE3-NEXT: movd %eax, %xmm8 +; SSSE3-NEXT: movzbl 12(%rdi), %eax +; SSSE3-NEXT: andl $15, %eax +; SSSE3-NEXT: movzbl (%rax,%rcx), %eax +; SSSE3-NEXT: movd %eax, %xmm9 +; SSSE3-NEXT: movzbl 4(%rdi), %eax +; SSSE3-NEXT: andl $15, %eax +; SSSE3-NEXT: movzbl (%rax,%rcx), %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: movzbl 14(%rdi), %eax +; SSSE3-NEXT: andl $15, %eax +; SSSE3-NEXT: movzbl (%rax,%rcx), %eax +; SSSE3-NEXT: movd %eax, %xmm10 +; SSSE3-NEXT: movzbl 6(%rdi), %eax +; SSSE3-NEXT: andl $15, %eax +; SSSE3-NEXT: movzbl (%rax,%rcx), %eax +; SSSE3-NEXT: movd %eax, %xmm5 +; SSSE3-NEXT: movzbl 10(%rdi), %eax +; SSSE3-NEXT: andl $15, %eax +; SSSE3-NEXT: movzbl (%rax,%rcx), %eax +; SSSE3-NEXT: movd %eax, %xmm11 +; SSSE3-NEXT: movzbl 2(%rdi), %eax +; SSSE3-NEXT: andl $15, %eax +; SSSE3-NEXT: movzbl (%rax,%rcx), %eax +; SSSE3-NEXT: movd %eax, %xmm7 +; SSSE3-NEXT: movzbl 15(%rdi), %eax +; SSSE3-NEXT: andl $15, %eax +; SSSE3-NEXT: movzbl (%rax,%rcx), %eax +; SSSE3-NEXT: movd %eax, %xmm12 +; SSSE3-NEXT: movzbl 7(%rdi), %eax +; SSSE3-NEXT: andl $15, %eax +; SSSE3-NEXT: movzbl (%rax,%rcx), %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: movzbl 11(%rdi), %eax +; SSSE3-NEXT: andl $15, %eax +; SSSE3-NEXT: movzbl (%rax,%rcx), %eax +; SSSE3-NEXT: movd %eax, %xmm13 +; SSSE3-NEXT: movzbl 3(%rdi), %eax +; SSSE3-NEXT: andl $15, %eax +; SSSE3-NEXT: movzbl (%rax,%rcx), %eax +; SSSE3-NEXT: movd %eax, %xmm6 +; SSSE3-NEXT: movzbl 13(%rdi), %eax +; SSSE3-NEXT: andl $15, %eax +; SSSE3-NEXT: movzbl (%rax,%rcx), %eax +; SSSE3-NEXT: movd %eax, %xmm14 +; SSSE3-NEXT: movzbl 5(%rdi), %eax +; SSSE3-NEXT: andl $15, %eax +; SSSE3-NEXT: movzbl (%rax,%rcx), %eax +; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: movzbl 9(%rdi), %eax +; SSSE3-NEXT: andl $15, %eax +; SSSE3-NEXT: movzbl (%rax,%rcx), %eax +; SSSE3-NEXT: movd %eax, %xmm15 +; SSSE3-NEXT: movzbl 1(%rdi), %eax +; SSSE3-NEXT: andl $15, %eax +; SSSE3-NEXT: movzbl (%rax,%rcx), %eax ; SSSE3-NEXT: movd %eax, %xmm1 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] @@ -911,55 +1037,75 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8* ; SSE41-NEXT: pushq %r13 ; SSE41-NEXT: pushq %r12 ; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: movsbq (%rdi), %rax +; SSE41-NEXT: movzbl (%rdi), %r11d +; SSE41-NEXT: andl $15, %r11d ; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: movsbq 1(%rdi), %r15 -; SSE41-NEXT: movsbq 2(%rdi), %r8 -; SSE41-NEXT: movsbq 3(%rdi), %r9 -; SSE41-NEXT: movsbq 4(%rdi), %r10 -; SSE41-NEXT: movsbq 5(%rdi), %r11 -; SSE41-NEXT: movsbq 6(%rdi), %r14 -; SSE41-NEXT: movsbq 7(%rdi), %r12 -; SSE41-NEXT: movsbq 8(%rdi), %r13 -; SSE41-NEXT: movsbq 9(%rdi), %rdx -; SSE41-NEXT: movsbq 10(%rdi), %rcx -; SSE41-NEXT: movsbq 11(%rdi), %rsi -; SSE41-NEXT: movsbq 12(%rdi), %rbx +; SSE41-NEXT: movzbl 1(%rdi), %r9d +; SSE41-NEXT: andl $15, %r9d +; SSE41-NEXT: movzbl 2(%rdi), %eax +; SSE41-NEXT: andl $15, %eax +; SSE41-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill +; SSE41-NEXT: movzbl 3(%rdi), %eax +; SSE41-NEXT: andl $15, %eax +; SSE41-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill +; SSE41-NEXT: movzbl 4(%rdi), %r14d +; SSE41-NEXT: andl $15, %r14d +; SSE41-NEXT: movzbl 5(%rdi), %r15d +; SSE41-NEXT: andl $15, %r15d +; SSE41-NEXT: movzbl 6(%rdi), %r12d +; SSE41-NEXT: andl $15, %r12d +; SSE41-NEXT: movzbl 7(%rdi), %r13d +; SSE41-NEXT: andl $15, %r13d +; SSE41-NEXT: movzbl 8(%rdi), %r8d +; SSE41-NEXT: andl $15, %r8d +; SSE41-NEXT: movzbl 9(%rdi), %eax +; SSE41-NEXT: andl $15, %eax +; SSE41-NEXT: movzbl 10(%rdi), %ecx +; SSE41-NEXT: andl $15, %ecx +; SSE41-NEXT: movzbl 11(%rdi), %edx +; SSE41-NEXT: andl $15, %edx +; SSE41-NEXT: movzbl 12(%rdi), %esi +; SSE41-NEXT: andl $15, %esi ; SSE41-NEXT: leaq -{{[0-9]+}}(%rsp), %rbp -; SSE41-NEXT: movzbl (%rax,%rbp), %eax -; SSE41-NEXT: movd %eax, %xmm0 -; SSE41-NEXT: movsbq 13(%rdi), %rax -; SSE41-NEXT: pinsrb $1, (%r15,%rbp), %xmm0 -; SSE41-NEXT: movsbq 14(%rdi), %r15 -; SSE41-NEXT: movsbq 15(%rdi), %rdi -; SSE41-NEXT: movzbl (%rdi,%rbp), %edi -; SSE41-NEXT: movzbl (%r15,%rbp), %r15d -; SSE41-NEXT: movzbl (%rax,%rbp), %eax -; SSE41-NEXT: movzbl (%rbx,%rbp), %ebx +; SSE41-NEXT: movzbl (%r11,%rbp), %ebx +; SSE41-NEXT: movd %ebx, %xmm0 +; SSE41-NEXT: movzbl 13(%rdi), %r11d +; SSE41-NEXT: andl $15, %r11d +; SSE41-NEXT: pinsrb $1, (%r9,%rbp), %xmm0 +; SSE41-NEXT: movzbl 14(%rdi), %ebx +; SSE41-NEXT: andl $15, %ebx +; SSE41-NEXT: movzbl 15(%rdi), %edi +; SSE41-NEXT: andl $15, %edi +; SSE41-NEXT: movzbl (%rdi,%rbp), %r10d +; SSE41-NEXT: movzbl (%rbx,%rbp), %r9d +; SSE41-NEXT: movzbl (%r11,%rbp), %r11d ; SSE41-NEXT: movzbl (%rsi,%rbp), %esi -; SSE41-NEXT: movzbl (%rcx,%rbp), %ecx ; SSE41-NEXT: movzbl (%rdx,%rbp), %edx +; SSE41-NEXT: movzbl (%rcx,%rbp), %ecx +; SSE41-NEXT: movzbl (%rax,%rbp), %eax +; SSE41-NEXT: movzbl (%r8,%rbp), %r8d ; SSE41-NEXT: movzbl (%r13,%rbp), %r13d ; SSE41-NEXT: movzbl (%r12,%rbp), %r12d +; SSE41-NEXT: movzbl (%r15,%rbp), %r15d ; SSE41-NEXT: movzbl (%r14,%rbp), %r14d -; SSE41-NEXT: movzbl (%r11,%rbp), %r11d -; SSE41-NEXT: movzbl (%r10,%rbp), %r10d -; SSE41-NEXT: movzbl (%r9,%rbp), %r9d -; SSE41-NEXT: movzbl (%r8,%rbp), %ebp +; SSE41-NEXT: movq -{{[0-9]+}}(%rsp), %rdi # 8-byte Reload +; SSE41-NEXT: movzbl (%rdi,%rbp), %edi +; SSE41-NEXT: movq -{{[0-9]+}}(%rsp), %rbx # 8-byte Reload +; SSE41-NEXT: movzbl (%rbx,%rbp), %ebp ; SSE41-NEXT: pinsrb $2, %ebp, %xmm0 -; SSE41-NEXT: pinsrb $3, %r9d, %xmm0 -; SSE41-NEXT: pinsrb $4, %r10d, %xmm0 -; SSE41-NEXT: pinsrb $5, %r11d, %xmm0 -; SSE41-NEXT: pinsrb $6, %r14d, %xmm0 -; SSE41-NEXT: pinsrb $7, %r12d, %xmm0 -; SSE41-NEXT: pinsrb $8, %r13d, %xmm0 -; SSE41-NEXT: pinsrb $9, %edx, %xmm0 +; SSE41-NEXT: pinsrb $3, %edi, %xmm0 +; SSE41-NEXT: pinsrb $4, %r14d, %xmm0 +; SSE41-NEXT: pinsrb $5, %r15d, %xmm0 +; SSE41-NEXT: pinsrb $6, %r12d, %xmm0 +; SSE41-NEXT: pinsrb $7, %r13d, %xmm0 +; SSE41-NEXT: pinsrb $8, %r8d, %xmm0 +; SSE41-NEXT: pinsrb $9, %eax, %xmm0 ; SSE41-NEXT: pinsrb $10, %ecx, %xmm0 -; SSE41-NEXT: pinsrb $11, %esi, %xmm0 -; SSE41-NEXT: pinsrb $12, %ebx, %xmm0 -; SSE41-NEXT: pinsrb $13, %eax, %xmm0 -; SSE41-NEXT: pinsrb $14, %r15d, %xmm0 -; SSE41-NEXT: pinsrb $15, %edi, %xmm0 +; SSE41-NEXT: pinsrb $11, %edx, %xmm0 +; SSE41-NEXT: pinsrb $12, %esi, %xmm0 +; SSE41-NEXT: pinsrb $13, %r11d, %xmm0 +; SSE41-NEXT: pinsrb $14, %r9d, %xmm0 +; SSE41-NEXT: pinsrb $15, %r10d, %xmm0 ; SSE41-NEXT: popq %rbx ; SSE41-NEXT: popq %r12 ; SSE41-NEXT: popq %r13 @@ -976,55 +1122,75 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8* ; AVX-NEXT: pushq %r13 ; AVX-NEXT: pushq %r12 ; AVX-NEXT: pushq %rbx -; AVX-NEXT: movsbq (%rdi), %rsi +; AVX-NEXT: movzbl (%rdi), %r11d +; AVX-NEXT: andl $15, %r11d ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movsbq 1(%rdi), %r15 -; AVX-NEXT: movsbq 2(%rdi), %r8 -; AVX-NEXT: movsbq 3(%rdi), %r9 -; AVX-NEXT: movsbq 4(%rdi), %r10 -; AVX-NEXT: movsbq 5(%rdi), %r11 -; AVX-NEXT: movsbq 6(%rdi), %r14 -; AVX-NEXT: movsbq 7(%rdi), %r12 -; AVX-NEXT: movsbq 8(%rdi), %r13 -; AVX-NEXT: movsbq 9(%rdi), %rdx -; AVX-NEXT: movsbq 10(%rdi), %rax -; AVX-NEXT: movsbq 11(%rdi), %rcx -; AVX-NEXT: movsbq 12(%rdi), %rbx +; AVX-NEXT: movzbl 1(%rdi), %r9d +; AVX-NEXT: andl $15, %r9d +; AVX-NEXT: movzbl 2(%rdi), %eax +; AVX-NEXT: andl $15, %eax +; AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill +; AVX-NEXT: movzbl 3(%rdi), %eax +; AVX-NEXT: andl $15, %eax +; AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill +; AVX-NEXT: movzbl 4(%rdi), %r14d +; AVX-NEXT: andl $15, %r14d +; AVX-NEXT: movzbl 5(%rdi), %r15d +; AVX-NEXT: andl $15, %r15d +; AVX-NEXT: movzbl 6(%rdi), %r12d +; AVX-NEXT: andl $15, %r12d +; AVX-NEXT: movzbl 7(%rdi), %r13d +; AVX-NEXT: andl $15, %r13d +; AVX-NEXT: movzbl 8(%rdi), %r8d +; AVX-NEXT: andl $15, %r8d +; AVX-NEXT: movzbl 9(%rdi), %eax +; AVX-NEXT: andl $15, %eax +; AVX-NEXT: movzbl 10(%rdi), %ecx +; AVX-NEXT: andl $15, %ecx +; AVX-NEXT: movzbl 11(%rdi), %edx +; AVX-NEXT: andl $15, %edx +; AVX-NEXT: movzbl 12(%rdi), %esi +; AVX-NEXT: andl $15, %esi ; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rbp +; AVX-NEXT: movzbl (%r11,%rbp), %ebx +; AVX-NEXT: vmovd %ebx, %xmm0 +; AVX-NEXT: movzbl 13(%rdi), %r11d +; AVX-NEXT: andl $15, %r11d +; AVX-NEXT: vpinsrb $1, (%r9,%rbp), %xmm0, %xmm0 +; AVX-NEXT: movzbl 14(%rdi), %ebx +; AVX-NEXT: andl $15, %ebx +; AVX-NEXT: movzbl 15(%rdi), %edi +; AVX-NEXT: andl $15, %edi +; AVX-NEXT: movzbl (%rdi,%rbp), %r10d +; AVX-NEXT: movzbl (%rbx,%rbp), %r9d +; AVX-NEXT: movzbl (%r11,%rbp), %r11d ; AVX-NEXT: movzbl (%rsi,%rbp), %esi -; AVX-NEXT: vmovd %esi, %xmm0 -; AVX-NEXT: movsbq 13(%rdi), %rsi -; AVX-NEXT: vpinsrb $1, (%r15,%rbp), %xmm0, %xmm0 -; AVX-NEXT: movsbq 14(%rdi), %r15 -; AVX-NEXT: movsbq 15(%rdi), %rdi -; AVX-NEXT: movzbl (%rdi,%rbp), %edi -; AVX-NEXT: movzbl (%r15,%rbp), %r15d -; AVX-NEXT: movzbl (%rsi,%rbp), %esi -; AVX-NEXT: movzbl (%rbx,%rbp), %ebx +; AVX-NEXT: movzbl (%rdx,%rbp), %edx ; AVX-NEXT: movzbl (%rcx,%rbp), %ecx ; AVX-NEXT: movzbl (%rax,%rbp), %eax -; AVX-NEXT: movzbl (%rdx,%rbp), %edx +; AVX-NEXT: movzbl (%r8,%rbp), %r8d ; AVX-NEXT: movzbl (%r13,%rbp), %r13d ; AVX-NEXT: movzbl (%r12,%rbp), %r12d +; AVX-NEXT: movzbl (%r15,%rbp), %r15d ; AVX-NEXT: movzbl (%r14,%rbp), %r14d -; AVX-NEXT: movzbl (%r11,%rbp), %r11d -; AVX-NEXT: movzbl (%r10,%rbp), %r10d -; AVX-NEXT: movzbl (%r9,%rbp), %r9d -; AVX-NEXT: movzbl (%r8,%rbp), %ebp +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rdi # 8-byte Reload +; AVX-NEXT: movzbl (%rdi,%rbp), %edi +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rbx # 8-byte Reload +; AVX-NEXT: movzbl (%rbx,%rbp), %ebp ; AVX-NEXT: vpinsrb $2, %ebp, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $3, %r9d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $5, %r11d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $6, %r14d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $7, %r12d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $8, %r13d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $9, %edx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $12, %ebx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $13, %esi, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $14, %r15d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $15, %edi, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $8, %r8d, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $11, %edx, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $12, %esi, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $13, %r11d, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $14, %r9d, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $15, %r10d, %xmm0, %xmm0 ; AVX-NEXT: popq %rbx ; AVX-NEXT: popq %r12 ; AVX-NEXT: popq %r13 @@ -1106,11 +1272,14 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8* define <4 x float> @var_shuffle_v4f32_v4f32_x0yx_i32(<4 x float> %x, <4 x float> %y, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind { ; SSE-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32: ; SSE: # BB#0: -; SSE-NEXT: movslq %edi, %rax +; SSE-NEXT: # kill: %ECX %ECX %RCX +; SSE-NEXT: # kill: %EDX %EDX %RDX +; SSE-NEXT: # kill: %EDI %EDI %RDI +; SSE-NEXT: andl $3, %edi ; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movslq %edx, %rdx +; SSE-NEXT: andl $3, %edx ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movslq %ecx, %rcx +; SSE-NEXT: andl $3, %ecx ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero @@ -1120,11 +1289,14 @@ define <4 x float> @var_shuffle_v4f32_v4f32_x0yx_i32(<4 x float> %x, <4 x float> ; ; AVX-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32: ; AVX: # BB#0: -; AVX-NEXT: movslq %edi, %rax +; AVX-NEXT: # kill: %ECX %ECX %RCX +; AVX-NEXT: # kill: %EDX %EDX %RDX +; AVX-NEXT: # kill: %EDI %EDI %RDI +; AVX-NEXT: andl $3, %edi ; AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movslq %edx, %rdx +; AVX-NEXT: andl $3, %edx ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movslq %ecx, %rcx +; AVX-NEXT: andl $3, %ecx ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero @@ -1151,31 +1323,31 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> % ; SSE2-NEXT: # kill: %EDX %EDX %RDX ; SSE2-NEXT: # kill: %ESI %ESI %RSI ; SSE2-NEXT: # kill: %EDI %EDI %RDI -; SSE2-NEXT: movswq %di, %r10 -; SSE2-NEXT: movswq %si, %rsi -; SSE2-NEXT: movswq %dx, %r11 -; SSE2-NEXT: movswq %cx, %rcx +; SSE2-NEXT: andl $7, %edi +; SSE2-NEXT: andl $7, %esi +; SSE2-NEXT: andl $7, %edx +; SSE2-NEXT: andl $7, %ecx ; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movswq %r8w, %rdi +; SSE2-NEXT: andl $7, %r8d ; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movswq %r9w, %rax -; SSE2-NEXT: movzwl -24(%rsp,%rsi,2), %esi -; SSE2-NEXT: xorl %edx, %edx -; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: andl $7, %r9d +; SSE2-NEXT: movzwl -24(%rsp,%rsi,2), %eax +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: movd %esi, %xmm0 ; SSE2-NEXT: movzwl -24(%rsp,%rcx,2), %ecx ; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movd %esi, %xmm2 -; SSE2-NEXT: movzwl -24(%rsp,%rax,2), %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: movzwl -24(%rsp,%r9,2), %eax ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: movzwl -40(%rsp,%r10,2), %eax -; SSE2-NEXT: movzwl -40(%rsp,%r11,2), %ecx +; SSE2-NEXT: movzwl -40(%rsp,%rdi,2), %eax +; SSE2-NEXT: movzwl -40(%rsp,%rdx,2), %ecx ; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzwl -40(%rsp,%rdi,2), %eax +; SSE2-NEXT: movzwl -40(%rsp,%r8,2), %eax ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -1190,31 +1362,31 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> % ; SSSE3-NEXT: # kill: %EDX %EDX %RDX ; SSSE3-NEXT: # kill: %ESI %ESI %RSI ; SSSE3-NEXT: # kill: %EDI %EDI %RDI -; SSSE3-NEXT: movswq %di, %r10 -; SSSE3-NEXT: movswq %si, %rsi -; SSSE3-NEXT: movswq %dx, %r11 -; SSSE3-NEXT: movswq %cx, %rcx +; SSSE3-NEXT: andl $7, %edi +; SSSE3-NEXT: andl $7, %esi +; SSSE3-NEXT: andl $7, %edx +; SSSE3-NEXT: andl $7, %ecx ; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movswq %r8w, %rdi +; SSSE3-NEXT: andl $7, %r8d ; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movswq %r9w, %rax -; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %esi -; SSSE3-NEXT: xorl %edx, %edx -; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: andl $7, %r9d +; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %eax +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: movd %esi, %xmm0 ; SSSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx ; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: movd %esi, %xmm2 -; SSSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: movzwl -24(%rsp,%r9,2), %eax ; SSSE3-NEXT: movd %eax, %xmm3 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSSE3-NEXT: movzwl -40(%rsp,%r10,2), %eax -; SSSE3-NEXT: movzwl -40(%rsp,%r11,2), %ecx +; SSSE3-NEXT: movzwl -40(%rsp,%rdi,2), %eax +; SSSE3-NEXT: movzwl -40(%rsp,%rdx,2), %ecx ; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzwl -40(%rsp,%rdi,2), %eax +; SSSE3-NEXT: movzwl -40(%rsp,%r8,2), %eax ; SSSE3-NEXT: movd %eax, %xmm3 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -1229,21 +1401,21 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> % ; SSE41-NEXT: # kill: %EDX %EDX %RDX ; SSE41-NEXT: # kill: %ESI %ESI %RSI ; SSE41-NEXT: # kill: %EDI %EDI %RDI -; SSE41-NEXT: movswq %di, %rax -; SSE41-NEXT: movswq %si, %rsi -; SSE41-NEXT: movswq %dx, %rdx -; SSE41-NEXT: movswq %cx, %r10 +; SSE41-NEXT: andl $7, %edi +; SSE41-NEXT: andl $7, %esi +; SSE41-NEXT: andl $7, %edx +; SSE41-NEXT: andl $7, %ecx ; SSE41-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: movswq %r8w, %rdi +; SSE41-NEXT: andl $7, %r8d ; SSE41-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: movswq %r9w, %rcx -; SSE41-NEXT: movzwl -40(%rsp,%rax,2), %eax +; SSE41-NEXT: andl $7, %r9d +; SSE41-NEXT: movzwl -40(%rsp,%rdi,2), %eax ; SSE41-NEXT: movd %eax, %xmm1 ; SSE41-NEXT: pinsrw $1, -24(%rsp,%rsi,2), %xmm1 ; SSE41-NEXT: pinsrw $2, -40(%rsp,%rdx,2), %xmm1 -; SSE41-NEXT: pinsrw $3, -24(%rsp,%r10,2), %xmm1 -; SSE41-NEXT: pinsrw $4, -40(%rsp,%rdi,2), %xmm1 -; SSE41-NEXT: pinsrw $5, -24(%rsp,%rcx,2), %xmm1 +; SSE41-NEXT: pinsrw $3, -24(%rsp,%rcx,2), %xmm1 +; SSE41-NEXT: pinsrw $4, -40(%rsp,%r8,2), %xmm1 +; SSE41-NEXT: pinsrw $5, -24(%rsp,%r9,2), %xmm1 ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; SSE41-NEXT: retq @@ -1256,21 +1428,21 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> % ; AVX1-NEXT: # kill: %EDX %EDX %RDX ; AVX1-NEXT: # kill: %ESI %ESI %RSI ; AVX1-NEXT: # kill: %EDI %EDI %RDI -; AVX1-NEXT: movswq %di, %r10 -; AVX1-NEXT: movswq %si, %r11 -; AVX1-NEXT: movswq %dx, %rdx -; AVX1-NEXT: movswq %cx, %rcx +; AVX1-NEXT: andl $7, %edi +; AVX1-NEXT: andl $7, %esi +; AVX1-NEXT: andl $7, %edx +; AVX1-NEXT: andl $7, %ecx ; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movswq %r8w, %rdi +; AVX1-NEXT: andl $7, %r8d ; AVX1-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movswq %r9w, %rax -; AVX1-NEXT: movzwl -40(%rsp,%r10,2), %esi -; AVX1-NEXT: vmovd %esi, %xmm0 -; AVX1-NEXT: vpinsrw $1, -24(%rsp,%r11,2), %xmm0, %xmm0 +; AVX1-NEXT: andl $7, %r9d +; AVX1-NEXT: movzwl -40(%rsp,%rdi,2), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0 ; AVX1-NEXT: vpinsrw $2, -40(%rsp,%rdx,2), %xmm0, %xmm0 ; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $4, -40(%rsp,%rdi,2), %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $4, -40(%rsp,%r8,2), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; AVX1-NEXT: retq @@ -1283,21 +1455,21 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> % ; AVX2-NEXT: # kill: %EDX %EDX %RDX ; AVX2-NEXT: # kill: %ESI %ESI %RSI ; AVX2-NEXT: # kill: %EDI %EDI %RDI -; AVX2-NEXT: movswq %di, %r10 -; AVX2-NEXT: movswq %si, %r11 -; AVX2-NEXT: movswq %dx, %rdx -; AVX2-NEXT: movswq %cx, %rcx +; AVX2-NEXT: andl $7, %edi +; AVX2-NEXT: andl $7, %esi +; AVX2-NEXT: andl $7, %edx +; AVX2-NEXT: andl $7, %ecx ; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movswq %r8w, %rdi +; AVX2-NEXT: andl $7, %r8d ; AVX2-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movswq %r9w, %rax -; AVX2-NEXT: movzwl -40(%rsp,%r10,2), %esi -; AVX2-NEXT: vmovd %esi, %xmm0 -; AVX2-NEXT: vpinsrw $1, -24(%rsp,%r11,2), %xmm0, %xmm0 +; AVX2-NEXT: andl $7, %r9d +; AVX2-NEXT: movzwl -40(%rsp,%rdi,2), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0 ; AVX2-NEXT: vpinsrw $2, -40(%rsp,%rdx,2), %xmm0, %xmm0 ; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $4, -40(%rsp,%rdi,2), %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $4, -40(%rsp,%r8,2), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll index b43ec058ed91..42b3c11d3d6b 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll @@ -13,6 +13,10 @@ define <4 x double> @var_shuffle_v4f64_v4f64_xxxx_i64(<4 x double> %x, i64 %i0, ; ALL-NEXT: movq %rsp, %rbp ; ALL-NEXT: andq $-32, %rsp ; ALL-NEXT: subq $64, %rsp +; ALL-NEXT: andl $3, %ecx +; ALL-NEXT: andl $3, %edx +; ALL-NEXT: andl $3, %esi +; ALL-NEXT: andl $3, %edi ; ALL-NEXT: vmovaps %ymm0, (%rsp) ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -40,6 +44,8 @@ define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0, ; ALL-NEXT: movq %rsp, %rbp ; ALL-NEXT: andq $-32, %rsp ; ALL-NEXT: subq $64, %rsp +; ALL-NEXT: andl $3, %edx +; ALL-NEXT: andl $3, %esi ; ALL-NEXT: vmovaps %ymm0, (%rsp) ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] @@ -62,6 +68,10 @@ define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0, define <4 x double> @var_shuffle_v4f64_v2f64_xxxx_i64(<2 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { ; ALL-LABEL: var_shuffle_v4f64_v2f64_xxxx_i64: ; ALL: # BB#0: +; ALL-NEXT: andl $1, %ecx +; ALL-NEXT: andl $1, %edx +; ALL-NEXT: andl $1, %esi +; ALL-NEXT: andl $1, %edi ; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -87,6 +97,10 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64 %i0, i64 %i ; AVX1-NEXT: movq %rsp, %rbp ; AVX1-NEXT: andq $-32, %rsp ; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: andl $3, %ecx +; AVX1-NEXT: andl $3, %edx +; AVX1-NEXT: andl $3, %esi +; AVX1-NEXT: andl $3, %edi ; AVX1-NEXT: vmovaps %ymm0, (%rsp) ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero @@ -105,6 +119,10 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64 %i0, i64 %i ; AVX2-NEXT: movq %rsp, %rbp ; AVX2-NEXT: andq $-32, %rsp ; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: andl $3, %ecx +; AVX2-NEXT: andl $3, %edx +; AVX2-NEXT: andl $3, %esi +; AVX2-NEXT: andl $3, %edi ; AVX2-NEXT: vmovaps %ymm0, (%rsp) ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero @@ -134,6 +152,8 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i ; AVX1-NEXT: movq %rsp, %rbp ; AVX1-NEXT: andq $-32, %rsp ; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: andl $3, %esi +; AVX1-NEXT: andl $3, %edi ; AVX1-NEXT: vmovaps %ymm0, (%rsp) ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero @@ -150,6 +170,8 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i ; AVX2-NEXT: movq %rsp, %rbp ; AVX2-NEXT: andq $-32, %rsp ; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: andl $3, %esi +; AVX2-NEXT: andl $3, %edi ; AVX2-NEXT: vmovaps %ymm0, (%rsp) ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero @@ -173,6 +195,10 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i define <4 x i64> @var_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { ; AVX1-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64: ; AVX1: # BB#0: +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: andl $1, %edx +; AVX1-NEXT: andl $1, %esi +; AVX1-NEXT: andl $1, %edi ; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero @@ -185,6 +211,10 @@ define <4 x i64> @var_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64 %i0, i64 %i ; ; AVX2-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64: ; AVX2: # BB#0: +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: andl $1, %esi +; AVX2-NEXT: andl $1, %edi ; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero @@ -212,15 +242,23 @@ define <8 x float> @var_shuffle_v8f32_v8f32_xxxxxxxx_i32(<8 x float> %x, i32 %i0 ; AVX1-NEXT: movq %rsp, %rbp ; AVX1-NEXT: andq $-32, %rsp ; AVX1-NEXT: subq $64, %rsp -; AVX1-NEXT: movslq %edi, %rax -; AVX1-NEXT: movslq %esi, %rsi -; AVX1-NEXT: movslq %edx, %rdx -; AVX1-NEXT: movslq %ecx, %r11 -; AVX1-NEXT: movslq %r8d, %r10 +; AVX1-NEXT: # kill: %R9D %R9D %R9 +; AVX1-NEXT: # kill: %R8D %R8D %R8 +; AVX1-NEXT: # kill: %ECX %ECX %RCX +; AVX1-NEXT: # kill: %EDX %EDX %RDX +; AVX1-NEXT: # kill: %ESI %ESI %RSI +; AVX1-NEXT: # kill: %EDI %EDI %RDI +; AVX1-NEXT: andl $7, %edi +; AVX1-NEXT: andl $7, %esi +; AVX1-NEXT: andl $7, %edx +; AVX1-NEXT: andl $7, %ecx +; AVX1-NEXT: andl $7, %r8d ; AVX1-NEXT: vmovaps %ymm0, (%rsp) -; AVX1-NEXT: movslq %r9d, %r8 -; AVX1-NEXT: movslq 16(%rbp), %rdi -; AVX1-NEXT: movslq 24(%rbp), %rcx +; AVX1-NEXT: andl $7, %r9d +; AVX1-NEXT: movl 16(%rbp), %r10d +; AVX1-NEXT: andl $7, %r10d +; AVX1-NEXT: movl 24(%rbp), %eax +; AVX1-NEXT: andl $7, %eax ; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero @@ -284,15 +322,23 @@ define <8 x float> @var_shuffle_v8f32_v8f32_xxxxxxxx_i32(<8 x float> %x, i32 %i0 define <8 x float> @var_shuffle_v8f32_v4f32_xxxxxxxx_i32(<4 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind { ; ALL-LABEL: var_shuffle_v8f32_v4f32_xxxxxxxx_i32: ; ALL: # BB#0: -; ALL-NEXT: movslq %edi, %rax -; ALL-NEXT: movslq %esi, %rsi -; ALL-NEXT: movslq %edx, %rdx -; ALL-NEXT: movslq %ecx, %r11 -; ALL-NEXT: movslq %r8d, %r10 +; ALL-NEXT: # kill: %R9D %R9D %R9 +; ALL-NEXT: # kill: %R8D %R8D %R8 +; ALL-NEXT: # kill: %ECX %ECX %RCX +; ALL-NEXT: # kill: %EDX %EDX %RDX +; ALL-NEXT: # kill: %ESI %ESI %RSI +; ALL-NEXT: # kill: %EDI %EDI %RDI +; ALL-NEXT: andl $3, %edi +; ALL-NEXT: andl $3, %esi +; ALL-NEXT: andl $3, %edx +; ALL-NEXT: andl $3, %ecx +; ALL-NEXT: andl $3, %r8d ; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; ALL-NEXT: movslq %r9d, %r8 -; ALL-NEXT: movslq {{[0-9]+}}(%rsp), %rdi -; ALL-NEXT: movslq {{[0-9]+}}(%rsp), %rcx +; ALL-NEXT: andl $3, %r9d +; ALL-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; ALL-NEXT: andl $3, %r10d +; ALL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; ALL-NEXT: andl $3, %eax ; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; ALL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; ALL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero @@ -331,48 +377,64 @@ define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x, ; AVX1-NEXT: movq %rsp, %rbp ; AVX1-NEXT: andq $-32, %rsp ; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: # kill: %R9D %R9D %R9 +; AVX1-NEXT: # kill: %R8D %R8D %R8 +; AVX1-NEXT: # kill: %ECX %ECX %RCX +; AVX1-NEXT: # kill: %EDX %EDX %RDX +; AVX1-NEXT: # kill: %ESI %ESI %RSI +; AVX1-NEXT: # kill: %EDI %EDI %RDI ; AVX1-NEXT: vmovaps %ymm0, (%rsp) -; AVX1-NEXT: movslq 32(%rbp), %rax +; AVX1-NEXT: movl 32(%rbp), %eax +; AVX1-NEXT: andl $15, %eax ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: movslq 40(%rbp), %rax +; AVX1-NEXT: movl 40(%rbp), %eax +; AVX1-NEXT: andl $15, %eax ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax ; AVX1-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movslq 48(%rbp), %rax +; AVX1-NEXT: movl 48(%rbp), %eax +; AVX1-NEXT: andl $15, %eax ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax ; AVX1-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movslq 56(%rbp), %rax +; AVX1-NEXT: movl 56(%rbp), %eax +; AVX1-NEXT: andl $15, %eax ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax ; AVX1-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movslq 64(%rbp), %rax +; AVX1-NEXT: movl 64(%rbp), %eax +; AVX1-NEXT: andl $15, %eax ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax ; AVX1-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movslq 72(%rbp), %rax +; AVX1-NEXT: movl 72(%rbp), %eax +; AVX1-NEXT: andl $15, %eax ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax ; AVX1-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movslq 80(%rbp), %rax +; AVX1-NEXT: movl 80(%rbp), %eax +; AVX1-NEXT: andl $15, %eax ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax ; AVX1-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movslq 88(%rbp), %rax +; AVX1-NEXT: movl 88(%rbp), %eax +; AVX1-NEXT: andl $15, %eax ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax ; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movslq %edi, %rax -; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax +; AVX1-NEXT: andl $15, %edi +; AVX1-NEXT: movzwl (%rsp,%rdi,2), %eax ; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: movslq %esi, %rax -; AVX1-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1 -; AVX1-NEXT: movslq %edx, %rax -; AVX1-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1 -; AVX1-NEXT: movslq %ecx, %rax -; AVX1-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1 -; AVX1-NEXT: movslq %r8d, %rax -; AVX1-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1 -; AVX1-NEXT: movslq %r9d, %rax -; AVX1-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1 -; AVX1-NEXT: movslq 16(%rbp), %rax +; AVX1-NEXT: andl $15, %esi +; AVX1-NEXT: vpinsrw $1, (%rsp,%rsi,2), %xmm1, %xmm1 +; AVX1-NEXT: andl $15, %edx +; AVX1-NEXT: vpinsrw $2, (%rsp,%rdx,2), %xmm1, %xmm1 +; AVX1-NEXT: andl $15, %ecx +; AVX1-NEXT: vpinsrw $3, (%rsp,%rcx,2), %xmm1, %xmm1 +; AVX1-NEXT: andl $15, %r8d +; AVX1-NEXT: vpinsrw $4, (%rsp,%r8,2), %xmm1, %xmm1 +; AVX1-NEXT: andl $15, %r9d +; AVX1-NEXT: vpinsrw $5, (%rsp,%r9,2), %xmm1, %xmm1 +; AVX1-NEXT: movl 16(%rbp), %eax +; AVX1-NEXT: andl $15, %eax ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax ; AVX1-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movslq 24(%rbp), %rax +; AVX1-NEXT: movl 24(%rbp), %eax +; AVX1-NEXT: andl $15, %eax ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax ; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -386,48 +448,64 @@ define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x, ; AVX2-NEXT: movq %rsp, %rbp ; AVX2-NEXT: andq $-32, %rsp ; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: # kill: %R9D %R9D %R9 +; AVX2-NEXT: # kill: %R8D %R8D %R8 +; AVX2-NEXT: # kill: %ECX %ECX %RCX +; AVX2-NEXT: # kill: %EDX %EDX %RDX +; AVX2-NEXT: # kill: %ESI %ESI %RSI +; AVX2-NEXT: # kill: %EDI %EDI %RDI ; AVX2-NEXT: vmovaps %ymm0, (%rsp) -; AVX2-NEXT: movslq 32(%rbp), %rax +; AVX2-NEXT: movl 32(%rbp), %eax +; AVX2-NEXT: andl $15, %eax ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax ; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: movslq 40(%rbp), %rax +; AVX2-NEXT: movl 40(%rbp), %eax +; AVX2-NEXT: andl $15, %eax ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax ; AVX2-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movslq 48(%rbp), %rax +; AVX2-NEXT: movl 48(%rbp), %eax +; AVX2-NEXT: andl $15, %eax ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax ; AVX2-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movslq 56(%rbp), %rax +; AVX2-NEXT: movl 56(%rbp), %eax +; AVX2-NEXT: andl $15, %eax ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax ; AVX2-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movslq 64(%rbp), %rax +; AVX2-NEXT: movl 64(%rbp), %eax +; AVX2-NEXT: andl $15, %eax ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax ; AVX2-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movslq 72(%rbp), %rax +; AVX2-NEXT: movl 72(%rbp), %eax +; AVX2-NEXT: andl $15, %eax ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax ; AVX2-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movslq 80(%rbp), %rax +; AVX2-NEXT: movl 80(%rbp), %eax +; AVX2-NEXT: andl $15, %eax ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax ; AVX2-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movslq 88(%rbp), %rax +; AVX2-NEXT: movl 88(%rbp), %eax +; AVX2-NEXT: andl $15, %eax ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax ; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movslq %edi, %rax -; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax +; AVX2-NEXT: andl $15, %edi +; AVX2-NEXT: movzwl (%rsp,%rdi,2), %eax ; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: movslq %esi, %rax -; AVX2-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1 -; AVX2-NEXT: movslq %edx, %rax -; AVX2-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1 -; AVX2-NEXT: movslq %ecx, %rax -; AVX2-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1 -; AVX2-NEXT: movslq %r8d, %rax -; AVX2-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1 -; AVX2-NEXT: movslq %r9d, %rax -; AVX2-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1 -; AVX2-NEXT: movslq 16(%rbp), %rax +; AVX2-NEXT: andl $15, %esi +; AVX2-NEXT: vpinsrw $1, (%rsp,%rsi,2), %xmm1, %xmm1 +; AVX2-NEXT: andl $15, %edx +; AVX2-NEXT: vpinsrw $2, (%rsp,%rdx,2), %xmm1, %xmm1 +; AVX2-NEXT: andl $15, %ecx +; AVX2-NEXT: vpinsrw $3, (%rsp,%rcx,2), %xmm1, %xmm1 +; AVX2-NEXT: andl $15, %r8d +; AVX2-NEXT: vpinsrw $4, (%rsp,%r8,2), %xmm1, %xmm1 +; AVX2-NEXT: andl $15, %r9d +; AVX2-NEXT: vpinsrw $5, (%rsp,%r9,2), %xmm1, %xmm1 +; AVX2-NEXT: movl 16(%rbp), %eax +; AVX2-NEXT: andl $15, %eax ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax ; AVX2-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movslq 24(%rbp), %rax +; AVX2-NEXT: movl 24(%rbp), %eax +; AVX2-NEXT: andl $15, %eax ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax ; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 @@ -472,48 +550,64 @@ define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x, define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, i32 %i11, i32 %i12, i32 %i13, i32 %i14, i32 %i15) nounwind { ; AVX1-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16: ; AVX1: # BB#0: +; AVX1-NEXT: # kill: %R9D %R9D %R9 +; AVX1-NEXT: # kill: %R8D %R8D %R8 +; AVX1-NEXT: # kill: %ECX %ECX %RCX +; AVX1-NEXT: # kill: %EDX %EDX %RDX +; AVX1-NEXT: # kill: %ESI %ESI %RSI +; AVX1-NEXT: # kill: %EDI %EDI %RDI ; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: andl $7, %eax ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: andl $7, %eax ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax ; AVX1-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: andl $7, %eax ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax ; AVX1-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: andl $7, %eax ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax ; AVX1-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: andl $7, %eax ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax ; AVX1-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: andl $7, %eax ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax ; AVX1-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: andl $7, %eax ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax ; AVX1-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: andl $7, %eax ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax ; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movslq %edi, %rax -; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax +; AVX1-NEXT: andl $7, %edi +; AVX1-NEXT: movzwl -24(%rsp,%rdi,2), %eax ; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: movslq %esi, %rax -; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1 -; AVX1-NEXT: movslq %edx, %rax -; AVX1-NEXT: vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1 -; AVX1-NEXT: movslq %ecx, %rax -; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1 -; AVX1-NEXT: movslq %r8d, %rax -; AVX1-NEXT: vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1 -; AVX1-NEXT: movslq %r9d, %rax -; AVX1-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1 -; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax +; AVX1-NEXT: andl $7, %esi +; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm1, %xmm1 +; AVX1-NEXT: andl $7, %edx +; AVX1-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm1, %xmm1 +; AVX1-NEXT: andl $7, %ecx +; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm1, %xmm1 +; AVX1-NEXT: andl $7, %r8d +; AVX1-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm1, %xmm1 +; AVX1-NEXT: andl $7, %r9d +; AVX1-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: andl $7, %eax ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax ; AVX1-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 -; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: andl $7, %eax ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax ; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -521,48 +615,64 @@ define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i ; ; AVX2-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16: ; AVX2: # BB#0: +; AVX2-NEXT: # kill: %R9D %R9D %R9 +; AVX2-NEXT: # kill: %R8D %R8D %R8 +; AVX2-NEXT: # kill: %ECX %ECX %RCX +; AVX2-NEXT: # kill: %EDX %EDX %RDX +; AVX2-NEXT: # kill: %ESI %ESI %RSI +; AVX2-NEXT: # kill: %EDI %EDI %RDI ; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: andl $7, %eax ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax ; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: andl $7, %eax ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax ; AVX2-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: andl $7, %eax ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax ; AVX2-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: andl $7, %eax ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax ; AVX2-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: andl $7, %eax ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax ; AVX2-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: andl $7, %eax ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax ; AVX2-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: andl $7, %eax ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax ; AVX2-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: andl $7, %eax ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax ; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 -; AVX2-NEXT: movslq %edi, %rax -; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax +; AVX2-NEXT: andl $7, %edi +; AVX2-NEXT: movzwl -24(%rsp,%rdi,2), %eax ; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: movslq %esi, %rax -; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1 -; AVX2-NEXT: movslq %edx, %rax -; AVX2-NEXT: vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1 -; AVX2-NEXT: movslq %ecx, %rax -; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1 -; AVX2-NEXT: movslq %r8d, %rax -; AVX2-NEXT: vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1 -; AVX2-NEXT: movslq %r9d, %rax -; AVX2-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1 -; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: andl $7, %esi +; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm1, %xmm1 +; AVX2-NEXT: andl $7, %edx +; AVX2-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm1, %xmm1 +; AVX2-NEXT: andl $7, %ecx +; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm1, %xmm1 +; AVX2-NEXT: andl $7, %r8d +; AVX2-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm1, %xmm1 +; AVX2-NEXT: andl $7, %r9d +; AVX2-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: andl $7, %eax ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax ; AVX2-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: andl $7, %eax ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax ; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 @@ -615,8 +725,12 @@ define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64* %i) nounwi ; AVX1-NEXT: subq $64, %rsp ; AVX1-NEXT: movq (%rdi), %rax ; AVX1-NEXT: movq 8(%rdi), %rcx +; AVX1-NEXT: andl $3, %eax +; AVX1-NEXT: andl $3, %ecx ; AVX1-NEXT: movq 16(%rdi), %rdx +; AVX1-NEXT: andl $3, %edx ; AVX1-NEXT: movq 24(%rdi), %rsi +; AVX1-NEXT: andl $3, %esi ; AVX1-NEXT: vmovaps %ymm0, (%rsp) ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero @@ -637,8 +751,12 @@ define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64* %i) nounwi ; AVX2-NEXT: subq $64, %rsp ; AVX2-NEXT: movq (%rdi), %rax ; AVX2-NEXT: movq 8(%rdi), %rcx +; AVX2-NEXT: andl $3, %eax +; AVX2-NEXT: andl $3, %ecx ; AVX2-NEXT: movq 16(%rdi), %rdx +; AVX2-NEXT: andl $3, %edx ; AVX2-NEXT: movq 24(%rdi), %rsi +; AVX2-NEXT: andl $3, %esi ; AVX2-NEXT: vmovaps %ymm0, (%rsp) ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero @@ -674,8 +792,12 @@ define <4 x i64> @mem_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64* %i) nounwi ; AVX1: # BB#0: ; AVX1-NEXT: movq (%rdi), %rax ; AVX1-NEXT: movq 8(%rdi), %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: andl $1, %ecx ; AVX1-NEXT: movq 16(%rdi), %rdx +; AVX1-NEXT: andl $1, %edx ; AVX1-NEXT: movq 24(%rdi), %rsi +; AVX1-NEXT: andl $1, %esi ; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero @@ -690,8 +812,12 @@ define <4 x i64> @mem_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64* %i) nounwi ; AVX2: # BB#0: ; AVX2-NEXT: movq (%rdi), %rax ; AVX2-NEXT: movq 8(%rdi), %rcx +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: movq 16(%rdi), %rdx +; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: movq 24(%rdi), %rsi +; AVX2-NEXT: andl $1, %esi ; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero