diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index accda2588c88..0cd15de4d641 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -7296,34 +7296,31 @@ SDValue TargetLowering::expandBITREVERSE(SDNode *N, SelectionDAG &DAG) const { // TODO: We can easily support i4/i2 legal types if any target ever does. if (Sz >= 8 && isPowerOf2_32(Sz)) { // Create the masks - repeating the pattern every byte. - APInt MaskHi4 = APInt::getSplat(Sz, APInt(8, 0xF0)); - APInt MaskHi2 = APInt::getSplat(Sz, APInt(8, 0xCC)); - APInt MaskHi1 = APInt::getSplat(Sz, APInt(8, 0xAA)); - APInt MaskLo4 = APInt::getSplat(Sz, APInt(8, 0x0F)); - APInt MaskLo2 = APInt::getSplat(Sz, APInt(8, 0x33)); - APInt MaskLo1 = APInt::getSplat(Sz, APInt(8, 0x55)); + APInt Mask4 = APInt::getSplat(Sz, APInt(8, 0x0F)); + APInt Mask2 = APInt::getSplat(Sz, APInt(8, 0x33)); + APInt Mask1 = APInt::getSplat(Sz, APInt(8, 0x55)); // BSWAP if the type is wider than a single byte. Tmp = (Sz > 8 ? DAG.getNode(ISD::BSWAP, dl, VT, Op) : Op); - // swap i4: ((V & 0xF0) >> 4) | ((V & 0x0F) << 4) - Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi4, dl, VT)); - Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo4, dl, VT)); - Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(4, dl, SHVT)); + // swap i4: ((V >> 4) & 0x0F) | ((V & 0x0F) << 4) + Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp, DAG.getConstant(4, dl, SHVT)); + Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Mask4, dl, VT)); + Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(Mask4, dl, VT)); Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(4, dl, SHVT)); Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); - // swap i2: ((V & 0xCC) >> 2) | ((V & 0x33) << 2) - Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi2, dl, VT)); - Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo2, dl, VT)); - Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(2, dl, SHVT)); + // swap i2: ((V >> 2) & 0x33) | ((V & 0x33) << 2) + Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp, DAG.getConstant(2, dl, SHVT)); + Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Mask2, dl, VT)); + Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(Mask2, dl, VT)); Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(2, dl, SHVT)); Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); - // swap i1: ((V & 0xAA) >> 1) | ((V & 0x55) << 1) - Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi1, dl, VT)); - Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo1, dl, VT)); - Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(1, dl, SHVT)); + // swap i1: ((V >> 1) & 0x55) | ((V & 0x55) << 1) + Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp, DAG.getConstant(1, dl, SHVT)); + Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Mask1, dl, VT)); + Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(Mask1, dl, VT)); Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(1, dl, SHVT)); Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); return Tmp; diff --git a/llvm/test/CodeGen/RISCV/rv32zbp.ll b/llvm/test/CodeGen/RISCV/rv32zbp.ll index cfad9fb9110a..1717526a608c 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbp.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbp.ll @@ -2453,13 +2453,13 @@ define zeroext i8 @bitreverse_i8(i8 zeroext %a) nounwind { ; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: andi a1, a0, 51 ; RV32I-NEXT: slli a1, a1, 2 -; RV32I-NEXT: andi a0, a0, 204 ; RV32I-NEXT: srli a0, a0, 2 +; RV32I-NEXT: andi a0, a0, 51 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: andi a1, a0, 85 ; RV32I-NEXT: slli a1, a1, 1 -; RV32I-NEXT: andi a0, a0, 170 ; RV32I-NEXT: srli a0, a0, 1 +; RV32I-NEXT: andi a0, a0, 85 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: ret ; @@ -2484,33 +2484,27 @@ define zeroext i16 @bitreverse_i16(i16 zeroext %a) nounwind { ; RV32I-NEXT: srli a1, a0, 8 ; RV32I-NEXT: slli a0, a0, 8 ; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: lui a1, 1 -; RV32I-NEXT: addi a1, a1, -241 -; RV32I-NEXT: and a1, a0, a1 -; RV32I-NEXT: slli a1, a1, 4 -; RV32I-NEXT: lui a2, 15 -; RV32I-NEXT: addi a2, a2, 240 +; RV32I-NEXT: srli a1, a0, 4 +; RV32I-NEXT: lui a2, 1 +; RV32I-NEXT: addi a2, a2, -241 +; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: srli a0, a0, 4 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: lui a1, 3 -; RV32I-NEXT: addi a1, a1, 819 -; RV32I-NEXT: and a1, a0, a1 -; RV32I-NEXT: slli a1, a1, 2 -; RV32I-NEXT: lui a2, 13 -; RV32I-NEXT: addi a2, a2, -820 +; RV32I-NEXT: slli a0, a0, 4 +; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: srli a1, a0, 2 +; RV32I-NEXT: lui a2, 3 +; RV32I-NEXT: addi a2, a2, 819 +; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: lui a1, 5 -; RV32I-NEXT: addi a1, a1, 1365 -; RV32I-NEXT: and a1, a0, a1 -; RV32I-NEXT: slli a1, a1, 1 -; RV32I-NEXT: lui a2, 11 -; RV32I-NEXT: addi a2, a2, -1366 +; RV32I-NEXT: slli a0, a0, 2 +; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: srli a1, a0, 1 +; RV32I-NEXT: lui a2, 5 +; RV32I-NEXT: addi a2, a2, 1365 +; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: srli a0, a0, 1 -; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: slli a0, a0, 1 +; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: ret ; ; RV32B-LABEL: bitreverse_i16: @@ -2543,33 +2537,27 @@ define i32 @bitreverse_i32(i32 %a) nounwind { ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or a0, a0, a2 ; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: lui a1, 61681 -; RV32I-NEXT: addi a1, a1, -241 -; RV32I-NEXT: and a1, a0, a1 -; RV32I-NEXT: slli a1, a1, 4 -; RV32I-NEXT: lui a2, 986895 -; RV32I-NEXT: addi a2, a2, 240 +; RV32I-NEXT: srli a1, a0, 4 +; RV32I-NEXT: lui a2, 61681 +; RV32I-NEXT: addi a2, a2, -241 +; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: srli a0, a0, 4 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi a1, a1, 819 -; RV32I-NEXT: and a1, a0, a1 -; RV32I-NEXT: slli a1, a1, 2 -; RV32I-NEXT: lui a2, 838861 -; RV32I-NEXT: addi a2, a2, -820 +; RV32I-NEXT: slli a0, a0, 4 +; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: srli a1, a0, 2 +; RV32I-NEXT: lui a2, 209715 +; RV32I-NEXT: addi a2, a2, 819 +; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: lui a1, 349525 -; RV32I-NEXT: addi a1, a1, 1365 -; RV32I-NEXT: and a1, a0, a1 -; RV32I-NEXT: slli a1, a1, 1 -; RV32I-NEXT: lui a2, 699051 -; RV32I-NEXT: addi a2, a2, -1366 +; RV32I-NEXT: slli a0, a0, 2 +; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: srli a1, a0, 1 +; RV32I-NEXT: lui a2, 349525 +; RV32I-NEXT: addi a2, a2, 1365 +; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: and a0, a0, a2 -; RV32I-NEXT: srli a0, a0, 1 -; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: slli a0, a0, 1 +; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: ret ; ; RV32B-LABEL: bitreverse_i32: @@ -2602,58 +2590,52 @@ define i64 @bitreverse_i64(i64 %a) nounwind { ; RV32I-NEXT: slli a1, a1, 24 ; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: or a1, a1, a2 -; RV32I-NEXT: lui a2, 61681 -; RV32I-NEXT: addi t0, a2, -241 -; RV32I-NEXT: and a2, a1, t0 -; RV32I-NEXT: slli a2, a2, 4 -; RV32I-NEXT: lui a3, 986895 -; RV32I-NEXT: addi t1, a3, 240 -; RV32I-NEXT: and a1, a1, t1 -; RV32I-NEXT: srli a1, a1, 4 -; RV32I-NEXT: or a1, a1, a2 -; RV32I-NEXT: lui a2, 209715 -; RV32I-NEXT: addi t2, a2, 819 -; RV32I-NEXT: and a2, a1, t2 -; RV32I-NEXT: slli a2, a2, 2 -; RV32I-NEXT: lui a4, 838861 -; RV32I-NEXT: addi t3, a4, -820 -; RV32I-NEXT: and a1, a1, t3 -; RV32I-NEXT: srli a1, a1, 2 -; RV32I-NEXT: or a1, a1, a2 -; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi a3, a2, 1365 -; RV32I-NEXT: and a2, a1, a3 -; RV32I-NEXT: slli a2, a2, 1 -; RV32I-NEXT: lui a5, 699051 -; RV32I-NEXT: addi a5, a5, -1366 +; RV32I-NEXT: srli a2, a1, 4 +; RV32I-NEXT: lui a4, 61681 +; RV32I-NEXT: addi a4, a4, -241 +; RV32I-NEXT: and a2, a2, a4 +; RV32I-NEXT: and a1, a1, a4 +; RV32I-NEXT: slli a1, a1, 4 +; RV32I-NEXT: or a1, a2, a1 +; RV32I-NEXT: srli a2, a1, 2 +; RV32I-NEXT: lui a3, 209715 +; RV32I-NEXT: addi a3, a3, 819 +; RV32I-NEXT: and a2, a2, a3 +; RV32I-NEXT: and a1, a1, a3 +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: or a1, a2, a1 +; RV32I-NEXT: srli a2, a1, 1 +; RV32I-NEXT: lui a5, 349525 +; RV32I-NEXT: addi a5, a5, 1365 +; RV32I-NEXT: and a2, a2, a5 ; RV32I-NEXT: and a1, a1, a5 -; RV32I-NEXT: srli a1, a1, 1 -; RV32I-NEXT: or a2, a1, a2 +; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: or t0, a2, a1 ; RV32I-NEXT: srli a1, a0, 8 ; RV32I-NEXT: and a1, a1, a6 -; RV32I-NEXT: srli a4, a0, 24 -; RV32I-NEXT: or a1, a1, a4 -; RV32I-NEXT: slli a4, a0, 8 -; RV32I-NEXT: and a4, a4, a7 +; RV32I-NEXT: srli a2, a0, 24 +; RV32I-NEXT: or a1, a1, a2 +; RV32I-NEXT: slli a2, a0, 8 +; RV32I-NEXT: and a2, a2, a7 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a4 +; RV32I-NEXT: or a0, a0, a2 ; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: and a1, a0, t0 -; RV32I-NEXT: slli a1, a1, 4 -; RV32I-NEXT: and a0, a0, t1 -; RV32I-NEXT: srli a0, a0, 4 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: and a1, a0, t2 -; RV32I-NEXT: slli a1, a1, 2 -; RV32I-NEXT: and a0, a0, t3 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: and a1, a0, a3 -; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: srli a1, a0, 4 +; RV32I-NEXT: and a1, a1, a4 +; RV32I-NEXT: and a0, a0, a4 +; RV32I-NEXT: slli a0, a0, 4 +; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: srli a1, a0, 2 +; RV32I-NEXT: and a1, a1, a3 +; RV32I-NEXT: and a0, a0, a3 +; RV32I-NEXT: slli a0, a0, 2 +; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: srli a1, a0, 1 +; RV32I-NEXT: and a1, a1, a5 ; RV32I-NEXT: and a0, a0, a5 -; RV32I-NEXT: srli a0, a0, 1 -; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: slli a0, a0, 1 +; RV32I-NEXT: or a1, a1, a0 +; RV32I-NEXT: mv a0, t0 ; RV32I-NEXT: ret ; ; RV32B-LABEL: bitreverse_i64: @@ -2756,33 +2738,27 @@ define i32 @bitreverse_bswap_i32(i32 %a) { ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or a0, a0, a3 ; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: lui a1, 61681 -; RV32I-NEXT: addi a1, a1, -241 -; RV32I-NEXT: and a1, a0, a1 -; RV32I-NEXT: slli a1, a1, 4 -; RV32I-NEXT: lui a3, 986895 -; RV32I-NEXT: addi a3, a3, 240 +; RV32I-NEXT: srli a1, a0, 4 +; RV32I-NEXT: lui a3, 61681 +; RV32I-NEXT: addi a3, a3, -241 +; RV32I-NEXT: and a1, a1, a3 ; RV32I-NEXT: and a0, a0, a3 -; RV32I-NEXT: srli a0, a0, 4 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi a1, a1, 819 -; RV32I-NEXT: and a1, a0, a1 -; RV32I-NEXT: slli a1, a1, 2 -; RV32I-NEXT: lui a3, 838861 -; RV32I-NEXT: addi a3, a3, -820 +; RV32I-NEXT: slli a0, a0, 4 +; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: srli a1, a0, 2 +; RV32I-NEXT: lui a3, 209715 +; RV32I-NEXT: addi a3, a3, 819 +; RV32I-NEXT: and a1, a1, a3 ; RV32I-NEXT: and a0, a0, a3 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: lui a1, 349525 -; RV32I-NEXT: addi a1, a1, 1365 -; RV32I-NEXT: and a1, a0, a1 -; RV32I-NEXT: slli a1, a1, 1 -; RV32I-NEXT: lui a3, 699051 -; RV32I-NEXT: addi a3, a3, -1366 +; RV32I-NEXT: slli a0, a0, 2 +; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: srli a1, a0, 1 +; RV32I-NEXT: lui a3, 349525 +; RV32I-NEXT: addi a3, a3, 1365 +; RV32I-NEXT: and a1, a1, a3 ; RV32I-NEXT: and a0, a0, a3 -; RV32I-NEXT: srli a0, a0, 1 -; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: slli a0, a0, 1 +; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 8 ; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: srli a2, a0, 24 @@ -2813,82 +2789,76 @@ define i64 @bitreverse_bswap_i64(i64 %a) { ; RV32I: # %bb.0: ; RV32I-NEXT: srli a3, a1, 8 ; RV32I-NEXT: lui a2, 16 -; RV32I-NEXT: addi t0, a2, -256 -; RV32I-NEXT: and a3, a3, t0 +; RV32I-NEXT: addi a6, a2, -256 +; RV32I-NEXT: and a3, a3, a6 ; RV32I-NEXT: srli a4, a1, 24 -; RV32I-NEXT: or a4, a3, a4 -; RV32I-NEXT: slli a5, a1, 8 -; RV32I-NEXT: lui t1, 4080 -; RV32I-NEXT: and a5, a5, t1 +; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: slli a4, a1, 8 +; RV32I-NEXT: lui a7, 4080 +; RV32I-NEXT: and a4, a4, a7 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a5 ; RV32I-NEXT: or a1, a1, a4 -; RV32I-NEXT: lui a4, 61681 -; RV32I-NEXT: addi a6, a4, -241 -; RV32I-NEXT: and a5, a1, a6 -; RV32I-NEXT: slli a5, a5, 4 -; RV32I-NEXT: lui a4, 986895 -; RV32I-NEXT: addi a7, a4, 240 -; RV32I-NEXT: and a1, a1, a7 -; RV32I-NEXT: srli a1, a1, 4 -; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: lui a5, 209715 -; RV32I-NEXT: addi t2, a5, 819 -; RV32I-NEXT: and a4, a1, t2 -; RV32I-NEXT: slli a4, a4, 2 -; RV32I-NEXT: lui a2, 838861 -; RV32I-NEXT: addi t3, a2, -820 -; RV32I-NEXT: and a1, a1, t3 -; RV32I-NEXT: srli a1, a1, 2 -; RV32I-NEXT: or a1, a1, a4 -; RV32I-NEXT: lui a4, 349525 -; RV32I-NEXT: addi a4, a4, 1365 -; RV32I-NEXT: and a3, a1, a4 -; RV32I-NEXT: slli a3, a3, 1 -; RV32I-NEXT: lui a5, 699051 -; RV32I-NEXT: addi a5, a5, -1366 -; RV32I-NEXT: and a1, a1, a5 -; RV32I-NEXT: srli a1, a1, 1 ; RV32I-NEXT: or a1, a1, a3 -; RV32I-NEXT: srli a3, a0, 8 +; RV32I-NEXT: srli a3, a1, 4 +; RV32I-NEXT: lui a4, 61681 +; RV32I-NEXT: addi t0, a4, -241 ; RV32I-NEXT: and a3, a3, t0 -; RV32I-NEXT: srli a2, a0, 24 -; RV32I-NEXT: or a2, a3, a2 -; RV32I-NEXT: slli a3, a0, 8 -; RV32I-NEXT: and a3, a3, t1 +; RV32I-NEXT: and a1, a1, t0 +; RV32I-NEXT: slli a1, a1, 4 +; RV32I-NEXT: or a1, a3, a1 +; RV32I-NEXT: srli a3, a1, 2 +; RV32I-NEXT: lui a2, 209715 +; RV32I-NEXT: addi a2, a2, 819 +; RV32I-NEXT: and a3, a3, a2 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: or a1, a3, a1 +; RV32I-NEXT: srli a3, a1, 1 +; RV32I-NEXT: lui a5, 349525 +; RV32I-NEXT: addi a5, a5, 1365 +; RV32I-NEXT: and a3, a3, a5 +; RV32I-NEXT: and a1, a1, a5 +; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: or a1, a3, a1 +; RV32I-NEXT: srli a3, a0, 8 +; RV32I-NEXT: and a3, a3, a6 +; RV32I-NEXT: srli a4, a0, 24 +; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: slli a4, a0, 8 +; RV32I-NEXT: and a4, a4, a7 ; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or a0, a0, a4 ; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: or a0, a0, a2 -; RV32I-NEXT: and a2, a0, a6 -; RV32I-NEXT: slli a2, a2, 4 -; RV32I-NEXT: and a0, a0, a7 -; RV32I-NEXT: srli a0, a0, 4 -; RV32I-NEXT: or a0, a0, a2 -; RV32I-NEXT: and a2, a0, t2 -; RV32I-NEXT: slli a2, a2, 2 -; RV32I-NEXT: and a0, a0, t3 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: or a0, a0, a2 -; RV32I-NEXT: and a2, a0, a4 -; RV32I-NEXT: slli a2, a2, 1 +; RV32I-NEXT: srli a3, a0, 4 +; RV32I-NEXT: and a3, a3, t0 +; RV32I-NEXT: and a0, a0, t0 +; RV32I-NEXT: slli a0, a0, 4 +; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: srli a3, a0, 2 +; RV32I-NEXT: and a3, a3, a2 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: slli a0, a0, 2 +; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: srli a2, a0, 1 +; RV32I-NEXT: and a2, a2, a5 ; RV32I-NEXT: and a0, a0, a5 -; RV32I-NEXT: srli a0, a0, 1 -; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: slli a0, a0, 1 +; RV32I-NEXT: or a0, a2, a0 ; RV32I-NEXT: srli a2, a0, 8 -; RV32I-NEXT: and a2, a2, t0 +; RV32I-NEXT: and a2, a2, a6 ; RV32I-NEXT: srli a3, a0, 24 ; RV32I-NEXT: or a2, a2, a3 ; RV32I-NEXT: slli a3, a0, 8 -; RV32I-NEXT: and a3, a3, t1 +; RV32I-NEXT: and a3, a3, a7 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or a0, a0, a3 ; RV32I-NEXT: or a0, a0, a2 ; RV32I-NEXT: srli a2, a1, 8 -; RV32I-NEXT: and a2, a2, t0 +; RV32I-NEXT: and a2, a2, a6 ; RV32I-NEXT: srli a3, a1, 24 ; RV32I-NEXT: or a2, a2, a3 ; RV32I-NEXT: slli a3, a1, 8 -; RV32I-NEXT: and a3, a3, t1 +; RV32I-NEXT: and a3, a3, a7 ; RV32I-NEXT: slli a1, a1, 24 ; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: or a1, a1, a2 diff --git a/llvm/test/CodeGen/RISCV/rv64zbp.ll b/llvm/test/CodeGen/RISCV/rv64zbp.ll index c83698b70b76..4c35a53e61db 100644 --- a/llvm/test/CodeGen/RISCV/rv64zbp.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbp.ll @@ -2816,13 +2816,13 @@ define zeroext i8 @bitreverse_i8(i8 zeroext %a) nounwind { ; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: andi a1, a0, 51 ; RV64I-NEXT: slli a1, a1, 2 -; RV64I-NEXT: andi a0, a0, 204 ; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: andi a0, a0, 51 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: andi a1, a0, 85 ; RV64I-NEXT: slli a1, a1, 1 -; RV64I-NEXT: andi a0, a0, 170 ; RV64I-NEXT: srli a0, a0, 1 +; RV64I-NEXT: andi a0, a0, 85 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret ; @@ -2847,33 +2847,27 @@ define zeroext i16 @bitreverse_i16(i16 zeroext %a) nounwind { ; RV64I-NEXT: srli a1, a0, 8 ; RV64I-NEXT: slli a0, a0, 8 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: lui a1, 1 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: and a1, a0, a1 -; RV64I-NEXT: slli a1, a1, 4 -; RV64I-NEXT: lui a2, 15 -; RV64I-NEXT: addiw a2, a2, 240 +; RV64I-NEXT: srli a1, a0, 4 +; RV64I-NEXT: lui a2, 1 +; RV64I-NEXT: addiw a2, a2, -241 +; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: srli a0, a0, 4 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: lui a1, 3 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a1, a0, a1 -; RV64I-NEXT: slli a1, a1, 2 -; RV64I-NEXT: lui a2, 13 -; RV64I-NEXT: addiw a2, a2, -820 +; RV64I-NEXT: slli a0, a0, 4 +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: srli a1, a0, 2 +; RV64I-NEXT: lui a2, 3 +; RV64I-NEXT: addiw a2, a2, 819 +; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: lui a1, 5 -; RV64I-NEXT: addiw a1, a1, 1365 -; RV64I-NEXT: and a1, a0, a1 -; RV64I-NEXT: slli a1, a1, 1 -; RV64I-NEXT: lui a2, 11 -; RV64I-NEXT: addiw a2, a2, -1366 +; RV64I-NEXT: slli a0, a0, 2 +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: srli a1, a0, 1 +; RV64I-NEXT: lui a2, 5 +; RV64I-NEXT: addiw a2, a2, 1365 +; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: srli a0, a0, 1 -; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: slli a0, a0, 1 +; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: ret ; ; RV64B-LABEL: bitreverse_i16: @@ -2906,35 +2900,27 @@ define signext i32 @bitreverse_i32(i32 signext %a) nounwind { ; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: or a0, a0, a2 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: and a1, a0, a1 -; RV64I-NEXT: slli a1, a1, 4 -; RV64I-NEXT: lui a2, 241 +; RV64I-NEXT: srli a1, a0, 4 +; RV64I-NEXT: lui a2, 61681 ; RV64I-NEXT: addiw a2, a2, -241 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, 240 +; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: srli a0, a0, 4 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a1, a0, a1 -; RV64I-NEXT: slli a1, a1, 2 -; RV64I-NEXT: lui a2, 838861 -; RV64I-NEXT: addiw a2, a2, -820 +; RV64I-NEXT: slli a0, a0, 4 +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: srli a1, a0, 2 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: addiw a2, a2, 819 +; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: lui a1, 349525 -; RV64I-NEXT: addiw a1, a1, 1365 -; RV64I-NEXT: and a1, a0, a1 -; RV64I-NEXT: slli a1, a1, 1 -; RV64I-NEXT: lui a2, 699051 -; RV64I-NEXT: addiw a2, a2, -1366 +; RV64I-NEXT: slli a0, a0, 2 +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: srli a1, a0, 1 +; RV64I-NEXT: lui a2, 349525 +; RV64I-NEXT: addiw a2, a2, 1365 +; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: srli a0, a0, 1 -; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: slli a0, a0, 1 +; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: sext.w a0, a0 ; RV64I-NEXT: ret ; @@ -2967,35 +2953,27 @@ define void @bitreverse_i32_nosext(i32 signext %a, i32* %x) nounwind { ; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: or a0, a0, a2 -; RV64I-NEXT: lui a2, 61681 -; RV64I-NEXT: addiw a2, a2, -241 -; RV64I-NEXT: and a2, a0, a2 -; RV64I-NEXT: slli a2, a2, 4 -; RV64I-NEXT: lui a3, 241 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: lui a3, 61681 ; RV64I-NEXT: addiw a3, a3, -241 -; RV64I-NEXT: slli a3, a3, 12 -; RV64I-NEXT: addi a3, a3, 240 +; RV64I-NEXT: and a2, a2, a3 ; RV64I-NEXT: and a0, a0, a3 -; RV64I-NEXT: srli a0, a0, 4 -; RV64I-NEXT: or a0, a0, a2 -; RV64I-NEXT: lui a2, 209715 -; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: and a2, a0, a2 -; RV64I-NEXT: slli a2, a2, 2 -; RV64I-NEXT: lui a3, 838861 -; RV64I-NEXT: addiw a3, a3, -820 +; RV64I-NEXT: slli a0, a0, 4 +; RV64I-NEXT: or a0, a2, a0 +; RV64I-NEXT: srli a2, a0, 2 +; RV64I-NEXT: lui a3, 209715 +; RV64I-NEXT: addiw a3, a3, 819 +; RV64I-NEXT: and a2, a2, a3 ; RV64I-NEXT: and a0, a0, a3 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: or a0, a0, a2 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a2, a0, a2 -; RV64I-NEXT: slli a2, a2, 1 -; RV64I-NEXT: lui a3, 699051 -; RV64I-NEXT: addiw a3, a3, -1366 +; RV64I-NEXT: slli a0, a0, 2 +; RV64I-NEXT: or a0, a2, a0 +; RV64I-NEXT: srli a2, a0, 1 +; RV64I-NEXT: lui a3, 349525 +; RV64I-NEXT: addiw a3, a3, 1365 +; RV64I-NEXT: and a2, a2, a3 ; RV64I-NEXT: and a0, a0, a3 -; RV64I-NEXT: srli a0, a0, 1 -; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: slli a0, a0, 1 +; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: sw a0, 0(a1) ; RV64I-NEXT: ret ; @@ -3049,69 +3027,45 @@ define i64 @bitreverse_i64(i64 %a) nounwind { ; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: or a0, a0, a2 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: lui a1, 3855 -; RV64I-NEXT: addiw a1, a1, 241 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, -241 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, 241 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, -241 -; RV64I-NEXT: and a1, a0, a1 -; RV64I-NEXT: slli a1, a1, 4 -; RV64I-NEXT: lui a2, 1044721 -; RV64I-NEXT: addiw a2, a2, -241 +; RV64I-NEXT: srli a1, a0, 4 +; RV64I-NEXT: lui a2, 3855 +; RV64I-NEXT: addiw a2, a2, 241 +; RV64I-NEXT: slli a2, a2, 12 +; RV64I-NEXT: addi a2, a2, -241 ; RV64I-NEXT: slli a2, a2, 12 ; RV64I-NEXT: addi a2, a2, 241 ; RV64I-NEXT: slli a2, a2, 12 ; RV64I-NEXT: addi a2, a2, -241 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, 240 +; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: srli a0, a0, 4 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: lui a1, 13107 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, 819 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, 819 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, 819 -; RV64I-NEXT: and a1, a0, a1 -; RV64I-NEXT: slli a1, a1, 2 -; RV64I-NEXT: lui a2, 1035469 -; RV64I-NEXT: addiw a2, a2, -819 +; RV64I-NEXT: slli a0, a0, 4 +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: srli a1, a0, 2 +; RV64I-NEXT: lui a2, 13107 +; RV64I-NEXT: addiw a2, a2, 819 ; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, -819 +; RV64I-NEXT: addi a2, a2, 819 ; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, -819 +; RV64I-NEXT: addi a2, a2, 819 ; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, -820 +; RV64I-NEXT: addi a2, a2, 819 +; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: lui a1, 21845 -; RV64I-NEXT: addiw a1, a1, 1365 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, 1365 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, 1365 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, 1365 -; RV64I-NEXT: and a1, a0, a1 -; RV64I-NEXT: slli a1, a1, 1 -; RV64I-NEXT: lui a2, 1026731 -; RV64I-NEXT: addiw a2, a2, -1365 +; RV64I-NEXT: slli a0, a0, 2 +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: srli a1, a0, 1 +; RV64I-NEXT: lui a2, 21845 +; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, -1365 +; RV64I-NEXT: addi a2, a2, 1365 ; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, -1365 +; RV64I-NEXT: addi a2, a2, 1365 ; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, -1366 +; RV64I-NEXT: addi a2, a2, 1365 +; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: srli a0, a0, 1 -; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: slli a0, a0, 1 +; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: ret ; ; RV64B-LABEL: bitreverse_i64: @@ -3210,35 +3164,27 @@ define i32 @bitreverse_bswap_i32(i32 %a) { ; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: and a1, a0, a1 -; RV64I-NEXT: slli a1, a1, 4 -; RV64I-NEXT: lui a3, 241 +; RV64I-NEXT: srli a1, a0, 4 +; RV64I-NEXT: lui a3, 61681 ; RV64I-NEXT: addiw a3, a3, -241 -; RV64I-NEXT: slli a3, a3, 12 -; RV64I-NEXT: addi a3, a3, 240 +; RV64I-NEXT: and a1, a1, a3 ; RV64I-NEXT: and a0, a0, a3 -; RV64I-NEXT: srli a0, a0, 4 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a1, a0, a1 -; RV64I-NEXT: slli a1, a1, 2 -; RV64I-NEXT: lui a3, 838861 -; RV64I-NEXT: addiw a3, a3, -820 +; RV64I-NEXT: slli a0, a0, 4 +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: srli a1, a0, 2 +; RV64I-NEXT: lui a3, 209715 +; RV64I-NEXT: addiw a3, a3, 819 +; RV64I-NEXT: and a1, a1, a3 ; RV64I-NEXT: and a0, a0, a3 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: lui a1, 349525 -; RV64I-NEXT: addiw a1, a1, 1365 -; RV64I-NEXT: and a1, a0, a1 -; RV64I-NEXT: slli a1, a1, 1 -; RV64I-NEXT: lui a3, 699051 -; RV64I-NEXT: addiw a3, a3, -1366 +; RV64I-NEXT: slli a0, a0, 2 +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: srli a1, a0, 1 +; RV64I-NEXT: lui a3, 349525 +; RV64I-NEXT: addiw a3, a3, 1365 +; RV64I-NEXT: and a1, a1, a3 ; RV64I-NEXT: and a0, a0, a3 -; RV64I-NEXT: srli a0, a0, 1 -; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: slli a0, a0, 1 +; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 8 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: srli a2, a0, 24 @@ -3267,14 +3213,14 @@ define i32 @bitreverse_bswap_i32(i32 %a) { define i64 @bitreverse_bswap_i64(i64 %a) { ; RV64I-LABEL: bitreverse_bswap_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: srli a1, a0, 24 +; RV64I-NEXT: srli a2, a0, 24 ; RV64I-NEXT: lui a6, 4080 -; RV64I-NEXT: and a1, a1, a6 -; RV64I-NEXT: srli a3, a0, 8 +; RV64I-NEXT: and a3, a2, a6 +; RV64I-NEXT: srli a4, a0, 8 ; RV64I-NEXT: addi a5, zero, 255 ; RV64I-NEXT: slli a7, a5, 24 -; RV64I-NEXT: and a3, a3, a7 -; RV64I-NEXT: or a3, a3, a1 +; RV64I-NEXT: and a4, a4, a7 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: srli a4, a0, 40 ; RV64I-NEXT: lui a1, 16 ; RV64I-NEXT: addiw a1, a1, -256 @@ -3282,9 +3228,9 @@ define i64 @bitreverse_bswap_i64(i64 %a) { ; RV64I-NEXT: srli a2, a0, 56 ; RV64I-NEXT: or a2, a4, a2 ; RV64I-NEXT: or a2, a3, a2 -; RV64I-NEXT: slli a4, a0, 8 +; RV64I-NEXT: slli a3, a0, 8 ; RV64I-NEXT: slli t0, a5, 32 -; RV64I-NEXT: and a3, a4, t0 +; RV64I-NEXT: and a3, a3, t0 ; RV64I-NEXT: slli a4, a0, 24 ; RV64I-NEXT: slli t1, a5, 40 ; RV64I-NEXT: and a4, a4, t1 @@ -3296,69 +3242,45 @@ define i64 @bitreverse_bswap_i64(i64 %a) { ; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: or a0, a0, a2 -; RV64I-NEXT: lui a2, 3855 -; RV64I-NEXT: addiw a2, a2, 241 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, -241 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, 241 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, -241 -; RV64I-NEXT: and a2, a0, a2 -; RV64I-NEXT: slli a2, a2, 4 -; RV64I-NEXT: lui a3, 1044721 -; RV64I-NEXT: addiw a3, a3, -241 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: lui a3, 3855 +; RV64I-NEXT: addiw a3, a3, 241 +; RV64I-NEXT: slli a3, a3, 12 +; RV64I-NEXT: addi a3, a3, -241 ; RV64I-NEXT: slli a3, a3, 12 ; RV64I-NEXT: addi a3, a3, 241 ; RV64I-NEXT: slli a3, a3, 12 ; RV64I-NEXT: addi a3, a3, -241 -; RV64I-NEXT: slli a3, a3, 12 -; RV64I-NEXT: addi a3, a3, 240 +; RV64I-NEXT: and a2, a2, a3 ; RV64I-NEXT: and a0, a0, a3 -; RV64I-NEXT: srli a0, a0, 4 -; RV64I-NEXT: or a0, a0, a2 -; RV64I-NEXT: lui a2, 13107 -; RV64I-NEXT: addiw a2, a2, 819 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: and a2, a0, a2 -; RV64I-NEXT: slli a2, a2, 2 -; RV64I-NEXT: lui a3, 1035469 -; RV64I-NEXT: addiw a3, a3, -819 +; RV64I-NEXT: slli a0, a0, 4 +; RV64I-NEXT: or a0, a2, a0 +; RV64I-NEXT: srli a2, a0, 2 +; RV64I-NEXT: lui a3, 13107 +; RV64I-NEXT: addiw a3, a3, 819 ; RV64I-NEXT: slli a3, a3, 12 -; RV64I-NEXT: addi a3, a3, -819 +; RV64I-NEXT: addi a3, a3, 819 ; RV64I-NEXT: slli a3, a3, 12 -; RV64I-NEXT: addi a3, a3, -819 +; RV64I-NEXT: addi a3, a3, 819 ; RV64I-NEXT: slli a3, a3, 12 -; RV64I-NEXT: addi a3, a3, -820 +; RV64I-NEXT: addi a3, a3, 819 +; RV64I-NEXT: and a2, a2, a3 ; RV64I-NEXT: and a0, a0, a3 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: or a0, a0, a2 -; RV64I-NEXT: lui a2, 21845 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, 1365 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, 1365 -; RV64I-NEXT: slli a2, a2, 12 -; RV64I-NEXT: addi a2, a2, 1365 -; RV64I-NEXT: and a2, a0, a2 -; RV64I-NEXT: slli a2, a2, 1 -; RV64I-NEXT: lui a3, 1026731 -; RV64I-NEXT: addiw a3, a3, -1365 +; RV64I-NEXT: slli a0, a0, 2 +; RV64I-NEXT: or a0, a2, a0 +; RV64I-NEXT: srli a2, a0, 1 +; RV64I-NEXT: lui a3, 21845 +; RV64I-NEXT: addiw a3, a3, 1365 ; RV64I-NEXT: slli a3, a3, 12 -; RV64I-NEXT: addi a3, a3, -1365 +; RV64I-NEXT: addi a3, a3, 1365 ; RV64I-NEXT: slli a3, a3, 12 -; RV64I-NEXT: addi a3, a3, -1365 +; RV64I-NEXT: addi a3, a3, 1365 ; RV64I-NEXT: slli a3, a3, 12 -; RV64I-NEXT: addi a3, a3, -1366 +; RV64I-NEXT: addi a3, a3, 1365 +; RV64I-NEXT: and a2, a2, a3 ; RV64I-NEXT: and a0, a0, a3 -; RV64I-NEXT: srli a0, a0, 1 -; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: slli a0, a0, 1 +; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: srli a2, a0, 40 ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: srli a2, a0, 56 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll index c1a9fe20aa93..a3180f0b4e31 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll @@ -12,33 +12,27 @@ define void @bitreverse_v8i16(<8 x i16>* %x, <8 x i16>* %y) { ; LMULMAX2-RV32-NEXT: vsrl.vi v26, v25, 8 ; LMULMAX2-RV32-NEXT: vsll.vi v25, v25, 8 ; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV32-NEXT: vsrl.vi v26, v25, 4 ; LMULMAX2-RV32-NEXT: lui a1, 1 ; LMULMAX2-RV32-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32-NEXT: vand.vx v26, v25, a1 -; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 4 -; LMULMAX2-RV32-NEXT: lui a1, 15 -; LMULMAX2-RV32-NEXT: addi a1, a1, 240 +; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV32-NEXT: vand.vx v25, v25, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v25, v25, 4 -; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV32-NEXT: vsll.vi v25, v25, 4 +; LMULMAX2-RV32-NEXT: vor.vv v25, v26, v25 +; LMULMAX2-RV32-NEXT: vsrl.vi v26, v25, 2 ; LMULMAX2-RV32-NEXT: lui a1, 3 ; LMULMAX2-RV32-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32-NEXT: vand.vx v26, v25, a1 -; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 2 -; LMULMAX2-RV32-NEXT: lui a1, 13 -; LMULMAX2-RV32-NEXT: addi a1, a1, -820 +; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV32-NEXT: vand.vx v25, v25, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v25, v25, 2 -; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV32-NEXT: vsll.vi v25, v25, 2 +; LMULMAX2-RV32-NEXT: vor.vv v25, v26, v25 +; LMULMAX2-RV32-NEXT: vsrl.vi v26, v25, 1 ; LMULMAX2-RV32-NEXT: lui a1, 5 ; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32-NEXT: vand.vx v26, v25, a1 -; LMULMAX2-RV32-NEXT: vadd.vv v26, v26, v26 -; LMULMAX2-RV32-NEXT: lui a1, 11 -; LMULMAX2-RV32-NEXT: addi a1, a1, -1366 +; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV32-NEXT: vand.vx v25, v25, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v25, v25, 1 -; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV32-NEXT: vadd.vv v25, v25, v25 +; LMULMAX2-RV32-NEXT: vor.vv v25, v26, v25 ; LMULMAX2-RV32-NEXT: vse16.v v25, (a0) ; LMULMAX2-RV32-NEXT: ret ; @@ -49,33 +43,27 @@ define void @bitreverse_v8i16(<8 x i16>* %x, <8 x i16>* %y) { ; LMULMAX2-RV64-NEXT: vsrl.vi v26, v25, 8 ; LMULMAX2-RV64-NEXT: vsll.vi v25, v25, 8 ; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV64-NEXT: vsrl.vi v26, v25, 4 ; LMULMAX2-RV64-NEXT: lui a1, 1 ; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX2-RV64-NEXT: vand.vx v26, v25, a1 -; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 4 -; LMULMAX2-RV64-NEXT: lui a1, 15 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 240 +; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV64-NEXT: vand.vx v25, v25, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v25, v25, 4 -; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV64-NEXT: vsll.vi v25, v25, 4 +; LMULMAX2-RV64-NEXT: vor.vv v25, v26, v25 +; LMULMAX2-RV64-NEXT: vsrl.vi v26, v25, 2 ; LMULMAX2-RV64-NEXT: lui a1, 3 ; LMULMAX2-RV64-NEXT: addiw a1, a1, 819 -; LMULMAX2-RV64-NEXT: vand.vx v26, v25, a1 -; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 2 -; LMULMAX2-RV64-NEXT: lui a1, 13 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -820 +; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV64-NEXT: vand.vx v25, v25, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v25, v25, 2 -; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV64-NEXT: vsll.vi v25, v25, 2 +; LMULMAX2-RV64-NEXT: vor.vv v25, v26, v25 +; LMULMAX2-RV64-NEXT: vsrl.vi v26, v25, 1 ; LMULMAX2-RV64-NEXT: lui a1, 5 ; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX2-RV64-NEXT: vand.vx v26, v25, a1 -; LMULMAX2-RV64-NEXT: vadd.vv v26, v26, v26 -; LMULMAX2-RV64-NEXT: lui a1, 11 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -1366 +; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV64-NEXT: vand.vx v25, v25, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v25, v25, 1 -; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV64-NEXT: vadd.vv v25, v25, v25 +; LMULMAX2-RV64-NEXT: vor.vv v25, v26, v25 ; LMULMAX2-RV64-NEXT: vse16.v v25, (a0) ; LMULMAX2-RV64-NEXT: ret ; @@ -86,33 +74,27 @@ define void @bitreverse_v8i16(<8 x i16>* %x, <8 x i16>* %y) { ; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 8 ; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 8 ; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 4 ; LMULMAX1-RV32-NEXT: lui a1, 1 ; LMULMAX1-RV32-NEXT: addi a1, a1, -241 -; LMULMAX1-RV32-NEXT: vand.vx v26, v25, a1 -; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 4 -; LMULMAX1-RV32-NEXT: lui a1, 15 -; LMULMAX1-RV32-NEXT: addi a1, a1, 240 +; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a1 ; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a1 -; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 4 -; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 4 +; LMULMAX1-RV32-NEXT: vor.vv v25, v26, v25 +; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 2 ; LMULMAX1-RV32-NEXT: lui a1, 3 ; LMULMAX1-RV32-NEXT: addi a1, a1, 819 -; LMULMAX1-RV32-NEXT: vand.vx v26, v25, a1 -; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 2 -; LMULMAX1-RV32-NEXT: lui a1, 13 -; LMULMAX1-RV32-NEXT: addi a1, a1, -820 +; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a1 ; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a1 -; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 2 -; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 2 +; LMULMAX1-RV32-NEXT: vor.vv v25, v26, v25 +; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 1 ; LMULMAX1-RV32-NEXT: lui a1, 5 ; LMULMAX1-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX1-RV32-NEXT: vand.vx v26, v25, a1 -; LMULMAX1-RV32-NEXT: vadd.vv v26, v26, v26 -; LMULMAX1-RV32-NEXT: lui a1, 11 -; LMULMAX1-RV32-NEXT: addi a1, a1, -1366 +; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a1 ; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a1 -; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 1 -; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v25 +; LMULMAX1-RV32-NEXT: vor.vv v25, v26, v25 ; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) ; LMULMAX1-RV32-NEXT: ret ; @@ -123,33 +105,27 @@ define void @bitreverse_v8i16(<8 x i16>* %x, <8 x i16>* %y) { ; LMULMAX1-RV64-NEXT: vsrl.vi v26, v25, 8 ; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 8 ; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vsrl.vi v26, v25, 4 ; LMULMAX1-RV64-NEXT: lui a1, 1 ; LMULMAX1-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX1-RV64-NEXT: vand.vx v26, v25, a1 -; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 4 -; LMULMAX1-RV64-NEXT: lui a1, 15 -; LMULMAX1-RV64-NEXT: addiw a1, a1, 240 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a1 ; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1 -; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 4 -; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 4 +; LMULMAX1-RV64-NEXT: vor.vv v25, v26, v25 +; LMULMAX1-RV64-NEXT: vsrl.vi v26, v25, 2 ; LMULMAX1-RV64-NEXT: lui a1, 3 ; LMULMAX1-RV64-NEXT: addiw a1, a1, 819 -; LMULMAX1-RV64-NEXT: vand.vx v26, v25, a1 -; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 2 -; LMULMAX1-RV64-NEXT: lui a1, 13 -; LMULMAX1-RV64-NEXT: addiw a1, a1, -820 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a1 ; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1 -; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 2 -; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 2 +; LMULMAX1-RV64-NEXT: vor.vv v25, v26, v25 +; LMULMAX1-RV64-NEXT: vsrl.vi v26, v25, 1 ; LMULMAX1-RV64-NEXT: lui a1, 5 ; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX1-RV64-NEXT: vand.vx v26, v25, a1 -; LMULMAX1-RV64-NEXT: vadd.vv v26, v26, v26 -; LMULMAX1-RV64-NEXT: lui a1, 11 -; LMULMAX1-RV64-NEXT: addiw a1, a1, -1366 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a1 ; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1 -; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 1 -; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v25 +; LMULMAX1-RV64-NEXT: vor.vv v25, v26, v25 ; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) ; LMULMAX1-RV64-NEXT: ret %a = load <8 x i16>, <8 x i16>* %x @@ -177,33 +153,27 @@ define void @bitreverse_v4i32(<4 x i32>* %x, <4 x i32>* %y) { ; LMULMAX2-RV32-NEXT: vsll.vi v25, v25, 24 ; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v27 ; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV32-NEXT: vsrl.vi v26, v25, 4 ; LMULMAX2-RV32-NEXT: lui a1, 61681 ; LMULMAX2-RV32-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32-NEXT: vand.vx v26, v25, a1 -; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 4 -; LMULMAX2-RV32-NEXT: lui a1, 986895 -; LMULMAX2-RV32-NEXT: addi a1, a1, 240 +; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV32-NEXT: vand.vx v25, v25, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v25, v25, 4 -; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV32-NEXT: vsll.vi v25, v25, 4 +; LMULMAX2-RV32-NEXT: vor.vv v25, v26, v25 +; LMULMAX2-RV32-NEXT: vsrl.vi v26, v25, 2 ; LMULMAX2-RV32-NEXT: lui a1, 209715 ; LMULMAX2-RV32-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32-NEXT: vand.vx v26, v25, a1 -; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 2 -; LMULMAX2-RV32-NEXT: lui a1, 838861 -; LMULMAX2-RV32-NEXT: addi a1, a1, -820 +; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV32-NEXT: vand.vx v25, v25, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v25, v25, 2 -; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV32-NEXT: vsll.vi v25, v25, 2 +; LMULMAX2-RV32-NEXT: vor.vv v25, v26, v25 +; LMULMAX2-RV32-NEXT: vsrl.vi v26, v25, 1 ; LMULMAX2-RV32-NEXT: lui a1, 349525 ; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32-NEXT: vand.vx v26, v25, a1 -; LMULMAX2-RV32-NEXT: vadd.vv v26, v26, v26 -; LMULMAX2-RV32-NEXT: lui a1, 699051 -; LMULMAX2-RV32-NEXT: addi a1, a1, -1366 +; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV32-NEXT: vand.vx v25, v25, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v25, v25, 1 -; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV32-NEXT: vadd.vv v25, v25, v25 +; LMULMAX2-RV32-NEXT: vor.vv v25, v26, v25 ; LMULMAX2-RV32-NEXT: vse32.v v25, (a0) ; LMULMAX2-RV32-NEXT: ret ; @@ -223,39 +193,27 @@ define void @bitreverse_v4i32(<4 x i32>* %x, <4 x i32>* %y) { ; LMULMAX2-RV64-NEXT: vsll.vi v25, v25, 24 ; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v27 ; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV64-NEXT: vsrl.vi v26, v25, 4 ; LMULMAX2-RV64-NEXT: lui a1, 61681 ; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX2-RV64-NEXT: vand.vx v26, v25, a1 -; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 4 -; LMULMAX2-RV64-NEXT: lui a1, 241 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX2-RV64-NEXT: slli a1, a1, 12 -; LMULMAX2-RV64-NEXT: addi a1, a1, 240 +; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV64-NEXT: vand.vx v25, v25, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v25, v25, 4 -; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV64-NEXT: vsll.vi v25, v25, 4 +; LMULMAX2-RV64-NEXT: vor.vv v25, v26, v25 +; LMULMAX2-RV64-NEXT: vsrl.vi v26, v25, 2 ; LMULMAX2-RV64-NEXT: lui a1, 209715 ; LMULMAX2-RV64-NEXT: addiw a1, a1, 819 -; LMULMAX2-RV64-NEXT: vand.vx v26, v25, a1 -; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 2 -; LMULMAX2-RV64-NEXT: lui a1, 205 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -819 -; LMULMAX2-RV64-NEXT: slli a1, a1, 12 -; LMULMAX2-RV64-NEXT: addi a1, a1, -820 +; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV64-NEXT: vand.vx v25, v25, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v25, v25, 2 -; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV64-NEXT: vsll.vi v25, v25, 2 +; LMULMAX2-RV64-NEXT: vor.vv v25, v26, v25 +; LMULMAX2-RV64-NEXT: vsrl.vi v26, v25, 1 ; LMULMAX2-RV64-NEXT: lui a1, 349525 ; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX2-RV64-NEXT: vand.vx v26, v25, a1 -; LMULMAX2-RV64-NEXT: vadd.vv v26, v26, v26 -; LMULMAX2-RV64-NEXT: lui a1, 171 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -1365 -; LMULMAX2-RV64-NEXT: slli a1, a1, 12 -; LMULMAX2-RV64-NEXT: addi a1, a1, -1366 +; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV64-NEXT: vand.vx v25, v25, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v25, v25, 1 -; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV64-NEXT: vadd.vv v25, v25, v25 +; LMULMAX2-RV64-NEXT: vor.vv v25, v26, v25 ; LMULMAX2-RV64-NEXT: vse32.v v25, (a0) ; LMULMAX2-RV64-NEXT: ret ; @@ -275,33 +233,27 @@ define void @bitreverse_v4i32(<4 x i32>* %x, <4 x i32>* %y) { ; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 24 ; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27 ; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 4 ; LMULMAX1-RV32-NEXT: lui a1, 61681 ; LMULMAX1-RV32-NEXT: addi a1, a1, -241 -; LMULMAX1-RV32-NEXT: vand.vx v26, v25, a1 -; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 4 -; LMULMAX1-RV32-NEXT: lui a1, 986895 -; LMULMAX1-RV32-NEXT: addi a1, a1, 240 +; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a1 ; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a1 -; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 4 -; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 4 +; LMULMAX1-RV32-NEXT: vor.vv v25, v26, v25 +; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 2 ; LMULMAX1-RV32-NEXT: lui a1, 209715 ; LMULMAX1-RV32-NEXT: addi a1, a1, 819 -; LMULMAX1-RV32-NEXT: vand.vx v26, v25, a1 -; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 2 -; LMULMAX1-RV32-NEXT: lui a1, 838861 -; LMULMAX1-RV32-NEXT: addi a1, a1, -820 +; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a1 ; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a1 -; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 2 -; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 2 +; LMULMAX1-RV32-NEXT: vor.vv v25, v26, v25 +; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 1 ; LMULMAX1-RV32-NEXT: lui a1, 349525 ; LMULMAX1-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX1-RV32-NEXT: vand.vx v26, v25, a1 -; LMULMAX1-RV32-NEXT: vadd.vv v26, v26, v26 -; LMULMAX1-RV32-NEXT: lui a1, 699051 -; LMULMAX1-RV32-NEXT: addi a1, a1, -1366 +; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a1 ; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a1 -; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 1 -; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v25 +; LMULMAX1-RV32-NEXT: vor.vv v25, v26, v25 ; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) ; LMULMAX1-RV32-NEXT: ret ; @@ -321,39 +273,27 @@ define void @bitreverse_v4i32(<4 x i32>* %x, <4 x i32>* %y) { ; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 24 ; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 ; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vsrl.vi v26, v25, 4 ; LMULMAX1-RV64-NEXT: lui a1, 61681 ; LMULMAX1-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX1-RV64-NEXT: vand.vx v26, v25, a1 -; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 4 -; LMULMAX1-RV64-NEXT: lui a1, 241 -; LMULMAX1-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, 240 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a1 ; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1 -; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 4 -; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 4 +; LMULMAX1-RV64-NEXT: vor.vv v25, v26, v25 +; LMULMAX1-RV64-NEXT: vsrl.vi v26, v25, 2 ; LMULMAX1-RV64-NEXT: lui a1, 209715 ; LMULMAX1-RV64-NEXT: addiw a1, a1, 819 -; LMULMAX1-RV64-NEXT: vand.vx v26, v25, a1 -; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 2 -; LMULMAX1-RV64-NEXT: lui a1, 205 -; LMULMAX1-RV64-NEXT: addiw a1, a1, -819 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, -820 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a1 ; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1 -; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 2 -; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 2 +; LMULMAX1-RV64-NEXT: vor.vv v25, v26, v25 +; LMULMAX1-RV64-NEXT: vsrl.vi v26, v25, 1 ; LMULMAX1-RV64-NEXT: lui a1, 349525 ; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX1-RV64-NEXT: vand.vx v26, v25, a1 -; LMULMAX1-RV64-NEXT: vadd.vv v26, v26, v26 -; LMULMAX1-RV64-NEXT: lui a1, 171 -; LMULMAX1-RV64-NEXT: addiw a1, a1, -1365 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, -1366 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a1 ; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1 -; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 1 -; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v25 +; LMULMAX1-RV64-NEXT: vor.vv v25, v26, v25 ; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) ; LMULMAX1-RV64-NEXT: ret %a = load <4 x i32>, <4 x i32>* %x @@ -416,51 +356,36 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v28 ; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v27 ; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV32-NEXT: vsrl.vi v26, v25, 4 ; LMULMAX2-RV32-NEXT: lui a1, 61681 ; LMULMAX2-RV32-NEXT: addi a1, a1, -241 ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX2-RV32-NEXT: vmv.v.x v26, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX2-RV32-NEXT: vand.vv v26, v25, v26 -; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 4 -; LMULMAX2-RV32-NEXT: lui a1, 986895 -; LMULMAX2-RV32-NEXT: addi a1, a1, 240 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX2-RV32-NEXT: vmv.v.x v27, a1 ; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX2-RV32-NEXT: vand.vv v26, v26, v27 ; LMULMAX2-RV32-NEXT: vand.vv v25, v25, v27 -; LMULMAX2-RV32-NEXT: vsrl.vi v25, v25, 4 -; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV32-NEXT: vsll.vi v25, v25, 4 +; LMULMAX2-RV32-NEXT: vor.vv v25, v26, v25 +; LMULMAX2-RV32-NEXT: vsrl.vi v26, v25, 2 ; LMULMAX2-RV32-NEXT: lui a1, 209715 ; LMULMAX2-RV32-NEXT: addi a1, a1, 819 ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX2-RV32-NEXT: vmv.v.x v26, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX2-RV32-NEXT: vand.vv v26, v25, v26 -; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 2 -; LMULMAX2-RV32-NEXT: lui a1, 838861 -; LMULMAX2-RV32-NEXT: addi a1, a1, -820 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX2-RV32-NEXT: vmv.v.x v27, a1 ; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX2-RV32-NEXT: vand.vv v26, v26, v27 ; LMULMAX2-RV32-NEXT: vand.vv v25, v25, v27 -; LMULMAX2-RV32-NEXT: vsrl.vi v25, v25, 2 -; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV32-NEXT: vsll.vi v25, v25, 2 +; LMULMAX2-RV32-NEXT: vor.vv v25, v26, v25 +; LMULMAX2-RV32-NEXT: vsrl.vi v26, v25, 1 ; LMULMAX2-RV32-NEXT: lui a1, 349525 ; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX2-RV32-NEXT: vmv.v.x v26, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX2-RV32-NEXT: vand.vv v26, v25, v26 -; LMULMAX2-RV32-NEXT: vadd.vv v26, v26, v26 -; LMULMAX2-RV32-NEXT: lui a1, 699051 -; LMULMAX2-RV32-NEXT: addi a1, a1, -1366 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX2-RV32-NEXT: vmv.v.x v27, a1 ; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX2-RV32-NEXT: vand.vv v26, v26, v27 ; LMULMAX2-RV32-NEXT: vand.vv v25, v25, v27 -; LMULMAX2-RV32-NEXT: vsrl.vi v25, v25, 1 -; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV32-NEXT: vadd.vv v25, v25, v25 +; LMULMAX2-RV32-NEXT: vor.vv v25, v26, v25 ; LMULMAX2-RV32-NEXT: vse64.v v25, (a0) ; LMULMAX2-RV32-NEXT: ret ; @@ -499,6 +424,7 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX2-RV64-NEXT: vor.vv v25, v28, v25 ; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v27 ; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV64-NEXT: vsrl.vi v26, v25, 4 ; LMULMAX2-RV64-NEXT: lui a1, 3855 ; LMULMAX2-RV64-NEXT: addiw a1, a1, 241 ; LMULMAX2-RV64-NEXT: slli a1, a1, 12 @@ -507,19 +433,11 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX2-RV64-NEXT: addi a1, a1, 241 ; LMULMAX2-RV64-NEXT: slli a1, a1, 12 ; LMULMAX2-RV64-NEXT: addi a1, a1, -241 -; LMULMAX2-RV64-NEXT: vand.vx v26, v25, a1 -; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 4 -; LMULMAX2-RV64-NEXT: lui a1, 1044721 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX2-RV64-NEXT: slli a1, a1, 12 -; LMULMAX2-RV64-NEXT: addi a1, a1, 241 -; LMULMAX2-RV64-NEXT: slli a1, a1, 12 -; LMULMAX2-RV64-NEXT: addi a1, a1, -241 -; LMULMAX2-RV64-NEXT: slli a1, a1, 12 -; LMULMAX2-RV64-NEXT: addi a1, a1, 240 +; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV64-NEXT: vand.vx v25, v25, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v25, v25, 4 -; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV64-NEXT: vsll.vi v25, v25, 4 +; LMULMAX2-RV64-NEXT: vor.vv v25, v26, v25 +; LMULMAX2-RV64-NEXT: vsrl.vi v26, v25, 2 ; LMULMAX2-RV64-NEXT: lui a1, 13107 ; LMULMAX2-RV64-NEXT: addiw a1, a1, 819 ; LMULMAX2-RV64-NEXT: slli a1, a1, 12 @@ -528,19 +446,11 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX2-RV64-NEXT: addi a1, a1, 819 ; LMULMAX2-RV64-NEXT: slli a1, a1, 12 ; LMULMAX2-RV64-NEXT: addi a1, a1, 819 -; LMULMAX2-RV64-NEXT: vand.vx v26, v25, a1 -; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 2 -; LMULMAX2-RV64-NEXT: lui a1, 1035469 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -819 -; LMULMAX2-RV64-NEXT: slli a1, a1, 12 -; LMULMAX2-RV64-NEXT: addi a1, a1, -819 -; LMULMAX2-RV64-NEXT: slli a1, a1, 12 -; LMULMAX2-RV64-NEXT: addi a1, a1, -819 -; LMULMAX2-RV64-NEXT: slli a1, a1, 12 -; LMULMAX2-RV64-NEXT: addi a1, a1, -820 +; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV64-NEXT: vand.vx v25, v25, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v25, v25, 2 -; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV64-NEXT: vsll.vi v25, v25, 2 +; LMULMAX2-RV64-NEXT: vor.vv v25, v26, v25 +; LMULMAX2-RV64-NEXT: vsrl.vi v26, v25, 1 ; LMULMAX2-RV64-NEXT: lui a1, 21845 ; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365 ; LMULMAX2-RV64-NEXT: slli a1, a1, 12 @@ -549,19 +459,10 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX2-RV64-NEXT: addi a1, a1, 1365 ; LMULMAX2-RV64-NEXT: slli a1, a1, 12 ; LMULMAX2-RV64-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV64-NEXT: vand.vx v26, v25, a1 -; LMULMAX2-RV64-NEXT: vadd.vv v26, v26, v26 -; LMULMAX2-RV64-NEXT: lui a1, 1026731 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -1365 -; LMULMAX2-RV64-NEXT: slli a1, a1, 12 -; LMULMAX2-RV64-NEXT: addi a1, a1, -1365 -; LMULMAX2-RV64-NEXT: slli a1, a1, 12 -; LMULMAX2-RV64-NEXT: addi a1, a1, -1365 -; LMULMAX2-RV64-NEXT: slli a1, a1, 12 -; LMULMAX2-RV64-NEXT: addi a1, a1, -1366 +; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV64-NEXT: vand.vx v25, v25, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v25, v25, 1 -; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV64-NEXT: vadd.vv v25, v25, v25 +; LMULMAX2-RV64-NEXT: vor.vv v25, v26, v25 ; LMULMAX2-RV64-NEXT: vse64.v v25, (a0) ; LMULMAX2-RV64-NEXT: ret ; @@ -616,51 +517,36 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v28 ; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27 ; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 4 ; LMULMAX1-RV32-NEXT: lui a1, 61681 ; LMULMAX1-RV32-NEXT: addi a1, a1, -241 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV32-NEXT: vmv.v.x v26, a1 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-RV32-NEXT: vand.vv v26, v25, v26 -; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 4 -; LMULMAX1-RV32-NEXT: lui a1, 986895 -; LMULMAX1-RV32-NEXT: addi a1, a1, 240 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-RV32-NEXT: vmv.v.x v27, a1 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX1-RV32-NEXT: vand.vv v26, v26, v27 ; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v27 -; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 4 -; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 4 +; LMULMAX1-RV32-NEXT: vor.vv v25, v26, v25 +; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 2 ; LMULMAX1-RV32-NEXT: lui a1, 209715 ; LMULMAX1-RV32-NEXT: addi a1, a1, 819 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV32-NEXT: vmv.v.x v26, a1 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-RV32-NEXT: vand.vv v26, v25, v26 -; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 2 -; LMULMAX1-RV32-NEXT: lui a1, 838861 -; LMULMAX1-RV32-NEXT: addi a1, a1, -820 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-RV32-NEXT: vmv.v.x v27, a1 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX1-RV32-NEXT: vand.vv v26, v26, v27 ; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v27 -; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 2 -; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 2 +; LMULMAX1-RV32-NEXT: vor.vv v25, v26, v25 +; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 1 ; LMULMAX1-RV32-NEXT: lui a1, 349525 ; LMULMAX1-RV32-NEXT: addi a1, a1, 1365 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV32-NEXT: vmv.v.x v26, a1 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-RV32-NEXT: vand.vv v26, v25, v26 -; LMULMAX1-RV32-NEXT: vadd.vv v26, v26, v26 -; LMULMAX1-RV32-NEXT: lui a1, 699051 -; LMULMAX1-RV32-NEXT: addi a1, a1, -1366 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-RV32-NEXT: vmv.v.x v27, a1 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX1-RV32-NEXT: vand.vv v26, v26, v27 ; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v27 -; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 1 -; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v25 +; LMULMAX1-RV32-NEXT: vor.vv v25, v26, v25 ; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) ; LMULMAX1-RV32-NEXT: ret ; @@ -699,6 +585,7 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX1-RV64-NEXT: vor.vv v25, v28, v25 ; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 ; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vsrl.vi v26, v25, 4 ; LMULMAX1-RV64-NEXT: lui a1, 3855 ; LMULMAX1-RV64-NEXT: addiw a1, a1, 241 ; LMULMAX1-RV64-NEXT: slli a1, a1, 12 @@ -707,19 +594,11 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX1-RV64-NEXT: addi a1, a1, 241 ; LMULMAX1-RV64-NEXT: slli a1, a1, 12 ; LMULMAX1-RV64-NEXT: addi a1, a1, -241 -; LMULMAX1-RV64-NEXT: vand.vx v26, v25, a1 -; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 4 -; LMULMAX1-RV64-NEXT: lui a1, 1044721 -; LMULMAX1-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, 241 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, -241 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, 240 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a1 ; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1 -; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 4 -; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 4 +; LMULMAX1-RV64-NEXT: vor.vv v25, v26, v25 +; LMULMAX1-RV64-NEXT: vsrl.vi v26, v25, 2 ; LMULMAX1-RV64-NEXT: lui a1, 13107 ; LMULMAX1-RV64-NEXT: addiw a1, a1, 819 ; LMULMAX1-RV64-NEXT: slli a1, a1, 12 @@ -728,19 +607,11 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX1-RV64-NEXT: addi a1, a1, 819 ; LMULMAX1-RV64-NEXT: slli a1, a1, 12 ; LMULMAX1-RV64-NEXT: addi a1, a1, 819 -; LMULMAX1-RV64-NEXT: vand.vx v26, v25, a1 -; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 2 -; LMULMAX1-RV64-NEXT: lui a1, 1035469 -; LMULMAX1-RV64-NEXT: addiw a1, a1, -819 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, -819 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, -819 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, -820 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a1 ; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1 -; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 2 -; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 2 +; LMULMAX1-RV64-NEXT: vor.vv v25, v26, v25 +; LMULMAX1-RV64-NEXT: vsrl.vi v26, v25, 1 ; LMULMAX1-RV64-NEXT: lui a1, 21845 ; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365 ; LMULMAX1-RV64-NEXT: slli a1, a1, 12 @@ -749,19 +620,10 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX1-RV64-NEXT: addi a1, a1, 1365 ; LMULMAX1-RV64-NEXT: slli a1, a1, 12 ; LMULMAX1-RV64-NEXT: addi a1, a1, 1365 -; LMULMAX1-RV64-NEXT: vand.vx v26, v25, a1 -; LMULMAX1-RV64-NEXT: vadd.vv v26, v26, v26 -; LMULMAX1-RV64-NEXT: lui a1, 1026731 -; LMULMAX1-RV64-NEXT: addiw a1, a1, -1365 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, -1365 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, -1365 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, -1366 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a1 ; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1 -; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 1 -; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v25 +; LMULMAX1-RV64-NEXT: vor.vv v25, v26, v25 ; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) ; LMULMAX1-RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x @@ -780,33 +642,27 @@ define void @bitreverse_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX2-RV32-NEXT: vsrl.vi v28, v26, 8 ; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 8 ; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV32-NEXT: vsrl.vi v28, v26, 4 ; LMULMAX2-RV32-NEXT: lui a1, 1 ; LMULMAX2-RV32-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32-NEXT: vand.vx v28, v26, a1 -; LMULMAX2-RV32-NEXT: vsll.vi v28, v28, 4 -; LMULMAX2-RV32-NEXT: lui a1, 15 -; LMULMAX2-RV32-NEXT: addi a1, a1, 240 +; LMULMAX2-RV32-NEXT: vand.vx v28, v28, a1 ; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v26, v26, 4 -; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 4 +; LMULMAX2-RV32-NEXT: vor.vv v26, v28, v26 +; LMULMAX2-RV32-NEXT: vsrl.vi v28, v26, 2 ; LMULMAX2-RV32-NEXT: lui a1, 3 ; LMULMAX2-RV32-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32-NEXT: vand.vx v28, v26, a1 -; LMULMAX2-RV32-NEXT: vsll.vi v28, v28, 2 -; LMULMAX2-RV32-NEXT: lui a1, 13 -; LMULMAX2-RV32-NEXT: addi a1, a1, -820 +; LMULMAX2-RV32-NEXT: vand.vx v28, v28, a1 ; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v26, v26, 2 -; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 2 +; LMULMAX2-RV32-NEXT: vor.vv v26, v28, v26 +; LMULMAX2-RV32-NEXT: vsrl.vi v28, v26, 1 ; LMULMAX2-RV32-NEXT: lui a1, 5 ; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32-NEXT: vand.vx v28, v26, a1 -; LMULMAX2-RV32-NEXT: vadd.vv v28, v28, v28 -; LMULMAX2-RV32-NEXT: lui a1, 11 -; LMULMAX2-RV32-NEXT: addi a1, a1, -1366 +; LMULMAX2-RV32-NEXT: vand.vx v28, v28, a1 ; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v26, v26, 1 -; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV32-NEXT: vadd.vv v26, v26, v26 +; LMULMAX2-RV32-NEXT: vor.vv v26, v28, v26 ; LMULMAX2-RV32-NEXT: vse16.v v26, (a0) ; LMULMAX2-RV32-NEXT: ret ; @@ -817,150 +673,132 @@ define void @bitreverse_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX2-RV64-NEXT: vsrl.vi v28, v26, 8 ; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 8 ; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV64-NEXT: vsrl.vi v28, v26, 4 ; LMULMAX2-RV64-NEXT: lui a1, 1 ; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX2-RV64-NEXT: vand.vx v28, v26, a1 -; LMULMAX2-RV64-NEXT: vsll.vi v28, v28, 4 -; LMULMAX2-RV64-NEXT: lui a1, 15 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 240 +; LMULMAX2-RV64-NEXT: vand.vx v28, v28, a1 ; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v26, v26, 4 -; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 4 +; LMULMAX2-RV64-NEXT: vor.vv v26, v28, v26 +; LMULMAX2-RV64-NEXT: vsrl.vi v28, v26, 2 ; LMULMAX2-RV64-NEXT: lui a1, 3 ; LMULMAX2-RV64-NEXT: addiw a1, a1, 819 -; LMULMAX2-RV64-NEXT: vand.vx v28, v26, a1 -; LMULMAX2-RV64-NEXT: vsll.vi v28, v28, 2 -; LMULMAX2-RV64-NEXT: lui a1, 13 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -820 +; LMULMAX2-RV64-NEXT: vand.vx v28, v28, a1 ; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v26, v26, 2 -; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 2 +; LMULMAX2-RV64-NEXT: vor.vv v26, v28, v26 +; LMULMAX2-RV64-NEXT: vsrl.vi v28, v26, 1 ; LMULMAX2-RV64-NEXT: lui a1, 5 ; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX2-RV64-NEXT: vand.vx v28, v26, a1 -; LMULMAX2-RV64-NEXT: vadd.vv v28, v28, v28 -; LMULMAX2-RV64-NEXT: lui a1, 11 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -1366 +; LMULMAX2-RV64-NEXT: vand.vx v28, v28, a1 ; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v26, v26, 1 -; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV64-NEXT: vadd.vv v26, v26, v26 +; LMULMAX2-RV64-NEXT: vor.vv v26, v28, v26 ; LMULMAX2-RV64-NEXT: vse16.v v26, (a0) ; LMULMAX2-RV64-NEXT: ret ; ; LMULMAX1-RV32-LABEL: bitreverse_v16i16: ; LMULMAX1-RV32: # %bb.0: ; LMULMAX1-RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; LMULMAX1-RV32-NEXT: addi a6, a0, 16 -; LMULMAX1-RV32-NEXT: vle16.v v25, (a6) +; LMULMAX1-RV32-NEXT: addi a1, a0, 16 +; LMULMAX1-RV32-NEXT: vle16.v v25, (a1) ; LMULMAX1-RV32-NEXT: vle16.v v26, (a0) ; LMULMAX1-RV32-NEXT: vsrl.vi v27, v25, 8 ; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 8 ; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV32-NEXT: vsrl.vi v27, v25, 4 ; LMULMAX1-RV32-NEXT: lui a2, 1 -; LMULMAX1-RV32-NEXT: addi a7, a2, -241 -; LMULMAX1-RV32-NEXT: vand.vx v27, v25, a7 -; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 4 -; LMULMAX1-RV32-NEXT: lui a3, 15 -; LMULMAX1-RV32-NEXT: addi a3, a3, 240 -; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a3 -; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 4 -; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27 -; LMULMAX1-RV32-NEXT: lui a4, 3 -; LMULMAX1-RV32-NEXT: addi a4, a4, 819 -; LMULMAX1-RV32-NEXT: vand.vx v27, v25, a4 -; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 2 -; LMULMAX1-RV32-NEXT: lui a5, 13 -; LMULMAX1-RV32-NEXT: addi a5, a5, -820 -; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a5 -; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 2 -; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27 -; LMULMAX1-RV32-NEXT: lui a1, 5 -; LMULMAX1-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX1-RV32-NEXT: vand.vx v27, v25, a1 -; LMULMAX1-RV32-NEXT: vadd.vv v27, v27, v27 -; LMULMAX1-RV32-NEXT: lui a2, 11 -; LMULMAX1-RV32-NEXT: addi a2, a2, -1366 +; LMULMAX1-RV32-NEXT: addi a2, a2, -241 +; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a2 ; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a2 -; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 1 -; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 4 +; LMULMAX1-RV32-NEXT: vor.vv v25, v27, v25 +; LMULMAX1-RV32-NEXT: vsrl.vi v27, v25, 2 +; LMULMAX1-RV32-NEXT: lui a3, 3 +; LMULMAX1-RV32-NEXT: addi a3, a3, 819 +; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a3 +; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a3 +; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 2 +; LMULMAX1-RV32-NEXT: vor.vv v25, v27, v25 +; LMULMAX1-RV32-NEXT: vsrl.vi v27, v25, 1 +; LMULMAX1-RV32-NEXT: lui a4, 5 +; LMULMAX1-RV32-NEXT: addi a4, a4, 1365 +; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a4 +; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a4 +; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v25 +; LMULMAX1-RV32-NEXT: vor.vv v25, v27, v25 ; LMULMAX1-RV32-NEXT: vsrl.vi v27, v26, 8 ; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 8 ; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27 -; LMULMAX1-RV32-NEXT: vand.vx v27, v26, a7 -; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 4 -; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a3 -; LMULMAX1-RV32-NEXT: vsrl.vi v26, v26, 4 -; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27 -; LMULMAX1-RV32-NEXT: vand.vx v27, v26, a4 -; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 2 -; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a5 -; LMULMAX1-RV32-NEXT: vsrl.vi v26, v26, 2 -; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27 -; LMULMAX1-RV32-NEXT: vand.vx v27, v26, a1 -; LMULMAX1-RV32-NEXT: vadd.vv v27, v27, v27 +; LMULMAX1-RV32-NEXT: vsrl.vi v27, v26, 4 +; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a2 ; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a2 -; LMULMAX1-RV32-NEXT: vsrl.vi v26, v26, 1 -; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 4 +; LMULMAX1-RV32-NEXT: vor.vv v26, v27, v26 +; LMULMAX1-RV32-NEXT: vsrl.vi v27, v26, 2 +; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a3 +; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a3 +; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 2 +; LMULMAX1-RV32-NEXT: vor.vv v26, v27, v26 +; LMULMAX1-RV32-NEXT: vsrl.vi v27, v26, 1 +; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a4 +; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a4 +; LMULMAX1-RV32-NEXT: vadd.vv v26, v26, v26 +; LMULMAX1-RV32-NEXT: vor.vv v26, v27, v26 ; LMULMAX1-RV32-NEXT: vse16.v v26, (a0) -; LMULMAX1-RV32-NEXT: vse16.v v25, (a6) +; LMULMAX1-RV32-NEXT: vse16.v v25, (a1) ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: bitreverse_v16i16: ; LMULMAX1-RV64: # %bb.0: ; LMULMAX1-RV64-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; LMULMAX1-RV64-NEXT: addi a6, a0, 16 -; LMULMAX1-RV64-NEXT: vle16.v v25, (a6) +; LMULMAX1-RV64-NEXT: addi a1, a0, 16 +; LMULMAX1-RV64-NEXT: vle16.v v25, (a1) ; LMULMAX1-RV64-NEXT: vle16.v v26, (a0) ; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 8 ; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 8 ; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 4 ; LMULMAX1-RV64-NEXT: lui a2, 1 -; LMULMAX1-RV64-NEXT: addiw a7, a2, -241 -; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a7 -; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 4 -; LMULMAX1-RV64-NEXT: lui a3, 15 -; LMULMAX1-RV64-NEXT: addiw a3, a3, 240 -; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a3 -; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 4 -; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 -; LMULMAX1-RV64-NEXT: lui a4, 3 -; LMULMAX1-RV64-NEXT: addiw a4, a4, 819 -; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a4 -; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 2 -; LMULMAX1-RV64-NEXT: lui a5, 13 -; LMULMAX1-RV64-NEXT: addiw a5, a5, -820 -; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a5 -; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 2 -; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 -; LMULMAX1-RV64-NEXT: lui a1, 5 -; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a1 -; LMULMAX1-RV64-NEXT: vadd.vv v27, v27, v27 -; LMULMAX1-RV64-NEXT: lui a2, 11 -; LMULMAX1-RV64-NEXT: addiw a2, a2, -1366 +; LMULMAX1-RV64-NEXT: addiw a2, a2, -241 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a2 ; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a2 -; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 1 -; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 4 +; LMULMAX1-RV64-NEXT: vor.vv v25, v27, v25 +; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 2 +; LMULMAX1-RV64-NEXT: lui a3, 3 +; LMULMAX1-RV64-NEXT: addiw a3, a3, 819 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a3 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a3 +; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 2 +; LMULMAX1-RV64-NEXT: vor.vv v25, v27, v25 +; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 1 +; LMULMAX1-RV64-NEXT: lui a4, 5 +; LMULMAX1-RV64-NEXT: addiw a4, a4, 1365 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a4 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a4 +; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v25 +; LMULMAX1-RV64-NEXT: vor.vv v25, v27, v25 ; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 8 ; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 8 ; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 -; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a7 -; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 4 -; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a3 -; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 4 -; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 -; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a4 -; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 2 -; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a5 -; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 2 -; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 -; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a1 -; LMULMAX1-RV64-NEXT: vadd.vv v27, v27, v27 +; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 4 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a2 ; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a2 -; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 1 -; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 4 +; LMULMAX1-RV64-NEXT: vor.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 2 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a3 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a3 +; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 2 +; LMULMAX1-RV64-NEXT: vor.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 1 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a4 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a4 +; LMULMAX1-RV64-NEXT: vadd.vv v26, v26, v26 +; LMULMAX1-RV64-NEXT: vor.vv v26, v27, v26 ; LMULMAX1-RV64-NEXT: vse16.v v26, (a0) -; LMULMAX1-RV64-NEXT: vse16.v v25, (a6) +; LMULMAX1-RV64-NEXT: vse16.v v25, (a1) ; LMULMAX1-RV64-NEXT: ret %a = load <16 x i16>, <16 x i16>* %x %b = load <16 x i16>, <16 x i16>* %y @@ -987,33 +825,27 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 24 ; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v30 ; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV32-NEXT: vsrl.vi v28, v26, 4 ; LMULMAX2-RV32-NEXT: lui a1, 61681 ; LMULMAX2-RV32-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32-NEXT: vand.vx v28, v26, a1 -; LMULMAX2-RV32-NEXT: vsll.vi v28, v28, 4 -; LMULMAX2-RV32-NEXT: lui a1, 986895 -; LMULMAX2-RV32-NEXT: addi a1, a1, 240 +; LMULMAX2-RV32-NEXT: vand.vx v28, v28, a1 ; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v26, v26, 4 -; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 4 +; LMULMAX2-RV32-NEXT: vor.vv v26, v28, v26 +; LMULMAX2-RV32-NEXT: vsrl.vi v28, v26, 2 ; LMULMAX2-RV32-NEXT: lui a1, 209715 ; LMULMAX2-RV32-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32-NEXT: vand.vx v28, v26, a1 -; LMULMAX2-RV32-NEXT: vsll.vi v28, v28, 2 -; LMULMAX2-RV32-NEXT: lui a1, 838861 -; LMULMAX2-RV32-NEXT: addi a1, a1, -820 +; LMULMAX2-RV32-NEXT: vand.vx v28, v28, a1 ; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v26, v26, 2 -; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 2 +; LMULMAX2-RV32-NEXT: vor.vv v26, v28, v26 +; LMULMAX2-RV32-NEXT: vsrl.vi v28, v26, 1 ; LMULMAX2-RV32-NEXT: lui a1, 349525 ; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32-NEXT: vand.vx v28, v26, a1 -; LMULMAX2-RV32-NEXT: vadd.vv v28, v28, v28 -; LMULMAX2-RV32-NEXT: lui a1, 699051 -; LMULMAX2-RV32-NEXT: addi a1, a1, -1366 +; LMULMAX2-RV32-NEXT: vand.vx v28, v28, a1 ; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v26, v26, 1 -; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV32-NEXT: vadd.vv v26, v26, v26 +; LMULMAX2-RV32-NEXT: vor.vv v26, v28, v26 ; LMULMAX2-RV32-NEXT: vse32.v v26, (a0) ; LMULMAX2-RV32-NEXT: ret ; @@ -1033,39 +865,27 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 24 ; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v30 ; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV64-NEXT: vsrl.vi v28, v26, 4 ; LMULMAX2-RV64-NEXT: lui a1, 61681 ; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX2-RV64-NEXT: vand.vx v28, v26, a1 -; LMULMAX2-RV64-NEXT: vsll.vi v28, v28, 4 -; LMULMAX2-RV64-NEXT: lui a1, 241 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX2-RV64-NEXT: slli a1, a1, 12 -; LMULMAX2-RV64-NEXT: addi a1, a1, 240 +; LMULMAX2-RV64-NEXT: vand.vx v28, v28, a1 ; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v26, v26, 4 -; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 4 +; LMULMAX2-RV64-NEXT: vor.vv v26, v28, v26 +; LMULMAX2-RV64-NEXT: vsrl.vi v28, v26, 2 ; LMULMAX2-RV64-NEXT: lui a1, 209715 ; LMULMAX2-RV64-NEXT: addiw a1, a1, 819 -; LMULMAX2-RV64-NEXT: vand.vx v28, v26, a1 -; LMULMAX2-RV64-NEXT: vsll.vi v28, v28, 2 -; LMULMAX2-RV64-NEXT: lui a1, 205 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -819 -; LMULMAX2-RV64-NEXT: slli a1, a1, 12 -; LMULMAX2-RV64-NEXT: addi a1, a1, -820 +; LMULMAX2-RV64-NEXT: vand.vx v28, v28, a1 ; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v26, v26, 2 -; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 2 +; LMULMAX2-RV64-NEXT: vor.vv v26, v28, v26 +; LMULMAX2-RV64-NEXT: vsrl.vi v28, v26, 1 ; LMULMAX2-RV64-NEXT: lui a1, 349525 ; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX2-RV64-NEXT: vand.vx v28, v26, a1 -; LMULMAX2-RV64-NEXT: vadd.vv v28, v28, v28 -; LMULMAX2-RV64-NEXT: lui a1, 171 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -1365 -; LMULMAX2-RV64-NEXT: slli a1, a1, 12 -; LMULMAX2-RV64-NEXT: addi a1, a1, -1366 +; LMULMAX2-RV64-NEXT: vand.vx v28, v28, a1 ; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v26, v26, 1 -; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV64-NEXT: vadd.vv v26, v26, v26 +; LMULMAX2-RV64-NEXT: vor.vv v26, v28, v26 ; LMULMAX2-RV64-NEXT: vse32.v v26, (a0) ; LMULMAX2-RV64-NEXT: ret ; @@ -1077,67 +897,61 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX1-RV32-NEXT: vle32.v v26, (a0) ; LMULMAX1-RV32-NEXT: vsrl.vi v27, v25, 8 ; LMULMAX1-RV32-NEXT: lui a2, 16 -; LMULMAX1-RV32-NEXT: addi a7, a2, -256 -; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a7 +; LMULMAX1-RV32-NEXT: addi a2, a2, -256 +; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a2 ; LMULMAX1-RV32-NEXT: vsrl.vi v28, v25, 24 ; LMULMAX1-RV32-NEXT: vor.vv v27, v27, v28 ; LMULMAX1-RV32-NEXT: vsll.vi v28, v25, 8 -; LMULMAX1-RV32-NEXT: lui t0, 4080 -; LMULMAX1-RV32-NEXT: vand.vx v28, v28, t0 +; LMULMAX1-RV32-NEXT: lui a3, 4080 +; LMULMAX1-RV32-NEXT: vand.vx v28, v28, a3 ; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 24 ; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v28 ; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV32-NEXT: vsrl.vi v27, v25, 4 ; LMULMAX1-RV32-NEXT: lui a4, 61681 -; LMULMAX1-RV32-NEXT: addi t1, a4, -241 -; LMULMAX1-RV32-NEXT: vand.vx v27, v25, t1 -; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 4 -; LMULMAX1-RV32-NEXT: lui a5, 986895 -; LMULMAX1-RV32-NEXT: addi a5, a5, 240 -; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a5 -; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 4 -; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27 -; LMULMAX1-RV32-NEXT: lui a1, 209715 -; LMULMAX1-RV32-NEXT: addi a1, a1, 819 -; LMULMAX1-RV32-NEXT: vand.vx v27, v25, a1 -; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 2 -; LMULMAX1-RV32-NEXT: lui a2, 838861 -; LMULMAX1-RV32-NEXT: addi a2, a2, -820 -; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a2 -; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 2 -; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27 -; LMULMAX1-RV32-NEXT: lui a3, 349525 -; LMULMAX1-RV32-NEXT: addi a3, a3, 1365 -; LMULMAX1-RV32-NEXT: vand.vx v27, v25, a3 -; LMULMAX1-RV32-NEXT: vadd.vv v27, v27, v27 -; LMULMAX1-RV32-NEXT: lui a4, 699051 -; LMULMAX1-RV32-NEXT: addi a4, a4, -1366 +; LMULMAX1-RV32-NEXT: addi a4, a4, -241 +; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a4 ; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a4 -; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 1 -; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 4 +; LMULMAX1-RV32-NEXT: vor.vv v25, v27, v25 +; LMULMAX1-RV32-NEXT: vsrl.vi v27, v25, 2 +; LMULMAX1-RV32-NEXT: lui a5, 209715 +; LMULMAX1-RV32-NEXT: addi a5, a5, 819 +; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a5 +; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a5 +; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 2 +; LMULMAX1-RV32-NEXT: vor.vv v25, v27, v25 +; LMULMAX1-RV32-NEXT: vsrl.vi v27, v25, 1 +; LMULMAX1-RV32-NEXT: lui a1, 349525 +; LMULMAX1-RV32-NEXT: addi a1, a1, 1365 +; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a1 +; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a1 +; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v25 +; LMULMAX1-RV32-NEXT: vor.vv v25, v27, v25 ; LMULMAX1-RV32-NEXT: vsrl.vi v27, v26, 8 -; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a7 +; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a2 ; LMULMAX1-RV32-NEXT: vsrl.vi v28, v26, 24 ; LMULMAX1-RV32-NEXT: vor.vv v27, v27, v28 ; LMULMAX1-RV32-NEXT: vsll.vi v28, v26, 8 -; LMULMAX1-RV32-NEXT: vand.vx v28, v28, t0 +; LMULMAX1-RV32-NEXT: vand.vx v28, v28, a3 ; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 24 ; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v28 ; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27 -; LMULMAX1-RV32-NEXT: vand.vx v27, v26, t1 -; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 4 -; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a5 -; LMULMAX1-RV32-NEXT: vsrl.vi v26, v26, 4 -; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27 -; LMULMAX1-RV32-NEXT: vand.vx v27, v26, a1 -; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 2 -; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a2 -; LMULMAX1-RV32-NEXT: vsrl.vi v26, v26, 2 -; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27 -; LMULMAX1-RV32-NEXT: vand.vx v27, v26, a3 -; LMULMAX1-RV32-NEXT: vadd.vv v27, v27, v27 +; LMULMAX1-RV32-NEXT: vsrl.vi v27, v26, 4 +; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a4 ; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a4 -; LMULMAX1-RV32-NEXT: vsrl.vi v26, v26, 1 -; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 4 +; LMULMAX1-RV32-NEXT: vor.vv v26, v27, v26 +; LMULMAX1-RV32-NEXT: vsrl.vi v27, v26, 2 +; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a5 +; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a5 +; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 2 +; LMULMAX1-RV32-NEXT: vor.vv v26, v27, v26 +; LMULMAX1-RV32-NEXT: vsrl.vi v27, v26, 1 +; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a1 +; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a1 +; LMULMAX1-RV32-NEXT: vadd.vv v26, v26, v26 +; LMULMAX1-RV32-NEXT: vor.vv v26, v27, v26 ; LMULMAX1-RV32-NEXT: vse32.v v26, (a0) ; LMULMAX1-RV32-NEXT: vse32.v v25, (a6) ; LMULMAX1-RV32-NEXT: ret @@ -1155,68 +969,56 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX1-RV64-NEXT: vsrl.vi v28, v25, 24 ; LMULMAX1-RV64-NEXT: vor.vv v27, v27, v28 ; LMULMAX1-RV64-NEXT: vsll.vi v28, v25, 8 -; LMULMAX1-RV64-NEXT: lui a7, 4080 -; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a7 +; LMULMAX1-RV64-NEXT: lui a3, 4080 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a3 ; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 24 ; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v28 ; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 4 ; LMULMAX1-RV64-NEXT: lui a4, 61681 ; LMULMAX1-RV64-NEXT: addiw a4, a4, -241 -; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a4 -; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 4 -; LMULMAX1-RV64-NEXT: lui a5, 241 -; LMULMAX1-RV64-NEXT: addiw a5, a5, -241 -; LMULMAX1-RV64-NEXT: slli a5, a5, 12 -; LMULMAX1-RV64-NEXT: addi t0, a5, 240 -; LMULMAX1-RV64-NEXT: vand.vx v25, v25, t0 -; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 4 -; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 -; LMULMAX1-RV64-NEXT: lui a1, 209715 -; LMULMAX1-RV64-NEXT: addiw a1, a1, 819 -; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a1 -; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 2 -; LMULMAX1-RV64-NEXT: lui a3, 205 -; LMULMAX1-RV64-NEXT: addiw a3, a3, -819 -; LMULMAX1-RV64-NEXT: slli a3, a3, 12 -; LMULMAX1-RV64-NEXT: addi t1, a3, -820 -; LMULMAX1-RV64-NEXT: vand.vx v25, v25, t1 -; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 2 -; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 -; LMULMAX1-RV64-NEXT: lui a5, 349525 -; LMULMAX1-RV64-NEXT: addiw a5, a5, 1365 -; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a5 -; LMULMAX1-RV64-NEXT: vadd.vv v27, v27, v27 -; LMULMAX1-RV64-NEXT: lui a3, 171 -; LMULMAX1-RV64-NEXT: addiw a3, a3, -1365 -; LMULMAX1-RV64-NEXT: slli a3, a3, 12 -; LMULMAX1-RV64-NEXT: addi a3, a3, -1366 -; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a3 -; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 1 -; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a4 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a4 +; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 4 +; LMULMAX1-RV64-NEXT: vor.vv v25, v27, v25 +; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 2 +; LMULMAX1-RV64-NEXT: lui a5, 209715 +; LMULMAX1-RV64-NEXT: addiw a5, a5, 819 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a5 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a5 +; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 2 +; LMULMAX1-RV64-NEXT: vor.vv v25, v27, v25 +; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 1 +; LMULMAX1-RV64-NEXT: lui a1, 349525 +; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a1 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1 +; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v25 +; LMULMAX1-RV64-NEXT: vor.vv v25, v27, v25 ; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 8 ; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a2 ; LMULMAX1-RV64-NEXT: vsrl.vi v28, v26, 24 ; LMULMAX1-RV64-NEXT: vor.vv v27, v27, v28 ; LMULMAX1-RV64-NEXT: vsll.vi v28, v26, 8 -; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a7 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a3 ; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 24 ; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v28 ; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 -; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a4 -; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 4 -; LMULMAX1-RV64-NEXT: vand.vx v26, v26, t0 -; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 4 -; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 -; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a1 -; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 2 -; LMULMAX1-RV64-NEXT: vand.vx v26, v26, t1 -; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 2 -; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 -; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a5 -; LMULMAX1-RV64-NEXT: vadd.vv v27, v27, v27 -; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a3 -; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 1 -; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 4 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a4 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a4 +; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 4 +; LMULMAX1-RV64-NEXT: vor.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 2 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a5 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a5 +; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 2 +; LMULMAX1-RV64-NEXT: vor.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 1 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a1 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a1 +; LMULMAX1-RV64-NEXT: vadd.vv v26, v26, v26 +; LMULMAX1-RV64-NEXT: vor.vv v26, v27, v26 ; LMULMAX1-RV64-NEXT: vse32.v v26, (a0) ; LMULMAX1-RV64-NEXT: vse32.v v25, (a6) ; LMULMAX1-RV64-NEXT: ret @@ -1280,51 +1082,36 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v8 ; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v30 ; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV32-NEXT: vsrl.vi v28, v26, 4 ; LMULMAX2-RV32-NEXT: lui a1, 61681 ; LMULMAX2-RV32-NEXT: addi a1, a1, -241 ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; LMULMAX2-RV32-NEXT: vmv.v.x v28, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; LMULMAX2-RV32-NEXT: vand.vv v28, v26, v28 -; LMULMAX2-RV32-NEXT: vsll.vi v28, v28, 4 -; LMULMAX2-RV32-NEXT: lui a1, 986895 -; LMULMAX2-RV32-NEXT: addi a1, a1, 240 -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; LMULMAX2-RV32-NEXT: vmv.v.x v30, a1 ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX2-RV32-NEXT: vand.vv v28, v28, v30 ; LMULMAX2-RV32-NEXT: vand.vv v26, v26, v30 -; LMULMAX2-RV32-NEXT: vsrl.vi v26, v26, 4 -; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 4 +; LMULMAX2-RV32-NEXT: vor.vv v26, v28, v26 +; LMULMAX2-RV32-NEXT: vsrl.vi v28, v26, 2 ; LMULMAX2-RV32-NEXT: lui a1, 209715 ; LMULMAX2-RV32-NEXT: addi a1, a1, 819 ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; LMULMAX2-RV32-NEXT: vmv.v.x v28, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; LMULMAX2-RV32-NEXT: vand.vv v28, v26, v28 -; LMULMAX2-RV32-NEXT: vsll.vi v28, v28, 2 -; LMULMAX2-RV32-NEXT: lui a1, 838861 -; LMULMAX2-RV32-NEXT: addi a1, a1, -820 -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; LMULMAX2-RV32-NEXT: vmv.v.x v30, a1 ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX2-RV32-NEXT: vand.vv v28, v28, v30 ; LMULMAX2-RV32-NEXT: vand.vv v26, v26, v30 -; LMULMAX2-RV32-NEXT: vsrl.vi v26, v26, 2 -; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 2 +; LMULMAX2-RV32-NEXT: vor.vv v26, v28, v26 +; LMULMAX2-RV32-NEXT: vsrl.vi v28, v26, 1 ; LMULMAX2-RV32-NEXT: lui a1, 349525 ; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; LMULMAX2-RV32-NEXT: vmv.v.x v28, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; LMULMAX2-RV32-NEXT: vand.vv v28, v26, v28 -; LMULMAX2-RV32-NEXT: vadd.vv v28, v28, v28 -; LMULMAX2-RV32-NEXT: lui a1, 699051 -; LMULMAX2-RV32-NEXT: addi a1, a1, -1366 -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; LMULMAX2-RV32-NEXT: vmv.v.x v30, a1 ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX2-RV32-NEXT: vand.vv v28, v28, v30 ; LMULMAX2-RV32-NEXT: vand.vv v26, v26, v30 -; LMULMAX2-RV32-NEXT: vsrl.vi v26, v26, 1 -; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV32-NEXT: vadd.vv v26, v26, v26 +; LMULMAX2-RV32-NEXT: vor.vv v26, v28, v26 ; LMULMAX2-RV32-NEXT: vse64.v v26, (a0) ; LMULMAX2-RV32-NEXT: ret ; @@ -1363,6 +1150,7 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX2-RV64-NEXT: vor.vv v26, v8, v26 ; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v30 ; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV64-NEXT: vsrl.vi v28, v26, 4 ; LMULMAX2-RV64-NEXT: lui a1, 3855 ; LMULMAX2-RV64-NEXT: addiw a1, a1, 241 ; LMULMAX2-RV64-NEXT: slli a1, a1, 12 @@ -1371,19 +1159,11 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX2-RV64-NEXT: addi a1, a1, 241 ; LMULMAX2-RV64-NEXT: slli a1, a1, 12 ; LMULMAX2-RV64-NEXT: addi a1, a1, -241 -; LMULMAX2-RV64-NEXT: vand.vx v28, v26, a1 -; LMULMAX2-RV64-NEXT: vsll.vi v28, v28, 4 -; LMULMAX2-RV64-NEXT: lui a1, 1044721 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX2-RV64-NEXT: slli a1, a1, 12 -; LMULMAX2-RV64-NEXT: addi a1, a1, 241 -; LMULMAX2-RV64-NEXT: slli a1, a1, 12 -; LMULMAX2-RV64-NEXT: addi a1, a1, -241 -; LMULMAX2-RV64-NEXT: slli a1, a1, 12 -; LMULMAX2-RV64-NEXT: addi a1, a1, 240 +; LMULMAX2-RV64-NEXT: vand.vx v28, v28, a1 ; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v26, v26, 4 -; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 4 +; LMULMAX2-RV64-NEXT: vor.vv v26, v28, v26 +; LMULMAX2-RV64-NEXT: vsrl.vi v28, v26, 2 ; LMULMAX2-RV64-NEXT: lui a1, 13107 ; LMULMAX2-RV64-NEXT: addiw a1, a1, 819 ; LMULMAX2-RV64-NEXT: slli a1, a1, 12 @@ -1392,19 +1172,11 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX2-RV64-NEXT: addi a1, a1, 819 ; LMULMAX2-RV64-NEXT: slli a1, a1, 12 ; LMULMAX2-RV64-NEXT: addi a1, a1, 819 -; LMULMAX2-RV64-NEXT: vand.vx v28, v26, a1 -; LMULMAX2-RV64-NEXT: vsll.vi v28, v28, 2 -; LMULMAX2-RV64-NEXT: lui a1, 1035469 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -819 -; LMULMAX2-RV64-NEXT: slli a1, a1, 12 -; LMULMAX2-RV64-NEXT: addi a1, a1, -819 -; LMULMAX2-RV64-NEXT: slli a1, a1, 12 -; LMULMAX2-RV64-NEXT: addi a1, a1, -819 -; LMULMAX2-RV64-NEXT: slli a1, a1, 12 -; LMULMAX2-RV64-NEXT: addi a1, a1, -820 +; LMULMAX2-RV64-NEXT: vand.vx v28, v28, a1 ; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v26, v26, 2 -; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 2 +; LMULMAX2-RV64-NEXT: vor.vv v26, v28, v26 +; LMULMAX2-RV64-NEXT: vsrl.vi v28, v26, 1 ; LMULMAX2-RV64-NEXT: lui a1, 21845 ; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365 ; LMULMAX2-RV64-NEXT: slli a1, a1, 12 @@ -1413,19 +1185,10 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX2-RV64-NEXT: addi a1, a1, 1365 ; LMULMAX2-RV64-NEXT: slli a1, a1, 12 ; LMULMAX2-RV64-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV64-NEXT: vand.vx v28, v26, a1 -; LMULMAX2-RV64-NEXT: vadd.vv v28, v28, v28 -; LMULMAX2-RV64-NEXT: lui a1, 1026731 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -1365 -; LMULMAX2-RV64-NEXT: slli a1, a1, 12 -; LMULMAX2-RV64-NEXT: addi a1, a1, -1365 -; LMULMAX2-RV64-NEXT: slli a1, a1, 12 -; LMULMAX2-RV64-NEXT: addi a1, a1, -1365 -; LMULMAX2-RV64-NEXT: slli a1, a1, 12 -; LMULMAX2-RV64-NEXT: addi a1, a1, -1366 +; LMULMAX2-RV64-NEXT: vand.vx v28, v28, a1 ; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v26, v26, 1 -; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV64-NEXT: vadd.vv v26, v26, v26 +; LMULMAX2-RV64-NEXT: vor.vv v26, v28, v26 ; LMULMAX2-RV64-NEXT: vse64.v v26, (a0) ; LMULMAX2-RV64-NEXT: ret ; @@ -1433,17 +1196,17 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32: # %bb.0: ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-RV32-NEXT: addi a6, a0, 16 -; LMULMAX1-RV32-NEXT: vle64.v v30, (a6) +; LMULMAX1-RV32-NEXT: vle64.v v29, (a6) ; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) ; LMULMAX1-RV32-NEXT: addi a2, zero, 56 -; LMULMAX1-RV32-NEXT: vsrl.vx v26, v30, a2 +; LMULMAX1-RV32-NEXT: vsrl.vx v26, v29, a2 ; LMULMAX1-RV32-NEXT: addi a3, zero, 40 -; LMULMAX1-RV32-NEXT: vsrl.vx v27, v30, a3 +; LMULMAX1-RV32-NEXT: vsrl.vx v27, v29, a3 ; LMULMAX1-RV32-NEXT: lui a4, 16 ; LMULMAX1-RV32-NEXT: addi a4, a4, -256 ; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a4 ; LMULMAX1-RV32-NEXT: vor.vv v27, v27, v26 -; LMULMAX1-RV32-NEXT: vsrl.vi v26, v30, 24 +; LMULMAX1-RV32-NEXT: vsrl.vi v26, v29, 24 ; LMULMAX1-RV32-NEXT: lui a5, 4080 ; LMULMAX1-RV32-NEXT: vand.vx v28, v26, a5 ; LMULMAX1-RV32-NEXT: addi a1, zero, 5 @@ -1454,125 +1217,106 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: lui a1, 1044480 ; LMULMAX1-RV32-NEXT: vmerge.vxm v26, v26, a1, v0 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-RV32-NEXT: vsrl.vi v29, v30, 8 -; LMULMAX1-RV32-NEXT: vand.vv v29, v29, v26 -; LMULMAX1-RV32-NEXT: vor.vv v28, v29, v28 -; LMULMAX1-RV32-NEXT: vor.vv v31, v28, v27 +; LMULMAX1-RV32-NEXT: vsrl.vi v30, v29, 8 +; LMULMAX1-RV32-NEXT: vand.vv v30, v30, v26 +; LMULMAX1-RV32-NEXT: vor.vv v28, v30, v28 +; LMULMAX1-RV32-NEXT: vor.vv v30, v28, v27 ; LMULMAX1-RV32-NEXT: addi a1, zero, 255 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-RV32-NEXT: vmv.v.x v27, a1 ; LMULMAX1-RV32-NEXT: vmerge.vim v27, v27, 0, v0 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-RV32-NEXT: vsll.vi v28, v30, 8 -; LMULMAX1-RV32-NEXT: vand.vv v29, v28, v27 +; LMULMAX1-RV32-NEXT: vsll.vi v28, v29, 8 +; LMULMAX1-RV32-NEXT: vand.vv v31, v28, v27 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-RV32-NEXT: vmv.v.x v28, a4 ; LMULMAX1-RV32-NEXT: vmerge.vim v28, v28, 0, v0 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-RV32-NEXT: vsll.vi v8, v30, 24 +; LMULMAX1-RV32-NEXT: vsll.vi v8, v29, 24 ; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v28 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v29 -; LMULMAX1-RV32-NEXT: vsll.vx v9, v30, a3 +; LMULMAX1-RV32-NEXT: vor.vv v31, v8, v31 +; LMULMAX1-RV32-NEXT: vsll.vx v8, v29, a3 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV32-NEXT: vmv.v.x v29, a5 -; LMULMAX1-RV32-NEXT: vmerge.vim v29, v29, 0, v0 +; LMULMAX1-RV32-NEXT: vmv.v.x v9, a5 +; LMULMAX1-RV32-NEXT: vmerge.vim v9, v9, 0, v0 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v29 -; LMULMAX1-RV32-NEXT: vsll.vx v30, v30, a2 -; LMULMAX1-RV32-NEXT: vor.vv v30, v30, v9 -; LMULMAX1-RV32-NEXT: vor.vv v30, v30, v8 -; LMULMAX1-RV32-NEXT: vor.vv v31, v30, v31 +; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v9 +; LMULMAX1-RV32-NEXT: vsll.vx v29, v29, a2 +; LMULMAX1-RV32-NEXT: vor.vv v29, v29, v8 +; LMULMAX1-RV32-NEXT: vor.vv v29, v29, v31 +; LMULMAX1-RV32-NEXT: vor.vv v29, v29, v30 +; LMULMAX1-RV32-NEXT: vsrl.vi v30, v29, 4 ; LMULMAX1-RV32-NEXT: lui a1, 61681 ; LMULMAX1-RV32-NEXT: addi a1, a1, -241 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV32-NEXT: vmv.v.x v30, a1 +; LMULMAX1-RV32-NEXT: vmv.v.x v31, a1 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-RV32-NEXT: vand.vv v8, v31, v30 -; LMULMAX1-RV32-NEXT: vsll.vi v8, v8, 4 -; LMULMAX1-RV32-NEXT: lui a1, 986895 -; LMULMAX1-RV32-NEXT: addi a1, a1, 240 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV32-NEXT: vmv.v.x v9, a1 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-RV32-NEXT: vand.vv v31, v31, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v31, v31, 4 -; LMULMAX1-RV32-NEXT: vor.vv v31, v31, v8 +; LMULMAX1-RV32-NEXT: vand.vv v30, v30, v31 +; LMULMAX1-RV32-NEXT: vand.vv v29, v29, v31 +; LMULMAX1-RV32-NEXT: vsll.vi v29, v29, 4 +; LMULMAX1-RV32-NEXT: vor.vv v29, v30, v29 +; LMULMAX1-RV32-NEXT: vsrl.vi v30, v29, 2 ; LMULMAX1-RV32-NEXT: lui a1, 209715 ; LMULMAX1-RV32-NEXT: addi a1, a1, 819 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-RV32-NEXT: vmv.v.x v8, a1 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-RV32-NEXT: vand.vv v10, v31, v8 -; LMULMAX1-RV32-NEXT: vsll.vi v10, v10, 2 -; LMULMAX1-RV32-NEXT: lui a1, 838861 -; LMULMAX1-RV32-NEXT: addi a1, a1, -820 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV32-NEXT: vmv.v.x v11, a1 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-RV32-NEXT: vand.vv v31, v31, v11 -; LMULMAX1-RV32-NEXT: vsrl.vi v31, v31, 2 -; LMULMAX1-RV32-NEXT: vor.vv v31, v31, v10 +; LMULMAX1-RV32-NEXT: vand.vv v30, v30, v8 +; LMULMAX1-RV32-NEXT: vand.vv v29, v29, v8 +; LMULMAX1-RV32-NEXT: vsll.vi v29, v29, 2 +; LMULMAX1-RV32-NEXT: vor.vv v29, v30, v29 +; LMULMAX1-RV32-NEXT: vsrl.vi v30, v29, 1 ; LMULMAX1-RV32-NEXT: lui a1, 349525 ; LMULMAX1-RV32-NEXT: addi a1, a1, 1365 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-RV32-NEXT: vmv.v.x v10, a1 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-RV32-NEXT: vand.vv v12, v31, v10 -; LMULMAX1-RV32-NEXT: vadd.vv v12, v12, v12 -; LMULMAX1-RV32-NEXT: lui a1, 699051 -; LMULMAX1-RV32-NEXT: addi a1, a1, -1366 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV32-NEXT: vmv.v.x v13, a1 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-RV32-NEXT: vand.vv v31, v31, v13 -; LMULMAX1-RV32-NEXT: vsrl.vi v31, v31, 1 -; LMULMAX1-RV32-NEXT: vor.vv v31, v31, v12 -; LMULMAX1-RV32-NEXT: vsrl.vx v12, v25, a2 -; LMULMAX1-RV32-NEXT: vsrl.vx v14, v25, a3 -; LMULMAX1-RV32-NEXT: vand.vx v14, v14, a4 -; LMULMAX1-RV32-NEXT: vor.vv v12, v14, v12 -; LMULMAX1-RV32-NEXT: vsrl.vi v14, v25, 24 -; LMULMAX1-RV32-NEXT: vand.vx v14, v14, a5 -; LMULMAX1-RV32-NEXT: vsrl.vi v15, v25, 8 -; LMULMAX1-RV32-NEXT: vand.vv v26, v15, v26 -; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v14 -; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v12 -; LMULMAX1-RV32-NEXT: vsll.vi v12, v25, 8 -; LMULMAX1-RV32-NEXT: vand.vv v27, v12, v27 -; LMULMAX1-RV32-NEXT: vsll.vi v12, v25, 24 -; LMULMAX1-RV32-NEXT: vand.vv v28, v12, v28 +; LMULMAX1-RV32-NEXT: vand.vv v30, v30, v10 +; LMULMAX1-RV32-NEXT: vand.vv v29, v29, v10 +; LMULMAX1-RV32-NEXT: vadd.vv v29, v29, v29 +; LMULMAX1-RV32-NEXT: vor.vv v29, v30, v29 +; LMULMAX1-RV32-NEXT: vsrl.vx v30, v25, a2 +; LMULMAX1-RV32-NEXT: vsrl.vx v11, v25, a3 +; LMULMAX1-RV32-NEXT: vand.vx v11, v11, a4 +; LMULMAX1-RV32-NEXT: vor.vv v30, v11, v30 +; LMULMAX1-RV32-NEXT: vsrl.vi v11, v25, 24 +; LMULMAX1-RV32-NEXT: vand.vx v11, v11, a5 +; LMULMAX1-RV32-NEXT: vsrl.vi v12, v25, 8 +; LMULMAX1-RV32-NEXT: vand.vv v26, v12, v26 +; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v11 +; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v30 +; LMULMAX1-RV32-NEXT: vsll.vi v30, v25, 8 +; LMULMAX1-RV32-NEXT: vand.vv v27, v30, v27 +; LMULMAX1-RV32-NEXT: vsll.vi v30, v25, 24 +; LMULMAX1-RV32-NEXT: vand.vv v28, v30, v28 ; LMULMAX1-RV32-NEXT: vor.vv v27, v28, v27 ; LMULMAX1-RV32-NEXT: vsll.vx v28, v25, a3 -; LMULMAX1-RV32-NEXT: vand.vv v28, v28, v29 +; LMULMAX1-RV32-NEXT: vand.vv v28, v28, v9 ; LMULMAX1-RV32-NEXT: vsll.vx v25, v25, a2 ; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v28 ; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27 ; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vand.vv v26, v25, v30 -; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 4 -; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 4 -; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vand.vv v26, v25, v8 -; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 2 -; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v11 -; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 2 -; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vand.vv v26, v25, v10 -; LMULMAX1-RV32-NEXT: vadd.vv v26, v26, v26 -; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v13 -; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 1 -; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 4 +; LMULMAX1-RV32-NEXT: vand.vv v26, v26, v31 +; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v31 +; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 4 +; LMULMAX1-RV32-NEXT: vor.vv v25, v26, v25 +; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 2 +; LMULMAX1-RV32-NEXT: vand.vv v26, v26, v8 +; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v8 +; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 2 +; LMULMAX1-RV32-NEXT: vor.vv v25, v26, v25 +; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 1 +; LMULMAX1-RV32-NEXT: vand.vv v26, v26, v10 +; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v10 +; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v25 +; LMULMAX1-RV32-NEXT: vor.vv v25, v26, v25 ; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV32-NEXT: vse64.v v31, (a6) +; LMULMAX1-RV32-NEXT: vse64.v v29, (a6) ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: bitreverse_v4i64: ; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: addi sp, sp, -16 -; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 16 -; LMULMAX1-RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill -; LMULMAX1-RV64-NEXT: .cfi_offset s0, -8 ; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-RV64-NEXT: addi a6, a0, 16 ; LMULMAX1-RV64-NEXT: vle64.v v26, (a6) @@ -1581,33 +1325,34 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV64-NEXT: vsrl.vx v27, v26, t0 ; LMULMAX1-RV64-NEXT: addi t1, zero, 40 ; LMULMAX1-RV64-NEXT: vsrl.vx v28, v26, t1 -; LMULMAX1-RV64-NEXT: lui a1, 16 -; LMULMAX1-RV64-NEXT: addiw t4, a1, -256 -; LMULMAX1-RV64-NEXT: vand.vx v28, v28, t4 +; LMULMAX1-RV64-NEXT: lui a4, 16 +; LMULMAX1-RV64-NEXT: addiw t2, a4, -256 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, t2 ; LMULMAX1-RV64-NEXT: vor.vv v27, v28, v27 ; LMULMAX1-RV64-NEXT: vsrl.vi v28, v26, 24 ; LMULMAX1-RV64-NEXT: lui a7, 4080 ; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a7 ; LMULMAX1-RV64-NEXT: vsrl.vi v29, v26, 8 -; LMULMAX1-RV64-NEXT: addi a3, zero, 255 -; LMULMAX1-RV64-NEXT: slli a1, a3, 24 -; LMULMAX1-RV64-NEXT: vand.vx v29, v29, a1 +; LMULMAX1-RV64-NEXT: addi a1, zero, 255 +; LMULMAX1-RV64-NEXT: slli t4, a1, 24 +; LMULMAX1-RV64-NEXT: vand.vx v29, v29, t4 ; LMULMAX1-RV64-NEXT: vor.vv v28, v29, v28 ; LMULMAX1-RV64-NEXT: vor.vv v27, v28, v27 ; LMULMAX1-RV64-NEXT: vsll.vi v28, v26, 8 -; LMULMAX1-RV64-NEXT: slli a5, a3, 32 -; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a5 +; LMULMAX1-RV64-NEXT: slli a2, a1, 32 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a2 ; LMULMAX1-RV64-NEXT: vsll.vi v29, v26, 24 -; LMULMAX1-RV64-NEXT: slli a2, a3, 40 -; LMULMAX1-RV64-NEXT: vand.vx v29, v29, a2 +; LMULMAX1-RV64-NEXT: slli a3, a1, 40 +; LMULMAX1-RV64-NEXT: vand.vx v29, v29, a3 ; LMULMAX1-RV64-NEXT: vor.vv v28, v29, v28 ; LMULMAX1-RV64-NEXT: vsll.vx v29, v26, t0 ; LMULMAX1-RV64-NEXT: vsll.vx v26, v26, t1 -; LMULMAX1-RV64-NEXT: slli a3, a3, 48 -; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a3 +; LMULMAX1-RV64-NEXT: slli a1, a1, 48 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a1 ; LMULMAX1-RV64-NEXT: vor.vv v26, v29, v26 ; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v28 ; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 4 ; LMULMAX1-RV64-NEXT: lui a4, 3855 ; LMULMAX1-RV64-NEXT: addiw a4, a4, 241 ; LMULMAX1-RV64-NEXT: slli a4, a4, 12 @@ -1615,20 +1360,12 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV64-NEXT: slli a4, a4, 12 ; LMULMAX1-RV64-NEXT: addi a4, a4, 241 ; LMULMAX1-RV64-NEXT: slli a4, a4, 12 -; LMULMAX1-RV64-NEXT: addi t2, a4, -241 -; LMULMAX1-RV64-NEXT: vand.vx v27, v26, t2 -; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 4 -; LMULMAX1-RV64-NEXT: lui a4, 1044721 -; LMULMAX1-RV64-NEXT: addiw a4, a4, -241 -; LMULMAX1-RV64-NEXT: slli a4, a4, 12 -; LMULMAX1-RV64-NEXT: addi a4, a4, 241 -; LMULMAX1-RV64-NEXT: slli a4, a4, 12 -; LMULMAX1-RV64-NEXT: addi a4, a4, -241 -; LMULMAX1-RV64-NEXT: slli a4, a4, 12 -; LMULMAX1-RV64-NEXT: addi t3, a4, 240 +; LMULMAX1-RV64-NEXT: addi t3, a4, -241 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, t3 ; LMULMAX1-RV64-NEXT: vand.vx v26, v26, t3 -; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 4 -; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 4 +; LMULMAX1-RV64-NEXT: vor.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 2 ; LMULMAX1-RV64-NEXT: lui a4, 13107 ; LMULMAX1-RV64-NEXT: addiw a4, a4, 819 ; LMULMAX1-RV64-NEXT: slli a4, a4, 12 @@ -1636,81 +1373,62 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV64-NEXT: slli a4, a4, 12 ; LMULMAX1-RV64-NEXT: addi a4, a4, 819 ; LMULMAX1-RV64-NEXT: slli a4, a4, 12 -; LMULMAX1-RV64-NEXT: addi t5, a4, 819 -; LMULMAX1-RV64-NEXT: vand.vx v27, v26, t5 -; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 2 -; LMULMAX1-RV64-NEXT: lui a4, 1035469 -; LMULMAX1-RV64-NEXT: addiw a4, a4, -819 -; LMULMAX1-RV64-NEXT: slli a4, a4, 12 -; LMULMAX1-RV64-NEXT: addi a4, a4, -819 -; LMULMAX1-RV64-NEXT: slli a4, a4, 12 -; LMULMAX1-RV64-NEXT: addi a4, a4, -819 -; LMULMAX1-RV64-NEXT: slli a4, a4, 12 -; LMULMAX1-RV64-NEXT: addi t6, a4, -820 -; LMULMAX1-RV64-NEXT: vand.vx v26, v26, t6 -; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 2 -; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 -; LMULMAX1-RV64-NEXT: lui a4, 21845 -; LMULMAX1-RV64-NEXT: addiw a4, a4, 1365 -; LMULMAX1-RV64-NEXT: slli a4, a4, 12 -; LMULMAX1-RV64-NEXT: addi a4, a4, 1365 -; LMULMAX1-RV64-NEXT: slli a4, a4, 12 -; LMULMAX1-RV64-NEXT: addi a4, a4, 1365 -; LMULMAX1-RV64-NEXT: slli a4, a4, 12 -; LMULMAX1-RV64-NEXT: addi a4, a4, 1365 -; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a4 -; LMULMAX1-RV64-NEXT: vadd.vv v27, v27, v27 -; LMULMAX1-RV64-NEXT: lui s0, 1026731 -; LMULMAX1-RV64-NEXT: addiw s0, s0, -1365 -; LMULMAX1-RV64-NEXT: slli s0, s0, 12 -; LMULMAX1-RV64-NEXT: addi s0, s0, -1365 -; LMULMAX1-RV64-NEXT: slli s0, s0, 12 -; LMULMAX1-RV64-NEXT: addi s0, s0, -1365 -; LMULMAX1-RV64-NEXT: slli s0, s0, 12 -; LMULMAX1-RV64-NEXT: addi s0, s0, -1366 -; LMULMAX1-RV64-NEXT: vand.vx v26, v26, s0 -; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 1 -; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV64-NEXT: addi a4, a4, 819 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a4 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a4 +; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 2 +; LMULMAX1-RV64-NEXT: vor.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 1 +; LMULMAX1-RV64-NEXT: lui a5, 21845 +; LMULMAX1-RV64-NEXT: addiw a5, a5, 1365 +; LMULMAX1-RV64-NEXT: slli a5, a5, 12 +; LMULMAX1-RV64-NEXT: addi a5, a5, 1365 +; LMULMAX1-RV64-NEXT: slli a5, a5, 12 +; LMULMAX1-RV64-NEXT: addi a5, a5, 1365 +; LMULMAX1-RV64-NEXT: slli a5, a5, 12 +; LMULMAX1-RV64-NEXT: addi a5, a5, 1365 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a5 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a5 +; LMULMAX1-RV64-NEXT: vadd.vv v26, v26, v26 +; LMULMAX1-RV64-NEXT: vor.vv v26, v27, v26 ; LMULMAX1-RV64-NEXT: vsrl.vx v27, v25, t0 ; LMULMAX1-RV64-NEXT: vsrl.vx v28, v25, t1 -; LMULMAX1-RV64-NEXT: vand.vx v28, v28, t4 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, t2 ; LMULMAX1-RV64-NEXT: vor.vv v27, v28, v27 ; LMULMAX1-RV64-NEXT: vsrl.vi v28, v25, 24 ; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a7 ; LMULMAX1-RV64-NEXT: vsrl.vi v29, v25, 8 -; LMULMAX1-RV64-NEXT: vand.vx v29, v29, a1 +; LMULMAX1-RV64-NEXT: vand.vx v29, v29, t4 ; LMULMAX1-RV64-NEXT: vor.vv v28, v29, v28 ; LMULMAX1-RV64-NEXT: vor.vv v27, v28, v27 ; LMULMAX1-RV64-NEXT: vsll.vi v28, v25, 8 -; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a5 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a2 ; LMULMAX1-RV64-NEXT: vsll.vi v29, v25, 24 -; LMULMAX1-RV64-NEXT: vand.vx v29, v29, a2 +; LMULMAX1-RV64-NEXT: vand.vx v29, v29, a3 ; LMULMAX1-RV64-NEXT: vor.vv v28, v29, v28 ; LMULMAX1-RV64-NEXT: vsll.vx v29, v25, t0 ; LMULMAX1-RV64-NEXT: vsll.vx v25, v25, t1 -; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a3 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1 ; LMULMAX1-RV64-NEXT: vor.vv v25, v29, v25 ; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v28 ; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 -; LMULMAX1-RV64-NEXT: vand.vx v27, v25, t2 -; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 4 +; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 4 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, t3 ; LMULMAX1-RV64-NEXT: vand.vx v25, v25, t3 -; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 4 -; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 -; LMULMAX1-RV64-NEXT: vand.vx v27, v25, t5 -; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 2 -; LMULMAX1-RV64-NEXT: vand.vx v25, v25, t6 -; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 2 -; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 -; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a4 -; LMULMAX1-RV64-NEXT: vadd.vv v27, v27, v27 -; LMULMAX1-RV64-NEXT: vand.vx v25, v25, s0 -; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 1 -; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 4 +; LMULMAX1-RV64-NEXT: vor.vv v25, v27, v25 +; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 2 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a4 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a4 +; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 2 +; LMULMAX1-RV64-NEXT: vor.vv v25, v27, v25 +; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 1 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a5 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a5 +; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v25 +; LMULMAX1-RV64-NEXT: vor.vv v25, v27, v25 ; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) ; LMULMAX1-RV64-NEXT: vse64.v v26, (a6) -; LMULMAX1-RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload -; LMULMAX1-RV64-NEXT: addi sp, sp, 16 ; LMULMAX1-RV64-NEXT: ret %a = load <4 x i64>, <4 x i64>* %x %b = load <4 x i64>, <4 x i64>* %y diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll index b1c4cbead6c1..cfdbbce7f1f5 100644 --- a/llvm/test/CodeGen/X86/bitreverse.ll +++ b/llvm/test/CodeGen/X86/bitreverse.ll @@ -17,35 +17,35 @@ define <2 x i16> @test_bitreverse_v2i16(<2 x i16> %a) nounwind { ; X86-NEXT: movl %eax, %edx ; X86-NEXT: andl $3855, %edx # imm = 0xF0F ; X86-NEXT: shll $4, %edx -; X86-NEXT: andl $61680, %eax # imm = 0xF0F0 ; X86-NEXT: shrl $4, %eax +; X86-NEXT: andl $3855, %eax # imm = 0xF0F ; X86-NEXT: orl %edx, %eax ; X86-NEXT: movl %eax, %edx ; X86-NEXT: andl $13107, %edx # imm = 0x3333 -; X86-NEXT: andl $52428, %eax # imm = 0xCCCC ; X86-NEXT: shrl $2, %eax +; X86-NEXT: andl $13107, %eax # imm = 0x3333 ; X86-NEXT: leal (%eax,%edx,4), %eax ; X86-NEXT: movl %eax, %edx ; X86-NEXT: andl $21845, %edx # imm = 0x5555 -; X86-NEXT: andl $43690, %eax # imm = 0xAAAA ; X86-NEXT: shrl %eax +; X86-NEXT: andl $21845, %eax # imm = 0x5555 ; X86-NEXT: leal (%eax,%edx,2), %eax ; X86-NEXT: rolw $8, %cx ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: andl $3855, %edx # imm = 0xF0F ; X86-NEXT: shll $4, %edx -; X86-NEXT: andl $61680, %ecx # imm = 0xF0F0 ; X86-NEXT: shrl $4, %ecx +; X86-NEXT: andl $3855, %ecx # imm = 0xF0F ; X86-NEXT: orl %edx, %ecx ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: andl $13107, %edx # imm = 0x3333 -; X86-NEXT: andl $52428, %ecx # imm = 0xCCCC ; X86-NEXT: shrl $2, %ecx +; X86-NEXT: andl $13107, %ecx # imm = 0x3333 ; X86-NEXT: leal (%ecx,%edx,4), %ecx ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: andl $21845, %edx # imm = 0x5555 -; X86-NEXT: andl $43690, %ecx # imm = 0xAAAA ; X86-NEXT: shrl %ecx +; X86-NEXT: andl $21845, %ecx # imm = 0x5555 ; X86-NEXT: leal (%ecx,%edx,2), %edx ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: # kill: def $dx killed $dx killed $edx @@ -63,16 +63,18 @@ define <2 x i16> @test_bitreverse_v2i16(<2 x i16> %a) nounwind { ; X64-NEXT: psrlw $4, %xmm0 ; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: por %xmm1, %xmm0 -; X64-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; X64-NEXT: pand %xmm0, %xmm1 -; X64-NEXT: psllw $2, %xmm1 -; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-NEXT: psrlw $2, %xmm0 +; X64-NEXT: movdqa %xmm0, %xmm1 +; X64-NEXT: psrlw $2, %xmm1 +; X64-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X64-NEXT: pand %xmm2, %xmm1 +; X64-NEXT: pand %xmm2, %xmm0 +; X64-NEXT: psllw $2, %xmm0 ; X64-NEXT: por %xmm1, %xmm0 -; X64-NEXT: movdqa {{.*#+}} xmm1 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] -; X64-NEXT: pand %xmm0, %xmm1 +; X64-NEXT: movdqa %xmm0, %xmm1 ; X64-NEXT: psrlw $1, %xmm1 -; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; X64-NEXT: pand %xmm2, %xmm1 +; X64-NEXT: pand %xmm2, %xmm0 ; X64-NEXT: paddb %xmm0, %xmm0 ; X64-NEXT: por %xmm1, %xmm0 ; X64-NEXT: retq @@ -96,60 +98,60 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind { ; X86-NEXT: movl %eax, %edx ; X86-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F ; X86-NEXT: shll $4, %edx -; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0 ; X86-NEXT: shrl $4, %eax +; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F ; X86-NEXT: orl %edx, %eax ; X86-NEXT: movl %eax, %edx ; X86-NEXT: andl $858993459, %edx # imm = 0x33333333 -; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC ; X86-NEXT: shrl $2, %eax +; X86-NEXT: andl $858993459, %eax # imm = 0x33333333 ; X86-NEXT: leal (%eax,%edx,4), %eax ; X86-NEXT: movl %eax, %edx ; X86-NEXT: andl $1431655765, %edx # imm = 0x55555555 -; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA ; X86-NEXT: shrl %eax +; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; X86-NEXT: leal (%eax,%edx,2), %eax ; X86-NEXT: bswapl %ecx ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F ; X86-NEXT: shll $4, %edx -; X86-NEXT: andl $-252645136, %ecx # imm = 0xF0F0F0F0 ; X86-NEXT: shrl $4, %ecx +; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F ; X86-NEXT: orl %edx, %ecx ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: andl $858993459, %edx # imm = 0x33333333 -; X86-NEXT: andl $-858993460, %ecx # imm = 0xCCCCCCCC ; X86-NEXT: shrl $2, %ecx +; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 ; X86-NEXT: leal (%ecx,%edx,4), %ecx ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: andl $1431655765, %edx # imm = 0x55555555 -; X86-NEXT: andl $-1431655766, %ecx # imm = 0xAAAAAAAA ; X86-NEXT: shrl %ecx +; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 ; X86-NEXT: leal (%ecx,%edx,2), %edx ; X86-NEXT: retl ; ; X64-LABEL: test_bitreverse_i64: ; X64: # %bb.0: ; X64-NEXT: bswapq %rdi -; X64-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F -; X64-NEXT: andq %rdi, %rax -; X64-NEXT: shlq $4, %rax -; X64-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 -; X64-NEXT: andq %rdi, %rcx -; X64-NEXT: shrq $4, %rcx -; X64-NEXT: orq %rax, %rcx -; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: shrq $4, %rax +; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F ; X64-NEXT: andq %rcx, %rax -; X64-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC -; X64-NEXT: andq %rcx, %rdx -; X64-NEXT: shrq $2, %rdx -; X64-NEXT: leaq (%rdx,%rax,4), %rax -; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; X64-NEXT: andq %rcx, %rdi +; X64-NEXT: shlq $4, %rdi +; X64-NEXT: orq %rax, %rdi +; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; X64-NEXT: movq %rdi, %rcx ; X64-NEXT: andq %rax, %rcx -; X64-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA -; X64-NEXT: andq %rax, %rdx -; X64-NEXT: shrq %rdx -; X64-NEXT: leaq (%rdx,%rcx,2), %rax +; X64-NEXT: shrq $2, %rdi +; X64-NEXT: andq %rax, %rdi +; X64-NEXT: leaq (%rdi,%rcx,4), %rax +; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: andq %rcx, %rdx +; X64-NEXT: shrq %rax +; X64-NEXT: andq %rcx, %rax +; X64-NEXT: leaq (%rax,%rdx,2), %rax ; X64-NEXT: retq ; ; X86XOP-LABEL: test_bitreverse_i64: @@ -173,18 +175,18 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind { ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F ; X86-NEXT: shll $4, %ecx -; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0 ; X86-NEXT: shrl $4, %eax +; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC ; X86-NEXT: shrl $2, %eax +; X86-NEXT: andl $858993459, %eax # imm = 0x33333333 ; X86-NEXT: leal (%eax,%ecx,4), %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA ; X86-NEXT: shrl %eax +; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; X86-NEXT: leal (%eax,%ecx,2), %eax ; X86-NEXT: retl ; @@ -195,18 +197,18 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind { ; X64-NEXT: movl %edi, %eax ; X64-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F ; X64-NEXT: shll $4, %eax -; X64-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 ; X64-NEXT: shrl $4, %edi +; X64-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F ; X64-NEXT: orl %eax, %edi ; X64-NEXT: movl %edi, %eax ; X64-NEXT: andl $858993459, %eax # imm = 0x33333333 -; X64-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC ; X64-NEXT: shrl $2, %edi +; X64-NEXT: andl $858993459, %edi # imm = 0x33333333 ; X64-NEXT: leal (%rdi,%rax,4), %eax ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; X64-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA ; X64-NEXT: shrl %eax +; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; X64-NEXT: leal (%rax,%rcx,2), %eax ; X64-NEXT: retq ; @@ -230,18 +232,18 @@ define i24 @test_bitreverse_i24(i24 %a) nounwind { ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F ; X86-NEXT: shll $4, %ecx -; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0 ; X86-NEXT: shrl $4, %eax +; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC ; X86-NEXT: shrl $2, %eax +; X86-NEXT: andl $858993459, %eax # imm = 0x33333333 ; X86-NEXT: leal (%eax,%ecx,4), %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $1431655680, %ecx # imm = 0x55555500 -; X86-NEXT: andl $-1431655936, %eax # imm = 0xAAAAAA00 ; X86-NEXT: shrl %eax +; X86-NEXT: andl $1431655680, %eax # imm = 0x55555500 ; X86-NEXT: leal (%eax,%ecx,2), %eax ; X86-NEXT: shrl $8, %eax ; X86-NEXT: retl @@ -253,18 +255,18 @@ define i24 @test_bitreverse_i24(i24 %a) nounwind { ; X64-NEXT: movl %edi, %eax ; X64-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F ; X64-NEXT: shll $4, %eax -; X64-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 ; X64-NEXT: shrl $4, %edi +; X64-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F ; X64-NEXT: orl %eax, %edi ; X64-NEXT: movl %edi, %eax ; X64-NEXT: andl $858993459, %eax # imm = 0x33333333 -; X64-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC ; X64-NEXT: shrl $2, %edi +; X64-NEXT: andl $858993459, %edi # imm = 0x33333333 ; X64-NEXT: leal (%rdi,%rax,4), %eax ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: andl $1431655680, %ecx # imm = 0x55555500 -; X64-NEXT: andl $-1431655936, %eax # imm = 0xAAAAAA00 ; X64-NEXT: shrl %eax +; X64-NEXT: andl $1431655680, %eax # imm = 0x55555500 ; X64-NEXT: leal (%rax,%rcx,2), %eax ; X64-NEXT: shrl $8, %eax ; X64-NEXT: retq @@ -290,18 +292,18 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind { ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $3855, %ecx # imm = 0xF0F ; X86-NEXT: shll $4, %ecx -; X86-NEXT: andl $61680, %eax # imm = 0xF0F0 ; X86-NEXT: shrl $4, %eax +; X86-NEXT: andl $3855, %eax # imm = 0xF0F ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $13107, %ecx # imm = 0x3333 -; X86-NEXT: andl $52428, %eax # imm = 0xCCCC ; X86-NEXT: shrl $2, %eax +; X86-NEXT: andl $13107, %eax # imm = 0x3333 ; X86-NEXT: leal (%eax,%ecx,4), %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $21845, %ecx # imm = 0x5555 -; X86-NEXT: andl $43690, %eax # imm = 0xAAAA ; X86-NEXT: shrl %eax +; X86-NEXT: andl $21845, %eax # imm = 0x5555 ; X86-NEXT: leal (%eax,%ecx,2), %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl @@ -313,18 +315,18 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind { ; X64-NEXT: movl %edi, %eax ; X64-NEXT: andl $3855, %eax # imm = 0xF0F ; X64-NEXT: shll $4, %eax -; X64-NEXT: andl $61680, %edi # imm = 0xF0F0 ; X64-NEXT: shrl $4, %edi +; X64-NEXT: andl $3855, %edi # imm = 0xF0F ; X64-NEXT: orl %eax, %edi ; X64-NEXT: movl %edi, %eax ; X64-NEXT: andl $13107, %eax # imm = 0x3333 -; X64-NEXT: andl $52428, %edi # imm = 0xCCCC ; X64-NEXT: shrl $2, %edi +; X64-NEXT: andl $13107, %edi # imm = 0x3333 ; X64-NEXT: leal (%rdi,%rax,4), %eax ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: andl $21845, %ecx # imm = 0x5555 -; X64-NEXT: andl $43690, %eax # imm = 0xAAAA ; X64-NEXT: shrl %eax +; X64-NEXT: andl $21845, %eax # imm = 0x5555 ; X64-NEXT: leal (%rax,%rcx,2), %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq @@ -350,14 +352,14 @@ define i8 @test_bitreverse_i8(i8 %a) { ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andb $51, %cl ; X86-NEXT: shlb $2, %cl -; X86-NEXT: andb $-52, %al ; X86-NEXT: shrb $2, %al +; X86-NEXT: andb $51, %al ; X86-NEXT: orb %cl, %al ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andb $85, %cl ; X86-NEXT: addb %cl, %cl -; X86-NEXT: andb $-86, %al ; X86-NEXT: shrb %al +; X86-NEXT: andb $85, %al ; X86-NEXT: orb %cl, %al ; X86-NEXT: retl ; @@ -368,14 +370,14 @@ define i8 @test_bitreverse_i8(i8 %a) { ; X64-NEXT: movl %edi, %eax ; X64-NEXT: andb $51, %al ; X64-NEXT: shlb $2, %al -; X64-NEXT: andb $-52, %dil ; X64-NEXT: shrb $2, %dil +; X64-NEXT: andb $51, %dil ; X64-NEXT: orb %al, %dil ; X64-NEXT: movl %edi, %eax ; X64-NEXT: andb $85, %al ; X64-NEXT: addb %al, %al -; X64-NEXT: andb $-86, %dil ; X64-NEXT: shrb %dil +; X64-NEXT: andb $85, %dil ; X64-NEXT: addl %edi, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq @@ -401,14 +403,14 @@ define i4 @test_bitreverse_i4(i4 %a) { ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andb $51, %cl ; X86-NEXT: shlb $2, %cl -; X86-NEXT: andb $-52, %al ; X86-NEXT: shrb $2, %al +; X86-NEXT: andb $51, %al ; X86-NEXT: orb %cl, %al ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andb $80, %cl ; X86-NEXT: addb %cl, %cl -; X86-NEXT: andb $-96, %al ; X86-NEXT: shrb %al +; X86-NEXT: andb $80, %al ; X86-NEXT: orb %cl, %al ; X86-NEXT: shrb $4, %al ; X86-NEXT: retl @@ -420,14 +422,14 @@ define i4 @test_bitreverse_i4(i4 %a) { ; X64-NEXT: movl %edi, %eax ; X64-NEXT: andb $51, %al ; X64-NEXT: shlb $2, %al -; X64-NEXT: andb $-52, %dil ; X64-NEXT: shrb $2, %dil +; X64-NEXT: andb $51, %dil ; X64-NEXT: orb %al, %dil ; X64-NEXT: movl %edi, %eax ; X64-NEXT: andb $80, %al ; X64-NEXT: addb %al, %al -; X64-NEXT: andb $-96, %dil ; X64-NEXT: shrb %dil +; X64-NEXT: andb $80, %dil ; X64-NEXT: addl %edi, %eax ; X64-NEXT: shrb $4, %al ; X64-NEXT: # kill: def $al killed $al killed $eax @@ -621,107 +623,107 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: movl %ebx, %ebp ; X86-NEXT: andl $252645135, %ebp # imm = 0xF0F0F0F ; X86-NEXT: shll $4, %ebp -; X86-NEXT: andl $-252645136, %ebx # imm = 0xF0F0F0F0 ; X86-NEXT: shrl $4, %ebx +; X86-NEXT: andl $252645135, %ebx # imm = 0xF0F0F0F ; X86-NEXT: orl %ebp, %ebx ; X86-NEXT: movl %ebx, %ebp ; X86-NEXT: andl $858993459, %ebp # imm = 0x33333333 -; X86-NEXT: andl $-858993460, %ebx # imm = 0xCCCCCCCC ; X86-NEXT: shrl $2, %ebx +; X86-NEXT: andl $858993459, %ebx # imm = 0x33333333 ; X86-NEXT: leal (%ebx,%ebp,4), %ebx ; X86-NEXT: movl %ebx, %ebp ; X86-NEXT: andl $1431633920, %ebp # imm = 0x55550000 -; X86-NEXT: andl $-1431699456, %ebx # imm = 0xAAAA0000 ; X86-NEXT: shrl %ebx +; X86-NEXT: andl $1431633920, %ebx # imm = 0x55550000 ; X86-NEXT: leal (%ebx,%ebp,2), %ebx ; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill ; X86-NEXT: bswapl %edi ; X86-NEXT: movl %edi, %ebx ; X86-NEXT: andl $252645135, %ebx # imm = 0xF0F0F0F ; X86-NEXT: shll $4, %ebx -; X86-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 ; X86-NEXT: shrl $4, %edi +; X86-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F ; X86-NEXT: orl %ebx, %edi ; X86-NEXT: movl %edi, %ebx ; X86-NEXT: andl $858993459, %ebx # imm = 0x33333333 -; X86-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC ; X86-NEXT: shrl $2, %edi +; X86-NEXT: andl $858993459, %edi # imm = 0x33333333 ; X86-NEXT: leal (%edi,%ebx,4), %edi ; X86-NEXT: movl %edi, %ebx ; X86-NEXT: andl $1431655765, %ebx # imm = 0x55555555 -; X86-NEXT: andl $-1431655766, %edi # imm = 0xAAAAAAAA ; X86-NEXT: shrl %edi +; X86-NEXT: andl $1431655765, %edi # imm = 0x55555555 ; X86-NEXT: leal (%edi,%ebx,2), %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: bswapl %esi ; X86-NEXT: movl %esi, %edi ; X86-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F ; X86-NEXT: shll $4, %edi -; X86-NEXT: andl $-252645136, %esi # imm = 0xF0F0F0F0 ; X86-NEXT: shrl $4, %esi +; X86-NEXT: andl $252645135, %esi # imm = 0xF0F0F0F ; X86-NEXT: orl %edi, %esi ; X86-NEXT: movl %esi, %edi ; X86-NEXT: andl $858993459, %edi # imm = 0x33333333 -; X86-NEXT: andl $-858993460, %esi # imm = 0xCCCCCCCC ; X86-NEXT: shrl $2, %esi +; X86-NEXT: andl $858993459, %esi # imm = 0x33333333 ; X86-NEXT: leal (%esi,%edi,4), %esi ; X86-NEXT: movl %esi, %edi ; X86-NEXT: andl $1431655765, %edi # imm = 0x55555555 -; X86-NEXT: andl $-1431655766, %esi # imm = 0xAAAAAAAA ; X86-NEXT: shrl %esi +; X86-NEXT: andl $1431655765, %esi # imm = 0x55555555 ; X86-NEXT: leal (%esi,%edi,2), %ebx ; X86-NEXT: bswapl %edx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: andl $252645135, %esi # imm = 0xF0F0F0F ; X86-NEXT: shll $4, %esi -; X86-NEXT: andl $-252645136, %edx # imm = 0xF0F0F0F0 ; X86-NEXT: shrl $4, %edx +; X86-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F ; X86-NEXT: orl %esi, %edx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: andl $858993459, %esi # imm = 0x33333333 -; X86-NEXT: andl $-858993460, %edx # imm = 0xCCCCCCCC ; X86-NEXT: shrl $2, %edx +; X86-NEXT: andl $858993459, %edx # imm = 0x33333333 ; X86-NEXT: leal (%edx,%esi,4), %edx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: andl $1431655765, %esi # imm = 0x55555555 -; X86-NEXT: andl $-1431655766, %edx # imm = 0xAAAAAAAA ; X86-NEXT: shrl %edx +; X86-NEXT: andl $1431655765, %edx # imm = 0x55555555 ; X86-NEXT: leal (%edx,%esi,2), %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: bswapl %ecx ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F ; X86-NEXT: shll $4, %edx -; X86-NEXT: andl $-252645136, %ecx # imm = 0xF0F0F0F0 ; X86-NEXT: shrl $4, %ecx +; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F ; X86-NEXT: orl %edx, %ecx ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: andl $858993459, %edx # imm = 0x33333333 -; X86-NEXT: andl $-858993460, %ecx # imm = 0xCCCCCCCC ; X86-NEXT: shrl $2, %ecx +; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 ; X86-NEXT: leal (%ecx,%edx,4), %ecx ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: andl $1431655765, %edx # imm = 0x55555555 -; X86-NEXT: andl $-1431655766, %ecx # imm = 0xAAAAAAAA ; X86-NEXT: shrl %ecx +; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 ; X86-NEXT: leal (%ecx,%edx,2), %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: bswapl %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F ; X86-NEXT: shll $4, %ecx -; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0 ; X86-NEXT: shrl $4, %eax +; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC ; X86-NEXT: shrl $2, %eax +; X86-NEXT: andl $858993459, %eax # imm = 0x33333333 ; X86-NEXT: leal (%eax,%ecx,4), %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA ; X86-NEXT: shrl %eax +; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; X86-NEXT: leal (%eax,%ecx,2), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -729,18 +731,18 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F ; X86-NEXT: shll $4, %ecx -; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0 ; X86-NEXT: shrl $4, %eax +; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC ; X86-NEXT: shrl $2, %eax +; X86-NEXT: andl $858993459, %eax # imm = 0x33333333 ; X86-NEXT: leal (%eax,%ecx,4), %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA ; X86-NEXT: shrl %eax +; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; X86-NEXT: leal (%eax,%ecx,2), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -748,18 +750,18 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F ; X86-NEXT: shll $4, %ecx -; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0 ; X86-NEXT: shrl $4, %eax +; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC ; X86-NEXT: shrl $2, %eax +; X86-NEXT: andl $858993459, %eax # imm = 0x33333333 ; X86-NEXT: leal (%eax,%ecx,4), %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA ; X86-NEXT: shrl %eax +; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; X86-NEXT: leal (%eax,%ecx,2), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -767,18 +769,18 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F ; X86-NEXT: shll $4, %ecx -; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0 ; X86-NEXT: shrl $4, %eax +; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC ; X86-NEXT: shrl $2, %eax +; X86-NEXT: andl $858993459, %eax # imm = 0x33333333 ; X86-NEXT: leal (%eax,%ecx,4), %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA ; X86-NEXT: shrl %eax +; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; X86-NEXT: leal (%eax,%ecx,2), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -786,18 +788,18 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F ; X86-NEXT: shll $4, %ecx -; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0 ; X86-NEXT: shrl $4, %eax +; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC ; X86-NEXT: shrl $2, %eax +; X86-NEXT: andl $858993459, %eax # imm = 0x33333333 ; X86-NEXT: leal (%eax,%ecx,4), %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA ; X86-NEXT: shrl %eax +; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; X86-NEXT: leal (%eax,%ecx,2), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -805,18 +807,18 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F ; X86-NEXT: shll $4, %ecx -; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0 ; X86-NEXT: shrl $4, %eax +; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC ; X86-NEXT: shrl $2, %eax +; X86-NEXT: andl $858993459, %eax # imm = 0x33333333 ; X86-NEXT: leal (%eax,%ecx,4), %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA ; X86-NEXT: shrl %eax +; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; X86-NEXT: leal (%eax,%ecx,2), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -824,18 +826,18 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F ; X86-NEXT: shll $4, %ecx -; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0 ; X86-NEXT: shrl $4, %eax +; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC ; X86-NEXT: shrl $2, %eax +; X86-NEXT: andl $858993459, %eax # imm = 0x33333333 ; X86-NEXT: leal (%eax,%ecx,4), %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA ; X86-NEXT: shrl %eax +; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; X86-NEXT: leal (%eax,%ecx,2), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -843,18 +845,18 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F ; X86-NEXT: shll $4, %ecx -; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0 ; X86-NEXT: shrl $4, %eax +; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC ; X86-NEXT: shrl $2, %eax +; X86-NEXT: andl $858993459, %eax # imm = 0x33333333 ; X86-NEXT: leal (%eax,%ecx,4), %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA ; X86-NEXT: shrl %eax +; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; X86-NEXT: leal (%eax,%ecx,2), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -862,18 +864,18 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F ; X86-NEXT: shll $4, %ecx -; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0 ; X86-NEXT: shrl $4, %eax +; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC ; X86-NEXT: shrl $2, %eax +; X86-NEXT: andl $858993459, %eax # imm = 0x33333333 ; X86-NEXT: leal (%eax,%ecx,4), %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA ; X86-NEXT: shrl %eax +; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; X86-NEXT: leal (%eax,%ecx,2), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -881,18 +883,18 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F ; X86-NEXT: shll $4, %ecx -; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0 ; X86-NEXT: shrl $4, %eax +; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC ; X86-NEXT: shrl $2, %eax +; X86-NEXT: andl $858993459, %eax # imm = 0x33333333 ; X86-NEXT: leal (%eax,%ecx,4), %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA ; X86-NEXT: shrl %eax +; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; X86-NEXT: leal (%eax,%ecx,2), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -900,36 +902,36 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F ; X86-NEXT: shll $4, %ecx -; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0 ; X86-NEXT: shrl $4, %eax +; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC ; X86-NEXT: shrl $2, %eax +; X86-NEXT: andl $858993459, %eax # imm = 0x33333333 ; X86-NEXT: leal (%eax,%ecx,4), %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA ; X86-NEXT: shrl %eax +; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; X86-NEXT: leal (%eax,%ecx,2), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: bswapl %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F ; X86-NEXT: shll $4, %ecx -; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0 ; X86-NEXT: shrl $4, %eax +; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC ; X86-NEXT: shrl $2, %eax +; X86-NEXT: andl $858993459, %eax # imm = 0x33333333 ; X86-NEXT: leal (%eax,%ecx,4), %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA ; X86-NEXT: shrl %eax +; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; X86-NEXT: leal (%eax,%ecx,2), %edx ; X86-NEXT: movl (%esp), %esi # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -1018,194 +1020,186 @@ define i528 @large_promotion(i528 %A) nounwind { ; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: movq %rdi, %r12 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbx -; X64-NEXT: bswapq %rbx -; X64-NEXT: movabsq $1085102592571150095, %r13 # imm = 0xF0F0F0F0F0F0F0F -; X64-NEXT: movq %rbx, %r10 -; X64-NEXT: andq %r13, %r10 -; X64-NEXT: shlq $4, %r10 -; X64-NEXT: movabsq $-1085102592571150096, %rax # imm = 0xF0F0F0F0F0F0F0F0 -; X64-NEXT: andq %rax, %rbx +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; X64-NEXT: bswapq %rdi +; X64-NEXT: movq %rdi, %rbx ; X64-NEXT: shrq $4, %rbx -; X64-NEXT: orq %r10, %rbx -; X64-NEXT: movabsq $3689348814741910323, %r11 # imm = 0x3333333333333333 -; X64-NEXT: movq %rbx, %r10 -; X64-NEXT: andq %r11, %r10 -; X64-NEXT: movabsq $-3689348814741910324, %r14 # imm = 0xCCCCCCCCCCCCCCCC -; X64-NEXT: andq %r14, %rbx -; X64-NEXT: shrq $2, %rbx -; X64-NEXT: leaq (%rbx,%r10,4), %r10 -; X64-NEXT: movabsq $6148820866244280320, %rbx # imm = 0x5555000000000000 -; X64-NEXT: andq %r10, %rbx -; X64-NEXT: movabsq $-6149102341220990976, %rdi # imm = 0xAAAA000000000000 -; X64-NEXT: andq %r10, %rdi -; X64-NEXT: shrq %rdi -; X64-NEXT: leaq (%rdi,%rbx,2), %rdi -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: bswapq %rbp -; X64-NEXT: movq %rbp, %rdi +; X64-NEXT: movabsq $1085102592571150095, %r13 # imm = 0xF0F0F0F0F0F0F0F +; X64-NEXT: andq %r13, %rbx ; X64-NEXT: andq %r13, %rdi ; X64-NEXT: shlq $4, %rdi -; X64-NEXT: andq %rax, %rbp -; X64-NEXT: shrq $4, %rbp +; X64-NEXT: orq %rbx, %rdi +; X64-NEXT: movabsq $3689348814741910323, %r11 # imm = 0x3333333333333333 +; X64-NEXT: movq %rdi, %rbx +; X64-NEXT: andq %r11, %rbx +; X64-NEXT: shrq $2, %rdi +; X64-NEXT: andq %r11, %rdi +; X64-NEXT: leaq (%rdi,%rbx,4), %rdi +; X64-NEXT: movabsq $6148820866244280320, %r10 # imm = 0x5555000000000000 +; X64-NEXT: movq %rdi, %rbx +; X64-NEXT: andq %r10, %rbx +; X64-NEXT: shrq %rdi +; X64-NEXT: andq %r10, %rdi +; X64-NEXT: leaq (%rdi,%rbx,2), %r10 +; X64-NEXT: bswapq %rbp +; X64-NEXT: movq %rbp, %rdi +; X64-NEXT: shrq $4, %rdi +; X64-NEXT: andq %r13, %rdi +; X64-NEXT: andq %r13, %rbp +; X64-NEXT: shlq $4, %rbp ; X64-NEXT: orq %rdi, %rbp ; X64-NEXT: movq %rbp, %rdi ; X64-NEXT: andq %r11, %rdi -; X64-NEXT: andq %r14, %rbp ; X64-NEXT: shrq $2, %rbp -; X64-NEXT: leaq (%rbp,%rdi,4), %rbp -; X64-NEXT: movabsq $6148914691236517205, %rbx # imm = 0x5555555555555555 -; X64-NEXT: movq %rbp, %r10 -; X64-NEXT: andq %rbx, %r10 -; X64-NEXT: movabsq $-6148914691236517206, %rdi # imm = 0xAAAAAAAAAAAAAAAA -; X64-NEXT: andq %rdi, %rbp -; X64-NEXT: shrq %rbp -; X64-NEXT: leaq (%rbp,%r10,2), %rbp -; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; X64-NEXT: bswapq %rbp -; X64-NEXT: movq %rbp, %r10 -; X64-NEXT: andq %r13, %r10 -; X64-NEXT: shlq $4, %r10 -; X64-NEXT: andq %rax, %rbp -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: shrq $4, %rbp -; X64-NEXT: orq %r10, %rbp -; X64-NEXT: movq %rbp, %r10 -; X64-NEXT: andq %r11, %r10 -; X64-NEXT: andq %r14, %rbp -; X64-NEXT: shrq $2, %rbp -; X64-NEXT: leaq (%rbp,%r10,4), %rbp -; X64-NEXT: movq %rbp, %r10 -; X64-NEXT: andq %rbx, %r10 -; X64-NEXT: andq %rdi, %rbp -; X64-NEXT: shrq %rbp -; X64-NEXT: leaq (%rbp,%r10,2), %rbp -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; X64-NEXT: bswapq %r10 -; X64-NEXT: movq %r10, %rax -; X64-NEXT: andq %r13, %rax -; X64-NEXT: shlq $4, %rax +; X64-NEXT: andq %r11, %rbp +; X64-NEXT: leaq (%rbp,%rdi,4), %rdi +; X64-NEXT: movabsq $6148914691236517205, %rbp # imm = 0x5555555555555555 +; X64-NEXT: movq %rdi, %rbx +; X64-NEXT: andq %rbp, %rbx +; X64-NEXT: shrq %rdi +; X64-NEXT: andq %rbp, %rdi +; X64-NEXT: leaq (%rdi,%rbx,2), %r14 +; X64-NEXT: shrdq $48, %r14, %r10 +; X64-NEXT: bswapq %r15 ; X64-NEXT: movq %r15, %rdi -; X64-NEXT: andq %r15, %r10 -; X64-NEXT: shrq $4, %r10 -; X64-NEXT: orq %rax, %r10 -; X64-NEXT: movq %r10, %rax -; X64-NEXT: andq %r11, %rax -; X64-NEXT: andq %r14, %r10 -; X64-NEXT: shrq $2, %r10 -; X64-NEXT: leaq (%r10,%rax,4), %rax -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: andq %rbx, %r10 -; X64-NEXT: movabsq $-6148914691236517206, %r15 # imm = 0xAAAAAAAAAAAAAAAA -; X64-NEXT: andq %r15, %rax -; X64-NEXT: shrq %rax -; X64-NEXT: leaq (%rax,%r10,2), %r10 +; X64-NEXT: shrq $4, %rdi +; X64-NEXT: andq %r13, %rdi +; X64-NEXT: andq %r13, %r15 +; X64-NEXT: shlq $4, %r15 +; X64-NEXT: orq %rdi, %r15 +; X64-NEXT: movq %r15, %rdi +; X64-NEXT: andq %r11, %rdi +; X64-NEXT: shrq $2, %r15 +; X64-NEXT: andq %r11, %r15 +; X64-NEXT: leaq (%r15,%rdi,4), %rdi +; X64-NEXT: movq %rdi, %rbx +; X64-NEXT: andq %rbp, %rbx +; X64-NEXT: shrq %rdi +; X64-NEXT: andq %rbp, %rdi +; X64-NEXT: leaq (%rdi,%rbx,2), %r15 +; X64-NEXT: shrdq $48, %r15, %r14 +; X64-NEXT: bswapq %r12 +; X64-NEXT: movq %r12, %rdi +; X64-NEXT: shrq $4, %rdi +; X64-NEXT: andq %r13, %rdi +; X64-NEXT: andq %r13, %r12 +; X64-NEXT: shlq $4, %r12 +; X64-NEXT: orq %rdi, %r12 +; X64-NEXT: movq %r12, %rdi +; X64-NEXT: andq %r11, %rdi +; X64-NEXT: shrq $2, %r12 +; X64-NEXT: andq %r11, %r12 +; X64-NEXT: leaq (%r12,%rdi,4), %rdi +; X64-NEXT: movq %rdi, %rbx +; X64-NEXT: andq %rbp, %rbx +; X64-NEXT: shrq %rdi +; X64-NEXT: andq %rbp, %rdi +; X64-NEXT: leaq (%rdi,%rbx,2), %r12 +; X64-NEXT: shrdq $48, %r12, %r15 ; X64-NEXT: bswapq %r9 -; X64-NEXT: movq %r9, %rax -; X64-NEXT: andq %r13, %rax -; X64-NEXT: shlq $4, %rax -; X64-NEXT: andq %rdi, %r9 -; X64-NEXT: shrq $4, %r9 -; X64-NEXT: orq %rax, %r9 -; X64-NEXT: movq %r9, %rax -; X64-NEXT: andq %r11, %rax -; X64-NEXT: andq %r14, %r9 +; X64-NEXT: movq %r9, %rdi +; X64-NEXT: shrq $4, %rdi +; X64-NEXT: andq %r13, %rdi +; X64-NEXT: andq %r13, %r9 +; X64-NEXT: shlq $4, %r9 +; X64-NEXT: orq %rdi, %r9 +; X64-NEXT: movq %r9, %rdi +; X64-NEXT: andq %r11, %rdi ; X64-NEXT: shrq $2, %r9 -; X64-NEXT: leaq (%r9,%rax,4), %rax -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: andq %rbx, %r9 -; X64-NEXT: andq %r15, %rax -; X64-NEXT: shrq %rax -; X64-NEXT: leaq (%rax,%r9,2), %r9 +; X64-NEXT: andq %r11, %r9 +; X64-NEXT: leaq (%r9,%rdi,4), %rdi +; X64-NEXT: movq %rdi, %rbx +; X64-NEXT: andq %rbp, %rbx +; X64-NEXT: shrq %rdi +; X64-NEXT: andq %rbp, %rdi +; X64-NEXT: leaq (%rdi,%rbx,2), %r9 +; X64-NEXT: shrdq $48, %r9, %r12 ; X64-NEXT: bswapq %r8 -; X64-NEXT: movq %r8, %rax -; X64-NEXT: andq %r13, %rax -; X64-NEXT: shlq $4, %rax -; X64-NEXT: andq %rdi, %r8 -; X64-NEXT: shrq $4, %r8 -; X64-NEXT: orq %rax, %r8 -; X64-NEXT: movq %r8, %rax -; X64-NEXT: andq %r11, %rax -; X64-NEXT: andq %r14, %r8 +; X64-NEXT: movq %r8, %rdi +; X64-NEXT: shrq $4, %rdi +; X64-NEXT: andq %r13, %rdi +; X64-NEXT: andq %r13, %r8 +; X64-NEXT: shlq $4, %r8 +; X64-NEXT: orq %rdi, %r8 +; X64-NEXT: movq %r8, %rdi +; X64-NEXT: andq %r11, %rdi ; X64-NEXT: shrq $2, %r8 -; X64-NEXT: leaq (%r8,%rax,4), %rax -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: andq %rbx, %r8 -; X64-NEXT: andq %r15, %rax -; X64-NEXT: shrq %rax -; X64-NEXT: leaq (%rax,%r8,2), %r8 +; X64-NEXT: andq %r11, %r8 +; X64-NEXT: leaq (%r8,%rdi,4), %rdi +; X64-NEXT: movq %rdi, %rbx +; X64-NEXT: andq %rbp, %rbx +; X64-NEXT: shrq %rdi +; X64-NEXT: andq %rbp, %rdi +; X64-NEXT: leaq (%rdi,%rbx,2), %rdi +; X64-NEXT: shrdq $48, %rdi, %r9 ; X64-NEXT: bswapq %rcx -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: andq %r13, %rax -; X64-NEXT: shlq $4, %rax -; X64-NEXT: andq %rdi, %rcx -; X64-NEXT: shrq $4, %rcx -; X64-NEXT: orq %rax, %rcx -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: andq %r11, %rax -; X64-NEXT: andq %r14, %rcx +; X64-NEXT: movq %rcx, %rbx +; X64-NEXT: shrq $4, %rbx +; X64-NEXT: andq %r13, %rbx +; X64-NEXT: andq %r13, %rcx +; X64-NEXT: shlq $4, %rcx +; X64-NEXT: orq %rbx, %rcx +; X64-NEXT: movq %rcx, %rbx +; X64-NEXT: andq %r11, %rbx ; X64-NEXT: shrq $2, %rcx -; X64-NEXT: leaq (%rcx,%rax,4), %rax -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: andq %rbx, %rcx -; X64-NEXT: andq %r15, %rax -; X64-NEXT: shrq %rax -; X64-NEXT: leaq (%rax,%rcx,2), %rcx +; X64-NEXT: andq %r11, %rcx +; X64-NEXT: leaq (%rcx,%rbx,4), %rcx +; X64-NEXT: movq %rcx, %rbx +; X64-NEXT: andq %rbp, %rbx +; X64-NEXT: shrq %rcx +; X64-NEXT: andq %rbp, %rcx +; X64-NEXT: leaq (%rcx,%rbx,2), %rcx +; X64-NEXT: shrdq $48, %rcx, %rdi ; X64-NEXT: bswapq %rdx -; X64-NEXT: movq %rdx, %rax -; X64-NEXT: andq %r13, %rax -; X64-NEXT: shlq $4, %rax -; X64-NEXT: andq %rdi, %rdx -; X64-NEXT: shrq $4, %rdx -; X64-NEXT: orq %rax, %rdx -; X64-NEXT: movq %rdx, %rax -; X64-NEXT: andq %r11, %rax -; X64-NEXT: andq %r14, %rdx +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: shrq $4, %rbx +; X64-NEXT: andq %r13, %rbx +; X64-NEXT: andq %r13, %rdx +; X64-NEXT: shlq $4, %rdx +; X64-NEXT: orq %rbx, %rdx +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: andq %r11, %rbx ; X64-NEXT: shrq $2, %rdx -; X64-NEXT: leaq (%rdx,%rax,4), %rax -; X64-NEXT: movq %rax, %rdx -; X64-NEXT: andq %rbx, %rdx -; X64-NEXT: andq %r15, %rax -; X64-NEXT: shrq %rax -; X64-NEXT: leaq (%rax,%rdx,2), %rax -; X64-NEXT: bswapq %rsi -; X64-NEXT: andq %rsi, %r13 -; X64-NEXT: andq %rdi, %rsi -; X64-NEXT: shlq $4, %r13 -; X64-NEXT: shrq $4, %rsi -; X64-NEXT: orq %r13, %rsi -; X64-NEXT: andq %rsi, %r11 -; X64-NEXT: andq %r14, %rsi -; X64-NEXT: shrq $2, %rsi -; X64-NEXT: leaq (%rsi,%r11,4), %rdx -; X64-NEXT: andq %rdx, %rbx -; X64-NEXT: andq %r15, %rdx +; X64-NEXT: andq %r11, %rdx +; X64-NEXT: leaq (%rdx,%rbx,4), %rdx +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: andq %rbp, %rbx ; X64-NEXT: shrq %rdx +; X64-NEXT: andq %rbp, %rdx ; X64-NEXT: leaq (%rdx,%rbx,2), %rdx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: shrdq $48, %rdi, %rsi -; X64-NEXT: shrdq $48, %rbp, %rdi -; X64-NEXT: shrdq $48, %r10, %rbp -; X64-NEXT: shrdq $48, %r9, %r10 -; X64-NEXT: shrdq $48, %r8, %r9 -; X64-NEXT: shrdq $48, %rcx, %r8 -; X64-NEXT: shrdq $48, %rax, %rcx -; X64-NEXT: shrdq $48, %rdx, %rax -; X64-NEXT: movq %rax, 56(%r12) -; X64-NEXT: movq %rcx, 48(%r12) -; X64-NEXT: movq %r8, 40(%r12) -; X64-NEXT: movq %r9, 32(%r12) -; X64-NEXT: movq %r10, 24(%r12) -; X64-NEXT: movq %rbp, 16(%r12) -; X64-NEXT: movq %rdi, 8(%r12) -; X64-NEXT: movq %rsi, (%r12) -; X64-NEXT: shrq $48, %rdx -; X64-NEXT: movw %dx, 64(%r12) -; X64-NEXT: movq %r12, %rax +; X64-NEXT: shrdq $48, %rdx, %rcx +; X64-NEXT: bswapq %rsi +; X64-NEXT: movq %rsi, %rbx +; X64-NEXT: shrq $4, %rbx +; X64-NEXT: andq %r13, %rbx +; X64-NEXT: andq %r13, %rsi +; X64-NEXT: shlq $4, %rsi +; X64-NEXT: orq %rbx, %rsi +; X64-NEXT: movq %rsi, %rbx +; X64-NEXT: andq %r11, %rbx +; X64-NEXT: shrq $2, %rsi +; X64-NEXT: andq %r11, %rsi +; X64-NEXT: leaq (%rsi,%rbx,4), %rsi +; X64-NEXT: movq %rsi, %rbx +; X64-NEXT: andq %rbp, %rbx +; X64-NEXT: shrq %rsi +; X64-NEXT: andq %rbp, %rsi +; X64-NEXT: leaq (%rsi,%rbx,2), %rsi +; X64-NEXT: shrdq $48, %rsi, %rdx +; X64-NEXT: shrq $48, %rsi +; X64-NEXT: movq %rdx, 56(%rax) +; X64-NEXT: movq %rcx, 48(%rax) +; X64-NEXT: movq %rdi, 40(%rax) +; X64-NEXT: movq %r9, 32(%rax) +; X64-NEXT: movq %r12, 24(%rax) +; X64-NEXT: movq %r15, 16(%rax) +; X64-NEXT: movq %r14, 8(%rax) +; X64-NEXT: movq %r10, (%rax) +; X64-NEXT: movw %si, 64(%rax) ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 ; X64-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/combine-bitreverse.ll b/llvm/test/CodeGen/X86/combine-bitreverse.ll index 8c41f533fd6b..4a50f7c879ad 100644 --- a/llvm/test/CodeGen/X86/combine-bitreverse.ll +++ b/llvm/test/CodeGen/X86/combine-bitreverse.ll @@ -55,16 +55,18 @@ define <4 x i32> @test_demandedbits_bitreverse(<4 x i32> %a0) nounwind { ; X86-NEXT: psrlw $4, %xmm0 ; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-NEXT: por %xmm1, %xmm0 -; X86-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; X86-NEXT: pand %xmm0, %xmm1 -; X86-NEXT: psllw $2, %xmm1 -; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-NEXT: psrlw $2, %xmm0 +; X86-NEXT: movdqa %xmm0, %xmm1 +; X86-NEXT: psrlw $2, %xmm1 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X86-NEXT: pand %xmm2, %xmm1 +; X86-NEXT: pand %xmm2, %xmm0 +; X86-NEXT: psllw $2, %xmm0 ; X86-NEXT: por %xmm1, %xmm0 -; X86-NEXT: movdqa {{.*#+}} xmm1 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] -; X86-NEXT: pand %xmm0, %xmm1 +; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: psrlw $1, %xmm1 -; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; X86-NEXT: pand %xmm2, %xmm1 +; X86-NEXT: pand %xmm2, %xmm0 ; X86-NEXT: paddb %xmm0, %xmm0 ; X86-NEXT: por %xmm1, %xmm0 ; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 diff --git a/llvm/test/CodeGen/X86/pr43820.ll b/llvm/test/CodeGen/X86/pr43820.ll index 5bdf7872d61a..2cbced7053e8 100644 --- a/llvm/test/CodeGen/X86/pr43820.ll +++ b/llvm/test/CodeGen/X86/pr43820.ll @@ -10,363 +10,362 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: pushq %r13 ; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp +; CHECK-NEXT: bswapq %rbp +; CHECK-NEXT: movq %rbp, %r11 +; CHECK-NEXT: shrq $4, %r11 +; CHECK-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F +; CHECK-NEXT: andq %rsi, %r11 +; CHECK-NEXT: andq %rsi, %rbp +; CHECK-NEXT: shlq $4, %rbp +; CHECK-NEXT: orq %r11, %rbp +; CHECK-NEXT: movabsq $3689348814741910323, %rdi # imm = 0x3333333333333333 +; CHECK-NEXT: movq %rbp, %r12 +; CHECK-NEXT: andq %rdi, %r12 +; CHECK-NEXT: shrq $2, %rbp +; CHECK-NEXT: andq %rdi, %rbp +; CHECK-NEXT: leaq (%rbp,%r12,4), %rbp +; CHECK-NEXT: movabsq $6148914691230924800, %r12 # imm = 0x5555555555000000 +; CHECK-NEXT: movq %rbp, %r13 +; CHECK-NEXT: andq %r12, %r13 +; CHECK-NEXT: shrq %rbp +; CHECK-NEXT: andq %r12, %rbp +; CHECK-NEXT: leaq (%rbp,%r13,2), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: bswapq %rbx -; CHECK-NEXT: movabsq $1085102592571150095, %rdi # imm = 0xF0F0F0F0F0F0F0F +; CHECK-NEXT: movq %rbx, %rbp +; CHECK-NEXT: shrq $4, %rbp +; CHECK-NEXT: andq %rsi, %rbp +; CHECK-NEXT: andq %rsi, %rbx +; CHECK-NEXT: shlq $4, %rbx +; CHECK-NEXT: orq %rbp, %rbx ; CHECK-NEXT: movq %rbx, %rbp ; CHECK-NEXT: andq %rdi, %rbp -; CHECK-NEXT: shlq $4, %rbp -; CHECK-NEXT: movabsq $-1085102592571150096, %r11 # imm = 0xF0F0F0F0F0F0F0F0 -; CHECK-NEXT: andq %r11, %rbx -; CHECK-NEXT: movq %r11, %rax -; CHECK-NEXT: shrq $4, %rbx -; CHECK-NEXT: orq %rbp, %rbx -; CHECK-NEXT: movabsq $3689348814741910323, %r11 # imm = 0x3333333333333333 -; CHECK-NEXT: movq %rbx, %r14 -; CHECK-NEXT: andq %r11, %r14 -; CHECK-NEXT: movabsq $-3689348814741910324, %rbp # imm = 0xCCCCCCCCCCCCCCCC -; CHECK-NEXT: andq %rbp, %rbx -; CHECK-NEXT: movq %rbp, %r15 ; CHECK-NEXT: shrq $2, %rbx -; CHECK-NEXT: leaq (%rbx,%r14,4), %r14 -; CHECK-NEXT: movabsq $6148914691230924800, %rbx # imm = 0x5555555555000000 -; CHECK-NEXT: andq %r14, %rbx -; CHECK-NEXT: movabsq $-6148914691247702016, %rbp # imm = 0xAAAAAAAAAA000000 -; CHECK-NEXT: andq %r14, %rbp -; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: leaq (%rbp,%rbx,2), %rbx -; CHECK-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: bswapq %r10 -; CHECK-NEXT: movq %r10, %rbx ; CHECK-NEXT: andq %rdi, %rbx -; CHECK-NEXT: shlq $4, %rbx -; CHECK-NEXT: andq %rax, %r10 -; CHECK-NEXT: shrq $4, %r10 -; CHECK-NEXT: orq %rbx, %r10 -; CHECK-NEXT: movq %r10, %rbx -; CHECK-NEXT: andq %r11, %rbx -; CHECK-NEXT: andq %r15, %r10 -; CHECK-NEXT: shrq $2, %r10 -; CHECK-NEXT: leaq (%r10,%rbx,4), %rbp +; CHECK-NEXT: leaq (%rbx,%rbp,4), %rbp ; CHECK-NEXT: movabsq $6148914691236517205, %rbx # imm = 0x5555555555555555 -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rbx, %r10 -; CHECK-NEXT: movabsq $-6148914691236517206, %r13 # imm = 0xAAAAAAAAAAAAAAAA -; CHECK-NEXT: andq %r13, %rbp +; CHECK-NEXT: movq %rbp, %r12 +; CHECK-NEXT: andq %rbx, %r12 ; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp -; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; CHECK-NEXT: bswapq %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rdi, %r10 -; CHECK-NEXT: shlq $4, %r10 -; CHECK-NEXT: andq %rax, %rbp +; CHECK-NEXT: andq %rbx, %rbp +; CHECK-NEXT: leaq (%rbp,%r12,2), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: bswapq %r15 +; CHECK-NEXT: movq %r15, %rbp ; CHECK-NEXT: shrq $4, %rbp -; CHECK-NEXT: orq %r10, %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %r11, %r10 -; CHECK-NEXT: andq %r15, %rbp -; CHECK-NEXT: shrq $2, %rbp -; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rbx, %r10 -; CHECK-NEXT: andq %r13, %rbp +; CHECK-NEXT: andq %rsi, %rbp +; CHECK-NEXT: andq %rsi, %r15 +; CHECK-NEXT: shlq $4, %r15 +; CHECK-NEXT: orq %rbp, %r15 +; CHECK-NEXT: movq %r15, %rbp +; CHECK-NEXT: andq %rdi, %rbp +; CHECK-NEXT: shrq $2, %r15 +; CHECK-NEXT: andq %rdi, %r15 +; CHECK-NEXT: leaq (%r15,%rbp,4), %rbp +; CHECK-NEXT: movq %rbp, %r15 +; CHECK-NEXT: andq %rbx, %r15 ; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp -; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; CHECK-NEXT: bswapq %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rdi, %r10 -; CHECK-NEXT: shlq $4, %r10 -; CHECK-NEXT: andq %rax, %rbp +; CHECK-NEXT: andq %rbx, %rbp +; CHECK-NEXT: leaq (%rbp,%r15,2), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: bswapq %r14 +; CHECK-NEXT: movq %r14, %rbp ; CHECK-NEXT: shrq $4, %rbp -; CHECK-NEXT: orq %r10, %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %r11, %r10 -; CHECK-NEXT: andq %r15, %rbp -; CHECK-NEXT: shrq $2, %rbp -; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rbx, %r10 -; CHECK-NEXT: andq %r13, %rbp +; CHECK-NEXT: andq %rsi, %rbp +; CHECK-NEXT: andq %rsi, %r14 +; CHECK-NEXT: shlq $4, %r14 +; CHECK-NEXT: orq %rbp, %r14 +; CHECK-NEXT: movq %r14, %rbp +; CHECK-NEXT: andq %rdi, %rbp +; CHECK-NEXT: shrq $2, %r14 +; CHECK-NEXT: andq %rdi, %r14 +; CHECK-NEXT: leaq (%r14,%rbp,4), %rbp +; CHECK-NEXT: movq %rbp, %r14 +; CHECK-NEXT: andq %rbx, %r14 ; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp -; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; CHECK-NEXT: bswapq %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rdi, %r10 -; CHECK-NEXT: shlq $4, %r10 -; CHECK-NEXT: andq %rax, %rbp +; CHECK-NEXT: andq %rbx, %rbp +; CHECK-NEXT: leaq (%rbp,%r14,2), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: bswapq %r10 +; CHECK-NEXT: movq %r10, %rbp ; CHECK-NEXT: shrq $4, %rbp -; CHECK-NEXT: orq %r10, %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %r11, %r10 -; CHECK-NEXT: andq %r15, %rbp -; CHECK-NEXT: shrq $2, %rbp -; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rbx, %r10 -; CHECK-NEXT: andq %r13, %rbp -; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp -; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; CHECK-NEXT: bswapq %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rdi, %r10 +; CHECK-NEXT: andq %rsi, %rbp +; CHECK-NEXT: andq %rsi, %r10 ; CHECK-NEXT: shlq $4, %r10 -; CHECK-NEXT: andq %rax, %rbp -; CHECK-NEXT: movq %rax, %r14 -; CHECK-NEXT: shrq $4, %rbp -; CHECK-NEXT: orq %r10, %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %r11, %r10 -; CHECK-NEXT: andq %r15, %rbp -; CHECK-NEXT: shrq $2, %rbp -; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp +; CHECK-NEXT: orq %rbp, %r10 +; CHECK-NEXT: movq %r10, %rbp +; CHECK-NEXT: andq %rdi, %rbp +; CHECK-NEXT: shrq $2, %r10 +; CHECK-NEXT: andq %rdi, %r10 +; CHECK-NEXT: leaq (%r10,%rbp,4), %rbp ; CHECK-NEXT: movq %rbp, %r10 ; CHECK-NEXT: andq %rbx, %r10 -; CHECK-NEXT: andq %r13, %rbp ; CHECK-NEXT: shrq %rbp +; CHECK-NEXT: andq %rbx, %rbp ; CHECK-NEXT: leaq (%rbp,%r10,2), %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp ; CHECK-NEXT: bswapq %rbp ; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rdi, %r10 -; CHECK-NEXT: shlq $4, %r10 -; CHECK-NEXT: andq %r14, %rbp -; CHECK-NEXT: shrq $4, %rbp +; CHECK-NEXT: shrq $4, %r10 +; CHECK-NEXT: andq %rsi, %r10 +; CHECK-NEXT: andq %rsi, %rbp +; CHECK-NEXT: shlq $4, %rbp ; CHECK-NEXT: orq %r10, %rbp ; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %r11, %r10 -; CHECK-NEXT: andq %r15, %rbp +; CHECK-NEXT: andq %rdi, %r10 ; CHECK-NEXT: shrq $2, %rbp +; CHECK-NEXT: andq %rdi, %rbp ; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp ; CHECK-NEXT: movq %rbp, %r10 ; CHECK-NEXT: andq %rbx, %r10 -; CHECK-NEXT: andq %r13, %rbp ; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp -; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; CHECK-NEXT: bswapq %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rdi, %r10 +; CHECK-NEXT: andq %rbx, %rbp +; CHECK-NEXT: leaq (%rbp,%r10,2), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: bswapq %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: shrq $4, %r14 +; CHECK-NEXT: andq %rsi, %r14 +; CHECK-NEXT: andq %rsi, %r10 ; CHECK-NEXT: shlq $4, %r10 -; CHECK-NEXT: andq %r14, %rbp -; CHECK-NEXT: shrq $4, %rbp -; CHECK-NEXT: orq %r10, %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %r11, %r10 -; CHECK-NEXT: andq %r15, %rbp -; CHECK-NEXT: shrq $2, %rbp -; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rbx, %r10 -; CHECK-NEXT: andq %r13, %rbp -; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp -; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; CHECK-NEXT: bswapq %rbp -; CHECK-NEXT: movq %rbp, %r10 +; CHECK-NEXT: orq %r14, %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %rdi, %r14 +; CHECK-NEXT: shrq $2, %r10 ; CHECK-NEXT: andq %rdi, %r10 -; CHECK-NEXT: shlq $4, %r10 -; CHECK-NEXT: andq %r14, %rbp -; CHECK-NEXT: shrq $4, %rbp -; CHECK-NEXT: orq %r10, %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %r11, %r10 -; CHECK-NEXT: andq %r15, %rbp -; CHECK-NEXT: shrq $2, %rbp -; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp -; CHECK-NEXT: movq %rbp, %r10 +; CHECK-NEXT: movq %rdi, %rbp +; CHECK-NEXT: leaq (%r10,%r14,4), %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %rbx, %r14 +; CHECK-NEXT: shrq %r10 ; CHECK-NEXT: andq %rbx, %r10 -; CHECK-NEXT: andq %r13, %rbp -; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp -; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; CHECK-NEXT: bswapq %rbp -; CHECK-NEXT: movq %rbp, %r10 +; CHECK-NEXT: leaq (%r10,%r14,2), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: bswapq %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: shrq $4, %r14 +; CHECK-NEXT: andq %rsi, %r14 +; CHECK-NEXT: andq %rsi, %r10 +; CHECK-NEXT: shlq $4, %r10 +; CHECK-NEXT: orq %r14, %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %rdi, %r14 +; CHECK-NEXT: shrq $2, %r10 ; CHECK-NEXT: andq %rdi, %r10 -; CHECK-NEXT: shlq $4, %r10 -; CHECK-NEXT: andq %r14, %rbp -; CHECK-NEXT: shrq $4, %rbp -; CHECK-NEXT: orq %r10, %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %r11, %r10 -; CHECK-NEXT: andq %r15, %rbp -; CHECK-NEXT: shrq $2, %rbp -; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp -; CHECK-NEXT: movq %rbp, %r10 +; CHECK-NEXT: leaq (%r10,%r14,4), %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %rbx, %r14 +; CHECK-NEXT: shrq %r10 ; CHECK-NEXT: andq %rbx, %r10 -; CHECK-NEXT: andq %r13, %rbp -; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp -; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; CHECK-NEXT: bswapq %rbp -; CHECK-NEXT: movq %rbp, %r10 +; CHECK-NEXT: leaq (%r10,%r14,2), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: bswapq %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: shrq $4, %r14 +; CHECK-NEXT: andq %rsi, %r14 +; CHECK-NEXT: andq %rsi, %r10 +; CHECK-NEXT: shlq $4, %r10 +; CHECK-NEXT: orq %r14, %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %rdi, %r14 +; CHECK-NEXT: shrq $2, %r10 ; CHECK-NEXT: andq %rdi, %r10 -; CHECK-NEXT: shlq $4, %r10 -; CHECK-NEXT: andq %r14, %rbp -; CHECK-NEXT: shrq $4, %rbp -; CHECK-NEXT: orq %r10, %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %r11, %r10 -; CHECK-NEXT: andq %r15, %rbp -; CHECK-NEXT: shrq $2, %rbp -; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp -; CHECK-NEXT: movq %rbp, %r10 +; CHECK-NEXT: leaq (%r10,%r14,4), %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %rbx, %r14 +; CHECK-NEXT: shrq %r10 ; CHECK-NEXT: andq %rbx, %r10 -; CHECK-NEXT: andq %r13, %rbp -; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp -; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: leaq (%r10,%r14,2), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: bswapq %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: shrq $4, %r14 +; CHECK-NEXT: andq %rsi, %r14 +; CHECK-NEXT: andq %rsi, %r10 +; CHECK-NEXT: shlq $4, %r10 +; CHECK-NEXT: orq %r14, %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %rdi, %r14 +; CHECK-NEXT: shrq $2, %r10 +; CHECK-NEXT: andq %rdi, %r10 +; CHECK-NEXT: leaq (%r10,%r14,4), %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %rbx, %r14 +; CHECK-NEXT: shrq %r10 +; CHECK-NEXT: andq %rbx, %r10 +; CHECK-NEXT: leaq (%r10,%r14,2), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: bswapq %r10 +; CHECK-NEXT: movq %r10, %rax +; CHECK-NEXT: shrq $4, %rax +; CHECK-NEXT: andq %rsi, %rax +; CHECK-NEXT: andq %rsi, %r10 +; CHECK-NEXT: shlq $4, %r10 +; CHECK-NEXT: orq %rax, %r10 +; CHECK-NEXT: movq %r10, %rax +; CHECK-NEXT: andq %rdi, %rax +; CHECK-NEXT: shrq $2, %r10 +; CHECK-NEXT: andq %rdi, %r10 +; CHECK-NEXT: leaq (%r10,%rax,4), %rax +; CHECK-NEXT: movq %rax, %r10 +; CHECK-NEXT: andq %rbx, %r10 +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: andq %rbx, %rax +; CHECK-NEXT: leaq (%rax,%r10,2), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: bswapq %r9 -; CHECK-NEXT: movq %r9, %rbp -; CHECK-NEXT: andq %rdi, %rbp -; CHECK-NEXT: shlq $4, %rbp -; CHECK-NEXT: andq %r14, %r9 -; CHECK-NEXT: shrq $4, %r9 -; CHECK-NEXT: orq %rbp, %r9 -; CHECK-NEXT: movq %r9, %rbp -; CHECK-NEXT: andq %r11, %rbp -; CHECK-NEXT: andq %r15, %r9 +; CHECK-NEXT: movq %r9, %rax +; CHECK-NEXT: shrq $4, %rax +; CHECK-NEXT: andq %rsi, %rax +; CHECK-NEXT: andq %rsi, %r9 +; CHECK-NEXT: shlq $4, %r9 +; CHECK-NEXT: orq %rax, %r9 +; CHECK-NEXT: movq %r9, %rax +; CHECK-NEXT: andq %rdi, %rax ; CHECK-NEXT: shrq $2, %r9 -; CHECK-NEXT: leaq (%r9,%rbp,4), %rbp -; CHECK-NEXT: movq %rbp, %r9 +; CHECK-NEXT: andq %rdi, %r9 +; CHECK-NEXT: leaq (%r9,%rax,4), %rax +; CHECK-NEXT: movq %rax, %r9 ; CHECK-NEXT: andq %rbx, %r9 -; CHECK-NEXT: andq %r13, %rbp -; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: leaq (%rbp,%r9,2), %rbp -; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: andq %rbx, %rax +; CHECK-NEXT: leaq (%rax,%r9,2), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: bswapq %r8 -; CHECK-NEXT: movq %r8, %rbp -; CHECK-NEXT: andq %rdi, %rbp -; CHECK-NEXT: shlq $4, %rbp -; CHECK-NEXT: andq %r14, %r8 -; CHECK-NEXT: shrq $4, %r8 -; CHECK-NEXT: orq %rbp, %r8 -; CHECK-NEXT: movq %r8, %rbp -; CHECK-NEXT: andq %r11, %rbp -; CHECK-NEXT: andq %r15, %r8 -; CHECK-NEXT: movq %r15, %r9 +; CHECK-NEXT: movq %r8, %rax +; CHECK-NEXT: shrq $4, %rax +; CHECK-NEXT: andq %rsi, %rax +; CHECK-NEXT: andq %rsi, %r8 +; CHECK-NEXT: shlq $4, %r8 +; CHECK-NEXT: orq %rax, %r8 +; CHECK-NEXT: movq %r8, %rax +; CHECK-NEXT: andq %rdi, %rax ; CHECK-NEXT: shrq $2, %r8 -; CHECK-NEXT: leaq (%r8,%rbp,4), %rbp -; CHECK-NEXT: movq %rbp, %r8 +; CHECK-NEXT: andq %rdi, %r8 +; CHECK-NEXT: leaq (%r8,%rax,4), %rax +; CHECK-NEXT: movq %rax, %r8 ; CHECK-NEXT: andq %rbx, %r8 -; CHECK-NEXT: andq %r13, %rbp -; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: leaq (%rbp,%r8,2), %rbp -; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: andq %rbx, %rax +; CHECK-NEXT: leaq (%rax,%r8,2), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: bswapq %rcx -; CHECK-NEXT: movq %rcx, %rbp -; CHECK-NEXT: andq %rdi, %rbp -; CHECK-NEXT: shlq $4, %rbp -; CHECK-NEXT: andq %r14, %rcx -; CHECK-NEXT: shrq $4, %rcx -; CHECK-NEXT: orq %rbp, %rcx -; CHECK-NEXT: movq %rcx, %rbp -; CHECK-NEXT: andq %r11, %rbp -; CHECK-NEXT: andq %r15, %rcx +; CHECK-NEXT: movq %rcx, %rax +; CHECK-NEXT: shrq $4, %rax +; CHECK-NEXT: andq %rsi, %rax +; CHECK-NEXT: andq %rsi, %rcx +; CHECK-NEXT: shlq $4, %rcx +; CHECK-NEXT: orq %rax, %rcx +; CHECK-NEXT: movq %rcx, %rax +; CHECK-NEXT: andq %rdi, %rax ; CHECK-NEXT: shrq $2, %rcx -; CHECK-NEXT: leaq (%rcx,%rbp,4), %rcx -; CHECK-NEXT: movq %rcx, %rbp -; CHECK-NEXT: andq %rbx, %rbp -; CHECK-NEXT: andq %r13, %rcx -; CHECK-NEXT: shrq %rcx -; CHECK-NEXT: leaq (%rcx,%rbp,2), %r15 +; CHECK-NEXT: andq %rdi, %rcx +; CHECK-NEXT: leaq (%rcx,%rax,4), %rax +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: andq %rbx, %rcx +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: andq %rbx, %rax +; CHECK-NEXT: leaq (%rax,%rcx,2), %r12 ; CHECK-NEXT: bswapq %rdx -; CHECK-NEXT: movq %rdx, %rbp -; CHECK-NEXT: andq %rdi, %rbp -; CHECK-NEXT: shlq $4, %rbp -; CHECK-NEXT: andq %r14, %rdx -; CHECK-NEXT: shrq $4, %rdx -; CHECK-NEXT: orq %rbp, %rdx -; CHECK-NEXT: movq %rdx, %rbp -; CHECK-NEXT: andq %r11, %rbp -; CHECK-NEXT: andq %r9, %rdx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: shrq $4, %rax +; CHECK-NEXT: andq %rsi, %rax +; CHECK-NEXT: andq %rsi, %rdx +; CHECK-NEXT: shlq $4, %rdx +; CHECK-NEXT: orq %rax, %rdx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: andq %rdi, %rax ; CHECK-NEXT: shrq $2, %rdx -; CHECK-NEXT: leaq (%rdx,%rbp,4), %rdx -; CHECK-NEXT: movq %rdx, %rbp -; CHECK-NEXT: andq %rbx, %rbp -; CHECK-NEXT: andq %r13, %rdx -; CHECK-NEXT: shrq %rdx -; CHECK-NEXT: leaq (%rdx,%rbp,2), %rdx -; CHECK-NEXT: bswapq %rsi -; CHECK-NEXT: andq %rsi, %rdi -; CHECK-NEXT: andq %r14, %rsi -; CHECK-NEXT: shlq $4, %rdi -; CHECK-NEXT: shrq $4, %rsi -; CHECK-NEXT: orq %rdi, %rsi -; CHECK-NEXT: andq %rsi, %r11 -; CHECK-NEXT: andq %r9, %rsi -; CHECK-NEXT: shrq $2, %rsi -; CHECK-NEXT: leaq (%rsi,%r11,4), %rsi -; CHECK-NEXT: andq %rsi, %rbx -; CHECK-NEXT: andq %r13, %rsi -; CHECK-NEXT: shrq %rsi -; CHECK-NEXT: leaq (%rsi,%rbx,2), %r13 -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; CHECK-NEXT: andq %rdi, %rdx +; CHECK-NEXT: leaq (%rdx,%rax,4), %rax +; CHECK-NEXT: movq %rax, %rdx +; CHECK-NEXT: andq %rbx, %rdx +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: andq %rbx, %rax +; CHECK-NEXT: leaq (%rax,%rdx,2), %rdi ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; CHECK-NEXT: shrdq $24, %rax, %r11 +; CHECK-NEXT: bswapq %rax +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq $4, %rcx +; CHECK-NEXT: andq %rsi, %rcx +; CHECK-NEXT: andq %rsi, %rax +; CHECK-NEXT: shlq $4, %rax +; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: andq %rbp, %rcx +; CHECK-NEXT: shrq $2, %rax +; CHECK-NEXT: andq %rbp, %rax +; CHECK-NEXT: leaq (%rax,%rcx,4), %rax +; CHECK-NEXT: movq %rax, %rsi +; CHECK-NEXT: andq %rbx, %rsi +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: andq %rbx, %rax +; CHECK-NEXT: leaq (%rax,%rsi,2), %rsi +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; CHECK-NEXT: shrdq $24, %rax, %rdx ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; CHECK-NEXT: shrdq $24, %rcx, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload ; CHECK-NEXT: shrdq $24, %rbp, %rcx ; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; CHECK-NEXT: shrdq $24, %r12, %rbp +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; CHECK-NEXT: shrdq $24, %r13, %rbp +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; CHECK-NEXT: shrdq $24, %r15, %r13 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; CHECK-NEXT: shrdq $24, %r14, %r12 +; CHECK-NEXT: shrdq $24, %r14, %r15 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload ; CHECK-NEXT: shrdq $24, %rbx, %r14 +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; CHECK-NEXT: shrdq $24, %r11, %rbx ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; CHECK-NEXT: shrdq $24, %r10, %rbx +; CHECK-NEXT: shrdq $24, %r10, %r11 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload ; CHECK-NEXT: shrdq $24, %r9, %r10 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload ; CHECK-NEXT: shrdq $24, %r8, %r9 -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; CHECK-NEXT: shrdq $24, %rdi, %r8 -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; CHECK-NEXT: shrdq $24, %rax, %r8 +; CHECK-NEXT: shrdq $24, %r12, %rax +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrdq $24, %rdi, %r12 ; CHECK-NEXT: shrdq $24, %rsi, %rdi ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; CHECK-NEXT: shrdq $24, %rax, %rsi -; CHECK-NEXT: shrdq $24, %r15, %rax -; CHECK-NEXT: movq %rax, %rcx -; CHECK-NEXT: shrdq $24, %rdx, %r15 -; CHECK-NEXT: shrdq $24, %r13, %rdx -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; CHECK-NEXT: movq %rdx, 112(%rax) -; CHECK-NEXT: movq %r15, 104(%rax) +; CHECK-NEXT: movq %rdi, 112(%rax) +; CHECK-NEXT: movq %r12, 104(%rax) ; CHECK-NEXT: movq %rcx, 96(%rax) -; CHECK-NEXT: movq %rsi, 88(%rax) -; CHECK-NEXT: movq %rdi, 80(%rax) -; CHECK-NEXT: movq %r8, 72(%rax) -; CHECK-NEXT: movq %r9, 64(%rax) -; CHECK-NEXT: movq %r10, 56(%rax) -; CHECK-NEXT: movq %rbx, 48(%rax) -; CHECK-NEXT: movq %r14, 40(%rax) -; CHECK-NEXT: movq %r12, 32(%rax) +; CHECK-NEXT: movq %r8, 88(%rax) +; CHECK-NEXT: movq %r9, 80(%rax) +; CHECK-NEXT: movq %r10, 72(%rax) +; CHECK-NEXT: movq %r11, 64(%rax) +; CHECK-NEXT: movq %rbx, 56(%rax) +; CHECK-NEXT: movq %r14, 48(%rax) +; CHECK-NEXT: movq %r15, 40(%rax) +; CHECK-NEXT: movq %r13, 32(%rax) ; CHECK-NEXT: movq %rbp, 24(%rax) ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; CHECK-NEXT: movq %rcx, 16(%rax) ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; CHECK-NEXT: movq %rcx, 8(%rax) -; CHECK-NEXT: movq %r11, (%rax) -; CHECK-NEXT: movq %r13, %rcx -; CHECK-NEXT: shrq $56, %r13 -; CHECK-NEXT: movb %r13b, 124(%rax) +; CHECK-NEXT: movq %rdx, (%rax) +; CHECK-NEXT: movq %rsi, %rcx +; CHECK-NEXT: shrq $56, %rsi +; CHECK-NEXT: movb %sil, 124(%rax) ; CHECK-NEXT: shrq $24, %rcx ; CHECK-NEXT: movl %ecx, 120(%rax) ; CHECK-NEXT: popq %rbx diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll index 651418d271be..3555312b18a1 100644 --- a/llvm/test/CodeGen/X86/vector-bitreverse.ll +++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll @@ -24,14 +24,14 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind { ; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: andb $51, %al ; SSE-NEXT: shlb $2, %al -; SSE-NEXT: andb $-52, %dil ; SSE-NEXT: shrb $2, %dil +; SSE-NEXT: andb $51, %dil ; SSE-NEXT: orb %al, %dil ; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: andb $85, %al ; SSE-NEXT: addb %al, %al -; SSE-NEXT: andb $-86, %dil ; SSE-NEXT: shrb %dil +; SSE-NEXT: andb $85, %dil ; SSE-NEXT: addl %edi, %eax ; SSE-NEXT: # kill: def $al killed $al killed $eax ; SSE-NEXT: retq @@ -43,14 +43,14 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind { ; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: andb $51, %al ; AVX-NEXT: shlb $2, %al -; AVX-NEXT: andb $-52, %dil ; AVX-NEXT: shrb $2, %dil +; AVX-NEXT: andb $51, %dil ; AVX-NEXT: orb %al, %dil ; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: andb $85, %al ; AVX-NEXT: addb %al, %al -; AVX-NEXT: andb $-86, %dil ; AVX-NEXT: shrb %dil +; AVX-NEXT: andb $85, %dil ; AVX-NEXT: addl %edi, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq @@ -70,14 +70,14 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind { ; GFNISSE-NEXT: movl %edi, %eax ; GFNISSE-NEXT: andb $51, %al ; GFNISSE-NEXT: shlb $2, %al -; GFNISSE-NEXT: andb $-52, %dil ; GFNISSE-NEXT: shrb $2, %dil +; GFNISSE-NEXT: andb $51, %dil ; GFNISSE-NEXT: orb %al, %dil ; GFNISSE-NEXT: movl %edi, %eax ; GFNISSE-NEXT: andb $85, %al ; GFNISSE-NEXT: addb %al, %al -; GFNISSE-NEXT: andb $-86, %dil ; GFNISSE-NEXT: shrb %dil +; GFNISSE-NEXT: andb $85, %dil ; GFNISSE-NEXT: addl %edi, %eax ; GFNISSE-NEXT: # kill: def $al killed $al killed $eax ; GFNISSE-NEXT: retq @@ -89,14 +89,14 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind { ; GFNIAVX-NEXT: movl %edi, %eax ; GFNIAVX-NEXT: andb $51, %al ; GFNIAVX-NEXT: shlb $2, %al -; GFNIAVX-NEXT: andb $-52, %dil ; GFNIAVX-NEXT: shrb $2, %dil +; GFNIAVX-NEXT: andb $51, %dil ; GFNIAVX-NEXT: orb %al, %dil ; GFNIAVX-NEXT: movl %edi, %eax ; GFNIAVX-NEXT: andb $85, %al ; GFNIAVX-NEXT: addb %al, %al -; GFNIAVX-NEXT: andb $-86, %dil ; GFNIAVX-NEXT: shrb %dil +; GFNIAVX-NEXT: andb $85, %dil ; GFNIAVX-NEXT: addl %edi, %eax ; GFNIAVX-NEXT: # kill: def $al killed $al killed $eax ; GFNIAVX-NEXT: retq @@ -108,14 +108,14 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind { ; GFNIAVX2-NEXT: movl %edi, %eax ; GFNIAVX2-NEXT: andb $51, %al ; GFNIAVX2-NEXT: shlb $2, %al -; GFNIAVX2-NEXT: andb $-52, %dil ; GFNIAVX2-NEXT: shrb $2, %dil +; GFNIAVX2-NEXT: andb $51, %dil ; GFNIAVX2-NEXT: orb %al, %dil ; GFNIAVX2-NEXT: movl %edi, %eax ; GFNIAVX2-NEXT: andb $85, %al ; GFNIAVX2-NEXT: addb %al, %al -; GFNIAVX2-NEXT: andb $-86, %dil ; GFNIAVX2-NEXT: shrb %dil +; GFNIAVX2-NEXT: andb $85, %dil ; GFNIAVX2-NEXT: addl %edi, %eax ; GFNIAVX2-NEXT: # kill: def $al killed $al killed $eax ; GFNIAVX2-NEXT: retq @@ -127,14 +127,14 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind { ; GFNIAVX512F-NEXT: movl %edi, %eax ; GFNIAVX512F-NEXT: andb $51, %al ; GFNIAVX512F-NEXT: shlb $2, %al -; GFNIAVX512F-NEXT: andb $-52, %dil ; GFNIAVX512F-NEXT: shrb $2, %dil +; GFNIAVX512F-NEXT: andb $51, %dil ; GFNIAVX512F-NEXT: orb %al, %dil ; GFNIAVX512F-NEXT: movl %edi, %eax ; GFNIAVX512F-NEXT: andb $85, %al ; GFNIAVX512F-NEXT: addb %al, %al -; GFNIAVX512F-NEXT: andb $-86, %dil ; GFNIAVX512F-NEXT: shrb %dil +; GFNIAVX512F-NEXT: andb $85, %dil ; GFNIAVX512F-NEXT: addl %edi, %eax ; GFNIAVX512F-NEXT: # kill: def $al killed $al killed $eax ; GFNIAVX512F-NEXT: retq @@ -146,14 +146,14 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind { ; GFNIAVX512BW-NEXT: movl %edi, %eax ; GFNIAVX512BW-NEXT: andb $51, %al ; GFNIAVX512BW-NEXT: shlb $2, %al -; GFNIAVX512BW-NEXT: andb $-52, %dil ; GFNIAVX512BW-NEXT: shrb $2, %dil +; GFNIAVX512BW-NEXT: andb $51, %dil ; GFNIAVX512BW-NEXT: orb %al, %dil ; GFNIAVX512BW-NEXT: movl %edi, %eax ; GFNIAVX512BW-NEXT: andb $85, %al ; GFNIAVX512BW-NEXT: addb %al, %al -; GFNIAVX512BW-NEXT: andb $-86, %dil ; GFNIAVX512BW-NEXT: shrb %dil +; GFNIAVX512BW-NEXT: andb $85, %dil ; GFNIAVX512BW-NEXT: addl %edi, %eax ; GFNIAVX512BW-NEXT: # kill: def $al killed $al killed $eax ; GFNIAVX512BW-NEXT: retq @@ -169,18 +169,18 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind { ; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: andl $3855, %eax # imm = 0xF0F ; SSE-NEXT: shll $4, %eax -; SSE-NEXT: andl $61680, %edi # imm = 0xF0F0 ; SSE-NEXT: shrl $4, %edi +; SSE-NEXT: andl $3855, %edi # imm = 0xF0F ; SSE-NEXT: orl %eax, %edi ; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: andl $13107, %eax # imm = 0x3333 -; SSE-NEXT: andl $52428, %edi # imm = 0xCCCC ; SSE-NEXT: shrl $2, %edi +; SSE-NEXT: andl $13107, %edi # imm = 0x3333 ; SSE-NEXT: leal (%rdi,%rax,4), %eax ; SSE-NEXT: movl %eax, %ecx ; SSE-NEXT: andl $21845, %ecx # imm = 0x5555 -; SSE-NEXT: andl $43690, %eax # imm = 0xAAAA ; SSE-NEXT: shrl %eax +; SSE-NEXT: andl $21845, %eax # imm = 0x5555 ; SSE-NEXT: leal (%rax,%rcx,2), %eax ; SSE-NEXT: # kill: def $ax killed $ax killed $eax ; SSE-NEXT: retq @@ -192,18 +192,18 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind { ; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: andl $3855, %eax # imm = 0xF0F ; AVX-NEXT: shll $4, %eax -; AVX-NEXT: andl $61680, %edi # imm = 0xF0F0 ; AVX-NEXT: shrl $4, %edi +; AVX-NEXT: andl $3855, %edi # imm = 0xF0F ; AVX-NEXT: orl %eax, %edi ; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX-NEXT: andl $52428, %edi # imm = 0xCCCC ; AVX-NEXT: shrl $2, %edi +; AVX-NEXT: andl $13107, %edi # imm = 0x3333 ; AVX-NEXT: leal (%rdi,%rax,4), %eax ; AVX-NEXT: movl %eax, %ecx ; AVX-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX-NEXT: andl $43690, %eax # imm = 0xAAAA ; AVX-NEXT: shrl %eax +; AVX-NEXT: andl $21845, %eax # imm = 0x5555 ; AVX-NEXT: leal (%rax,%rcx,2), %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq @@ -223,18 +223,18 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind { ; GFNISSE-NEXT: movl %edi, %eax ; GFNISSE-NEXT: andl $3855, %eax # imm = 0xF0F ; GFNISSE-NEXT: shll $4, %eax -; GFNISSE-NEXT: andl $61680, %edi # imm = 0xF0F0 ; GFNISSE-NEXT: shrl $4, %edi +; GFNISSE-NEXT: andl $3855, %edi # imm = 0xF0F ; GFNISSE-NEXT: orl %eax, %edi ; GFNISSE-NEXT: movl %edi, %eax ; GFNISSE-NEXT: andl $13107, %eax # imm = 0x3333 -; GFNISSE-NEXT: andl $52428, %edi # imm = 0xCCCC ; GFNISSE-NEXT: shrl $2, %edi +; GFNISSE-NEXT: andl $13107, %edi # imm = 0x3333 ; GFNISSE-NEXT: leal (%rdi,%rax,4), %eax ; GFNISSE-NEXT: movl %eax, %ecx ; GFNISSE-NEXT: andl $21845, %ecx # imm = 0x5555 -; GFNISSE-NEXT: andl $43690, %eax # imm = 0xAAAA ; GFNISSE-NEXT: shrl %eax +; GFNISSE-NEXT: andl $21845, %eax # imm = 0x5555 ; GFNISSE-NEXT: leal (%rax,%rcx,2), %eax ; GFNISSE-NEXT: # kill: def $ax killed $ax killed $eax ; GFNISSE-NEXT: retq @@ -246,18 +246,18 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind { ; GFNIAVX-NEXT: movl %edi, %eax ; GFNIAVX-NEXT: andl $3855, %eax # imm = 0xF0F ; GFNIAVX-NEXT: shll $4, %eax -; GFNIAVX-NEXT: andl $61680, %edi # imm = 0xF0F0 ; GFNIAVX-NEXT: shrl $4, %edi +; GFNIAVX-NEXT: andl $3855, %edi # imm = 0xF0F ; GFNIAVX-NEXT: orl %eax, %edi ; GFNIAVX-NEXT: movl %edi, %eax ; GFNIAVX-NEXT: andl $13107, %eax # imm = 0x3333 -; GFNIAVX-NEXT: andl $52428, %edi # imm = 0xCCCC ; GFNIAVX-NEXT: shrl $2, %edi +; GFNIAVX-NEXT: andl $13107, %edi # imm = 0x3333 ; GFNIAVX-NEXT: leal (%rdi,%rax,4), %eax ; GFNIAVX-NEXT: movl %eax, %ecx ; GFNIAVX-NEXT: andl $21845, %ecx # imm = 0x5555 -; GFNIAVX-NEXT: andl $43690, %eax # imm = 0xAAAA ; GFNIAVX-NEXT: shrl %eax +; GFNIAVX-NEXT: andl $21845, %eax # imm = 0x5555 ; GFNIAVX-NEXT: leal (%rax,%rcx,2), %eax ; GFNIAVX-NEXT: # kill: def $ax killed $ax killed $eax ; GFNIAVX-NEXT: retq @@ -269,18 +269,18 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind { ; GFNIAVX2-NEXT: movl %edi, %eax ; GFNIAVX2-NEXT: andl $3855, %eax # imm = 0xF0F ; GFNIAVX2-NEXT: shll $4, %eax -; GFNIAVX2-NEXT: andl $61680, %edi # imm = 0xF0F0 ; GFNIAVX2-NEXT: shrl $4, %edi +; GFNIAVX2-NEXT: andl $3855, %edi # imm = 0xF0F ; GFNIAVX2-NEXT: orl %eax, %edi ; GFNIAVX2-NEXT: movl %edi, %eax ; GFNIAVX2-NEXT: andl $13107, %eax # imm = 0x3333 -; GFNIAVX2-NEXT: andl $52428, %edi # imm = 0xCCCC ; GFNIAVX2-NEXT: shrl $2, %edi +; GFNIAVX2-NEXT: andl $13107, %edi # imm = 0x3333 ; GFNIAVX2-NEXT: leal (%rdi,%rax,4), %eax ; GFNIAVX2-NEXT: movl %eax, %ecx ; GFNIAVX2-NEXT: andl $21845, %ecx # imm = 0x5555 -; GFNIAVX2-NEXT: andl $43690, %eax # imm = 0xAAAA ; GFNIAVX2-NEXT: shrl %eax +; GFNIAVX2-NEXT: andl $21845, %eax # imm = 0x5555 ; GFNIAVX2-NEXT: leal (%rax,%rcx,2), %eax ; GFNIAVX2-NEXT: # kill: def $ax killed $ax killed $eax ; GFNIAVX2-NEXT: retq @@ -292,18 +292,18 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind { ; GFNIAVX512F-NEXT: movl %edi, %eax ; GFNIAVX512F-NEXT: andl $3855, %eax # imm = 0xF0F ; GFNIAVX512F-NEXT: shll $4, %eax -; GFNIAVX512F-NEXT: andl $61680, %edi # imm = 0xF0F0 ; GFNIAVX512F-NEXT: shrl $4, %edi +; GFNIAVX512F-NEXT: andl $3855, %edi # imm = 0xF0F ; GFNIAVX512F-NEXT: orl %eax, %edi ; GFNIAVX512F-NEXT: movl %edi, %eax ; GFNIAVX512F-NEXT: andl $13107, %eax # imm = 0x3333 -; GFNIAVX512F-NEXT: andl $52428, %edi # imm = 0xCCCC ; GFNIAVX512F-NEXT: shrl $2, %edi +; GFNIAVX512F-NEXT: andl $13107, %edi # imm = 0x3333 ; GFNIAVX512F-NEXT: leal (%rdi,%rax,4), %eax ; GFNIAVX512F-NEXT: movl %eax, %ecx ; GFNIAVX512F-NEXT: andl $21845, %ecx # imm = 0x5555 -; GFNIAVX512F-NEXT: andl $43690, %eax # imm = 0xAAAA ; GFNIAVX512F-NEXT: shrl %eax +; GFNIAVX512F-NEXT: andl $21845, %eax # imm = 0x5555 ; GFNIAVX512F-NEXT: leal (%rax,%rcx,2), %eax ; GFNIAVX512F-NEXT: # kill: def $ax killed $ax killed $eax ; GFNIAVX512F-NEXT: retq @@ -315,18 +315,18 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind { ; GFNIAVX512BW-NEXT: movl %edi, %eax ; GFNIAVX512BW-NEXT: andl $3855, %eax # imm = 0xF0F ; GFNIAVX512BW-NEXT: shll $4, %eax -; GFNIAVX512BW-NEXT: andl $61680, %edi # imm = 0xF0F0 ; GFNIAVX512BW-NEXT: shrl $4, %edi +; GFNIAVX512BW-NEXT: andl $3855, %edi # imm = 0xF0F ; GFNIAVX512BW-NEXT: orl %eax, %edi ; GFNIAVX512BW-NEXT: movl %edi, %eax ; GFNIAVX512BW-NEXT: andl $13107, %eax # imm = 0x3333 -; GFNIAVX512BW-NEXT: andl $52428, %edi # imm = 0xCCCC ; GFNIAVX512BW-NEXT: shrl $2, %edi +; GFNIAVX512BW-NEXT: andl $13107, %edi # imm = 0x3333 ; GFNIAVX512BW-NEXT: leal (%rdi,%rax,4), %eax ; GFNIAVX512BW-NEXT: movl %eax, %ecx ; GFNIAVX512BW-NEXT: andl $21845, %ecx # imm = 0x5555 -; GFNIAVX512BW-NEXT: andl $43690, %eax # imm = 0xAAAA ; GFNIAVX512BW-NEXT: shrl %eax +; GFNIAVX512BW-NEXT: andl $21845, %eax # imm = 0x5555 ; GFNIAVX512BW-NEXT: leal (%rax,%rcx,2), %eax ; GFNIAVX512BW-NEXT: # kill: def $ax killed $ax killed $eax ; GFNIAVX512BW-NEXT: retq @@ -342,18 +342,18 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind { ; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F ; SSE-NEXT: shll $4, %eax -; SSE-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 ; SSE-NEXT: shrl $4, %edi +; SSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F ; SSE-NEXT: orl %eax, %edi ; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: andl $858993459, %eax # imm = 0x33333333 -; SSE-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC ; SSE-NEXT: shrl $2, %edi +; SSE-NEXT: andl $858993459, %edi # imm = 0x33333333 ; SSE-NEXT: leal (%rdi,%rax,4), %eax ; SSE-NEXT: movl %eax, %ecx ; SSE-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; SSE-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA ; SSE-NEXT: shrl %eax +; SSE-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; SSE-NEXT: leal (%rax,%rcx,2), %eax ; SSE-NEXT: retq ; @@ -364,18 +364,18 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind { ; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F ; AVX-NEXT: shll $4, %eax -; AVX-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 ; AVX-NEXT: shrl $4, %edi +; AVX-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F ; AVX-NEXT: orl %eax, %edi ; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: andl $858993459, %eax # imm = 0x33333333 -; AVX-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC ; AVX-NEXT: shrl $2, %edi +; AVX-NEXT: andl $858993459, %edi # imm = 0x33333333 ; AVX-NEXT: leal (%rdi,%rax,4), %eax ; AVX-NEXT: movl %eax, %ecx ; AVX-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; AVX-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA ; AVX-NEXT: shrl %eax +; AVX-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; AVX-NEXT: leal (%rax,%rcx,2), %eax ; AVX-NEXT: retq ; @@ -393,18 +393,18 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind { ; GFNISSE-NEXT: movl %edi, %eax ; GFNISSE-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F ; GFNISSE-NEXT: shll $4, %eax -; GFNISSE-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 ; GFNISSE-NEXT: shrl $4, %edi +; GFNISSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F ; GFNISSE-NEXT: orl %eax, %edi ; GFNISSE-NEXT: movl %edi, %eax ; GFNISSE-NEXT: andl $858993459, %eax # imm = 0x33333333 -; GFNISSE-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC ; GFNISSE-NEXT: shrl $2, %edi +; GFNISSE-NEXT: andl $858993459, %edi # imm = 0x33333333 ; GFNISSE-NEXT: leal (%rdi,%rax,4), %eax ; GFNISSE-NEXT: movl %eax, %ecx ; GFNISSE-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; GFNISSE-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA ; GFNISSE-NEXT: shrl %eax +; GFNISSE-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; GFNISSE-NEXT: leal (%rax,%rcx,2), %eax ; GFNISSE-NEXT: retq ; @@ -415,18 +415,18 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind { ; GFNIAVX-NEXT: movl %edi, %eax ; GFNIAVX-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F ; GFNIAVX-NEXT: shll $4, %eax -; GFNIAVX-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 ; GFNIAVX-NEXT: shrl $4, %edi +; GFNIAVX-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F ; GFNIAVX-NEXT: orl %eax, %edi ; GFNIAVX-NEXT: movl %edi, %eax ; GFNIAVX-NEXT: andl $858993459, %eax # imm = 0x33333333 -; GFNIAVX-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC ; GFNIAVX-NEXT: shrl $2, %edi +; GFNIAVX-NEXT: andl $858993459, %edi # imm = 0x33333333 ; GFNIAVX-NEXT: leal (%rdi,%rax,4), %eax ; GFNIAVX-NEXT: movl %eax, %ecx ; GFNIAVX-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; GFNIAVX-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA ; GFNIAVX-NEXT: shrl %eax +; GFNIAVX-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; GFNIAVX-NEXT: leal (%rax,%rcx,2), %eax ; GFNIAVX-NEXT: retq ; @@ -437,18 +437,18 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind { ; GFNIAVX2-NEXT: movl %edi, %eax ; GFNIAVX2-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F ; GFNIAVX2-NEXT: shll $4, %eax -; GFNIAVX2-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 ; GFNIAVX2-NEXT: shrl $4, %edi +; GFNIAVX2-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F ; GFNIAVX2-NEXT: orl %eax, %edi ; GFNIAVX2-NEXT: movl %edi, %eax ; GFNIAVX2-NEXT: andl $858993459, %eax # imm = 0x33333333 -; GFNIAVX2-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC ; GFNIAVX2-NEXT: shrl $2, %edi +; GFNIAVX2-NEXT: andl $858993459, %edi # imm = 0x33333333 ; GFNIAVX2-NEXT: leal (%rdi,%rax,4), %eax ; GFNIAVX2-NEXT: movl %eax, %ecx ; GFNIAVX2-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; GFNIAVX2-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA ; GFNIAVX2-NEXT: shrl %eax +; GFNIAVX2-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; GFNIAVX2-NEXT: leal (%rax,%rcx,2), %eax ; GFNIAVX2-NEXT: retq ; @@ -459,18 +459,18 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind { ; GFNIAVX512F-NEXT: movl %edi, %eax ; GFNIAVX512F-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F ; GFNIAVX512F-NEXT: shll $4, %eax -; GFNIAVX512F-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 ; GFNIAVX512F-NEXT: shrl $4, %edi +; GFNIAVX512F-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F ; GFNIAVX512F-NEXT: orl %eax, %edi ; GFNIAVX512F-NEXT: movl %edi, %eax ; GFNIAVX512F-NEXT: andl $858993459, %eax # imm = 0x33333333 -; GFNIAVX512F-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC ; GFNIAVX512F-NEXT: shrl $2, %edi +; GFNIAVX512F-NEXT: andl $858993459, %edi # imm = 0x33333333 ; GFNIAVX512F-NEXT: leal (%rdi,%rax,4), %eax ; GFNIAVX512F-NEXT: movl %eax, %ecx ; GFNIAVX512F-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; GFNIAVX512F-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA ; GFNIAVX512F-NEXT: shrl %eax +; GFNIAVX512F-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; GFNIAVX512F-NEXT: leal (%rax,%rcx,2), %eax ; GFNIAVX512F-NEXT: retq ; @@ -481,18 +481,18 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind { ; GFNIAVX512BW-NEXT: movl %edi, %eax ; GFNIAVX512BW-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F ; GFNIAVX512BW-NEXT: shll $4, %eax -; GFNIAVX512BW-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 ; GFNIAVX512BW-NEXT: shrl $4, %edi +; GFNIAVX512BW-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F ; GFNIAVX512BW-NEXT: orl %eax, %edi ; GFNIAVX512BW-NEXT: movl %edi, %eax ; GFNIAVX512BW-NEXT: andl $858993459, %eax # imm = 0x33333333 -; GFNIAVX512BW-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC ; GFNIAVX512BW-NEXT: shrl $2, %edi +; GFNIAVX512BW-NEXT: andl $858993459, %edi # imm = 0x33333333 ; GFNIAVX512BW-NEXT: leal (%rdi,%rax,4), %eax ; GFNIAVX512BW-NEXT: movl %eax, %ecx ; GFNIAVX512BW-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; GFNIAVX512BW-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA ; GFNIAVX512BW-NEXT: shrl %eax +; GFNIAVX512BW-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; GFNIAVX512BW-NEXT: leal (%rax,%rcx,2), %eax ; GFNIAVX512BW-NEXT: retq %b = call i32 @llvm.bitreverse.i32(i32 %a) @@ -503,49 +503,49 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind { ; SSE-LABEL: test_bitreverse_i64: ; SSE: # %bb.0: ; SSE-NEXT: bswapq %rdi -; SSE-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F -; SSE-NEXT: andq %rdi, %rax -; SSE-NEXT: shlq $4, %rax -; SSE-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 -; SSE-NEXT: andq %rdi, %rcx -; SSE-NEXT: shrq $4, %rcx -; SSE-NEXT: orq %rax, %rcx -; SSE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: shrq $4, %rax +; SSE-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F ; SSE-NEXT: andq %rcx, %rax -; SSE-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC -; SSE-NEXT: andq %rcx, %rdx -; SSE-NEXT: shrq $2, %rdx -; SSE-NEXT: leaq (%rdx,%rax,4), %rax -; SSE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; SSE-NEXT: andq %rcx, %rdi +; SSE-NEXT: shlq $4, %rdi +; SSE-NEXT: orq %rax, %rdi +; SSE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; SSE-NEXT: movq %rdi, %rcx ; SSE-NEXT: andq %rax, %rcx -; SSE-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA -; SSE-NEXT: andq %rax, %rdx -; SSE-NEXT: shrq %rdx -; SSE-NEXT: leaq (%rdx,%rcx,2), %rax +; SSE-NEXT: shrq $2, %rdi +; SSE-NEXT: andq %rax, %rdi +; SSE-NEXT: leaq (%rdi,%rcx,4), %rax +; SSE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; SSE-NEXT: movq %rax, %rdx +; SSE-NEXT: andq %rcx, %rdx +; SSE-NEXT: shrq %rax +; SSE-NEXT: andq %rcx, %rax +; SSE-NEXT: leaq (%rax,%rdx,2), %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_bitreverse_i64: ; AVX: # %bb.0: ; AVX-NEXT: bswapq %rdi -; AVX-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F -; AVX-NEXT: andq %rdi, %rax -; AVX-NEXT: shlq $4, %rax -; AVX-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 -; AVX-NEXT: andq %rdi, %rcx -; AVX-NEXT: shrq $4, %rcx -; AVX-NEXT: orq %rax, %rcx -; AVX-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; AVX-NEXT: movq %rdi, %rax +; AVX-NEXT: shrq $4, %rax +; AVX-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F ; AVX-NEXT: andq %rcx, %rax -; AVX-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC -; AVX-NEXT: andq %rcx, %rdx -; AVX-NEXT: shrq $2, %rdx -; AVX-NEXT: leaq (%rdx,%rax,4), %rax -; AVX-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; AVX-NEXT: andq %rcx, %rdi +; AVX-NEXT: shlq $4, %rdi +; AVX-NEXT: orq %rax, %rdi +; AVX-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; AVX-NEXT: movq %rdi, %rcx ; AVX-NEXT: andq %rax, %rcx -; AVX-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA -; AVX-NEXT: andq %rax, %rdx -; AVX-NEXT: shrq %rdx -; AVX-NEXT: leaq (%rdx,%rcx,2), %rax +; AVX-NEXT: shrq $2, %rdi +; AVX-NEXT: andq %rax, %rdi +; AVX-NEXT: leaq (%rdi,%rcx,4), %rax +; AVX-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; AVX-NEXT: movq %rax, %rdx +; AVX-NEXT: andq %rcx, %rdx +; AVX-NEXT: shrq %rax +; AVX-NEXT: andq %rcx, %rax +; AVX-NEXT: leaq (%rax,%rdx,2), %rax ; AVX-NEXT: retq ; ; XOP-LABEL: test_bitreverse_i64: @@ -558,121 +558,121 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind { ; GFNISSE-LABEL: test_bitreverse_i64: ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: bswapq %rdi -; GFNISSE-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F -; GFNISSE-NEXT: andq %rdi, %rax -; GFNISSE-NEXT: shlq $4, %rax -; GFNISSE-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 -; GFNISSE-NEXT: andq %rdi, %rcx -; GFNISSE-NEXT: shrq $4, %rcx -; GFNISSE-NEXT: orq %rax, %rcx -; GFNISSE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; GFNISSE-NEXT: movq %rdi, %rax +; GFNISSE-NEXT: shrq $4, %rax +; GFNISSE-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F ; GFNISSE-NEXT: andq %rcx, %rax -; GFNISSE-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC -; GFNISSE-NEXT: andq %rcx, %rdx -; GFNISSE-NEXT: shrq $2, %rdx -; GFNISSE-NEXT: leaq (%rdx,%rax,4), %rax -; GFNISSE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; GFNISSE-NEXT: andq %rcx, %rdi +; GFNISSE-NEXT: shlq $4, %rdi +; GFNISSE-NEXT: orq %rax, %rdi +; GFNISSE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; GFNISSE-NEXT: movq %rdi, %rcx ; GFNISSE-NEXT: andq %rax, %rcx -; GFNISSE-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA -; GFNISSE-NEXT: andq %rax, %rdx -; GFNISSE-NEXT: shrq %rdx -; GFNISSE-NEXT: leaq (%rdx,%rcx,2), %rax +; GFNISSE-NEXT: shrq $2, %rdi +; GFNISSE-NEXT: andq %rax, %rdi +; GFNISSE-NEXT: leaq (%rdi,%rcx,4), %rax +; GFNISSE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; GFNISSE-NEXT: movq %rax, %rdx +; GFNISSE-NEXT: andq %rcx, %rdx +; GFNISSE-NEXT: shrq %rax +; GFNISSE-NEXT: andq %rcx, %rax +; GFNISSE-NEXT: leaq (%rax,%rdx,2), %rax ; GFNISSE-NEXT: retq ; ; GFNIAVX-LABEL: test_bitreverse_i64: ; GFNIAVX: # %bb.0: ; GFNIAVX-NEXT: bswapq %rdi -; GFNIAVX-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F -; GFNIAVX-NEXT: andq %rdi, %rax -; GFNIAVX-NEXT: shlq $4, %rax -; GFNIAVX-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 -; GFNIAVX-NEXT: andq %rdi, %rcx -; GFNIAVX-NEXT: shrq $4, %rcx -; GFNIAVX-NEXT: orq %rax, %rcx -; GFNIAVX-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; GFNIAVX-NEXT: movq %rdi, %rax +; GFNIAVX-NEXT: shrq $4, %rax +; GFNIAVX-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F ; GFNIAVX-NEXT: andq %rcx, %rax -; GFNIAVX-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC -; GFNIAVX-NEXT: andq %rcx, %rdx -; GFNIAVX-NEXT: shrq $2, %rdx -; GFNIAVX-NEXT: leaq (%rdx,%rax,4), %rax -; GFNIAVX-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; GFNIAVX-NEXT: andq %rcx, %rdi +; GFNIAVX-NEXT: shlq $4, %rdi +; GFNIAVX-NEXT: orq %rax, %rdi +; GFNIAVX-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; GFNIAVX-NEXT: movq %rdi, %rcx ; GFNIAVX-NEXT: andq %rax, %rcx -; GFNIAVX-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA -; GFNIAVX-NEXT: andq %rax, %rdx -; GFNIAVX-NEXT: shrq %rdx -; GFNIAVX-NEXT: leaq (%rdx,%rcx,2), %rax +; GFNIAVX-NEXT: shrq $2, %rdi +; GFNIAVX-NEXT: andq %rax, %rdi +; GFNIAVX-NEXT: leaq (%rdi,%rcx,4), %rax +; GFNIAVX-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; GFNIAVX-NEXT: movq %rax, %rdx +; GFNIAVX-NEXT: andq %rcx, %rdx +; GFNIAVX-NEXT: shrq %rax +; GFNIAVX-NEXT: andq %rcx, %rax +; GFNIAVX-NEXT: leaq (%rax,%rdx,2), %rax ; GFNIAVX-NEXT: retq ; ; GFNIAVX2-LABEL: test_bitreverse_i64: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: bswapq %rdi -; GFNIAVX2-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F -; GFNIAVX2-NEXT: andq %rdi, %rax -; GFNIAVX2-NEXT: shlq $4, %rax -; GFNIAVX2-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 -; GFNIAVX2-NEXT: andq %rdi, %rcx -; GFNIAVX2-NEXT: shrq $4, %rcx -; GFNIAVX2-NEXT: orq %rax, %rcx -; GFNIAVX2-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; GFNIAVX2-NEXT: movq %rdi, %rax +; GFNIAVX2-NEXT: shrq $4, %rax +; GFNIAVX2-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F ; GFNIAVX2-NEXT: andq %rcx, %rax -; GFNIAVX2-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC -; GFNIAVX2-NEXT: andq %rcx, %rdx -; GFNIAVX2-NEXT: shrq $2, %rdx -; GFNIAVX2-NEXT: leaq (%rdx,%rax,4), %rax -; GFNIAVX2-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; GFNIAVX2-NEXT: andq %rcx, %rdi +; GFNIAVX2-NEXT: shlq $4, %rdi +; GFNIAVX2-NEXT: orq %rax, %rdi +; GFNIAVX2-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; GFNIAVX2-NEXT: movq %rdi, %rcx ; GFNIAVX2-NEXT: andq %rax, %rcx -; GFNIAVX2-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA -; GFNIAVX2-NEXT: andq %rax, %rdx -; GFNIAVX2-NEXT: shrq %rdx -; GFNIAVX2-NEXT: leaq (%rdx,%rcx,2), %rax +; GFNIAVX2-NEXT: shrq $2, %rdi +; GFNIAVX2-NEXT: andq %rax, %rdi +; GFNIAVX2-NEXT: leaq (%rdi,%rcx,4), %rax +; GFNIAVX2-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; GFNIAVX2-NEXT: movq %rax, %rdx +; GFNIAVX2-NEXT: andq %rcx, %rdx +; GFNIAVX2-NEXT: shrq %rax +; GFNIAVX2-NEXT: andq %rcx, %rax +; GFNIAVX2-NEXT: leaq (%rax,%rdx,2), %rax ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512F-LABEL: test_bitreverse_i64: ; GFNIAVX512F: # %bb.0: ; GFNIAVX512F-NEXT: bswapq %rdi -; GFNIAVX512F-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F -; GFNIAVX512F-NEXT: andq %rdi, %rax -; GFNIAVX512F-NEXT: shlq $4, %rax -; GFNIAVX512F-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 -; GFNIAVX512F-NEXT: andq %rdi, %rcx -; GFNIAVX512F-NEXT: shrq $4, %rcx -; GFNIAVX512F-NEXT: orq %rax, %rcx -; GFNIAVX512F-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; GFNIAVX512F-NEXT: movq %rdi, %rax +; GFNIAVX512F-NEXT: shrq $4, %rax +; GFNIAVX512F-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F ; GFNIAVX512F-NEXT: andq %rcx, %rax -; GFNIAVX512F-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC -; GFNIAVX512F-NEXT: andq %rcx, %rdx -; GFNIAVX512F-NEXT: shrq $2, %rdx -; GFNIAVX512F-NEXT: leaq (%rdx,%rax,4), %rax -; GFNIAVX512F-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; GFNIAVX512F-NEXT: andq %rcx, %rdi +; GFNIAVX512F-NEXT: shlq $4, %rdi +; GFNIAVX512F-NEXT: orq %rax, %rdi +; GFNIAVX512F-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; GFNIAVX512F-NEXT: movq %rdi, %rcx ; GFNIAVX512F-NEXT: andq %rax, %rcx -; GFNIAVX512F-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA -; GFNIAVX512F-NEXT: andq %rax, %rdx -; GFNIAVX512F-NEXT: shrq %rdx -; GFNIAVX512F-NEXT: leaq (%rdx,%rcx,2), %rax +; GFNIAVX512F-NEXT: shrq $2, %rdi +; GFNIAVX512F-NEXT: andq %rax, %rdi +; GFNIAVX512F-NEXT: leaq (%rdi,%rcx,4), %rax +; GFNIAVX512F-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; GFNIAVX512F-NEXT: movq %rax, %rdx +; GFNIAVX512F-NEXT: andq %rcx, %rdx +; GFNIAVX512F-NEXT: shrq %rax +; GFNIAVX512F-NEXT: andq %rcx, %rax +; GFNIAVX512F-NEXT: leaq (%rax,%rdx,2), %rax ; GFNIAVX512F-NEXT: retq ; ; GFNIAVX512BW-LABEL: test_bitreverse_i64: ; GFNIAVX512BW: # %bb.0: ; GFNIAVX512BW-NEXT: bswapq %rdi -; GFNIAVX512BW-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F -; GFNIAVX512BW-NEXT: andq %rdi, %rax -; GFNIAVX512BW-NEXT: shlq $4, %rax -; GFNIAVX512BW-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 -; GFNIAVX512BW-NEXT: andq %rdi, %rcx -; GFNIAVX512BW-NEXT: shrq $4, %rcx -; GFNIAVX512BW-NEXT: orq %rax, %rcx -; GFNIAVX512BW-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; GFNIAVX512BW-NEXT: movq %rdi, %rax +; GFNIAVX512BW-NEXT: shrq $4, %rax +; GFNIAVX512BW-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F ; GFNIAVX512BW-NEXT: andq %rcx, %rax -; GFNIAVX512BW-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC -; GFNIAVX512BW-NEXT: andq %rcx, %rdx -; GFNIAVX512BW-NEXT: shrq $2, %rdx -; GFNIAVX512BW-NEXT: leaq (%rdx,%rax,4), %rax -; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; GFNIAVX512BW-NEXT: andq %rcx, %rdi +; GFNIAVX512BW-NEXT: shlq $4, %rdi +; GFNIAVX512BW-NEXT: orq %rax, %rdi +; GFNIAVX512BW-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; GFNIAVX512BW-NEXT: movq %rdi, %rcx ; GFNIAVX512BW-NEXT: andq %rax, %rcx -; GFNIAVX512BW-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA -; GFNIAVX512BW-NEXT: andq %rax, %rdx -; GFNIAVX512BW-NEXT: shrq %rdx -; GFNIAVX512BW-NEXT: leaq (%rdx,%rcx,2), %rax +; GFNIAVX512BW-NEXT: shrq $2, %rdi +; GFNIAVX512BW-NEXT: andq %rax, %rdi +; GFNIAVX512BW-NEXT: leaq (%rdi,%rcx,4), %rax +; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; GFNIAVX512BW-NEXT: movq %rax, %rdx +; GFNIAVX512BW-NEXT: andq %rcx, %rdx +; GFNIAVX512BW-NEXT: shrq %rax +; GFNIAVX512BW-NEXT: andq %rcx, %rax +; GFNIAVX512BW-NEXT: leaq (%rax,%rdx,2), %rax ; GFNIAVX512BW-NEXT: retq %b = call i64 @llvm.bitreverse.i64(i64 %a) ret i64 %b @@ -687,16 +687,18 @@ define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind { ; SSE2-NEXT: psrlw $4, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: psllw $2, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: psllw $2, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] -; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: paddb %xmm0, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -775,16 +777,18 @@ define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind { ; SSE2-NEXT: psrlw $4, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: psllw $2, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: psllw $2, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] -; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: paddb %xmm0, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -875,16 +879,18 @@ define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind { ; SSE2-NEXT: psrlw $4, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: psllw $2, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: psllw $2, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] -; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: paddb %xmm0, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -977,16 +983,18 @@ define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind { ; SSE2-NEXT: psrlw $4, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: psllw $2, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: psllw $2, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] -; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: paddb %xmm0, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -1071,38 +1079,38 @@ define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind { ; SSE2-NEXT: psrlw $4, %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: psrlw $2, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: psllw $2, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: psllw $2, %xmm0 ; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: psrlw $1, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] -; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: psrlw $1, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: paddb %xmm0, %xmm0 -; SSE2-NEXT: por %xmm6, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: psllw $4, %xmm6 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: psllw $4, %xmm5 ; SSE2-NEXT: psrlw $4, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: psllw $2, %xmm3 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: psrlw $2, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: psrlw $1, %xmm4 -; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: psllw $2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrlw $1, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test_bitreverse_v32i8: @@ -1248,42 +1256,42 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind { ; SSE2-NEXT: psrlw $4, %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: psrlw $2, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: psllw $2, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: psllw $2, %xmm0 ; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] -; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: psrlw $1, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] -; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: psrlw $1, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: paddb %xmm0, %xmm0 -; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: psrlw $8, %xmm7 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: psrlw $8, %xmm5 ; SSE2-NEXT: psllw $8, %xmm2 -; SSE2-NEXT: por %xmm7, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: psllw $4, %xmm7 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: psllw $4, %xmm5 ; SSE2-NEXT: psrlw $4, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm7, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: psllw $2, %xmm3 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: psrlw $2, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: psrlw $1, %xmm4 -; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: psllw $2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrlw $1, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test_bitreverse_v16i16: @@ -1434,63 +1442,63 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind { ; SSE2-LABEL: test_bitreverse_v8i32: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] ; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psllw $4, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: psllw $4, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: pandn %xmm4, %xmm5 ; SSE2-NEXT: psrlw $4, %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: psllw $2, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] -; SSE2-NEXT: pand %xmm8, %xmm0 -; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: psrlw $2, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: psllw $2, %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] ; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pand %xmm5, %xmm6 ; SSE2-NEXT: psrlw $1, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] -; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: paddb %xmm0, %xmm0 ; SSE2-NEXT: por %xmm6, %xmm0 ; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15] ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] ; SSE2-NEXT: packuswb %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psllw $4, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: psllw $4, %xmm3 ; SSE2-NEXT: psrlw $4, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: psllw $2, %xmm3 -; SSE2-NEXT: pand %xmm8, %xmm1 -; SSE2-NEXT: psrlw $2, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: psrlw $1, %xmm5 -; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: psllw $2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrlw $1, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test_bitreverse_v8i32: @@ -1641,67 +1649,67 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind { ; SSE2-LABEL: test_bitreverse_v4i64: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] ; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psllw $4, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: psllw $4, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: pandn %xmm4, %xmm5 ; SSE2-NEXT: psrlw $4, %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: psllw $2, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] -; SSE2-NEXT: pand %xmm8, %xmm0 -; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: psrlw $2, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: psllw $2, %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] ; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pand %xmm5, %xmm6 ; SSE2-NEXT: psrlw $1, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] -; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: paddb %xmm0, %xmm0 ; SSE2-NEXT: por %xmm6, %xmm0 ; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15] ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] ; SSE2-NEXT: packuswb %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psllw $4, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: psllw $4, %xmm3 ; SSE2-NEXT: psrlw $4, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: psllw $2, %xmm3 -; SSE2-NEXT: pand %xmm8, %xmm1 -; SSE2-NEXT: psrlw $2, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: psrlw $1, %xmm5 -; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: psllw $2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrlw $1, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test_bitreverse_v4i64: @@ -1851,7 +1859,7 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind { define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind { ; SSE2-LABEL: test_bitreverse_v64i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm3, %xmm10 +; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: psllw $4, %xmm5 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] @@ -1860,76 +1868,76 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind { ; SSE2-NEXT: psrlw $4, %xmm0 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: por %xmm6, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: psllw $2, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] +; SSE2-NEXT: psrlw $2, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: pand %xmm8, %xmm6 ; SSE2-NEXT: pand %xmm8, %xmm0 -; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: psllw $2, %xmm0 ; SSE2-NEXT: por %xmm6, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] ; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: pand %xmm6, %xmm7 ; SSE2-NEXT: psrlw $1, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] -; SSE2-NEXT: pand %xmm9, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pand %xmm6, %xmm0 ; SSE2-NEXT: paddb %xmm0, %xmm0 ; SSE2-NEXT: por %xmm7, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm7 ; SSE2-NEXT: psllw $4, %xmm7 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm7, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: pandn %xmm7, %xmm5 ; SSE2-NEXT: psrlw $4, %xmm1 ; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: psllw $2, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: psrlw $2, %xmm5 +; SSE2-NEXT: pand %xmm8, %xmm5 ; SSE2-NEXT: pand %xmm8, %xmm1 -; SSE2-NEXT: psrlw $2, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm6, %xmm4 -; SSE2-NEXT: psrlw $1, %xmm4 -; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: psllw $2, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: psrlw $1, %xmm5 +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm6, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psllw $4, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: psllw $4, %xmm5 ; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: pandn %xmm4, %xmm7 +; SSE2-NEXT: pandn %xmm5, %xmm7 ; SSE2-NEXT: psrlw $4, %xmm2 ; SSE2-NEXT: pand %xmm3, %xmm2 ; SSE2-NEXT: por %xmm7, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: psllw $2, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: psrlw $2, %xmm5 +; SSE2-NEXT: pand %xmm8, %xmm5 ; SSE2-NEXT: pand %xmm8, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pand %xmm6, %xmm4 -; SSE2-NEXT: psrlw $1, %xmm4 -; SSE2-NEXT: pand %xmm9, %xmm2 +; SSE2-NEXT: psllw $2, %xmm2 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: psrlw $1, %xmm5 +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm6, %xmm2 ; SSE2-NEXT: paddb %xmm2, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm10, %xmm4 -; SSE2-NEXT: psllw $4, %xmm4 -; SSE2-NEXT: psrlw $4, %xmm10 -; SSE2-NEXT: pand %xmm3, %xmm10 -; SSE2-NEXT: pandn %xmm4, %xmm3 -; SSE2-NEXT: por %xmm10, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: psllw $2, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: psllw $4, %xmm5 +; SSE2-NEXT: psrlw $4, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm5, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: psrlw $2, %xmm4 +; SSE2-NEXT: pand %xmm8, %xmm4 ; SSE2-NEXT: pand %xmm8, %xmm3 -; SSE2-NEXT: psrlw $2, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm6 -; SSE2-NEXT: psrlw $1, %xmm6 -; SSE2-NEXT: pand %xmm9, %xmm3 +; SSE2-NEXT: psllw $2, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: psrlw $1, %xmm4 +; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm6, %xmm3 ; SSE2-NEXT: paddb %xmm3, %xmm3 -; SSE2-NEXT: por %xmm6, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test_bitreverse_v64i8: @@ -2152,20 +2160,18 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { ; SSE2-NEXT: psrlw $4, %xmm0 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: por %xmm6, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pand %xmm10, %xmm6 -; SSE2-NEXT: psllw $2, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] +; SSE2-NEXT: psrlw $2, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: pand %xmm8, %xmm6 ; SSE2-NEXT: pand %xmm8, %xmm0 -; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: psllw $2, %xmm0 ; SSE2-NEXT: por %xmm6, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] ; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: pand %xmm6, %xmm7 ; SSE2-NEXT: psrlw $1, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] -; SSE2-NEXT: pand %xmm9, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pand %xmm6, %xmm0 ; SSE2-NEXT: paddb %xmm0, %xmm0 ; SSE2-NEXT: por %xmm7, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm7 @@ -2180,15 +2186,15 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { ; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: por %xmm5, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pand %xmm10, %xmm5 -; SSE2-NEXT: psllw $2, %xmm5 +; SSE2-NEXT: psrlw $2, %xmm5 +; SSE2-NEXT: pand %xmm8, %xmm5 ; SSE2-NEXT: pand %xmm8, %xmm1 -; SSE2-NEXT: psrlw $2, %xmm1 +; SSE2-NEXT: psllw $2, %xmm1 ; SSE2-NEXT: por %xmm5, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pand %xmm6, %xmm5 ; SSE2-NEXT: psrlw $1, %xmm5 -; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm6, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm1 ; SSE2-NEXT: por %xmm5, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm5 @@ -2203,15 +2209,15 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { ; SSE2-NEXT: pand %xmm3, %xmm2 ; SSE2-NEXT: por %xmm7, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pand %xmm10, %xmm5 -; SSE2-NEXT: psllw $2, %xmm5 +; SSE2-NEXT: psrlw $2, %xmm5 +; SSE2-NEXT: pand %xmm8, %xmm5 ; SSE2-NEXT: pand %xmm8, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm2 +; SSE2-NEXT: psllw $2, %xmm2 ; SSE2-NEXT: por %xmm5, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pand %xmm6, %xmm5 ; SSE2-NEXT: psrlw $1, %xmm5 -; SSE2-NEXT: pand %xmm9, %xmm2 +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm6, %xmm2 ; SSE2-NEXT: paddb %xmm2, %xmm2 ; SSE2-NEXT: por %xmm5, %xmm2 ; SSE2-NEXT: movdqa %xmm4, %xmm5 @@ -2224,16 +2230,18 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { ; SSE2-NEXT: pand %xmm3, %xmm4 ; SSE2-NEXT: pandn %xmm5, %xmm3 ; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm10 -; SSE2-NEXT: psllw $2, %xmm10 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: psrlw $2, %xmm4 +; SSE2-NEXT: pand %xmm8, %xmm4 ; SSE2-NEXT: pand %xmm8, %xmm3 -; SSE2-NEXT: psrlw $2, %xmm3 -; SSE2-NEXT: por %xmm10, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm6 -; SSE2-NEXT: psrlw $1, %xmm6 -; SSE2-NEXT: pand %xmm9, %xmm3 +; SSE2-NEXT: psllw $2, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: psrlw $1, %xmm4 +; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm6, %xmm3 ; SSE2-NEXT: paddb %xmm3, %xmm3 -; SSE2-NEXT: por %xmm6, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test_bitreverse_v32i16: @@ -2478,118 +2486,118 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { ; SSE2-LABEL: test_bitreverse_v16i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm3, %xmm11 -; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm8 ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] ; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: psllw $4, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: psllw $4, %xmm6 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: pandn %xmm5, %xmm7 +; SSE2-NEXT: pandn %xmm6, %xmm7 ; SSE2-NEXT: psrlw $4, %xmm0 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: pand %xmm5, %xmm7 -; SSE2-NEXT: psllw $2, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] -; SSE2-NEXT: pand %xmm8, %xmm0 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pand %xmm7, %xmm6 -; SSE2-NEXT: psrlw $1, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; SSE2-NEXT: psrlw $2, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: pand %xmm9, %xmm7 ; SSE2-NEXT: pand %xmm9, %xmm0 +; SSE2-NEXT: psllw $2, %xmm0 +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: psrlw $1, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; SSE2-NEXT: pand %xmm7, %xmm5 +; SSE2-NEXT: pand %xmm7, %xmm0 ; SSE2-NEXT: paddb %xmm0, %xmm0 -; SSE2-NEXT: por %xmm6, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] -; SSE2-NEXT: packuswb %xmm6, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: psllw $4, %xmm6 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: packuswb %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: psllw $4, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pandn %xmm5, %xmm6 ; SSE2-NEXT: psrlw $4, %xmm1 ; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: psllw $2, %xmm4 -; SSE2-NEXT: pand %xmm8, %xmm1 -; SSE2-NEXT: psrlw $2, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm7, %xmm4 -; SSE2-NEXT: psrlw $1, %xmm4 +; SSE2-NEXT: por %xmm6, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: psrlw $2, %xmm5 +; SSE2-NEXT: pand %xmm9, %xmm5 ; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: psllw $2, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: psrlw $1, %xmm5 +; SSE2-NEXT: pand %xmm7, %xmm5 +; SSE2-NEXT: pand %xmm7, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] -; SSE2-NEXT: packuswb %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psllw $4, %xmm4 +; SSE2-NEXT: packuswb %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: psllw $4, %xmm5 ; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pandn %xmm4, %xmm6 +; SSE2-NEXT: pandn %xmm5, %xmm6 ; SSE2-NEXT: psrlw $4, %xmm2 ; SSE2-NEXT: pand %xmm3, %xmm2 ; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: psllw $2, %xmm4 -; SSE2-NEXT: pand %xmm8, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pand %xmm7, %xmm4 -; SSE2-NEXT: psrlw $1, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: psrlw $2, %xmm5 +; SSE2-NEXT: pand %xmm9, %xmm5 ; SSE2-NEXT: pand %xmm9, %xmm2 +; SSE2-NEXT: psllw $2, %xmm2 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: psrlw $1, %xmm5 +; SSE2-NEXT: pand %xmm7, %xmm5 +; SSE2-NEXT: pand %xmm7, %xmm2 ; SSE2-NEXT: paddb %xmm2, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm11, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[3,2,1,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] -; SSE2-NEXT: packuswb %xmm4, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm4 -; SSE2-NEXT: psllw $4, %xmm4 -; SSE2-NEXT: psrlw $4, %xmm6 -; SSE2-NEXT: pand %xmm3, %xmm6 -; SSE2-NEXT: pandn %xmm4, %xmm3 -; SSE2-NEXT: por %xmm6, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: psllw $2, %xmm5 -; SSE2-NEXT: pand %xmm8, %xmm3 -; SSE2-NEXT: psrlw $2, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm7 -; SSE2-NEXT: psrlw $1, %xmm7 +; SSE2-NEXT: packuswb %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: psllw $4, %xmm5 +; SSE2-NEXT: psrlw $4, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm5, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: psrlw $2, %xmm4 +; SSE2-NEXT: pand %xmm9, %xmm4 ; SSE2-NEXT: pand %xmm9, %xmm3 +; SSE2-NEXT: psllw $2, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: psrlw $1, %xmm4 +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm7, %xmm3 ; SSE2-NEXT: paddb %xmm3, %xmm3 -; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test_bitreverse_v16i32: @@ -2834,126 +2842,126 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { ; SSE2-LABEL: test_bitreverse_v8i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm3, %xmm11 -; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm8 ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] ; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: psllw $4, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: psllw $4, %xmm6 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: pandn %xmm5, %xmm7 +; SSE2-NEXT: pandn %xmm6, %xmm7 ; SSE2-NEXT: psrlw $4, %xmm0 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: pand %xmm5, %xmm7 -; SSE2-NEXT: psllw $2, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] -; SSE2-NEXT: pand %xmm8, %xmm0 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pand %xmm7, %xmm6 -; SSE2-NEXT: psrlw $1, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; SSE2-NEXT: psrlw $2, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: pand %xmm9, %xmm7 ; SSE2-NEXT: pand %xmm9, %xmm0 +; SSE2-NEXT: psllw $2, %xmm0 +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: psrlw $1, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; SSE2-NEXT: pand %xmm7, %xmm5 +; SSE2-NEXT: pand %xmm7, %xmm0 ; SSE2-NEXT: paddb %xmm0, %xmm0 -; SSE2-NEXT: por %xmm6, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] -; SSE2-NEXT: packuswb %xmm6, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: psllw $4, %xmm6 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: packuswb %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: psllw $4, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pandn %xmm5, %xmm6 ; SSE2-NEXT: psrlw $4, %xmm1 ; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: psllw $2, %xmm4 -; SSE2-NEXT: pand %xmm8, %xmm1 -; SSE2-NEXT: psrlw $2, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm7, %xmm4 -; SSE2-NEXT: psrlw $1, %xmm4 +; SSE2-NEXT: por %xmm6, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: psrlw $2, %xmm5 +; SSE2-NEXT: pand %xmm9, %xmm5 ; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: psllw $2, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: psrlw $1, %xmm5 +; SSE2-NEXT: pand %xmm7, %xmm5 +; SSE2-NEXT: pand %xmm7, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] -; SSE2-NEXT: packuswb %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psllw $4, %xmm4 +; SSE2-NEXT: packuswb %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: psllw $4, %xmm5 ; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pandn %xmm4, %xmm6 +; SSE2-NEXT: pandn %xmm5, %xmm6 ; SSE2-NEXT: psrlw $4, %xmm2 ; SSE2-NEXT: pand %xmm3, %xmm2 ; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: psllw $2, %xmm4 -; SSE2-NEXT: pand %xmm8, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pand %xmm7, %xmm4 -; SSE2-NEXT: psrlw $1, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: psrlw $2, %xmm5 +; SSE2-NEXT: pand %xmm9, %xmm5 ; SSE2-NEXT: pand %xmm9, %xmm2 +; SSE2-NEXT: psllw $2, %xmm2 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: psrlw $1, %xmm5 +; SSE2-NEXT: pand %xmm7, %xmm5 +; SSE2-NEXT: pand %xmm7, %xmm2 ; SSE2-NEXT: paddb %xmm2, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm11, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm11[2,3,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] -; SSE2-NEXT: packuswb %xmm4, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm4 -; SSE2-NEXT: psllw $4, %xmm4 -; SSE2-NEXT: psrlw $4, %xmm6 -; SSE2-NEXT: pand %xmm3, %xmm6 -; SSE2-NEXT: pandn %xmm4, %xmm3 -; SSE2-NEXT: por %xmm6, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: psllw $2, %xmm5 -; SSE2-NEXT: pand %xmm8, %xmm3 -; SSE2-NEXT: psrlw $2, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm7 -; SSE2-NEXT: psrlw $1, %xmm7 +; SSE2-NEXT: packuswb %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: psllw $4, %xmm5 +; SSE2-NEXT: psrlw $4, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm5, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: psrlw $2, %xmm4 +; SSE2-NEXT: pand %xmm9, %xmm4 ; SSE2-NEXT: pand %xmm9, %xmm3 +; SSE2-NEXT: psllw $2, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: psrlw $1, %xmm4 +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm7, %xmm3 ; SSE2-NEXT: paddb %xmm3, %xmm3 -; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test_bitreverse_v8i64: