[SelectionDAG] Optimize bitreverse expansion to minimize the number of mask constants.

We can halve the number of mask constants by masking before shl
and after srl.

This can reduce the number of mov immediate or constant
materializations. Or reduce the number of constant pool loads
for X86 vectors.

I think we might be able to do something similar for bswap. I'll
look at it next.

Differential Revision: https://reviews.llvm.org/D108738
This commit is contained in:
Craig Topper 2021-08-25 15:28:06 -07:00
parent 70f3ccb6a2
commit 8bb24289f3
8 changed files with 1916 additions and 2306 deletions

View File

@ -7296,34 +7296,31 @@ SDValue TargetLowering::expandBITREVERSE(SDNode *N, SelectionDAG &DAG) const {
// TODO: We can easily support i4/i2 legal types if any target ever does.
if (Sz >= 8 && isPowerOf2_32(Sz)) {
// Create the masks - repeating the pattern every byte.
APInt MaskHi4 = APInt::getSplat(Sz, APInt(8, 0xF0));
APInt MaskHi2 = APInt::getSplat(Sz, APInt(8, 0xCC));
APInt MaskHi1 = APInt::getSplat(Sz, APInt(8, 0xAA));
APInt MaskLo4 = APInt::getSplat(Sz, APInt(8, 0x0F));
APInt MaskLo2 = APInt::getSplat(Sz, APInt(8, 0x33));
APInt MaskLo1 = APInt::getSplat(Sz, APInt(8, 0x55));
APInt Mask4 = APInt::getSplat(Sz, APInt(8, 0x0F));
APInt Mask2 = APInt::getSplat(Sz, APInt(8, 0x33));
APInt Mask1 = APInt::getSplat(Sz, APInt(8, 0x55));
// BSWAP if the type is wider than a single byte.
Tmp = (Sz > 8 ? DAG.getNode(ISD::BSWAP, dl, VT, Op) : Op);
// swap i4: ((V & 0xF0) >> 4) | ((V & 0x0F) << 4)
Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi4, dl, VT));
Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo4, dl, VT));
Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(4, dl, SHVT));
// swap i4: ((V >> 4) & 0x0F) | ((V & 0x0F) << 4)
Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp, DAG.getConstant(4, dl, SHVT));
Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Mask4, dl, VT));
Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(Mask4, dl, VT));
Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(4, dl, SHVT));
Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
// swap i2: ((V & 0xCC) >> 2) | ((V & 0x33) << 2)
Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi2, dl, VT));
Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo2, dl, VT));
Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(2, dl, SHVT));
// swap i2: ((V >> 2) & 0x33) | ((V & 0x33) << 2)
Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp, DAG.getConstant(2, dl, SHVT));
Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Mask2, dl, VT));
Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(Mask2, dl, VT));
Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(2, dl, SHVT));
Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
// swap i1: ((V & 0xAA) >> 1) | ((V & 0x55) << 1)
Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi1, dl, VT));
Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo1, dl, VT));
Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(1, dl, SHVT));
// swap i1: ((V >> 1) & 0x55) | ((V & 0x55) << 1)
Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp, DAG.getConstant(1, dl, SHVT));
Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Mask1, dl, VT));
Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(Mask1, dl, VT));
Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(1, dl, SHVT));
Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
return Tmp;

View File

@ -2453,13 +2453,13 @@ define zeroext i8 @bitreverse_i8(i8 zeroext %a) nounwind {
; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: andi a1, a0, 51
; RV32I-NEXT: slli a1, a1, 2
; RV32I-NEXT: andi a0, a0, 204
; RV32I-NEXT: srli a0, a0, 2
; RV32I-NEXT: andi a0, a0, 51
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: andi a1, a0, 85
; RV32I-NEXT: slli a1, a1, 1
; RV32I-NEXT: andi a0, a0, 170
; RV32I-NEXT: srli a0, a0, 1
; RV32I-NEXT: andi a0, a0, 85
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: ret
;
@ -2484,33 +2484,27 @@ define zeroext i16 @bitreverse_i16(i16 zeroext %a) nounwind {
; RV32I-NEXT: srli a1, a0, 8
; RV32I-NEXT: slli a0, a0, 8
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: lui a1, 1
; RV32I-NEXT: addi a1, a1, -241
; RV32I-NEXT: and a1, a0, a1
; RV32I-NEXT: slli a1, a1, 4
; RV32I-NEXT: lui a2, 15
; RV32I-NEXT: addi a2, a2, 240
; RV32I-NEXT: srli a1, a0, 4
; RV32I-NEXT: lui a2, 1
; RV32I-NEXT: addi a2, a2, -241
; RV32I-NEXT: and a1, a1, a2
; RV32I-NEXT: and a0, a0, a2
; RV32I-NEXT: srli a0, a0, 4
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: lui a1, 3
; RV32I-NEXT: addi a1, a1, 819
; RV32I-NEXT: and a1, a0, a1
; RV32I-NEXT: slli a1, a1, 2
; RV32I-NEXT: lui a2, 13
; RV32I-NEXT: addi a2, a2, -820
; RV32I-NEXT: slli a0, a0, 4
; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: srli a1, a0, 2
; RV32I-NEXT: lui a2, 3
; RV32I-NEXT: addi a2, a2, 819
; RV32I-NEXT: and a1, a1, a2
; RV32I-NEXT: and a0, a0, a2
; RV32I-NEXT: srli a0, a0, 2
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: lui a1, 5
; RV32I-NEXT: addi a1, a1, 1365
; RV32I-NEXT: and a1, a0, a1
; RV32I-NEXT: slli a1, a1, 1
; RV32I-NEXT: lui a2, 11
; RV32I-NEXT: addi a2, a2, -1366
; RV32I-NEXT: slli a0, a0, 2
; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: srli a1, a0, 1
; RV32I-NEXT: lui a2, 5
; RV32I-NEXT: addi a2, a2, 1365
; RV32I-NEXT: and a1, a1, a2
; RV32I-NEXT: and a0, a0, a2
; RV32I-NEXT: srli a0, a0, 1
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: slli a0, a0, 1
; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: ret
;
; RV32B-LABEL: bitreverse_i16:
@ -2543,33 +2537,27 @@ define i32 @bitreverse_i32(i32 %a) nounwind {
; RV32I-NEXT: slli a0, a0, 24
; RV32I-NEXT: or a0, a0, a2
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: lui a1, 61681
; RV32I-NEXT: addi a1, a1, -241
; RV32I-NEXT: and a1, a0, a1
; RV32I-NEXT: slli a1, a1, 4
; RV32I-NEXT: lui a2, 986895
; RV32I-NEXT: addi a2, a2, 240
; RV32I-NEXT: srli a1, a0, 4
; RV32I-NEXT: lui a2, 61681
; RV32I-NEXT: addi a2, a2, -241
; RV32I-NEXT: and a1, a1, a2
; RV32I-NEXT: and a0, a0, a2
; RV32I-NEXT: srli a0, a0, 4
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: lui a1, 209715
; RV32I-NEXT: addi a1, a1, 819
; RV32I-NEXT: and a1, a0, a1
; RV32I-NEXT: slli a1, a1, 2
; RV32I-NEXT: lui a2, 838861
; RV32I-NEXT: addi a2, a2, -820
; RV32I-NEXT: slli a0, a0, 4
; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: srli a1, a0, 2
; RV32I-NEXT: lui a2, 209715
; RV32I-NEXT: addi a2, a2, 819
; RV32I-NEXT: and a1, a1, a2
; RV32I-NEXT: and a0, a0, a2
; RV32I-NEXT: srli a0, a0, 2
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: lui a1, 349525
; RV32I-NEXT: addi a1, a1, 1365
; RV32I-NEXT: and a1, a0, a1
; RV32I-NEXT: slli a1, a1, 1
; RV32I-NEXT: lui a2, 699051
; RV32I-NEXT: addi a2, a2, -1366
; RV32I-NEXT: slli a0, a0, 2
; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: srli a1, a0, 1
; RV32I-NEXT: lui a2, 349525
; RV32I-NEXT: addi a2, a2, 1365
; RV32I-NEXT: and a1, a1, a2
; RV32I-NEXT: and a0, a0, a2
; RV32I-NEXT: srli a0, a0, 1
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: slli a0, a0, 1
; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: ret
;
; RV32B-LABEL: bitreverse_i32:
@ -2602,58 +2590,52 @@ define i64 @bitreverse_i64(i64 %a) nounwind {
; RV32I-NEXT: slli a1, a1, 24
; RV32I-NEXT: or a1, a1, a4
; RV32I-NEXT: or a1, a1, a2
; RV32I-NEXT: lui a2, 61681
; RV32I-NEXT: addi t0, a2, -241
; RV32I-NEXT: and a2, a1, t0
; RV32I-NEXT: slli a2, a2, 4
; RV32I-NEXT: lui a3, 986895
; RV32I-NEXT: addi t1, a3, 240
; RV32I-NEXT: and a1, a1, t1
; RV32I-NEXT: srli a1, a1, 4
; RV32I-NEXT: or a1, a1, a2
; RV32I-NEXT: lui a2, 209715
; RV32I-NEXT: addi t2, a2, 819
; RV32I-NEXT: and a2, a1, t2
; RV32I-NEXT: slli a2, a2, 2
; RV32I-NEXT: lui a4, 838861
; RV32I-NEXT: addi t3, a4, -820
; RV32I-NEXT: and a1, a1, t3
; RV32I-NEXT: srli a1, a1, 2
; RV32I-NEXT: or a1, a1, a2
; RV32I-NEXT: lui a2, 349525
; RV32I-NEXT: addi a3, a2, 1365
; RV32I-NEXT: and a2, a1, a3
; RV32I-NEXT: slli a2, a2, 1
; RV32I-NEXT: lui a5, 699051
; RV32I-NEXT: addi a5, a5, -1366
; RV32I-NEXT: srli a2, a1, 4
; RV32I-NEXT: lui a4, 61681
; RV32I-NEXT: addi a4, a4, -241
; RV32I-NEXT: and a2, a2, a4
; RV32I-NEXT: and a1, a1, a4
; RV32I-NEXT: slli a1, a1, 4
; RV32I-NEXT: or a1, a2, a1
; RV32I-NEXT: srli a2, a1, 2
; RV32I-NEXT: lui a3, 209715
; RV32I-NEXT: addi a3, a3, 819
; RV32I-NEXT: and a2, a2, a3
; RV32I-NEXT: and a1, a1, a3
; RV32I-NEXT: slli a1, a1, 2
; RV32I-NEXT: or a1, a2, a1
; RV32I-NEXT: srli a2, a1, 1
; RV32I-NEXT: lui a5, 349525
; RV32I-NEXT: addi a5, a5, 1365
; RV32I-NEXT: and a2, a2, a5
; RV32I-NEXT: and a1, a1, a5
; RV32I-NEXT: srli a1, a1, 1
; RV32I-NEXT: or a2, a1, a2
; RV32I-NEXT: slli a1, a1, 1
; RV32I-NEXT: or t0, a2, a1
; RV32I-NEXT: srli a1, a0, 8
; RV32I-NEXT: and a1, a1, a6
; RV32I-NEXT: srli a4, a0, 24
; RV32I-NEXT: or a1, a1, a4
; RV32I-NEXT: slli a4, a0, 8
; RV32I-NEXT: and a4, a4, a7
; RV32I-NEXT: srli a2, a0, 24
; RV32I-NEXT: or a1, a1, a2
; RV32I-NEXT: slli a2, a0, 8
; RV32I-NEXT: and a2, a2, a7
; RV32I-NEXT: slli a0, a0, 24
; RV32I-NEXT: or a0, a0, a4
; RV32I-NEXT: or a0, a0, a2
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: and a1, a0, t0
; RV32I-NEXT: slli a1, a1, 4
; RV32I-NEXT: and a0, a0, t1
; RV32I-NEXT: srli a0, a0, 4
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: and a1, a0, t2
; RV32I-NEXT: slli a1, a1, 2
; RV32I-NEXT: and a0, a0, t3
; RV32I-NEXT: srli a0, a0, 2
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: and a1, a0, a3
; RV32I-NEXT: slli a1, a1, 1
; RV32I-NEXT: srli a1, a0, 4
; RV32I-NEXT: and a1, a1, a4
; RV32I-NEXT: and a0, a0, a4
; RV32I-NEXT: slli a0, a0, 4
; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: srli a1, a0, 2
; RV32I-NEXT: and a1, a1, a3
; RV32I-NEXT: and a0, a0, a3
; RV32I-NEXT: slli a0, a0, 2
; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: srli a1, a0, 1
; RV32I-NEXT: and a1, a1, a5
; RV32I-NEXT: and a0, a0, a5
; RV32I-NEXT: srli a0, a0, 1
; RV32I-NEXT: or a1, a0, a1
; RV32I-NEXT: mv a0, a2
; RV32I-NEXT: slli a0, a0, 1
; RV32I-NEXT: or a1, a1, a0
; RV32I-NEXT: mv a0, t0
; RV32I-NEXT: ret
;
; RV32B-LABEL: bitreverse_i64:
@ -2756,33 +2738,27 @@ define i32 @bitreverse_bswap_i32(i32 %a) {
; RV32I-NEXT: slli a0, a0, 24
; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: lui a1, 61681
; RV32I-NEXT: addi a1, a1, -241
; RV32I-NEXT: and a1, a0, a1
; RV32I-NEXT: slli a1, a1, 4
; RV32I-NEXT: lui a3, 986895
; RV32I-NEXT: addi a3, a3, 240
; RV32I-NEXT: srli a1, a0, 4
; RV32I-NEXT: lui a3, 61681
; RV32I-NEXT: addi a3, a3, -241
; RV32I-NEXT: and a1, a1, a3
; RV32I-NEXT: and a0, a0, a3
; RV32I-NEXT: srli a0, a0, 4
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: lui a1, 209715
; RV32I-NEXT: addi a1, a1, 819
; RV32I-NEXT: and a1, a0, a1
; RV32I-NEXT: slli a1, a1, 2
; RV32I-NEXT: lui a3, 838861
; RV32I-NEXT: addi a3, a3, -820
; RV32I-NEXT: slli a0, a0, 4
; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: srli a1, a0, 2
; RV32I-NEXT: lui a3, 209715
; RV32I-NEXT: addi a3, a3, 819
; RV32I-NEXT: and a1, a1, a3
; RV32I-NEXT: and a0, a0, a3
; RV32I-NEXT: srli a0, a0, 2
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: lui a1, 349525
; RV32I-NEXT: addi a1, a1, 1365
; RV32I-NEXT: and a1, a0, a1
; RV32I-NEXT: slli a1, a1, 1
; RV32I-NEXT: lui a3, 699051
; RV32I-NEXT: addi a3, a3, -1366
; RV32I-NEXT: slli a0, a0, 2
; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: srli a1, a0, 1
; RV32I-NEXT: lui a3, 349525
; RV32I-NEXT: addi a3, a3, 1365
; RV32I-NEXT: and a1, a1, a3
; RV32I-NEXT: and a0, a0, a3
; RV32I-NEXT: srli a0, a0, 1
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: slli a0, a0, 1
; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: srli a1, a0, 8
; RV32I-NEXT: and a1, a1, a2
; RV32I-NEXT: srli a2, a0, 24
@ -2813,82 +2789,76 @@ define i64 @bitreverse_bswap_i64(i64 %a) {
; RV32I: # %bb.0:
; RV32I-NEXT: srli a3, a1, 8
; RV32I-NEXT: lui a2, 16
; RV32I-NEXT: addi t0, a2, -256
; RV32I-NEXT: and a3, a3, t0
; RV32I-NEXT: addi a6, a2, -256
; RV32I-NEXT: and a3, a3, a6
; RV32I-NEXT: srli a4, a1, 24
; RV32I-NEXT: or a4, a3, a4
; RV32I-NEXT: slli a5, a1, 8
; RV32I-NEXT: lui t1, 4080
; RV32I-NEXT: and a5, a5, t1
; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a4, a1, 8
; RV32I-NEXT: lui a7, 4080
; RV32I-NEXT: and a4, a4, a7
; RV32I-NEXT: slli a1, a1, 24
; RV32I-NEXT: or a1, a1, a5
; RV32I-NEXT: or a1, a1, a4
; RV32I-NEXT: lui a4, 61681
; RV32I-NEXT: addi a6, a4, -241
; RV32I-NEXT: and a5, a1, a6
; RV32I-NEXT: slli a5, a5, 4
; RV32I-NEXT: lui a4, 986895
; RV32I-NEXT: addi a7, a4, 240
; RV32I-NEXT: and a1, a1, a7
; RV32I-NEXT: srli a1, a1, 4
; RV32I-NEXT: or a1, a1, a5
; RV32I-NEXT: lui a5, 209715
; RV32I-NEXT: addi t2, a5, 819
; RV32I-NEXT: and a4, a1, t2
; RV32I-NEXT: slli a4, a4, 2
; RV32I-NEXT: lui a2, 838861
; RV32I-NEXT: addi t3, a2, -820
; RV32I-NEXT: and a1, a1, t3
; RV32I-NEXT: srli a1, a1, 2
; RV32I-NEXT: or a1, a1, a4
; RV32I-NEXT: lui a4, 349525
; RV32I-NEXT: addi a4, a4, 1365
; RV32I-NEXT: and a3, a1, a4
; RV32I-NEXT: slli a3, a3, 1
; RV32I-NEXT: lui a5, 699051
; RV32I-NEXT: addi a5, a5, -1366
; RV32I-NEXT: and a1, a1, a5
; RV32I-NEXT: srli a1, a1, 1
; RV32I-NEXT: or a1, a1, a3
; RV32I-NEXT: srli a3, a0, 8
; RV32I-NEXT: srli a3, a1, 4
; RV32I-NEXT: lui a4, 61681
; RV32I-NEXT: addi t0, a4, -241
; RV32I-NEXT: and a3, a3, t0
; RV32I-NEXT: srli a2, a0, 24
; RV32I-NEXT: or a2, a3, a2
; RV32I-NEXT: slli a3, a0, 8
; RV32I-NEXT: and a3, a3, t1
; RV32I-NEXT: and a1, a1, t0
; RV32I-NEXT: slli a1, a1, 4
; RV32I-NEXT: or a1, a3, a1
; RV32I-NEXT: srli a3, a1, 2
; RV32I-NEXT: lui a2, 209715
; RV32I-NEXT: addi a2, a2, 819
; RV32I-NEXT: and a3, a3, a2
; RV32I-NEXT: and a1, a1, a2
; RV32I-NEXT: slli a1, a1, 2
; RV32I-NEXT: or a1, a3, a1
; RV32I-NEXT: srli a3, a1, 1
; RV32I-NEXT: lui a5, 349525
; RV32I-NEXT: addi a5, a5, 1365
; RV32I-NEXT: and a3, a3, a5
; RV32I-NEXT: and a1, a1, a5
; RV32I-NEXT: slli a1, a1, 1
; RV32I-NEXT: or a1, a3, a1
; RV32I-NEXT: srli a3, a0, 8
; RV32I-NEXT: and a3, a3, a6
; RV32I-NEXT: srli a4, a0, 24
; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a4, a0, 8
; RV32I-NEXT: and a4, a4, a7
; RV32I-NEXT: slli a0, a0, 24
; RV32I-NEXT: or a0, a0, a4
; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: or a0, a0, a2
; RV32I-NEXT: and a2, a0, a6
; RV32I-NEXT: slli a2, a2, 4
; RV32I-NEXT: and a0, a0, a7
; RV32I-NEXT: srli a0, a0, 4
; RV32I-NEXT: or a0, a0, a2
; RV32I-NEXT: and a2, a0, t2
; RV32I-NEXT: slli a2, a2, 2
; RV32I-NEXT: and a0, a0, t3
; RV32I-NEXT: srli a0, a0, 2
; RV32I-NEXT: or a0, a0, a2
; RV32I-NEXT: and a2, a0, a4
; RV32I-NEXT: slli a2, a2, 1
; RV32I-NEXT: srli a3, a0, 4
; RV32I-NEXT: and a3, a3, t0
; RV32I-NEXT: and a0, a0, t0
; RV32I-NEXT: slli a0, a0, 4
; RV32I-NEXT: or a0, a3, a0
; RV32I-NEXT: srli a3, a0, 2
; RV32I-NEXT: and a3, a3, a2
; RV32I-NEXT: and a0, a0, a2
; RV32I-NEXT: slli a0, a0, 2
; RV32I-NEXT: or a0, a3, a0
; RV32I-NEXT: srli a2, a0, 1
; RV32I-NEXT: and a2, a2, a5
; RV32I-NEXT: and a0, a0, a5
; RV32I-NEXT: srli a0, a0, 1
; RV32I-NEXT: or a0, a0, a2
; RV32I-NEXT: slli a0, a0, 1
; RV32I-NEXT: or a0, a2, a0
; RV32I-NEXT: srli a2, a0, 8
; RV32I-NEXT: and a2, a2, t0
; RV32I-NEXT: and a2, a2, a6
; RV32I-NEXT: srli a3, a0, 24
; RV32I-NEXT: or a2, a2, a3
; RV32I-NEXT: slli a3, a0, 8
; RV32I-NEXT: and a3, a3, t1
; RV32I-NEXT: and a3, a3, a7
; RV32I-NEXT: slli a0, a0, 24
; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: or a0, a0, a2
; RV32I-NEXT: srli a2, a1, 8
; RV32I-NEXT: and a2, a2, t0
; RV32I-NEXT: and a2, a2, a6
; RV32I-NEXT: srli a3, a1, 24
; RV32I-NEXT: or a2, a2, a3
; RV32I-NEXT: slli a3, a1, 8
; RV32I-NEXT: and a3, a3, t1
; RV32I-NEXT: and a3, a3, a7
; RV32I-NEXT: slli a1, a1, 24
; RV32I-NEXT: or a1, a1, a3
; RV32I-NEXT: or a1, a1, a2

View File

@ -2816,13 +2816,13 @@ define zeroext i8 @bitreverse_i8(i8 zeroext %a) nounwind {
; RV64I-NEXT: or a0, a1, a0
; RV64I-NEXT: andi a1, a0, 51
; RV64I-NEXT: slli a1, a1, 2
; RV64I-NEXT: andi a0, a0, 204
; RV64I-NEXT: srli a0, a0, 2
; RV64I-NEXT: andi a0, a0, 51
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: andi a1, a0, 85
; RV64I-NEXT: slli a1, a1, 1
; RV64I-NEXT: andi a0, a0, 170
; RV64I-NEXT: srli a0, a0, 1
; RV64I-NEXT: andi a0, a0, 85
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: ret
;
@ -2847,33 +2847,27 @@ define zeroext i16 @bitreverse_i16(i16 zeroext %a) nounwind {
; RV64I-NEXT: srli a1, a0, 8
; RV64I-NEXT: slli a0, a0, 8
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: lui a1, 1
; RV64I-NEXT: addiw a1, a1, -241
; RV64I-NEXT: and a1, a0, a1
; RV64I-NEXT: slli a1, a1, 4
; RV64I-NEXT: lui a2, 15
; RV64I-NEXT: addiw a2, a2, 240
; RV64I-NEXT: srli a1, a0, 4
; RV64I-NEXT: lui a2, 1
; RV64I-NEXT: addiw a2, a2, -241
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: srli a0, a0, 4
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: lui a1, 3
; RV64I-NEXT: addiw a1, a1, 819
; RV64I-NEXT: and a1, a0, a1
; RV64I-NEXT: slli a1, a1, 2
; RV64I-NEXT: lui a2, 13
; RV64I-NEXT: addiw a2, a2, -820
; RV64I-NEXT: slli a0, a0, 4
; RV64I-NEXT: or a0, a1, a0
; RV64I-NEXT: srli a1, a0, 2
; RV64I-NEXT: lui a2, 3
; RV64I-NEXT: addiw a2, a2, 819
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: srli a0, a0, 2
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: lui a1, 5
; RV64I-NEXT: addiw a1, a1, 1365
; RV64I-NEXT: and a1, a0, a1
; RV64I-NEXT: slli a1, a1, 1
; RV64I-NEXT: lui a2, 11
; RV64I-NEXT: addiw a2, a2, -1366
; RV64I-NEXT: slli a0, a0, 2
; RV64I-NEXT: or a0, a1, a0
; RV64I-NEXT: srli a1, a0, 1
; RV64I-NEXT: lui a2, 5
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: srli a0, a0, 1
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: slli a0, a0, 1
; RV64I-NEXT: or a0, a1, a0
; RV64I-NEXT: ret
;
; RV64B-LABEL: bitreverse_i16:
@ -2906,35 +2900,27 @@ define signext i32 @bitreverse_i32(i32 signext %a) nounwind {
; RV64I-NEXT: slli a0, a0, 24
; RV64I-NEXT: or a0, a0, a2
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
; RV64I-NEXT: and a1, a0, a1
; RV64I-NEXT: slli a1, a1, 4
; RV64I-NEXT: lui a2, 241
; RV64I-NEXT: srli a1, a0, 4
; RV64I-NEXT: lui a2, 61681
; RV64I-NEXT: addiw a2, a2, -241
; RV64I-NEXT: slli a2, a2, 12
; RV64I-NEXT: addi a2, a2, 240
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: srli a0, a0, 4
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: lui a1, 209715
; RV64I-NEXT: addiw a1, a1, 819
; RV64I-NEXT: and a1, a0, a1
; RV64I-NEXT: slli a1, a1, 2
; RV64I-NEXT: lui a2, 838861
; RV64I-NEXT: addiw a2, a2, -820
; RV64I-NEXT: slli a0, a0, 4
; RV64I-NEXT: or a0, a1, a0
; RV64I-NEXT: srli a1, a0, 2
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addiw a2, a2, 819
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: srli a0, a0, 2
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: lui a1, 349525
; RV64I-NEXT: addiw a1, a1, 1365
; RV64I-NEXT: and a1, a0, a1
; RV64I-NEXT: slli a1, a1, 1
; RV64I-NEXT: lui a2, 699051
; RV64I-NEXT: addiw a2, a2, -1366
; RV64I-NEXT: slli a0, a0, 2
; RV64I-NEXT: or a0, a1, a0
; RV64I-NEXT: srli a1, a0, 1
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: srli a0, a0, 1
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: slli a0, a0, 1
; RV64I-NEXT: or a0, a1, a0
; RV64I-NEXT: sext.w a0, a0
; RV64I-NEXT: ret
;
@ -2967,35 +2953,27 @@ define void @bitreverse_i32_nosext(i32 signext %a, i32* %x) nounwind {
; RV64I-NEXT: slli a0, a0, 24
; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: or a0, a0, a2
; RV64I-NEXT: lui a2, 61681
; RV64I-NEXT: addiw a2, a2, -241
; RV64I-NEXT: and a2, a0, a2
; RV64I-NEXT: slli a2, a2, 4
; RV64I-NEXT: lui a3, 241
; RV64I-NEXT: srli a2, a0, 4
; RV64I-NEXT: lui a3, 61681
; RV64I-NEXT: addiw a3, a3, -241
; RV64I-NEXT: slli a3, a3, 12
; RV64I-NEXT: addi a3, a3, 240
; RV64I-NEXT: and a2, a2, a3
; RV64I-NEXT: and a0, a0, a3
; RV64I-NEXT: srli a0, a0, 4
; RV64I-NEXT: or a0, a0, a2
; RV64I-NEXT: lui a2, 209715
; RV64I-NEXT: addiw a2, a2, 819
; RV64I-NEXT: and a2, a0, a2
; RV64I-NEXT: slli a2, a2, 2
; RV64I-NEXT: lui a3, 838861
; RV64I-NEXT: addiw a3, a3, -820
; RV64I-NEXT: slli a0, a0, 4
; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: srli a2, a0, 2
; RV64I-NEXT: lui a3, 209715
; RV64I-NEXT: addiw a3, a3, 819
; RV64I-NEXT: and a2, a2, a3
; RV64I-NEXT: and a0, a0, a3
; RV64I-NEXT: srli a0, a0, 2
; RV64I-NEXT: or a0, a0, a2
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a2, a0, a2
; RV64I-NEXT: slli a2, a2, 1
; RV64I-NEXT: lui a3, 699051
; RV64I-NEXT: addiw a3, a3, -1366
; RV64I-NEXT: slli a0, a0, 2
; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: srli a2, a0, 1
; RV64I-NEXT: lui a3, 349525
; RV64I-NEXT: addiw a3, a3, 1365
; RV64I-NEXT: and a2, a2, a3
; RV64I-NEXT: and a0, a0, a3
; RV64I-NEXT: srli a0, a0, 1
; RV64I-NEXT: or a0, a0, a2
; RV64I-NEXT: slli a0, a0, 1
; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: sw a0, 0(a1)
; RV64I-NEXT: ret
;
@ -3049,69 +3027,45 @@ define i64 @bitreverse_i64(i64 %a) nounwind {
; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: or a0, a0, a2
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: lui a1, 3855
; RV64I-NEXT: addiw a1, a1, 241
; RV64I-NEXT: slli a1, a1, 12
; RV64I-NEXT: addi a1, a1, -241
; RV64I-NEXT: slli a1, a1, 12
; RV64I-NEXT: addi a1, a1, 241
; RV64I-NEXT: slli a1, a1, 12
; RV64I-NEXT: addi a1, a1, -241
; RV64I-NEXT: and a1, a0, a1
; RV64I-NEXT: slli a1, a1, 4
; RV64I-NEXT: lui a2, 1044721
; RV64I-NEXT: addiw a2, a2, -241
; RV64I-NEXT: srli a1, a0, 4
; RV64I-NEXT: lui a2, 3855
; RV64I-NEXT: addiw a2, a2, 241
; RV64I-NEXT: slli a2, a2, 12
; RV64I-NEXT: addi a2, a2, -241
; RV64I-NEXT: slli a2, a2, 12
; RV64I-NEXT: addi a2, a2, 241
; RV64I-NEXT: slli a2, a2, 12
; RV64I-NEXT: addi a2, a2, -241
; RV64I-NEXT: slli a2, a2, 12
; RV64I-NEXT: addi a2, a2, 240
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: srli a0, a0, 4
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: lui a1, 13107
; RV64I-NEXT: addiw a1, a1, 819
; RV64I-NEXT: slli a1, a1, 12
; RV64I-NEXT: addi a1, a1, 819
; RV64I-NEXT: slli a1, a1, 12
; RV64I-NEXT: addi a1, a1, 819
; RV64I-NEXT: slli a1, a1, 12
; RV64I-NEXT: addi a1, a1, 819
; RV64I-NEXT: and a1, a0, a1
; RV64I-NEXT: slli a1, a1, 2
; RV64I-NEXT: lui a2, 1035469
; RV64I-NEXT: addiw a2, a2, -819
; RV64I-NEXT: slli a0, a0, 4
; RV64I-NEXT: or a0, a1, a0
; RV64I-NEXT: srli a1, a0, 2
; RV64I-NEXT: lui a2, 13107
; RV64I-NEXT: addiw a2, a2, 819
; RV64I-NEXT: slli a2, a2, 12
; RV64I-NEXT: addi a2, a2, -819
; RV64I-NEXT: addi a2, a2, 819
; RV64I-NEXT: slli a2, a2, 12
; RV64I-NEXT: addi a2, a2, -819
; RV64I-NEXT: addi a2, a2, 819
; RV64I-NEXT: slli a2, a2, 12
; RV64I-NEXT: addi a2, a2, -820
; RV64I-NEXT: addi a2, a2, 819
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: srli a0, a0, 2
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: lui a1, 21845
; RV64I-NEXT: addiw a1, a1, 1365
; RV64I-NEXT: slli a1, a1, 12
; RV64I-NEXT: addi a1, a1, 1365
; RV64I-NEXT: slli a1, a1, 12
; RV64I-NEXT: addi a1, a1, 1365
; RV64I-NEXT: slli a1, a1, 12
; RV64I-NEXT: addi a1, a1, 1365
; RV64I-NEXT: and a1, a0, a1
; RV64I-NEXT: slli a1, a1, 1
; RV64I-NEXT: lui a2, 1026731
; RV64I-NEXT: addiw a2, a2, -1365
; RV64I-NEXT: slli a0, a0, 2
; RV64I-NEXT: or a0, a1, a0
; RV64I-NEXT: srli a1, a0, 1
; RV64I-NEXT: lui a2, 21845
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: slli a2, a2, 12
; RV64I-NEXT: addi a2, a2, -1365
; RV64I-NEXT: addi a2, a2, 1365
; RV64I-NEXT: slli a2, a2, 12
; RV64I-NEXT: addi a2, a2, -1365
; RV64I-NEXT: addi a2, a2, 1365
; RV64I-NEXT: slli a2, a2, 12
; RV64I-NEXT: addi a2, a2, -1366
; RV64I-NEXT: addi a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: srli a0, a0, 1
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: slli a0, a0, 1
; RV64I-NEXT: or a0, a1, a0
; RV64I-NEXT: ret
;
; RV64B-LABEL: bitreverse_i64:
@ -3210,35 +3164,27 @@ define i32 @bitreverse_bswap_i32(i32 %a) {
; RV64I-NEXT: slli a0, a0, 24
; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: lui a1, 61681
; RV64I-NEXT: addiw a1, a1, -241
; RV64I-NEXT: and a1, a0, a1
; RV64I-NEXT: slli a1, a1, 4
; RV64I-NEXT: lui a3, 241
; RV64I-NEXT: srli a1, a0, 4
; RV64I-NEXT: lui a3, 61681
; RV64I-NEXT: addiw a3, a3, -241
; RV64I-NEXT: slli a3, a3, 12
; RV64I-NEXT: addi a3, a3, 240
; RV64I-NEXT: and a1, a1, a3
; RV64I-NEXT: and a0, a0, a3
; RV64I-NEXT: srli a0, a0, 4
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: lui a1, 209715
; RV64I-NEXT: addiw a1, a1, 819
; RV64I-NEXT: and a1, a0, a1
; RV64I-NEXT: slli a1, a1, 2
; RV64I-NEXT: lui a3, 838861
; RV64I-NEXT: addiw a3, a3, -820
; RV64I-NEXT: slli a0, a0, 4
; RV64I-NEXT: or a0, a1, a0
; RV64I-NEXT: srli a1, a0, 2
; RV64I-NEXT: lui a3, 209715
; RV64I-NEXT: addiw a3, a3, 819
; RV64I-NEXT: and a1, a1, a3
; RV64I-NEXT: and a0, a0, a3
; RV64I-NEXT: srli a0, a0, 2
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: lui a1, 349525
; RV64I-NEXT: addiw a1, a1, 1365
; RV64I-NEXT: and a1, a0, a1
; RV64I-NEXT: slli a1, a1, 1
; RV64I-NEXT: lui a3, 699051
; RV64I-NEXT: addiw a3, a3, -1366
; RV64I-NEXT: slli a0, a0, 2
; RV64I-NEXT: or a0, a1, a0
; RV64I-NEXT: srli a1, a0, 1
; RV64I-NEXT: lui a3, 349525
; RV64I-NEXT: addiw a3, a3, 1365
; RV64I-NEXT: and a1, a1, a3
; RV64I-NEXT: and a0, a0, a3
; RV64I-NEXT: srli a0, a0, 1
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: slli a0, a0, 1
; RV64I-NEXT: or a0, a1, a0
; RV64I-NEXT: srli a1, a0, 8
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: srli a2, a0, 24
@ -3267,14 +3213,14 @@ define i32 @bitreverse_bswap_i32(i32 %a) {
define i64 @bitreverse_bswap_i64(i64 %a) {
; RV64I-LABEL: bitreverse_bswap_i64:
; RV64I: # %bb.0:
; RV64I-NEXT: srli a1, a0, 24
; RV64I-NEXT: srli a2, a0, 24
; RV64I-NEXT: lui a6, 4080
; RV64I-NEXT: and a1, a1, a6
; RV64I-NEXT: srli a3, a0, 8
; RV64I-NEXT: and a3, a2, a6
; RV64I-NEXT: srli a4, a0, 8
; RV64I-NEXT: addi a5, zero, 255
; RV64I-NEXT: slli a7, a5, 24
; RV64I-NEXT: and a3, a3, a7
; RV64I-NEXT: or a3, a3, a1
; RV64I-NEXT: and a4, a4, a7
; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: srli a4, a0, 40
; RV64I-NEXT: lui a1, 16
; RV64I-NEXT: addiw a1, a1, -256
@ -3282,9 +3228,9 @@ define i64 @bitreverse_bswap_i64(i64 %a) {
; RV64I-NEXT: srli a2, a0, 56
; RV64I-NEXT: or a2, a4, a2
; RV64I-NEXT: or a2, a3, a2
; RV64I-NEXT: slli a4, a0, 8
; RV64I-NEXT: slli a3, a0, 8
; RV64I-NEXT: slli t0, a5, 32
; RV64I-NEXT: and a3, a4, t0
; RV64I-NEXT: and a3, a3, t0
; RV64I-NEXT: slli a4, a0, 24
; RV64I-NEXT: slli t1, a5, 40
; RV64I-NEXT: and a4, a4, t1
@ -3296,69 +3242,45 @@ define i64 @bitreverse_bswap_i64(i64 %a) {
; RV64I-NEXT: or a0, a0, a4
; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: or a0, a0, a2
; RV64I-NEXT: lui a2, 3855
; RV64I-NEXT: addiw a2, a2, 241
; RV64I-NEXT: slli a2, a2, 12
; RV64I-NEXT: addi a2, a2, -241
; RV64I-NEXT: slli a2, a2, 12
; RV64I-NEXT: addi a2, a2, 241
; RV64I-NEXT: slli a2, a2, 12
; RV64I-NEXT: addi a2, a2, -241
; RV64I-NEXT: and a2, a0, a2
; RV64I-NEXT: slli a2, a2, 4
; RV64I-NEXT: lui a3, 1044721
; RV64I-NEXT: addiw a3, a3, -241
; RV64I-NEXT: srli a2, a0, 4
; RV64I-NEXT: lui a3, 3855
; RV64I-NEXT: addiw a3, a3, 241
; RV64I-NEXT: slli a3, a3, 12
; RV64I-NEXT: addi a3, a3, -241
; RV64I-NEXT: slli a3, a3, 12
; RV64I-NEXT: addi a3, a3, 241
; RV64I-NEXT: slli a3, a3, 12
; RV64I-NEXT: addi a3, a3, -241
; RV64I-NEXT: slli a3, a3, 12
; RV64I-NEXT: addi a3, a3, 240
; RV64I-NEXT: and a2, a2, a3
; RV64I-NEXT: and a0, a0, a3
; RV64I-NEXT: srli a0, a0, 4
; RV64I-NEXT: or a0, a0, a2
; RV64I-NEXT: lui a2, 13107
; RV64I-NEXT: addiw a2, a2, 819
; RV64I-NEXT: slli a2, a2, 12
; RV64I-NEXT: addi a2, a2, 819
; RV64I-NEXT: slli a2, a2, 12
; RV64I-NEXT: addi a2, a2, 819
; RV64I-NEXT: slli a2, a2, 12
; RV64I-NEXT: addi a2, a2, 819
; RV64I-NEXT: and a2, a0, a2
; RV64I-NEXT: slli a2, a2, 2
; RV64I-NEXT: lui a3, 1035469
; RV64I-NEXT: addiw a3, a3, -819
; RV64I-NEXT: slli a0, a0, 4
; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: srli a2, a0, 2
; RV64I-NEXT: lui a3, 13107
; RV64I-NEXT: addiw a3, a3, 819
; RV64I-NEXT: slli a3, a3, 12
; RV64I-NEXT: addi a3, a3, -819
; RV64I-NEXT: addi a3, a3, 819
; RV64I-NEXT: slli a3, a3, 12
; RV64I-NEXT: addi a3, a3, -819
; RV64I-NEXT: addi a3, a3, 819
; RV64I-NEXT: slli a3, a3, 12
; RV64I-NEXT: addi a3, a3, -820
; RV64I-NEXT: addi a3, a3, 819
; RV64I-NEXT: and a2, a2, a3
; RV64I-NEXT: and a0, a0, a3
; RV64I-NEXT: srli a0, a0, 2
; RV64I-NEXT: or a0, a0, a2
; RV64I-NEXT: lui a2, 21845
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: slli a2, a2, 12
; RV64I-NEXT: addi a2, a2, 1365
; RV64I-NEXT: slli a2, a2, 12
; RV64I-NEXT: addi a2, a2, 1365
; RV64I-NEXT: slli a2, a2, 12
; RV64I-NEXT: addi a2, a2, 1365
; RV64I-NEXT: and a2, a0, a2
; RV64I-NEXT: slli a2, a2, 1
; RV64I-NEXT: lui a3, 1026731
; RV64I-NEXT: addiw a3, a3, -1365
; RV64I-NEXT: slli a0, a0, 2
; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: srli a2, a0, 1
; RV64I-NEXT: lui a3, 21845
; RV64I-NEXT: addiw a3, a3, 1365
; RV64I-NEXT: slli a3, a3, 12
; RV64I-NEXT: addi a3, a3, -1365
; RV64I-NEXT: addi a3, a3, 1365
; RV64I-NEXT: slli a3, a3, 12
; RV64I-NEXT: addi a3, a3, -1365
; RV64I-NEXT: addi a3, a3, 1365
; RV64I-NEXT: slli a3, a3, 12
; RV64I-NEXT: addi a3, a3, -1366
; RV64I-NEXT: addi a3, a3, 1365
; RV64I-NEXT: and a2, a2, a3
; RV64I-NEXT: and a0, a0, a3
; RV64I-NEXT: srli a0, a0, 1
; RV64I-NEXT: or a0, a0, a2
; RV64I-NEXT: slli a0, a0, 1
; RV64I-NEXT: or a0, a2, a0
; RV64I-NEXT: srli a2, a0, 40
; RV64I-NEXT: and a1, a2, a1
; RV64I-NEXT: srli a2, a0, 56

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -55,16 +55,18 @@ define <4 x i32> @test_demandedbits_bitreverse(<4 x i32> %a0) nounwind {
; X86-NEXT: psrlw $4, %xmm0
; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-NEXT: por %xmm1, %xmm0
; X86-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; X86-NEXT: pand %xmm0, %xmm1
; X86-NEXT: psllw $2, %xmm1
; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-NEXT: psrlw $2, %xmm0
; X86-NEXT: movdqa %xmm0, %xmm1
; X86-NEXT: psrlw $2, %xmm1
; X86-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; X86-NEXT: pand %xmm2, %xmm1
; X86-NEXT: pand %xmm2, %xmm0
; X86-NEXT: psllw $2, %xmm0
; X86-NEXT: por %xmm1, %xmm0
; X86-NEXT: movdqa {{.*#+}} xmm1 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
; X86-NEXT: pand %xmm0, %xmm1
; X86-NEXT: movdqa %xmm0, %xmm1
; X86-NEXT: psrlw $1, %xmm1
; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
; X86-NEXT: pand %xmm2, %xmm1
; X86-NEXT: pand %xmm2, %xmm0
; X86-NEXT: paddb %xmm0, %xmm0
; X86-NEXT: por %xmm1, %xmm0
; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0

View File

@ -10,363 +10,362 @@ define i1000 @square(i1000 %A) nounwind {
; CHECK-NEXT: pushq %r13
; CHECK-NEXT: pushq %r12
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r14
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp
; CHECK-NEXT: bswapq %rbp
; CHECK-NEXT: movq %rbp, %r11
; CHECK-NEXT: shrq $4, %r11
; CHECK-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
; CHECK-NEXT: andq %rsi, %r11
; CHECK-NEXT: andq %rsi, %rbp
; CHECK-NEXT: shlq $4, %rbp
; CHECK-NEXT: orq %r11, %rbp
; CHECK-NEXT: movabsq $3689348814741910323, %rdi # imm = 0x3333333333333333
; CHECK-NEXT: movq %rbp, %r12
; CHECK-NEXT: andq %rdi, %r12
; CHECK-NEXT: shrq $2, %rbp
; CHECK-NEXT: andq %rdi, %rbp
; CHECK-NEXT: leaq (%rbp,%r12,4), %rbp
; CHECK-NEXT: movabsq $6148914691230924800, %r12 # imm = 0x5555555555000000
; CHECK-NEXT: movq %rbp, %r13
; CHECK-NEXT: andq %r12, %r13
; CHECK-NEXT: shrq %rbp
; CHECK-NEXT: andq %r12, %rbp
; CHECK-NEXT: leaq (%rbp,%r13,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: bswapq %rbx
; CHECK-NEXT: movabsq $1085102592571150095, %rdi # imm = 0xF0F0F0F0F0F0F0F
; CHECK-NEXT: movq %rbx, %rbp
; CHECK-NEXT: shrq $4, %rbp
; CHECK-NEXT: andq %rsi, %rbp
; CHECK-NEXT: andq %rsi, %rbx
; CHECK-NEXT: shlq $4, %rbx
; CHECK-NEXT: orq %rbp, %rbx
; CHECK-NEXT: movq %rbx, %rbp
; CHECK-NEXT: andq %rdi, %rbp
; CHECK-NEXT: shlq $4, %rbp
; CHECK-NEXT: movabsq $-1085102592571150096, %r11 # imm = 0xF0F0F0F0F0F0F0F0
; CHECK-NEXT: andq %r11, %rbx
; CHECK-NEXT: movq %r11, %rax
; CHECK-NEXT: shrq $4, %rbx
; CHECK-NEXT: orq %rbp, %rbx
; CHECK-NEXT: movabsq $3689348814741910323, %r11 # imm = 0x3333333333333333
; CHECK-NEXT: movq %rbx, %r14
; CHECK-NEXT: andq %r11, %r14
; CHECK-NEXT: movabsq $-3689348814741910324, %rbp # imm = 0xCCCCCCCCCCCCCCCC
; CHECK-NEXT: andq %rbp, %rbx
; CHECK-NEXT: movq %rbp, %r15
; CHECK-NEXT: shrq $2, %rbx
; CHECK-NEXT: leaq (%rbx,%r14,4), %r14
; CHECK-NEXT: movabsq $6148914691230924800, %rbx # imm = 0x5555555555000000
; CHECK-NEXT: andq %r14, %rbx
; CHECK-NEXT: movabsq $-6148914691247702016, %rbp # imm = 0xAAAAAAAAAA000000
; CHECK-NEXT: andq %r14, %rbp
; CHECK-NEXT: shrq %rbp
; CHECK-NEXT: leaq (%rbp,%rbx,2), %rbx
; CHECK-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: bswapq %r10
; CHECK-NEXT: movq %r10, %rbx
; CHECK-NEXT: andq %rdi, %rbx
; CHECK-NEXT: shlq $4, %rbx
; CHECK-NEXT: andq %rax, %r10
; CHECK-NEXT: shrq $4, %r10
; CHECK-NEXT: orq %rbx, %r10
; CHECK-NEXT: movq %r10, %rbx
; CHECK-NEXT: andq %r11, %rbx
; CHECK-NEXT: andq %r15, %r10
; CHECK-NEXT: shrq $2, %r10
; CHECK-NEXT: leaq (%r10,%rbx,4), %rbp
; CHECK-NEXT: leaq (%rbx,%rbp,4), %rbp
; CHECK-NEXT: movabsq $6148914691236517205, %rbx # imm = 0x5555555555555555
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: andq %rbx, %r10
; CHECK-NEXT: movabsq $-6148914691236517206, %r13 # imm = 0xAAAAAAAAAAAAAAAA
; CHECK-NEXT: andq %r13, %rbp
; CHECK-NEXT: movq %rbp, %r12
; CHECK-NEXT: andq %rbx, %r12
; CHECK-NEXT: shrq %rbp
; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp
; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp
; CHECK-NEXT: bswapq %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: andq %rdi, %r10
; CHECK-NEXT: shlq $4, %r10
; CHECK-NEXT: andq %rax, %rbp
; CHECK-NEXT: andq %rbx, %rbp
; CHECK-NEXT: leaq (%rbp,%r12,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: bswapq %r15
; CHECK-NEXT: movq %r15, %rbp
; CHECK-NEXT: shrq $4, %rbp
; CHECK-NEXT: orq %r10, %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: andq %r11, %r10
; CHECK-NEXT: andq %r15, %rbp
; CHECK-NEXT: shrq $2, %rbp
; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: andq %rbx, %r10
; CHECK-NEXT: andq %r13, %rbp
; CHECK-NEXT: andq %rsi, %rbp
; CHECK-NEXT: andq %rsi, %r15
; CHECK-NEXT: shlq $4, %r15
; CHECK-NEXT: orq %rbp, %r15
; CHECK-NEXT: movq %r15, %rbp
; CHECK-NEXT: andq %rdi, %rbp
; CHECK-NEXT: shrq $2, %r15
; CHECK-NEXT: andq %rdi, %r15
; CHECK-NEXT: leaq (%r15,%rbp,4), %rbp
; CHECK-NEXT: movq %rbp, %r15
; CHECK-NEXT: andq %rbx, %r15
; CHECK-NEXT: shrq %rbp
; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp
; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp
; CHECK-NEXT: bswapq %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: andq %rdi, %r10
; CHECK-NEXT: shlq $4, %r10
; CHECK-NEXT: andq %rax, %rbp
; CHECK-NEXT: andq %rbx, %rbp
; CHECK-NEXT: leaq (%rbp,%r15,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: bswapq %r14
; CHECK-NEXT: movq %r14, %rbp
; CHECK-NEXT: shrq $4, %rbp
; CHECK-NEXT: orq %r10, %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: andq %r11, %r10
; CHECK-NEXT: andq %r15, %rbp
; CHECK-NEXT: shrq $2, %rbp
; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: andq %rbx, %r10
; CHECK-NEXT: andq %r13, %rbp
; CHECK-NEXT: andq %rsi, %rbp
; CHECK-NEXT: andq %rsi, %r14
; CHECK-NEXT: shlq $4, %r14
; CHECK-NEXT: orq %rbp, %r14
; CHECK-NEXT: movq %r14, %rbp
; CHECK-NEXT: andq %rdi, %rbp
; CHECK-NEXT: shrq $2, %r14
; CHECK-NEXT: andq %rdi, %r14
; CHECK-NEXT: leaq (%r14,%rbp,4), %rbp
; CHECK-NEXT: movq %rbp, %r14
; CHECK-NEXT: andq %rbx, %r14
; CHECK-NEXT: shrq %rbp
; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp
; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp
; CHECK-NEXT: bswapq %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: andq %rdi, %r10
; CHECK-NEXT: shlq $4, %r10
; CHECK-NEXT: andq %rax, %rbp
; CHECK-NEXT: andq %rbx, %rbp
; CHECK-NEXT: leaq (%rbp,%r14,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: bswapq %r10
; CHECK-NEXT: movq %r10, %rbp
; CHECK-NEXT: shrq $4, %rbp
; CHECK-NEXT: orq %r10, %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: andq %r11, %r10
; CHECK-NEXT: andq %r15, %rbp
; CHECK-NEXT: shrq $2, %rbp
; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: andq %rbx, %r10
; CHECK-NEXT: andq %r13, %rbp
; CHECK-NEXT: shrq %rbp
; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp
; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp
; CHECK-NEXT: bswapq %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: andq %rdi, %r10
; CHECK-NEXT: andq %rsi, %rbp
; CHECK-NEXT: andq %rsi, %r10
; CHECK-NEXT: shlq $4, %r10
; CHECK-NEXT: andq %rax, %rbp
; CHECK-NEXT: movq %rax, %r14
; CHECK-NEXT: shrq $4, %rbp
; CHECK-NEXT: orq %r10, %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: andq %r11, %r10
; CHECK-NEXT: andq %r15, %rbp
; CHECK-NEXT: shrq $2, %rbp
; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp
; CHECK-NEXT: orq %rbp, %r10
; CHECK-NEXT: movq %r10, %rbp
; CHECK-NEXT: andq %rdi, %rbp
; CHECK-NEXT: shrq $2, %r10
; CHECK-NEXT: andq %rdi, %r10
; CHECK-NEXT: leaq (%r10,%rbp,4), %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: andq %rbx, %r10
; CHECK-NEXT: andq %r13, %rbp
; CHECK-NEXT: shrq %rbp
; CHECK-NEXT: andq %rbx, %rbp
; CHECK-NEXT: leaq (%rbp,%r10,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp
; CHECK-NEXT: bswapq %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: andq %rdi, %r10
; CHECK-NEXT: shlq $4, %r10
; CHECK-NEXT: andq %r14, %rbp
; CHECK-NEXT: shrq $4, %rbp
; CHECK-NEXT: shrq $4, %r10
; CHECK-NEXT: andq %rsi, %r10
; CHECK-NEXT: andq %rsi, %rbp
; CHECK-NEXT: shlq $4, %rbp
; CHECK-NEXT: orq %r10, %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: andq %r11, %r10
; CHECK-NEXT: andq %r15, %rbp
; CHECK-NEXT: andq %rdi, %r10
; CHECK-NEXT: shrq $2, %rbp
; CHECK-NEXT: andq %rdi, %rbp
; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: andq %rbx, %r10
; CHECK-NEXT: andq %r13, %rbp
; CHECK-NEXT: shrq %rbp
; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp
; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp
; CHECK-NEXT: bswapq %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: andq %rdi, %r10
; CHECK-NEXT: andq %rbx, %rbp
; CHECK-NEXT: leaq (%rbp,%r10,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
; CHECK-NEXT: bswapq %r10
; CHECK-NEXT: movq %r10, %r14
; CHECK-NEXT: shrq $4, %r14
; CHECK-NEXT: andq %rsi, %r14
; CHECK-NEXT: andq %rsi, %r10
; CHECK-NEXT: shlq $4, %r10
; CHECK-NEXT: andq %r14, %rbp
; CHECK-NEXT: shrq $4, %rbp
; CHECK-NEXT: orq %r10, %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: andq %r11, %r10
; CHECK-NEXT: andq %r15, %rbp
; CHECK-NEXT: shrq $2, %rbp
; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: andq %rbx, %r10
; CHECK-NEXT: andq %r13, %rbp
; CHECK-NEXT: shrq %rbp
; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp
; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp
; CHECK-NEXT: bswapq %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: orq %r14, %r10
; CHECK-NEXT: movq %r10, %r14
; CHECK-NEXT: andq %rdi, %r14
; CHECK-NEXT: shrq $2, %r10
; CHECK-NEXT: andq %rdi, %r10
; CHECK-NEXT: shlq $4, %r10
; CHECK-NEXT: andq %r14, %rbp
; CHECK-NEXT: shrq $4, %rbp
; CHECK-NEXT: orq %r10, %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: andq %r11, %r10
; CHECK-NEXT: andq %r15, %rbp
; CHECK-NEXT: shrq $2, %rbp
; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: movq %rdi, %rbp
; CHECK-NEXT: leaq (%r10,%r14,4), %r10
; CHECK-NEXT: movq %r10, %r14
; CHECK-NEXT: andq %rbx, %r14
; CHECK-NEXT: shrq %r10
; CHECK-NEXT: andq %rbx, %r10
; CHECK-NEXT: andq %r13, %rbp
; CHECK-NEXT: shrq %rbp
; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp
; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp
; CHECK-NEXT: bswapq %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: leaq (%r10,%r14,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
; CHECK-NEXT: bswapq %r10
; CHECK-NEXT: movq %r10, %r14
; CHECK-NEXT: shrq $4, %r14
; CHECK-NEXT: andq %rsi, %r14
; CHECK-NEXT: andq %rsi, %r10
; CHECK-NEXT: shlq $4, %r10
; CHECK-NEXT: orq %r14, %r10
; CHECK-NEXT: movq %r10, %r14
; CHECK-NEXT: andq %rdi, %r14
; CHECK-NEXT: shrq $2, %r10
; CHECK-NEXT: andq %rdi, %r10
; CHECK-NEXT: shlq $4, %r10
; CHECK-NEXT: andq %r14, %rbp
; CHECK-NEXT: shrq $4, %rbp
; CHECK-NEXT: orq %r10, %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: andq %r11, %r10
; CHECK-NEXT: andq %r15, %rbp
; CHECK-NEXT: shrq $2, %rbp
; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: leaq (%r10,%r14,4), %r10
; CHECK-NEXT: movq %r10, %r14
; CHECK-NEXT: andq %rbx, %r14
; CHECK-NEXT: shrq %r10
; CHECK-NEXT: andq %rbx, %r10
; CHECK-NEXT: andq %r13, %rbp
; CHECK-NEXT: shrq %rbp
; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp
; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp
; CHECK-NEXT: bswapq %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: leaq (%r10,%r14,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
; CHECK-NEXT: bswapq %r10
; CHECK-NEXT: movq %r10, %r14
; CHECK-NEXT: shrq $4, %r14
; CHECK-NEXT: andq %rsi, %r14
; CHECK-NEXT: andq %rsi, %r10
; CHECK-NEXT: shlq $4, %r10
; CHECK-NEXT: orq %r14, %r10
; CHECK-NEXT: movq %r10, %r14
; CHECK-NEXT: andq %rdi, %r14
; CHECK-NEXT: shrq $2, %r10
; CHECK-NEXT: andq %rdi, %r10
; CHECK-NEXT: shlq $4, %r10
; CHECK-NEXT: andq %r14, %rbp
; CHECK-NEXT: shrq $4, %rbp
; CHECK-NEXT: orq %r10, %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: andq %r11, %r10
; CHECK-NEXT: andq %r15, %rbp
; CHECK-NEXT: shrq $2, %rbp
; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp
; CHECK-NEXT: movq %rbp, %r10
; CHECK-NEXT: leaq (%r10,%r14,4), %r10
; CHECK-NEXT: movq %r10, %r14
; CHECK-NEXT: andq %rbx, %r14
; CHECK-NEXT: shrq %r10
; CHECK-NEXT: andq %rbx, %r10
; CHECK-NEXT: andq %r13, %rbp
; CHECK-NEXT: shrq %rbp
; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp
; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: leaq (%r10,%r14,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
; CHECK-NEXT: bswapq %r10
; CHECK-NEXT: movq %r10, %r14
; CHECK-NEXT: shrq $4, %r14
; CHECK-NEXT: andq %rsi, %r14
; CHECK-NEXT: andq %rsi, %r10
; CHECK-NEXT: shlq $4, %r10
; CHECK-NEXT: orq %r14, %r10
; CHECK-NEXT: movq %r10, %r14
; CHECK-NEXT: andq %rdi, %r14
; CHECK-NEXT: shrq $2, %r10
; CHECK-NEXT: andq %rdi, %r10
; CHECK-NEXT: leaq (%r10,%r14,4), %r10
; CHECK-NEXT: movq %r10, %r14
; CHECK-NEXT: andq %rbx, %r14
; CHECK-NEXT: shrq %r10
; CHECK-NEXT: andq %rbx, %r10
; CHECK-NEXT: leaq (%r10,%r14,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
; CHECK-NEXT: bswapq %r10
; CHECK-NEXT: movq %r10, %rax
; CHECK-NEXT: shrq $4, %rax
; CHECK-NEXT: andq %rsi, %rax
; CHECK-NEXT: andq %rsi, %r10
; CHECK-NEXT: shlq $4, %r10
; CHECK-NEXT: orq %rax, %r10
; CHECK-NEXT: movq %r10, %rax
; CHECK-NEXT: andq %rdi, %rax
; CHECK-NEXT: shrq $2, %r10
; CHECK-NEXT: andq %rdi, %r10
; CHECK-NEXT: leaq (%r10,%rax,4), %rax
; CHECK-NEXT: movq %rax, %r10
; CHECK-NEXT: andq %rbx, %r10
; CHECK-NEXT: shrq %rax
; CHECK-NEXT: andq %rbx, %rax
; CHECK-NEXT: leaq (%rax,%r10,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: bswapq %r9
; CHECK-NEXT: movq %r9, %rbp
; CHECK-NEXT: andq %rdi, %rbp
; CHECK-NEXT: shlq $4, %rbp
; CHECK-NEXT: andq %r14, %r9
; CHECK-NEXT: shrq $4, %r9
; CHECK-NEXT: orq %rbp, %r9
; CHECK-NEXT: movq %r9, %rbp
; CHECK-NEXT: andq %r11, %rbp
; CHECK-NEXT: andq %r15, %r9
; CHECK-NEXT: movq %r9, %rax
; CHECK-NEXT: shrq $4, %rax
; CHECK-NEXT: andq %rsi, %rax
; CHECK-NEXT: andq %rsi, %r9
; CHECK-NEXT: shlq $4, %r9
; CHECK-NEXT: orq %rax, %r9
; CHECK-NEXT: movq %r9, %rax
; CHECK-NEXT: andq %rdi, %rax
; CHECK-NEXT: shrq $2, %r9
; CHECK-NEXT: leaq (%r9,%rbp,4), %rbp
; CHECK-NEXT: movq %rbp, %r9
; CHECK-NEXT: andq %rdi, %r9
; CHECK-NEXT: leaq (%r9,%rax,4), %rax
; CHECK-NEXT: movq %rax, %r9
; CHECK-NEXT: andq %rbx, %r9
; CHECK-NEXT: andq %r13, %rbp
; CHECK-NEXT: shrq %rbp
; CHECK-NEXT: leaq (%rbp,%r9,2), %rbp
; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: shrq %rax
; CHECK-NEXT: andq %rbx, %rax
; CHECK-NEXT: leaq (%rax,%r9,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: bswapq %r8
; CHECK-NEXT: movq %r8, %rbp
; CHECK-NEXT: andq %rdi, %rbp
; CHECK-NEXT: shlq $4, %rbp
; CHECK-NEXT: andq %r14, %r8
; CHECK-NEXT: shrq $4, %r8
; CHECK-NEXT: orq %rbp, %r8
; CHECK-NEXT: movq %r8, %rbp
; CHECK-NEXT: andq %r11, %rbp
; CHECK-NEXT: andq %r15, %r8
; CHECK-NEXT: movq %r15, %r9
; CHECK-NEXT: movq %r8, %rax
; CHECK-NEXT: shrq $4, %rax
; CHECK-NEXT: andq %rsi, %rax
; CHECK-NEXT: andq %rsi, %r8
; CHECK-NEXT: shlq $4, %r8
; CHECK-NEXT: orq %rax, %r8
; CHECK-NEXT: movq %r8, %rax
; CHECK-NEXT: andq %rdi, %rax
; CHECK-NEXT: shrq $2, %r8
; CHECK-NEXT: leaq (%r8,%rbp,4), %rbp
; CHECK-NEXT: movq %rbp, %r8
; CHECK-NEXT: andq %rdi, %r8
; CHECK-NEXT: leaq (%r8,%rax,4), %rax
; CHECK-NEXT: movq %rax, %r8
; CHECK-NEXT: andq %rbx, %r8
; CHECK-NEXT: andq %r13, %rbp
; CHECK-NEXT: shrq %rbp
; CHECK-NEXT: leaq (%rbp,%r8,2), %rbp
; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: shrq %rax
; CHECK-NEXT: andq %rbx, %rax
; CHECK-NEXT: leaq (%rax,%r8,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: bswapq %rcx
; CHECK-NEXT: movq %rcx, %rbp
; CHECK-NEXT: andq %rdi, %rbp
; CHECK-NEXT: shlq $4, %rbp
; CHECK-NEXT: andq %r14, %rcx
; CHECK-NEXT: shrq $4, %rcx
; CHECK-NEXT: orq %rbp, %rcx
; CHECK-NEXT: movq %rcx, %rbp
; CHECK-NEXT: andq %r11, %rbp
; CHECK-NEXT: andq %r15, %rcx
; CHECK-NEXT: movq %rcx, %rax
; CHECK-NEXT: shrq $4, %rax
; CHECK-NEXT: andq %rsi, %rax
; CHECK-NEXT: andq %rsi, %rcx
; CHECK-NEXT: shlq $4, %rcx
; CHECK-NEXT: orq %rax, %rcx
; CHECK-NEXT: movq %rcx, %rax
; CHECK-NEXT: andq %rdi, %rax
; CHECK-NEXT: shrq $2, %rcx
; CHECK-NEXT: leaq (%rcx,%rbp,4), %rcx
; CHECK-NEXT: movq %rcx, %rbp
; CHECK-NEXT: andq %rbx, %rbp
; CHECK-NEXT: andq %r13, %rcx
; CHECK-NEXT: shrq %rcx
; CHECK-NEXT: leaq (%rcx,%rbp,2), %r15
; CHECK-NEXT: andq %rdi, %rcx
; CHECK-NEXT: leaq (%rcx,%rax,4), %rax
; CHECK-NEXT: movq %rax, %rcx
; CHECK-NEXT: andq %rbx, %rcx
; CHECK-NEXT: shrq %rax
; CHECK-NEXT: andq %rbx, %rax
; CHECK-NEXT: leaq (%rax,%rcx,2), %r12
; CHECK-NEXT: bswapq %rdx
; CHECK-NEXT: movq %rdx, %rbp
; CHECK-NEXT: andq %rdi, %rbp
; CHECK-NEXT: shlq $4, %rbp
; CHECK-NEXT: andq %r14, %rdx
; CHECK-NEXT: shrq $4, %rdx
; CHECK-NEXT: orq %rbp, %rdx
; CHECK-NEXT: movq %rdx, %rbp
; CHECK-NEXT: andq %r11, %rbp
; CHECK-NEXT: andq %r9, %rdx
; CHECK-NEXT: movq %rdx, %rax
; CHECK-NEXT: shrq $4, %rax
; CHECK-NEXT: andq %rsi, %rax
; CHECK-NEXT: andq %rsi, %rdx
; CHECK-NEXT: shlq $4, %rdx
; CHECK-NEXT: orq %rax, %rdx
; CHECK-NEXT: movq %rdx, %rax
; CHECK-NEXT: andq %rdi, %rax
; CHECK-NEXT: shrq $2, %rdx
; CHECK-NEXT: leaq (%rdx,%rbp,4), %rdx
; CHECK-NEXT: movq %rdx, %rbp
; CHECK-NEXT: andq %rbx, %rbp
; CHECK-NEXT: andq %r13, %rdx
; CHECK-NEXT: shrq %rdx
; CHECK-NEXT: leaq (%rdx,%rbp,2), %rdx
; CHECK-NEXT: bswapq %rsi
; CHECK-NEXT: andq %rsi, %rdi
; CHECK-NEXT: andq %r14, %rsi
; CHECK-NEXT: shlq $4, %rdi
; CHECK-NEXT: shrq $4, %rsi
; CHECK-NEXT: orq %rdi, %rsi
; CHECK-NEXT: andq %rsi, %r11
; CHECK-NEXT: andq %r9, %rsi
; CHECK-NEXT: shrq $2, %rsi
; CHECK-NEXT: leaq (%rsi,%r11,4), %rsi
; CHECK-NEXT: andq %rsi, %rbx
; CHECK-NEXT: andq %r13, %rsi
; CHECK-NEXT: shrq %rsi
; CHECK-NEXT: leaq (%rsi,%rbx,2), %r13
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
; CHECK-NEXT: andq %rdi, %rdx
; CHECK-NEXT: leaq (%rdx,%rax,4), %rax
; CHECK-NEXT: movq %rax, %rdx
; CHECK-NEXT: andq %rbx, %rdx
; CHECK-NEXT: shrq %rax
; CHECK-NEXT: andq %rbx, %rax
; CHECK-NEXT: leaq (%rax,%rdx,2), %rdi
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; CHECK-NEXT: shrdq $24, %rax, %r11
; CHECK-NEXT: bswapq %rax
; CHECK-NEXT: movq %rax, %rcx
; CHECK-NEXT: shrq $4, %rcx
; CHECK-NEXT: andq %rsi, %rcx
; CHECK-NEXT: andq %rsi, %rax
; CHECK-NEXT: shlq $4, %rax
; CHECK-NEXT: orq %rcx, %rax
; CHECK-NEXT: movq %rax, %rcx
; CHECK-NEXT: andq %rbp, %rcx
; CHECK-NEXT: shrq $2, %rax
; CHECK-NEXT: andq %rbp, %rax
; CHECK-NEXT: leaq (%rax,%rcx,4), %rax
; CHECK-NEXT: movq %rax, %rsi
; CHECK-NEXT: andq %rbx, %rsi
; CHECK-NEXT: shrq %rax
; CHECK-NEXT: andq %rbx, %rax
; CHECK-NEXT: leaq (%rax,%rsi,2), %rsi
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; CHECK-NEXT: shrdq $24, %rax, %rdx
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; CHECK-NEXT: shrdq $24, %rcx, %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
; CHECK-NEXT: shrdq $24, %rbp, %rcx
; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
; CHECK-NEXT: shrdq $24, %r12, %rbp
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
; CHECK-NEXT: shrdq $24, %r13, %rbp
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
; CHECK-NEXT: shrdq $24, %r15, %r13
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
; CHECK-NEXT: shrdq $24, %r14, %r12
; CHECK-NEXT: shrdq $24, %r14, %r15
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
; CHECK-NEXT: shrdq $24, %rbx, %r14
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
; CHECK-NEXT: shrdq $24, %r11, %rbx
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
; CHECK-NEXT: shrdq $24, %r10, %rbx
; CHECK-NEXT: shrdq $24, %r10, %r11
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
; CHECK-NEXT: shrdq $24, %r9, %r10
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
; CHECK-NEXT: shrdq $24, %r8, %r9
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
; CHECK-NEXT: shrdq $24, %rdi, %r8
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; CHECK-NEXT: shrdq $24, %rax, %r8
; CHECK-NEXT: shrdq $24, %r12, %rax
; CHECK-NEXT: movq %rax, %rcx
; CHECK-NEXT: shrdq $24, %rdi, %r12
; CHECK-NEXT: shrdq $24, %rsi, %rdi
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; CHECK-NEXT: shrdq $24, %rax, %rsi
; CHECK-NEXT: shrdq $24, %r15, %rax
; CHECK-NEXT: movq %rax, %rcx
; CHECK-NEXT: shrdq $24, %rdx, %r15
; CHECK-NEXT: shrdq $24, %r13, %rdx
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; CHECK-NEXT: movq %rdx, 112(%rax)
; CHECK-NEXT: movq %r15, 104(%rax)
; CHECK-NEXT: movq %rdi, 112(%rax)
; CHECK-NEXT: movq %r12, 104(%rax)
; CHECK-NEXT: movq %rcx, 96(%rax)
; CHECK-NEXT: movq %rsi, 88(%rax)
; CHECK-NEXT: movq %rdi, 80(%rax)
; CHECK-NEXT: movq %r8, 72(%rax)
; CHECK-NEXT: movq %r9, 64(%rax)
; CHECK-NEXT: movq %r10, 56(%rax)
; CHECK-NEXT: movq %rbx, 48(%rax)
; CHECK-NEXT: movq %r14, 40(%rax)
; CHECK-NEXT: movq %r12, 32(%rax)
; CHECK-NEXT: movq %r8, 88(%rax)
; CHECK-NEXT: movq %r9, 80(%rax)
; CHECK-NEXT: movq %r10, 72(%rax)
; CHECK-NEXT: movq %r11, 64(%rax)
; CHECK-NEXT: movq %rbx, 56(%rax)
; CHECK-NEXT: movq %r14, 48(%rax)
; CHECK-NEXT: movq %r15, 40(%rax)
; CHECK-NEXT: movq %r13, 32(%rax)
; CHECK-NEXT: movq %rbp, 24(%rax)
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; CHECK-NEXT: movq %rcx, 16(%rax)
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; CHECK-NEXT: movq %rcx, 8(%rax)
; CHECK-NEXT: movq %r11, (%rax)
; CHECK-NEXT: movq %r13, %rcx
; CHECK-NEXT: shrq $56, %r13
; CHECK-NEXT: movb %r13b, 124(%rax)
; CHECK-NEXT: movq %rdx, (%rax)
; CHECK-NEXT: movq %rsi, %rcx
; CHECK-NEXT: shrq $56, %rsi
; CHECK-NEXT: movb %sil, 124(%rax)
; CHECK-NEXT: shrq $24, %rcx
; CHECK-NEXT: movl %ecx, 120(%rax)
; CHECK-NEXT: popq %rbx

File diff suppressed because it is too large Load Diff