[DAGCombine] Basic combines for AVG nodes.

This adds very basic combines for AVG nodes, mostly for constant folding
and handling degenerate (zero) cases. The code performs mostly the same
transforms as visitMULHS, adjusted for AVG nodes.

Constant folding extends to a higher bitwidth and drops the lowest bit.
For undef nodes, `avg undef, x` is transformed to x.  There is also a
transform for `avgfloor x, 0` transforming to `shr x, 1`.

Differential Revision: https://reviews.llvm.org/D119559
This commit is contained in:
David Green 2022-02-14 11:18:35 +00:00
parent a87d3ba61c
commit 03380c70ed
4 changed files with 86 additions and 37 deletions

View File

@ -426,6 +426,7 @@ namespace {
SDValue visitREM(SDNode *N); SDValue visitREM(SDNode *N);
SDValue visitMULHU(SDNode *N); SDValue visitMULHU(SDNode *N);
SDValue visitMULHS(SDNode *N); SDValue visitMULHS(SDNode *N);
SDValue visitAVG(SDNode *N);
SDValue visitSMUL_LOHI(SDNode *N); SDValue visitSMUL_LOHI(SDNode *N);
SDValue visitUMUL_LOHI(SDNode *N); SDValue visitUMUL_LOHI(SDNode *N);
SDValue visitMULO(SDNode *N); SDValue visitMULO(SDNode *N);
@ -1635,6 +1636,10 @@ SDValue DAGCombiner::visit(SDNode *N) {
case ISD::UREM: return visitREM(N); case ISD::UREM: return visitREM(N);
case ISD::MULHU: return visitMULHU(N); case ISD::MULHU: return visitMULHU(N);
case ISD::MULHS: return visitMULHS(N); case ISD::MULHS: return visitMULHS(N);
case ISD::AVGFLOORS:
case ISD::AVGFLOORU:
case ISD::AVGCEILS:
case ISD::AVGCEILU: return visitAVG(N);
case ISD::SMUL_LOHI: return visitSMUL_LOHI(N); case ISD::SMUL_LOHI: return visitSMUL_LOHI(N);
case ISD::UMUL_LOHI: return visitUMUL_LOHI(N); case ISD::UMUL_LOHI: return visitUMUL_LOHI(N);
case ISD::SMULO: case ISD::SMULO:
@ -4654,6 +4659,46 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) {
return SDValue(); return SDValue();
} }
SDValue DAGCombiner::visitAVG(SDNode *N) {
unsigned Opcode = N->getOpcode();
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
SDLoc DL(N);
// fold (avg c1, c2)
if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}))
return C;
// canonicalize constant to RHS.
if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
!DAG.isConstantIntBuildVectorOrConstantInt(N1))
return DAG.getNode(Opcode, DL, N->getVTList(), N1, N0);
if (VT.isVector()) {
if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
return FoldedVOp;
// fold (avgfloor x, 0) -> x >> 1
if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) {
if (Opcode == ISD::AVGFLOORS)
return DAG.getNode(ISD::SRA, DL, VT, N0, DAG.getConstant(1, DL, VT));
if (Opcode == ISD::AVGFLOORU)
return DAG.getNode(ISD::SRL, DL, VT, N0, DAG.getConstant(1, DL, VT));
}
}
// fold (avg x, undef) -> x
if (N0.isUndef())
return N1;
if (N1.isUndef())
return N0;
// TODO If we use avg for scalars anywhere, we can add (avgfl x, 0) -> x >> 1
return SDValue();
}
/// Perform optimizations common to nodes that compute two values. LoOp and HiOp /// Perform optimizations common to nodes that compute two values. LoOp and HiOp
/// give the opcodes for the two computations that are being performed. Return /// give the opcodes for the two computations that are being performed. Return
/// true if a simplification was made. /// true if a simplification was made.

View File

@ -5274,6 +5274,30 @@ static llvm::Optional<APInt> FoldValue(unsigned Opcode, const APInt &C1,
APInt C2Ext = C2.zext(FullWidth); APInt C2Ext = C2.zext(FullWidth);
return (C1Ext * C2Ext).extractBits(C1.getBitWidth(), C1.getBitWidth()); return (C1Ext * C2Ext).extractBits(C1.getBitWidth(), C1.getBitWidth());
} }
case ISD::AVGFLOORS: {
unsigned FullWidth = C1.getBitWidth() + 1;
APInt C1Ext = C1.sext(FullWidth);
APInt C2Ext = C2.sext(FullWidth);
return (C1Ext + C2Ext).extractBits(C1.getBitWidth(), 1);
}
case ISD::AVGFLOORU: {
unsigned FullWidth = C1.getBitWidth() + 1;
APInt C1Ext = C1.zext(FullWidth);
APInt C2Ext = C2.zext(FullWidth);
return (C1Ext + C2Ext).extractBits(C1.getBitWidth(), 1);
}
case ISD::AVGCEILS: {
unsigned FullWidth = C1.getBitWidth() + 1;
APInt C1Ext = C1.sext(FullWidth);
APInt C2Ext = C2.sext(FullWidth);
return (C1Ext + C2Ext + 1).extractBits(C1.getBitWidth(), 1);
}
case ISD::AVGCEILU: {
unsigned FullWidth = C1.getBitWidth() + 1;
APInt C1Ext = C1.zext(FullWidth);
APInt C2Ext = C2.zext(FullWidth);
return (C1Ext + C2Ext + 1).extractBits(C1.getBitWidth(), 1);
}
} }
return llvm::None; return llvm::None;
} }

View File

@ -135,8 +135,7 @@ define <8 x i16> @haddu_i_const_lhs(<8 x i16> %src1) {
define <8 x i16> @haddu_i_const_zero(<8 x i16> %src1) { define <8 x i16> @haddu_i_const_zero(<8 x i16> %src1) {
; CHECK-LABEL: haddu_i_const_zero: ; CHECK-LABEL: haddu_i_const_zero:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: ushr v0.8h, v0.8h, #1
; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret ; CHECK-NEXT: ret
%result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <8 x i16> %src1) %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <8 x i16> %src1)
ret <8 x i16> %result ret <8 x i16> %result
@ -145,9 +144,7 @@ define <8 x i16> @haddu_i_const_zero(<8 x i16> %src1) {
define <8 x i16> @haddu_i_const_both() { define <8 x i16> @haddu_i_const_both() {
; CHECK-LABEL: haddu_i_const_both: ; CHECK-LABEL: haddu_i_const_both:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: movi v0.8h, #1 ; CHECK-NEXT: movi v0.8h, #2
; CHECK-NEXT: movi v1.8h, #3
; CHECK-NEXT: uhadd v0.8h, v1.8h, v0.8h
; CHECK-NEXT: ret ; CHECK-NEXT: ret
%result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>) %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
ret <8 x i16> %result ret <8 x i16> %result
@ -156,18 +153,16 @@ define <8 x i16> @haddu_i_const_both() {
define <8 x i16> @haddu_i_const_bothhigh() { define <8 x i16> @haddu_i_const_bothhigh() {
; CHECK-LABEL: haddu_i_const_bothhigh: ; CHECK-LABEL: haddu_i_const_bothhigh:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: movi v0.2d, #0xffffffffffffffff ; CHECK-NEXT: mvni v0.8h, #1
; CHECK-NEXT: mvni v1.8h, #1
; CHECK-NEXT: uhadd v0.8h, v1.8h, v0.8h
; CHECK-NEXT: ret ; CHECK-NEXT: ret
%result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> <i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534>, <8 x i16> <i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535>) %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> <i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534>, <8 x i16> <i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535>)
ret <8 x i16> %result ret <8 x i16> %result
} }
define <8 x i16> @haddu_i_undef(<8 x i16> %src1) { define <8 x i16> @haddu_i_undef(<8 x i16> %t, <8 x i16> %src1) {
; CHECK-LABEL: haddu_i_undef: ; CHECK-LABEL: haddu_i_undef:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: uhadd v0.8h, v0.8h, v0.8h ; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: ret ; CHECK-NEXT: ret
%result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> undef, <8 x i16> %src1) %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> undef, <8 x i16> %src1)
ret <8 x i16> %result ret <8 x i16> %result
@ -312,8 +307,7 @@ define <8 x i16> @hadds_i_const_lhs(<8 x i16> %src1) {
define <8 x i16> @hadds_i_const_zero(<8 x i16> %src1) { define <8 x i16> @hadds_i_const_zero(<8 x i16> %src1) {
; CHECK-LABEL: hadds_i_const_zero: ; CHECK-LABEL: hadds_i_const_zero:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: sshr v0.8h, v0.8h, #1
; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret ; CHECK-NEXT: ret
%result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <8 x i16> %src1) %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <8 x i16> %src1)
ret <8 x i16> %result ret <8 x i16> %result
@ -322,9 +316,7 @@ define <8 x i16> @hadds_i_const_zero(<8 x i16> %src1) {
define <8 x i16> @hadds_i_const_both() { define <8 x i16> @hadds_i_const_both() {
; CHECK-LABEL: hadds_i_const_both: ; CHECK-LABEL: hadds_i_const_both:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: movi v0.8h, #1 ; CHECK-NEXT: movi v0.8h, #2
; CHECK-NEXT: movi v1.8h, #3
; CHECK-NEXT: shadd v0.8h, v1.8h, v0.8h
; CHECK-NEXT: ret ; CHECK-NEXT: ret
%result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>) %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
ret <8 x i16> %result ret <8 x i16> %result
@ -334,18 +326,16 @@ define <8 x i16> @hadds_i_const_bothhigh() {
; CHECK-LABEL: hadds_i_const_bothhigh: ; CHECK-LABEL: hadds_i_const_bothhigh:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #32766 ; CHECK-NEXT: mov w8, #32766
; CHECK-NEXT: mvni v0.8h, #128, lsl #8 ; CHECK-NEXT: dup v0.8h, w8
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: shadd v0.8h, v1.8h, v0.8h
; CHECK-NEXT: ret ; CHECK-NEXT: ret
%result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>, <8 x i16> <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767>) %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>, <8 x i16> <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767>)
ret <8 x i16> %result ret <8 x i16> %result
} }
define <8 x i16> @hadds_i_undef(<8 x i16> %src1) { define <8 x i16> @hadds_i_undef(<8 x i16> %t, <8 x i16> %src1) {
; CHECK-LABEL: hadds_i_undef: ; CHECK-LABEL: hadds_i_undef:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: shadd v0.8h, v0.8h, v0.8h ; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: ret ; CHECK-NEXT: ret
%result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> undef, <8 x i16> %src1) %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> undef, <8 x i16> %src1)
ret <8 x i16> %result ret <8 x i16> %result
@ -508,9 +498,7 @@ define <8 x i16> @rhaddu_i_const_zero(<8 x i16> %src1) {
define <8 x i16> @rhaddu_i_const_both() { define <8 x i16> @rhaddu_i_const_both() {
; CHECK-LABEL: rhaddu_i_const_both: ; CHECK-LABEL: rhaddu_i_const_both:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: movi v0.8h, #1 ; CHECK-NEXT: movi v0.8h, #2
; CHECK-NEXT: movi v1.8h, #3
; CHECK-NEXT: urhadd v0.8h, v1.8h, v0.8h
; CHECK-NEXT: ret ; CHECK-NEXT: ret
%result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>) %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
ret <8 x i16> %result ret <8 x i16> %result
@ -520,17 +508,15 @@ define <8 x i16> @rhaddu_i_const_bothhigh() {
; CHECK-LABEL: rhaddu_i_const_bothhigh: ; CHECK-LABEL: rhaddu_i_const_bothhigh:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: movi v0.2d, #0xffffffffffffffff ; CHECK-NEXT: movi v0.2d, #0xffffffffffffffff
; CHECK-NEXT: mvni v1.8h, #1
; CHECK-NEXT: urhadd v0.8h, v1.8h, v0.8h
; CHECK-NEXT: ret ; CHECK-NEXT: ret
%result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> <i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534>, <8 x i16> <i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535>) %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> <i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534>, <8 x i16> <i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535>)
ret <8 x i16> %result ret <8 x i16> %result
} }
define <8 x i16> @rhaddu_i_undef(<8 x i16> %src1) { define <8 x i16> @rhaddu_i_undef(<8 x i16> %t, <8 x i16> %src1) {
; CHECK-LABEL: rhaddu_i_undef: ; CHECK-LABEL: rhaddu_i_undef:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: urhadd v0.8h, v0.8h, v0.8h ; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: ret ; CHECK-NEXT: ret
%result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> undef, <8 x i16> %src1) %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> undef, <8 x i16> %src1)
ret <8 x i16> %result ret <8 x i16> %result
@ -693,9 +679,7 @@ define <8 x i16> @rhadds_i_const_zero(<8 x i16> %src1) {
define <8 x i16> @rhadds_i_const_both() { define <8 x i16> @rhadds_i_const_both() {
; CHECK-LABEL: rhadds_i_const_both: ; CHECK-LABEL: rhadds_i_const_both:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: movi v0.8h, #1 ; CHECK-NEXT: movi v0.8h, #2
; CHECK-NEXT: movi v1.8h, #3
; CHECK-NEXT: srhadd v0.8h, v1.8h, v0.8h
; CHECK-NEXT: ret ; CHECK-NEXT: ret
%result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>) %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
ret <8 x i16> %result ret <8 x i16> %result
@ -704,19 +688,16 @@ define <8 x i16> @rhadds_i_const_both() {
define <8 x i16> @rhadds_i_const_bothhigh() { define <8 x i16> @rhadds_i_const_bothhigh() {
; CHECK-LABEL: rhadds_i_const_bothhigh: ; CHECK-LABEL: rhadds_i_const_bothhigh:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #32766
; CHECK-NEXT: mvni v0.8h, #128, lsl #8 ; CHECK-NEXT: mvni v0.8h, #128, lsl #8
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: srhadd v0.8h, v1.8h, v0.8h
; CHECK-NEXT: ret ; CHECK-NEXT: ret
%result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>, <8 x i16> <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767>) %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>, <8 x i16> <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767>)
ret <8 x i16> %result ret <8 x i16> %result
} }
define <8 x i16> @rhadds_i_undef(<8 x i16> %src1) { define <8 x i16> @rhadds_i_undef(<8 x i16> %t, <8 x i16> %src1) {
; CHECK-LABEL: rhadds_i_undef: ; CHECK-LABEL: rhadds_i_undef:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: srhadd v0.8h, v0.8h, v0.8h ; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: ret ; CHECK-NEXT: ret
%result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> undef, <8 x i16> %src1) %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> undef, <8 x i16> %src1)
ret <8 x i16> %result ret <8 x i16> %result

View File

@ -7,10 +7,9 @@ define void @f() nounwind {
; CHECK-NEXT: calll L0$pb ; CHECK-NEXT: calll L0$pb
; CHECK-NEXT: L0$pb: ; CHECK-NEXT: L0$pb:
; CHECK-NEXT: popl %eax ; CHECK-NEXT: popl %eax
; CHECK-NEXT: pxor %xmm0, %xmm0
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
; CHECK-NEXT: psllw {{\.?LCPI[0-9]+_[0-9]+}}-L0$pb(%eax), %xmm1 ; CHECK-NEXT: psllw {{\.?LCPI[0-9]+_[0-9]+}}-L0$pb(%eax), %xmm1
; CHECK-NEXT: pavgw {{\.?LCPI[0-9]+_[0-9]+}}-L0$pb(%eax), %xmm0 ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [21183,21183,21183,21183,21183,21183,21183,21183]
; CHECK-NEXT: paddsw %xmm0, %xmm0 ; CHECK-NEXT: paddsw %xmm0, %xmm0
; CHECK-NEXT: paddw %xmm1, %xmm0 ; CHECK-NEXT: paddw %xmm1, %xmm0
; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .p2align 4, 0x90