[SVE][CodeGen] Improve codegen for some FP insert_subvector cases

When inserting an unpacked FP subvector into a packed vector we
can simply cast the unpacked value into a packed value, since
both types are legal for SVE. We can then use this as the input
for the UZP instruction. This avoids us expanding the operation
by going through the stack.

Differential Revision: https://reviews.llvm.org/D113270
This commit is contained in:
David Sherwood 2021-11-05 11:42:51 +00:00
parent 438437cbb6
commit 8d38c24fb6
2 changed files with 15 additions and 12 deletions

View File

@ -10912,7 +10912,7 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
SDLoc DL(Op);
EVT VT = Op.getValueType();
if (!isTypeLegal(VT) || !VT.isInteger())
if (!isTypeLegal(VT))
return SDValue();
SDValue Vec0 = Op.getOperand(0);
@ -10922,9 +10922,19 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
return SDValue();
// Extend elements of smaller vector...
EVT WideVT = InVT.widenIntegerVectorElementType(*(DAG.getContext()));
SDValue ExtVec = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
EVT WideVT;
SDValue ExtVec;
if (VT.isFloatingPoint()) {
// The InVT type should be legal. We can safely cast the unpacked
// subvector from InVT -> VT.
WideVT = VT;
ExtVec = getSVESafeBitCast(VT, Vec1, DAG);
} else {
// Extend elements of smaller vector...
WideVT = InVT.widenIntegerVectorElementType(*(DAG.getContext()));
ExtVec = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
}
if (Idx == 0) {
SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);

View File

@ -424,14 +424,7 @@ define <vscale x 3 x i32> @insert_nxv3i32_nxv2i32_2(<vscale x 3 x i32> %sv0, <vs
define <vscale x 3 x float> @insert_nxv3f32_nxv2f32(<vscale x 2 x float> %sv0) nounwind {
; CHECK-LABEL: insert_nxv3f32_nxv2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: st1w { z0.d }, p0, [sp]
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: ret
%v0 = call <vscale x 3 x float> @llvm.experimental.vector.insert.nxv3f32.nxv2f32(<vscale x 3 x float> undef, <vscale x 2 x float> %sv0, i64 0)
ret <vscale x 3 x float> %v0