[SVE] Fix incorrect code generation for bitcasts of unpacked vector types.

Bitcasting between unpacked scalable vector types of different
element counts is not a NOP because the live elements are laid out
differently.
               01234567
e.g. nxv2i32 = XX??XX??
     nxv4f16 = X?X?X?X?

Differential Revision: https://reviews.llvm.org/D126957
This commit is contained in:
Paul Walker 2022-05-31 10:59:05 +01:00
parent 471bfb7016
commit a1121c31d8
3 changed files with 123 additions and 33 deletions

View File

@ -1731,16 +1731,14 @@ SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp, EVT SlotVT,
SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp, EVT SlotVT,
EVT DestVT, const SDLoc &dl,
SDValue Chain) {
unsigned SrcSize = SrcOp.getValueSizeInBits();
unsigned SlotSize = SlotVT.getSizeInBits();
unsigned DestSize = DestVT.getSizeInBits();
EVT SrcVT = SrcOp.getValueType();
Type *DestType = DestVT.getTypeForEVT(*DAG.getContext());
Align DestAlign = DAG.getDataLayout().getPrefTypeAlign(DestType);
// Don't convert with stack if the load/store is expensive.
if ((SrcSize > SlotSize &&
if ((SrcVT.bitsGT(SlotVT) &&
!TLI.isTruncStoreLegalOrCustom(SrcOp.getValueType(), SlotVT)) ||
(SlotSize < DestSize &&
(SlotVT.bitsLT(DestVT) &&
!TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, DestVT, SlotVT)))
return SDValue();
@ -1758,20 +1756,19 @@ SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp, EVT SlotVT,
// later than DestVT.
SDValue Store;
if (SrcSize > SlotSize)
if (SrcVT.bitsGT(SlotVT))
Store = DAG.getTruncStore(Chain, dl, SrcOp, FIPtr, PtrInfo,
SlotVT, SrcAlign);
else {
assert(SrcSize == SlotSize && "Invalid store");
Store =
DAG.getStore(Chain, dl, SrcOp, FIPtr, PtrInfo, SrcAlign);
assert(SrcVT.bitsEq(SlotVT) && "Invalid store");
Store = DAG.getStore(Chain, dl, SrcOp, FIPtr, PtrInfo, SrcAlign);
}
// Result is a load from the stack slot.
if (SlotSize == DestSize)
if (SlotVT.bitsEq(DestVT))
return DAG.getLoad(DestVT, dl, Store, FIPtr, PtrInfo, DestAlign);
assert(SlotSize < DestSize && "Unknown extension!");
assert(SlotVT.bitsLT(DestVT) && "Unknown extension!");
return DAG.getExtLoad(ISD::EXTLOAD, dl, DestVT, Store, FIPtr, PtrInfo, SlotVT,
DestAlign);
}

View File

@ -3817,6 +3817,14 @@ SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
return LowerFixedLengthBitcastToSVE(Op, DAG);
if (OpVT.isScalableVector()) {
// Bitcasting between unpacked vector types of different element counts is
// not a NOP because the live elements are laid out differently.
// 01234567
// e.g. nxv2i32 = XX??XX??
// nxv4f16 = X?X?X?X?
if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
return SDValue();
if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) {
assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
"Expected int->fp bitcast!");
@ -19282,6 +19290,15 @@ void AArch64TargetLowering::ReplaceBITCASTResults(
if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
"Expected fp->int bitcast!");
// Bitcasting between unpacked vector types of different element counts is
// not a NOP because the live elements are laid out differently.
// 01234567
// e.g. nxv2i32 = XX??XX??
// nxv4f16 = X?X?X?X?
if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
return;
SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
return;
@ -21137,6 +21154,17 @@ SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());
EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
// Safe bitcasting between unpacked vector types of different element counts
// is currently unsupported because the following is missing the necessary
// work to ensure the result's elements live where they're supposed to within
// an SVE register.
// 01234567
// e.g. nxv2i32 = XX??XX??
// nxv4f16 = X?X?X?X?
assert((VT.getVectorElementCount() == InVT.getVectorElementCount() ||
VT == PackedVT || InVT == PackedInVT) &&
"Unexpected bitcast!");
// Pack input if required.
if (InVT != PackedInVT)
Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);

View File

@ -610,11 +610,17 @@ define <vscale x 4 x i16> @bitcast_nxv4f16_to_nxv4i16(<vscale x 4 x half> %v) #0
ret <vscale x 4 x i16> %bc
}
; TODO: Invalid code generation because the bitcast must change the in-register
; layout when casting between unpacked scalable vector types.
define <vscale x 4 x i16> @bitcast_nxv2f32_to_nxv4i16(<vscale x 2 x float> %v) #0 {
; CHECK-LABEL: bitcast_nxv2f32_to_nxv4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl]
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%bc = bitcast <vscale x 2 x float> %v to <vscale x 4 x i16>
ret <vscale x 4 x i16> %bc
@ -664,11 +670,17 @@ define <vscale x 2 x i32> @bitcast_nxv4i16_to_nxv2i32(<vscale x 4 x i16> %v) #0
ret <vscale x 2 x i32> %bc
}
; TODO: Invalid code generation because the bitcast must change the in-register
; layout when casting between unpacked scalable vector types.
define <vscale x 2 x i32> @bitcast_nxv4f16_to_nxv2i32(<vscale x 4 x half> %v) #0 {
; CHECK-LABEL: bitcast_nxv4f16_to_nxv2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%bc = bitcast <vscale x 4 x half> %v to <vscale x 2 x i32>
ret <vscale x 2 x i32> %bc
@ -682,11 +694,17 @@ define <vscale x 2 x i32> @bitcast_nxv2f32_to_nxv2i32(<vscale x 2 x float> %v) #
ret <vscale x 2 x i32> %bc
}
; TODO: Invalid code generation because the bitcast must change the in-register
; layout when casting between unpacked scalable vector types.
define <vscale x 2 x i32> @bitcast_nxv4bf16_to_nxv2i32(<vscale x 4 x bfloat> %v) #0 {
; CHECK-LABEL: bitcast_nxv4bf16_to_nxv2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%bc = bitcast <vscale x 4 x bfloat> %v to <vscale x 2 x i32>
ret <vscale x 2 x i32> %bc
@ -720,21 +738,33 @@ define <vscale x 4 x half> @bitcast_nxv4i16_to_nxv4f16(<vscale x 4 x i16> %v) #0
ret <vscale x 4 x half> %bc
}
; TODO: Invalid code generation because the bitcast must change the in-register
; layout when casting between unpacked scalable vector types.
define <vscale x 4 x half> @bitcast_nxv2i32_to_nxv4f16(<vscale x 2 x i32> %v) #0 {
; CHECK-LABEL: bitcast_nxv2i32_to_nxv4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl]
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%bc = bitcast <vscale x 2 x i32> %v to <vscale x 4 x half>
ret <vscale x 4 x half> %bc
}
; TODO: Invalid code generation because the bitcast must change the in-register
; layout when casting between unpacked scalable vector types.
define <vscale x 4 x half> @bitcast_nxv2f32_to_nxv4f16(<vscale x 2 x float> %v) #0 {
; CHECK-LABEL: bitcast_nxv2f32_to_nxv4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl]
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%bc = bitcast <vscale x 2 x float> %v to <vscale x 4 x half>
ret <vscale x 4 x half> %bc
@ -768,11 +798,17 @@ define <vscale x 2 x float> @bitcast_nxv8i8_to_nxv2f32(<vscale x 8 x i8> %v) #0
ret <vscale x 2 x float> %bc
}
; TODO: Invalid code generation because the bitcast must change the in-register
; layout when casting between unpacked scalable vector types.
define <vscale x 2 x float> @bitcast_nxv4i16_to_nxv2f32(<vscale x 4 x i16> %v) #0 {
; CHECK-LABEL: bitcast_nxv4i16_to_nxv2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%bc = bitcast <vscale x 4 x i16> %v to <vscale x 2 x float>
ret <vscale x 2 x float> %bc
@ -786,21 +822,33 @@ define <vscale x 2 x float> @bitcast_nxv2i32_to_nxv2f32(<vscale x 2 x i32> %v) #
ret <vscale x 2 x float> %bc
}
; TODO: Invalid code generation because the bitcast must change the in-register
; layout when casting between unpacked scalable vector types.
define <vscale x 2 x float> @bitcast_nxv4f16_to_nxv2f32(<vscale x 4 x half> %v) #0 {
; CHECK-LABEL: bitcast_nxv4f16_to_nxv2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%bc = bitcast <vscale x 4 x half> %v to <vscale x 2 x float>
ret <vscale x 2 x float> %bc
}
; TODO: Invalid code generation because the bitcast must change the in-register
; layout when casting between unpacked scalable vector types.
define <vscale x 2 x float> @bitcast_nxv4bf16_to_nxv2f32(<vscale x 4 x bfloat> %v) #0 {
; CHECK-LABEL: bitcast_nxv4bf16_to_nxv2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%bc = bitcast <vscale x 4 x bfloat> %v to <vscale x 2 x float>
ret <vscale x 2 x float> %bc
@ -834,11 +882,17 @@ define <vscale x 4 x bfloat> @bitcast_nxv4i16_to_nxv4bf16(<vscale x 4 x i16> %v)
ret <vscale x 4 x bfloat> %bc
}
; TODO: Invalid code generation because the bitcast must change the in-register
; layout when casting between unpacked scalable vector types.
define <vscale x 4 x bfloat> @bitcast_nxv2i32_to_nxv4bf16(<vscale x 2 x i32> %v) #0 {
; CHECK-LABEL: bitcast_nxv2i32_to_nxv4bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl]
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%bc = bitcast <vscale x 2 x i32> %v to <vscale x 4 x bfloat>
ret <vscale x 4 x bfloat> %bc
@ -852,11 +906,17 @@ define <vscale x 4 x bfloat> @bitcast_nxv4f16_to_nxv4bf16(<vscale x 4 x half> %v
ret <vscale x 4 x bfloat> %bc
}
; TODO: Invalid code generation because the bitcast must change the in-register
; layout when casting between unpacked scalable vector types.
define <vscale x 4 x bfloat> @bitcast_nxv2f32_to_nxv4bf16(<vscale x 2 x float> %v) #0 {
; CHECK-LABEL: bitcast_nxv2f32_to_nxv4bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl]
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%bc = bitcast <vscale x 2 x float> %v to <vscale x 4 x bfloat>
ret <vscale x 4 x bfloat> %bc
@ -1049,13 +1109,18 @@ define <vscale x 2 x double> @bitcast_short_i32_to_float(<vscale x 2 x i64> %v)
ret <vscale x 2 x double> %extended
}
; TODO: Invalid code generation because the bitcast must change the in-register
; layout when casting between unpacked scalable vector types.
define <vscale x 2 x float> @bitcast_short_half_to_float(<vscale x 4 x half> %v) #0 {
; CHECK-LABEL: bitcast_short_half_to_float:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z0.h
; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%add = fadd <vscale x 4 x half> %v, %v
%bitcast = bitcast <vscale x 4 x half> %add to <vscale x 2 x float>