forked from OSchip/llvm-project
[SVE] Fix incorrect code generation for bitcasts of unpacked vector types.
Bitcasting between unpacked scalable vector types of different element counts is not a NOP because the live elements are laid out differently. 01234567 e.g. nxv2i32 = XX??XX?? nxv4f16 = X?X?X?X? Differential Revision: https://reviews.llvm.org/D126957
This commit is contained in:
parent
471bfb7016
commit
a1121c31d8
|
@ -1731,16 +1731,14 @@ SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp, EVT SlotVT,
|
|||
SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp, EVT SlotVT,
|
||||
EVT DestVT, const SDLoc &dl,
|
||||
SDValue Chain) {
|
||||
unsigned SrcSize = SrcOp.getValueSizeInBits();
|
||||
unsigned SlotSize = SlotVT.getSizeInBits();
|
||||
unsigned DestSize = DestVT.getSizeInBits();
|
||||
EVT SrcVT = SrcOp.getValueType();
|
||||
Type *DestType = DestVT.getTypeForEVT(*DAG.getContext());
|
||||
Align DestAlign = DAG.getDataLayout().getPrefTypeAlign(DestType);
|
||||
|
||||
// Don't convert with stack if the load/store is expensive.
|
||||
if ((SrcSize > SlotSize &&
|
||||
if ((SrcVT.bitsGT(SlotVT) &&
|
||||
!TLI.isTruncStoreLegalOrCustom(SrcOp.getValueType(), SlotVT)) ||
|
||||
(SlotSize < DestSize &&
|
||||
(SlotVT.bitsLT(DestVT) &&
|
||||
!TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, DestVT, SlotVT)))
|
||||
return SDValue();
|
||||
|
||||
|
@ -1758,20 +1756,19 @@ SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp, EVT SlotVT,
|
|||
// later than DestVT.
|
||||
SDValue Store;
|
||||
|
||||
if (SrcSize > SlotSize)
|
||||
if (SrcVT.bitsGT(SlotVT))
|
||||
Store = DAG.getTruncStore(Chain, dl, SrcOp, FIPtr, PtrInfo,
|
||||
SlotVT, SrcAlign);
|
||||
else {
|
||||
assert(SrcSize == SlotSize && "Invalid store");
|
||||
Store =
|
||||
DAG.getStore(Chain, dl, SrcOp, FIPtr, PtrInfo, SrcAlign);
|
||||
assert(SrcVT.bitsEq(SlotVT) && "Invalid store");
|
||||
Store = DAG.getStore(Chain, dl, SrcOp, FIPtr, PtrInfo, SrcAlign);
|
||||
}
|
||||
|
||||
// Result is a load from the stack slot.
|
||||
if (SlotSize == DestSize)
|
||||
if (SlotVT.bitsEq(DestVT))
|
||||
return DAG.getLoad(DestVT, dl, Store, FIPtr, PtrInfo, DestAlign);
|
||||
|
||||
assert(SlotSize < DestSize && "Unknown extension!");
|
||||
assert(SlotVT.bitsLT(DestVT) && "Unknown extension!");
|
||||
return DAG.getExtLoad(ISD::EXTLOAD, dl, DestVT, Store, FIPtr, PtrInfo, SlotVT,
|
||||
DestAlign);
|
||||
}
|
||||
|
|
|
@ -3817,6 +3817,14 @@ SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
|
|||
return LowerFixedLengthBitcastToSVE(Op, DAG);
|
||||
|
||||
if (OpVT.isScalableVector()) {
|
||||
// Bitcasting between unpacked vector types of different element counts is
|
||||
// not a NOP because the live elements are laid out differently.
|
||||
// 01234567
|
||||
// e.g. nxv2i32 = XX??XX??
|
||||
// nxv4f16 = X?X?X?X?
|
||||
if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
|
||||
return SDValue();
|
||||
|
||||
if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) {
|
||||
assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
|
||||
"Expected int->fp bitcast!");
|
||||
|
@ -19282,6 +19290,15 @@ void AArch64TargetLowering::ReplaceBITCASTResults(
|
|||
if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
|
||||
assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
|
||||
"Expected fp->int bitcast!");
|
||||
|
||||
// Bitcasting between unpacked vector types of different element counts is
|
||||
// not a NOP because the live elements are laid out differently.
|
||||
// 01234567
|
||||
// e.g. nxv2i32 = XX??XX??
|
||||
// nxv4f16 = X?X?X?X?
|
||||
if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
|
||||
return;
|
||||
|
||||
SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
|
||||
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
|
||||
return;
|
||||
|
@ -21137,6 +21154,17 @@ SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
|
|||
EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());
|
||||
EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
|
||||
|
||||
// Safe bitcasting between unpacked vector types of different element counts
|
||||
// is currently unsupported because the following is missing the necessary
|
||||
// work to ensure the result's elements live where they're supposed to within
|
||||
// an SVE register.
|
||||
// 01234567
|
||||
// e.g. nxv2i32 = XX??XX??
|
||||
// nxv4f16 = X?X?X?X?
|
||||
assert((VT.getVectorElementCount() == InVT.getVectorElementCount() ||
|
||||
VT == PackedVT || InVT == PackedInVT) &&
|
||||
"Unexpected bitcast!");
|
||||
|
||||
// Pack input if required.
|
||||
if (InVT != PackedInVT)
|
||||
Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
|
||||
|
|
|
@ -610,11 +610,17 @@ define <vscale x 4 x i16> @bitcast_nxv4f16_to_nxv4i16(<vscale x 4 x half> %v) #0
|
|||
ret <vscale x 4 x i16> %bc
|
||||
}
|
||||
|
||||
; TODO: Invalid code generation because the bitcast must change the in-register
|
||||
; layout when casting between unpacked scalable vector types.
|
||||
define <vscale x 4 x i16> @bitcast_nxv2f32_to_nxv4i16(<vscale x 2 x float> %v) #0 {
|
||||
; CHECK-LABEL: bitcast_nxv2f32_to_nxv4i16:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
||||
; CHECK-NEXT: addvl sp, sp, #-1
|
||||
; CHECK-NEXT: ptrue p0.d
|
||||
; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl]
|
||||
; CHECK-NEXT: ptrue p0.s
|
||||
; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl]
|
||||
; CHECK-NEXT: addvl sp, sp, #1
|
||||
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
||||
; CHECK-NEXT: ret
|
||||
%bc = bitcast <vscale x 2 x float> %v to <vscale x 4 x i16>
|
||||
ret <vscale x 4 x i16> %bc
|
||||
|
@ -664,11 +670,17 @@ define <vscale x 2 x i32> @bitcast_nxv4i16_to_nxv2i32(<vscale x 4 x i16> %v) #0
|
|||
ret <vscale x 2 x i32> %bc
|
||||
}
|
||||
|
||||
; TODO: Invalid code generation because the bitcast must change the in-register
|
||||
; layout when casting between unpacked scalable vector types.
|
||||
define <vscale x 2 x i32> @bitcast_nxv4f16_to_nxv2i32(<vscale x 4 x half> %v) #0 {
|
||||
; CHECK-LABEL: bitcast_nxv4f16_to_nxv2i32:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
||||
; CHECK-NEXT: addvl sp, sp, #-1
|
||||
; CHECK-NEXT: ptrue p0.s
|
||||
; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl]
|
||||
; CHECK-NEXT: ptrue p0.d
|
||||
; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl]
|
||||
; CHECK-NEXT: addvl sp, sp, #1
|
||||
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
||||
; CHECK-NEXT: ret
|
||||
%bc = bitcast <vscale x 4 x half> %v to <vscale x 2 x i32>
|
||||
ret <vscale x 2 x i32> %bc
|
||||
|
@ -682,11 +694,17 @@ define <vscale x 2 x i32> @bitcast_nxv2f32_to_nxv2i32(<vscale x 2 x float> %v) #
|
|||
ret <vscale x 2 x i32> %bc
|
||||
}
|
||||
|
||||
; TODO: Invalid code generation because the bitcast must change the in-register
|
||||
; layout when casting between unpacked scalable vector types.
|
||||
define <vscale x 2 x i32> @bitcast_nxv4bf16_to_nxv2i32(<vscale x 4 x bfloat> %v) #0 {
|
||||
; CHECK-LABEL: bitcast_nxv4bf16_to_nxv2i32:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
||||
; CHECK-NEXT: addvl sp, sp, #-1
|
||||
; CHECK-NEXT: ptrue p0.s
|
||||
; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl]
|
||||
; CHECK-NEXT: ptrue p0.d
|
||||
; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl]
|
||||
; CHECK-NEXT: addvl sp, sp, #1
|
||||
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
||||
; CHECK-NEXT: ret
|
||||
%bc = bitcast <vscale x 4 x bfloat> %v to <vscale x 2 x i32>
|
||||
ret <vscale x 2 x i32> %bc
|
||||
|
@ -720,21 +738,33 @@ define <vscale x 4 x half> @bitcast_nxv4i16_to_nxv4f16(<vscale x 4 x i16> %v) #0
|
|||
ret <vscale x 4 x half> %bc
|
||||
}
|
||||
|
||||
; TODO: Invalid code generation because the bitcast must change the in-register
|
||||
; layout when casting between unpacked scalable vector types.
|
||||
define <vscale x 4 x half> @bitcast_nxv2i32_to_nxv4f16(<vscale x 2 x i32> %v) #0 {
|
||||
; CHECK-LABEL: bitcast_nxv2i32_to_nxv4f16:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
||||
; CHECK-NEXT: addvl sp, sp, #-1
|
||||
; CHECK-NEXT: ptrue p0.d
|
||||
; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl]
|
||||
; CHECK-NEXT: ptrue p0.s
|
||||
; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl]
|
||||
; CHECK-NEXT: addvl sp, sp, #1
|
||||
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
||||
; CHECK-NEXT: ret
|
||||
%bc = bitcast <vscale x 2 x i32> %v to <vscale x 4 x half>
|
||||
ret <vscale x 4 x half> %bc
|
||||
}
|
||||
|
||||
; TODO: Invalid code generation because the bitcast must change the in-register
|
||||
; layout when casting between unpacked scalable vector types.
|
||||
define <vscale x 4 x half> @bitcast_nxv2f32_to_nxv4f16(<vscale x 2 x float> %v) #0 {
|
||||
; CHECK-LABEL: bitcast_nxv2f32_to_nxv4f16:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
||||
; CHECK-NEXT: addvl sp, sp, #-1
|
||||
; CHECK-NEXT: ptrue p0.d
|
||||
; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl]
|
||||
; CHECK-NEXT: ptrue p0.s
|
||||
; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl]
|
||||
; CHECK-NEXT: addvl sp, sp, #1
|
||||
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
||||
; CHECK-NEXT: ret
|
||||
%bc = bitcast <vscale x 2 x float> %v to <vscale x 4 x half>
|
||||
ret <vscale x 4 x half> %bc
|
||||
|
@ -768,11 +798,17 @@ define <vscale x 2 x float> @bitcast_nxv8i8_to_nxv2f32(<vscale x 8 x i8> %v) #0
|
|||
ret <vscale x 2 x float> %bc
|
||||
}
|
||||
|
||||
; TODO: Invalid code generation because the bitcast must change the in-register
|
||||
; layout when casting between unpacked scalable vector types.
|
||||
define <vscale x 2 x float> @bitcast_nxv4i16_to_nxv2f32(<vscale x 4 x i16> %v) #0 {
|
||||
; CHECK-LABEL: bitcast_nxv4i16_to_nxv2f32:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
||||
; CHECK-NEXT: addvl sp, sp, #-1
|
||||
; CHECK-NEXT: ptrue p0.s
|
||||
; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl]
|
||||
; CHECK-NEXT: ptrue p0.d
|
||||
; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl]
|
||||
; CHECK-NEXT: addvl sp, sp, #1
|
||||
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
||||
; CHECK-NEXT: ret
|
||||
%bc = bitcast <vscale x 4 x i16> %v to <vscale x 2 x float>
|
||||
ret <vscale x 2 x float> %bc
|
||||
|
@ -786,21 +822,33 @@ define <vscale x 2 x float> @bitcast_nxv2i32_to_nxv2f32(<vscale x 2 x i32> %v) #
|
|||
ret <vscale x 2 x float> %bc
|
||||
}
|
||||
|
||||
; TODO: Invalid code generation because the bitcast must change the in-register
|
||||
; layout when casting between unpacked scalable vector types.
|
||||
define <vscale x 2 x float> @bitcast_nxv4f16_to_nxv2f32(<vscale x 4 x half> %v) #0 {
|
||||
; CHECK-LABEL: bitcast_nxv4f16_to_nxv2f32:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
||||
; CHECK-NEXT: addvl sp, sp, #-1
|
||||
; CHECK-NEXT: ptrue p0.s
|
||||
; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl]
|
||||
; CHECK-NEXT: ptrue p0.d
|
||||
; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl]
|
||||
; CHECK-NEXT: addvl sp, sp, #1
|
||||
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
||||
; CHECK-NEXT: ret
|
||||
%bc = bitcast <vscale x 4 x half> %v to <vscale x 2 x float>
|
||||
ret <vscale x 2 x float> %bc
|
||||
}
|
||||
|
||||
; TODO: Invalid code generation because the bitcast must change the in-register
|
||||
; layout when casting between unpacked scalable vector types.
|
||||
define <vscale x 2 x float> @bitcast_nxv4bf16_to_nxv2f32(<vscale x 4 x bfloat> %v) #0 {
|
||||
; CHECK-LABEL: bitcast_nxv4bf16_to_nxv2f32:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
||||
; CHECK-NEXT: addvl sp, sp, #-1
|
||||
; CHECK-NEXT: ptrue p0.s
|
||||
; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl]
|
||||
; CHECK-NEXT: ptrue p0.d
|
||||
; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl]
|
||||
; CHECK-NEXT: addvl sp, sp, #1
|
||||
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
||||
; CHECK-NEXT: ret
|
||||
%bc = bitcast <vscale x 4 x bfloat> %v to <vscale x 2 x float>
|
||||
ret <vscale x 2 x float> %bc
|
||||
|
@ -834,11 +882,17 @@ define <vscale x 4 x bfloat> @bitcast_nxv4i16_to_nxv4bf16(<vscale x 4 x i16> %v)
|
|||
ret <vscale x 4 x bfloat> %bc
|
||||
}
|
||||
|
||||
; TODO: Invalid code generation because the bitcast must change the in-register
|
||||
; layout when casting between unpacked scalable vector types.
|
||||
define <vscale x 4 x bfloat> @bitcast_nxv2i32_to_nxv4bf16(<vscale x 2 x i32> %v) #0 {
|
||||
; CHECK-LABEL: bitcast_nxv2i32_to_nxv4bf16:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
||||
; CHECK-NEXT: addvl sp, sp, #-1
|
||||
; CHECK-NEXT: ptrue p0.d
|
||||
; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl]
|
||||
; CHECK-NEXT: ptrue p0.s
|
||||
; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl]
|
||||
; CHECK-NEXT: addvl sp, sp, #1
|
||||
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
||||
; CHECK-NEXT: ret
|
||||
%bc = bitcast <vscale x 2 x i32> %v to <vscale x 4 x bfloat>
|
||||
ret <vscale x 4 x bfloat> %bc
|
||||
|
@ -852,11 +906,17 @@ define <vscale x 4 x bfloat> @bitcast_nxv4f16_to_nxv4bf16(<vscale x 4 x half> %v
|
|||
ret <vscale x 4 x bfloat> %bc
|
||||
}
|
||||
|
||||
; TODO: Invalid code generation because the bitcast must change the in-register
|
||||
; layout when casting between unpacked scalable vector types.
|
||||
define <vscale x 4 x bfloat> @bitcast_nxv2f32_to_nxv4bf16(<vscale x 2 x float> %v) #0 {
|
||||
; CHECK-LABEL: bitcast_nxv2f32_to_nxv4bf16:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
||||
; CHECK-NEXT: addvl sp, sp, #-1
|
||||
; CHECK-NEXT: ptrue p0.d
|
||||
; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl]
|
||||
; CHECK-NEXT: ptrue p0.s
|
||||
; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl]
|
||||
; CHECK-NEXT: addvl sp, sp, #1
|
||||
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
||||
; CHECK-NEXT: ret
|
||||
%bc = bitcast <vscale x 2 x float> %v to <vscale x 4 x bfloat>
|
||||
ret <vscale x 4 x bfloat> %bc
|
||||
|
@ -1049,13 +1109,18 @@ define <vscale x 2 x double> @bitcast_short_i32_to_float(<vscale x 2 x i64> %v)
|
|||
ret <vscale x 2 x double> %extended
|
||||
}
|
||||
|
||||
; TODO: Invalid code generation because the bitcast must change the in-register
|
||||
; layout when casting between unpacked scalable vector types.
|
||||
define <vscale x 2 x float> @bitcast_short_half_to_float(<vscale x 4 x half> %v) #0 {
|
||||
; CHECK-LABEL: bitcast_short_half_to_float:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
|
||||
; CHECK-NEXT: addvl sp, sp, #-1
|
||||
; CHECK-NEXT: ptrue p0.s
|
||||
; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z0.h
|
||||
; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl]
|
||||
; CHECK-NEXT: ptrue p0.d
|
||||
; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl]
|
||||
; CHECK-NEXT: addvl sp, sp, #1
|
||||
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
|
||||
; CHECK-NEXT: ret
|
||||
%add = fadd <vscale x 4 x half> %v, %v
|
||||
%bitcast = bitcast <vscale x 4 x half> %add to <vscale x 2 x float>
|
||||
|
|
Loading…
Reference in New Issue