[Hexagon] Fix MachineSink not to hoist FP instructions that update USR.

Ideally we should make USR as Def for these floating point instructions.
However, it violates some assembler MCChecker rules. This patch fixes
the issue by marking these FP instructions as non-sinkable.
This commit is contained in:
Sumanth Gundapaneni 2021-08-31 22:46:21 -05:00 committed by Krzysztof Parzyszek
parent 52f347010a
commit 822448635e
3 changed files with 348 additions and 0 deletions

View File

@ -171,6 +171,23 @@ bool HexagonInstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
return MI.isAsCheapAsAMove();
}
// Do not sink floating point instructions that updates USR register.
// Example:
// feclearexcept
// F2_conv_w2sf
// fetestexcept
// MachineSink sinks F2_conv_w2sf and we are not able to catch exceptions.
// TODO: On some of these floating point instructions, USR is marked as Use.
// In reality, these instructions also Def the USR. If USR is marked as Def,
// some of the assumptions in assembler packetization are broken.
bool HexagonInstrInfo::shouldSink(const MachineInstr &MI) const {
// Assumption: A floating point instruction that reads the USR will write
// the USR as well.
if (isFloat(MI) && MI.hasRegisterImplicitUseOperand(Hexagon::USR))
return false;
return true;
}
/// Find the hardware loop instruction used to set-up the specified loop.
/// On Hexagon, we have two instructions used to set-up the hardware loop
/// (LOOP0, LOOP1) with corresponding endloop (ENDLOOP0, ENDLOOP1) instructions

View File

@ -337,6 +337,12 @@ public:
bool isTailCall(const MachineInstr &MI) const override;
bool isAsCheapAsAMove(const MachineInstr &MI) const override;
// Return true if the instruction should be sunk by MachineSink.
// MachineSink determines on its own whether the instruction is safe to sink;
// this gives the target a hook to override the default behavior with regards
// to which instructions should be sunk.
bool shouldSink(const MachineInstr &MI) const override;
/// HexagonInstrInfo specifics.
unsigned createVR(MachineFunction *MF, MVT VT) const;

View File

@ -0,0 +1,325 @@
# RUN: llc -march=hexagon -run-pass machine-sink -o - %s | FileCheck %s
# Test that MachineSink does not sink F2_conv_w2sf.
# CHECK: name:{{.*}} main
# CHECK: J2_call @feclearexcept
# CHECK: F2_conv_w2sf
# CHECK: J2_call @fetestexcept
--- |
target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
target triple = "hexagon"
@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
define dso_local i32 @syst_int32_to_float32(i32 %a) local_unnamed_addr #0 {
entry:
%conv = sitofp i32 %a to float
%0 = bitcast float %conv to i32
ret i32 %0
}
; Function Attrs: argmemonly mustprogress nofree nosync nounwind willreturn
declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1
; Function Attrs: argmemonly mustprogress nofree nosync nounwind willreturn
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1
; Function Attrs: nounwind
define dso_local i32 @main() local_unnamed_addr #2 {
entry:
%a = alloca i32, align 4
%b = alloca i32, align 4
%c = alloca i32, align 4
%a.0.a.0.a.0.a.0..sroa_cast = bitcast i32* %a to i8*
call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %a.0.a.0.a.0.a.0..sroa_cast)
store volatile i32 -16777235, i32* %a, align 4, !tbaa !3
%b.0.b.0.b.0.b.0..sroa_cast = bitcast i32* %b to i8*
call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %b.0.b.0.b.0.b.0..sroa_cast)
store volatile i32 34, i32* %b, align 4, !tbaa !3
%c.0.c.0.c.0.c.0..sroa_cast = bitcast i32* %c to i8*
call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %c.0.c.0.c.0.c.0..sroa_cast)
store volatile i32 34, i32* %c, align 4, !tbaa !3
%b.0.b.0.b.0.b.0.29 = load volatile i32, i32* %b, align 4, !tbaa !3
%cmp30 = icmp sgt i32 %b.0.b.0.b.0.b.0.29, 0
br i1 %cmp30, label %for.body, label %if.end
for.cond.for.cond.cleanup_crit_edge: ; preds = %for.body
%conv.i.le = sitofp i32 %a.0.a.0.a.0.a.0. to float
%0 = bitcast float %conv.i.le to i32
%phi.cmp = icmp ugt i32 %0, 100
br i1 %phi.cmp, label %if.then, label %if.end
for.body: ; preds = %entry, %for.body
%i.031 = phi i32 [ %inc4, %for.body ], [ 0, %entry ]
%c.0.c.0.c.0.c.0. = load volatile i32, i32* %c, align 4, !tbaa !3
%inc = add nsw i32 %c.0.c.0.c.0.c.0., 1
store volatile i32 %inc, i32* %c, align 4, !tbaa !3
%call = tail call i32 @feclearexcept(i32 31) #5
%a.0.a.0.a.0.a.0. = load volatile i32, i32* %a, align 4, !tbaa !3
%call2 = tail call i32 @fetestexcept(i32 31) #5
%call3 = tail call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %call2) #5
%inc4 = add nuw nsw i32 %i.031, 1
%b.0.b.0.b.0.b.0. = load volatile i32, i32* %b, align 4, !tbaa !3
%cmp = icmp slt i32 %inc4, %b.0.b.0.b.0.b.0.
br i1 %cmp, label %for.body, label %for.cond.for.cond.cleanup_crit_edge, !llvm.loop !7
if.then: ; preds = %for.cond.for.cond.cleanup_crit_edge
%a.0.a.0.a.0.a.0.23 = load volatile i32, i32* %a, align 4, !tbaa !3
%b.0.b.0.b.0.b.0.20 = load volatile i32, i32* %b, align 4, !tbaa !3
%add = add nsw i32 %b.0.b.0.b.0.b.0.20, %a.0.a.0.a.0.a.0.23
%c.0.c.0.c.0.c.0.17 = load volatile i32, i32* %c, align 4, !tbaa !3
%add7 = add nsw i32 %add, %c.0.c.0.c.0.c.0.17
br label %cleanup
if.end: ; preds = %entry, %for.cond.for.cond.cleanup_crit_edge
%a.0.a.0.a.0.a.0.24 = load volatile i32, i32* %a, align 4, !tbaa !3
%b.0.b.0.b.0.b.0.21 = load volatile i32, i32* %b, align 4, !tbaa !3
%mul.neg = mul i32 %b.0.b.0.b.0.b.0.21, -6
%sub = add i32 %mul.neg, %a.0.a.0.a.0.a.0.24
%c.0.c.0.c.0.c.0.18 = load volatile i32, i32* %c, align 4, !tbaa !3
%mul8 = mul nsw i32 %c.0.c.0.c.0.c.0.18, 3
%add9 = add nsw i32 %sub, %mul8
br label %cleanup
cleanup: ; preds = %if.end, %if.then
%retval.0 = phi i32 [ %add7, %if.then ], [ %add9, %if.end ]
%1 = bitcast i32* %c to i8*
%2 = bitcast i32* %b to i8*
%3 = bitcast i32* %a to i8*
call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %1)
call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %2)
call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %3)
ret i32 %retval.0
}
declare dso_local i32 @feclearexcept(i32) local_unnamed_addr #3
declare dso_local i32 @fetestexcept(i32) local_unnamed_addr #3
; Function Attrs: nofree nounwind
declare dso_local noundef i32 @printf(i8* nocapture noundef readonly, ...) local_unnamed_addr #4
attributes #0 = { mustprogress nofree norecurse nosync nounwind readnone willreturn "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv68" "target-features"="+v68,-long-calls" }
attributes #1 = { argmemonly mustprogress nofree nosync nounwind willreturn }
attributes #2 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv68" "target-features"="+v68,-long-calls" }
attributes #3 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv68" "target-features"="+v68,-long-calls" }
attributes #4 = { nofree nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv68" "target-features"="+v68,-long-calls" }
attributes #5 = { nounwind }
!llvm.module.flags = !{!0, !1}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"frame-pointer", i32 2}
!3 = !{!4, !4, i64 0}
!4 = !{!"int", !5, i64 0}
!5 = !{!"omnipotent char", !6, i64 0}
!6 = !{!"Simple C/C++ TBAA"}
!7 = distinct !{!7, !8}
!8 = !{!"llvm.loop.mustprogress"}
...
---
name: syst_int32_to_float32
alignment: 16
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
failedISel: false
tracksRegLiveness: true
hasWinCFI: false
registers:
- { id: 0, class: intregs, preferred-register: '' }
- { id: 1, class: intregs, preferred-register: '' }
liveins:
- { reg: '$r0', virtual-reg: '%0' }
frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
hasPatchPoint: false
stackSize: 0
offsetAdjustment: 0
maxAlignment: 1
adjustsStack: false
hasCalls: false
stackProtector: ''
maxCallFrameSize: 4294967295
cvBytesOfCalleeSavedRegisters: 0
hasOpaqueSPAdjustment: false
hasVAStart: false
hasMustTailInVarArgFunc: false
hasTailCall: false
localFrameSize: 0
savePoint: ''
restorePoint: ''
fixedStack: []
stack: []
callSites: []
debugValueSubstitutions: []
constants: []
machineFunctionInfo: {}
body: |
bb.0.entry:
liveins: $r0
%0:intregs = COPY $r0
%1:intregs = F2_conv_w2sf %0, implicit $usr
$r0 = COPY %1
PS_jmpret $r31, implicit-def dead $pc, implicit $r0
...
---
name: main
alignment: 16
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
failedISel: false
tracksRegLiveness: true
hasWinCFI: false
registers:
- { id: 0, class: intregs, preferred-register: '' }
- { id: 1, class: intregs, preferred-register: '' }
- { id: 2, class: intregs, preferred-register: '' }
- { id: 3, class: intregs, preferred-register: '' }
- { id: 4, class: intregs, preferred-register: '' }
- { id: 5, class: intregs, preferred-register: '' }
- { id: 6, class: intregs, preferred-register: '' }
- { id: 7, class: intregs, preferred-register: '' }
- { id: 8, class: predregs, preferred-register: '' }
- { id: 9, class: intregs, preferred-register: '' }
- { id: 10, class: intregs, preferred-register: '' }
- { id: 11, class: intregs, preferred-register: '' }
- { id: 12, class: intregs, preferred-register: '' }
- { id: 13, class: intregs, preferred-register: '' }
- { id: 14, class: intregs, preferred-register: '' }
- { id: 15, class: intregs, preferred-register: '' }
- { id: 16, class: predregs, preferred-register: '' }
- { id: 17, class: intregs, preferred-register: '' }
- { id: 18, class: predregs, preferred-register: '' }
- { id: 19, class: intregs, preferred-register: '' }
- { id: 20, class: intregs, preferred-register: '' }
- { id: 21, class: intregs, preferred-register: '' }
- { id: 22, class: intregs, preferred-register: '' }
- { id: 23, class: intregs, preferred-register: '' }
- { id: 24, class: intregs, preferred-register: '' }
- { id: 25, class: intregs, preferred-register: '' }
- { id: 26, class: intregs, preferred-register: '' }
- { id: 27, class: intregs, preferred-register: '' }
liveins: []
frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
hasPatchPoint: false
stackSize: 0
offsetAdjustment: 0
maxAlignment: 4
adjustsStack: false
hasCalls: true
stackProtector: ''
maxCallFrameSize: 4294967295
cvBytesOfCalleeSavedRegisters: 0
hasOpaqueSPAdjustment: false
hasVAStart: false
hasMustTailInVarArgFunc: false
hasTailCall: false
localFrameSize: 0
savePoint: ''
restorePoint: ''
fixedStack: []
stack:
- { id: 0, name: a, type: default, offset: 0, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 1, name: b, type: default, offset: 0, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 2, name: c, type: default, offset: 0, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
callSites: []
debugValueSubstitutions: []
constants: []
machineFunctionInfo: {}
body: |
bb.0.entry:
successors: %bb.6(0x50000000), %bb.4(0x30000000)
S4_storeiri_io %stack.0.a, 0, -16777235 :: (volatile store (s32) into %ir.a, !tbaa !3)
S4_storeiri_io %stack.1.b, 0, 34 :: (volatile store (s32) into %ir.b, !tbaa !3)
S4_storeiri_io %stack.2.c, 0, 34 :: (volatile store (s32) into %ir.c, !tbaa !3)
%7:intregs = L2_loadri_io %stack.1.b, 0 :: (volatile dereferenceable load (s32) from %ir.b, !tbaa !3)
%8:predregs = C2_cmpgti %7, 0
%6:intregs = A2_tfrsi 0
J2_jumpf %8, %bb.4, implicit-def $pc
bb.6:
successors: %bb.2(0x80000000)
%9:intregs = A2_tfrsi 31
%13:intregs = A2_tfrsi @.str
J2_jump %bb.2, implicit-def $pc
bb.1.for.cond.for.cond.cleanup_crit_edge:
successors: %bb.4(0x40000000)
J2_jump %bb.4, implicit-def dead $pc
bb.2.for.body:
successors: %bb.2(0x7c000000), %bb.1(0x04000000)
%0:intregs = PHI %6, %bb.6, %2, %bb.2
L4_iadd_memopw_io %stack.2.c, 0, 1 :: (volatile store (s32) into %ir.c, !tbaa !3), (volatile dereferenceable load (s32) from %ir.c, !tbaa !3)
ADJCALLSTACKDOWN 0, 0, implicit-def $r29, implicit-def dead $r30, implicit $r31, implicit $r30, implicit $r29
$r0 = COPY %9
J2_call @feclearexcept, hexagoncsr, implicit-def dead $pc, implicit-def dead $r31, implicit $r29, implicit $r0, implicit-def $r29, implicit-def $r0
ADJCALLSTACKUP 0, 0, implicit-def dead $r29, implicit-def dead $r30, implicit-def dead $r31, implicit $r29
%1:intregs = L2_loadri_io %stack.0.a, 0 :: (volatile dereferenceable load (s32) from %ir.a, !tbaa !3)
ADJCALLSTACKDOWN 0, 0, implicit-def $r29, implicit-def dead $r30, implicit $r31, implicit $r30, implicit $r29
%17:intregs = F2_conv_w2sf %1, implicit $usr
$r0 = COPY %9
J2_call @fetestexcept, hexagoncsr, implicit-def dead $pc, implicit-def dead $r31, implicit $r29, implicit $r0, implicit-def $r29, implicit-def $r0
ADJCALLSTACKUP 0, 0, implicit-def dead $r29, implicit-def dead $r30, implicit-def dead $r31, implicit $r29
%11:intregs = COPY $r0
%12:intregs = COPY $r29
S2_storeri_io %12, 0, %11 :: (store (s32) into stack)
ADJCALLSTACKDOWN 4, 0, implicit-def $r29, implicit-def dead $r30, implicit $r31, implicit $r30, implicit $r29
$r0 = COPY %13
J2_call @printf, hexagoncsr, implicit-def dead $pc, implicit-def dead $r31, implicit $r29, implicit $r0, implicit-def $r29, implicit-def $r0
ADJCALLSTACKUP 4, 0, implicit-def dead $r29, implicit-def dead $r30, implicit-def dead $r31, implicit $r29
%2:intregs = nuw nsw A2_addi %0, 1
%15:intregs = L2_loadri_io %stack.1.b, 0 :: (volatile dereferenceable load (s32) from %ir.b, !tbaa !3)
%16:predregs = C2_cmpgt %15, %2
J2_jumpt %16, %bb.2, implicit-def dead $pc
J2_jump %bb.1, implicit-def dead $pc
bb.3.if.then:
successors: %bb.5(0x80000000)
%18:predregs = C2_cmpgtui %17, 100
%24:intregs = L2_loadri_io %stack.0.a, 0 :: (volatile dereferenceable load (s32) from %ir.a, !tbaa !3)
%25:intregs = L2_loadri_io %stack.1.b, 0 :: (volatile dereferenceable load (s32) from %ir.b, !tbaa !3)
%26:intregs = L2_loadri_io %stack.2.c, 0 :: (volatile dereferenceable load (s32) from %ir.c, !tbaa !3)
%3:intregs = nsw M2_acci %26, %25, %24
J2_jumpf %18, %bb.5, implicit-def dead $pc
J2_jump %bb.5, implicit-def dead $pc
bb.4.if.end:
successors: %bb.5(0x80000000)
%19:intregs = L2_loadri_io %stack.0.a, 0 :: (volatile dereferenceable load (s32) from %ir.a, !tbaa !3)
%20:intregs = L2_loadri_io %stack.1.b, 0 :: (volatile dereferenceable load (s32) from %ir.b, !tbaa !3)
%27:intregs = M2_macsin %19, %20, 6
%23:intregs = L2_loadri_io %stack.2.c, 0 :: (volatile dereferenceable load (s32) from %ir.c, !tbaa !3)
%4:intregs = nsw M2_macsip %27, %23, 3
bb.5.cleanup:
%5:intregs = PHI %4, %bb.4, %3, %bb.3
$r0 = COPY %5
PS_jmpret $r31, implicit-def dead $pc, implicit $r0
...