[X86] Add i32->float and i64->double bitcast pseudo instructions to store folding table.

We have pseudo instructions we use for bitcasts between these types.
We have them in the load folding table, but not the store folding
table. This adds them there so they can be used for stack spills.

I added an exact size check so that we don't fold when the stack slot
is larger than the GPR. Otherwise the upper bits in the stack slot
would be garbage. That would be fine for Eli's test case in PR47874,
but I'm not sure its safe in general.

A step towards fixing PR47874. Next steps are to change the ADDSSrr_Int
pseudo instructions to use FR32 as the second source register class
instead of VR128. That will keep the coalescer from promoting the
register class of the bitcast instruction which will make the stack
slot 4 bytes instead of 16 bytes.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D89656
This commit is contained in:
Craig Topper 2020-10-19 11:50:47 -07:00
parent ae3625d752
commit e28376ec28
3 changed files with 25 additions and 14 deletions

View File

@ -300,11 +300,13 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
{ X86::MOV32rr, X86::MOV32mr, TB_FOLDED_STORE }, { X86::MOV32rr, X86::MOV32mr, TB_FOLDED_STORE },
{ X86::MOV64ri32, X86::MOV64mi32, TB_FOLDED_STORE }, { X86::MOV64ri32, X86::MOV64mi32, TB_FOLDED_STORE },
{ X86::MOV64rr, X86::MOV64mr, TB_FOLDED_STORE }, { X86::MOV64rr, X86::MOV64mr, TB_FOLDED_STORE },
{ X86::MOV64toSDrr, X86::MOV64mr, TB_FOLDED_STORE | TB_NO_REVERSE },
{ X86::MOV8ri, X86::MOV8mi, TB_FOLDED_STORE }, { X86::MOV8ri, X86::MOV8mi, TB_FOLDED_STORE },
{ X86::MOV8rr, X86::MOV8mr, TB_FOLDED_STORE }, { X86::MOV8rr, X86::MOV8mr, TB_FOLDED_STORE },
{ X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE }, { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE },
{ X86::MOVAPDrr, X86::MOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::MOVAPDrr, X86::MOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::MOVAPSrr, X86::MOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::MOVAPSrr, X86::MOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::MOVDI2SSrr, X86::MOV32mr, TB_FOLDED_STORE | TB_NO_REVERSE },
{ X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::MOVDQUrr, X86::MOVDQUmr, TB_FOLDED_STORE }, { X86::MOVDQUrr, X86::MOVDQUmr, TB_FOLDED_STORE },
{ X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE }, { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE },
@ -357,6 +359,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
{ X86::VEXTRACTI64x4Zrr, X86::VEXTRACTI64x4Zmr, TB_FOLDED_STORE }, { X86::VEXTRACTI64x4Zrr, X86::VEXTRACTI64x4Zmr, TB_FOLDED_STORE },
{ X86::VEXTRACTPSZrr, X86::VEXTRACTPSZmr, TB_FOLDED_STORE }, { X86::VEXTRACTPSZrr, X86::VEXTRACTPSZmr, TB_FOLDED_STORE },
{ X86::VEXTRACTPSrr, X86::VEXTRACTPSmr, TB_FOLDED_STORE }, { X86::VEXTRACTPSrr, X86::VEXTRACTPSmr, TB_FOLDED_STORE },
{ X86::VMOV64toSDZrr, X86::MOV64mr, TB_FOLDED_STORE | TB_NO_REVERSE },
{ X86::VMOV64toSDrr, X86::MOV64mr, TB_FOLDED_STORE | TB_NO_REVERSE },
{ X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
@ -367,6 +371,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
{ X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::VMOVAPSZrr, X86::VMOVAPSZmr, TB_FOLDED_STORE | TB_ALIGN_64 }, { X86::VMOVAPSZrr, X86::VMOVAPSZmr, TB_FOLDED_STORE | TB_ALIGN_64 },
{ X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::VMOVDI2SSZrr, X86::MOV32mr, TB_FOLDED_STORE | TB_NO_REVERSE },
{ X86::VMOVDI2SSrr, X86::MOV32mr, TB_FOLDED_STORE | TB_NO_REVERSE },
{ X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::VMOVDQA32Zrr, X86::VMOVDQA32Zmr, TB_FOLDED_STORE | TB_ALIGN_64 }, { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zmr, TB_FOLDED_STORE | TB_ALIGN_64 },

View File

@ -5526,6 +5526,10 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
if (I != nullptr) { if (I != nullptr) {
unsigned Opcode = I->DstOp; unsigned Opcode = I->DstOp;
bool FoldedLoad =
isTwoAddrFold || (OpNum == 0 && I->Flags & TB_FOLDED_LOAD) || OpNum > 0;
bool FoldedStore =
isTwoAddrFold || (OpNum == 0 && I->Flags & TB_FOLDED_STORE);
MaybeAlign MinAlign = MaybeAlign MinAlign =
decodeMaybeAlign((I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT); decodeMaybeAlign((I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT);
if (MinAlign && Alignment < *MinAlign) if (MinAlign && Alignment < *MinAlign)
@ -5536,20 +5540,25 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum,
&RI, MF); &RI, MF);
unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
if (Size < RCSize) {
// FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int.
// Check if it's safe to fold the load. If the size of the object is // Check if it's safe to fold the load. If the size of the object is
// narrower than the load width, then it's not. // narrower than the load width, then it's not.
if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4) // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int.
return nullptr; if (FoldedLoad && Size < RCSize) {
// If this is a 64-bit load, but the spill slot is 32, then we can do // If this is a 64-bit load, but the spill slot is 32, then we can do
// a 32-bit load which is implicitly zero-extended. This likely is // a 32-bit load which is implicitly zero-extended. This likely is
// due to live interval analysis remat'ing a load from stack slot. // due to live interval analysis remat'ing a load from stack slot.
if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
return nullptr;
if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg()) if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
return nullptr; return nullptr;
Opcode = X86::MOV32rm; Opcode = X86::MOV32rm;
NarrowToMOV32rm = true; NarrowToMOV32rm = true;
} }
// For stores, make sure the size of the object is equal to the size of
// the store. If the object is larger, the extra bits would be garbage. If
// the object is smaller we might overwrite another object or fault.
if (FoldedStore && Size != RCSize)
return nullptr;
} }
if (isTwoAddrFold) if (isTwoAddrFold)

View File

@ -9,8 +9,7 @@ define void @a(float* %arg, i32 %arg1) {
; SSE2-NEXT: testl %esi, %esi ; SSE2-NEXT: testl %esi, %esi
; SSE2-NEXT: jle LBB0_3 ; SSE2-NEXT: jle LBB0_3
; SSE2-NEXT: ## %bb.1: ## %bb2 ; SSE2-NEXT: ## %bb.1: ## %bb2
; SSE2-NEXT: movd %esi, %xmm0 ; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill
; SSE2-NEXT: movl %esi, %eax ; SSE2-NEXT: movl %esi, %eax
; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: LBB0_2: ## %bb6 ; SSE2-NEXT: LBB0_2: ## %bb6
@ -31,8 +30,7 @@ define void @a(float* %arg, i32 %arg1) {
; AVX-NEXT: testl %esi, %esi ; AVX-NEXT: testl %esi, %esi
; AVX-NEXT: jle LBB0_3 ; AVX-NEXT: jle LBB0_3
; AVX-NEXT: ## %bb.1: ## %bb2 ; AVX-NEXT: ## %bb.1: ## %bb2
; AVX-NEXT: vmovd %esi, %xmm0 ; AVX-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
; AVX-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill
; AVX-NEXT: movl %esi, %eax ; AVX-NEXT: movl %esi, %eax
; AVX-NEXT: .p2align 4, 0x90 ; AVX-NEXT: .p2align 4, 0x90
; AVX-NEXT: LBB0_2: ## %bb6 ; AVX-NEXT: LBB0_2: ## %bb6
@ -78,8 +76,7 @@ define void @b(double* %arg, i64 %arg1) {
; SSE2-NEXT: testq %rsi, %rsi ; SSE2-NEXT: testq %rsi, %rsi
; SSE2-NEXT: jle LBB1_3 ; SSE2-NEXT: jle LBB1_3
; SSE2-NEXT: ## %bb.1: ## %bb2 ; SSE2-NEXT: ## %bb.1: ## %bb2
; SSE2-NEXT: movq %rsi, %xmm0 ; SSE2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; SSE2-NEXT: movq %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Spill
; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: LBB1_2: ## %bb6 ; SSE2-NEXT: LBB1_2: ## %bb6
; SSE2-NEXT: ## =>This Inner Loop Header: Depth=1 ; SSE2-NEXT: ## =>This Inner Loop Header: Depth=1
@ -99,8 +96,7 @@ define void @b(double* %arg, i64 %arg1) {
; AVX-NEXT: testq %rsi, %rsi ; AVX-NEXT: testq %rsi, %rsi
; AVX-NEXT: jle LBB1_3 ; AVX-NEXT: jle LBB1_3
; AVX-NEXT: ## %bb.1: ## %bb2 ; AVX-NEXT: ## %bb.1: ## %bb2
; AVX-NEXT: vmovq %rsi, %xmm0 ; AVX-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; AVX-NEXT: vmovq %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Spill
; AVX-NEXT: .p2align 4, 0x90 ; AVX-NEXT: .p2align 4, 0x90
; AVX-NEXT: LBB1_2: ## %bb6 ; AVX-NEXT: LBB1_2: ## %bb6
; AVX-NEXT: ## =>This Inner Loop Header: Depth=1 ; AVX-NEXT: ## =>This Inner Loop Header: Depth=1