s390/mm: better annotate 2KB pagetable fragments handling

Explicitly encode immediate value of pending remove nibble
(bits 31-28) and tracking nibble (bits 27-24) of the page
refcount whenever these nibbles are tested or changed, for
better readability. Also, add some comments describing how
the fragments are handled.

Reviewed-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
This commit is contained in:
Alexander Gordeev 2021-11-04 07:14:45 +01:00 committed by Heiko Carstens
parent c2c224932f
commit 1194372db6
1 changed files with 107 additions and 20 deletions

View File

@ -176,7 +176,75 @@ void page_table_free_pgste(struct page *page)
#endif /* CONFIG_PGSTE */
/*
* page table entry allocation/free routines.
* A 2KB-pgtable is either upper or lower half of a normal page.
* The second half of the page may be unused or used as another
* 2KB-pgtable.
*
* Whenever possible the parent page for a new 2KB-pgtable is picked
* from the list of partially allocated pages mm_context_t::pgtable_list.
* In case the list is empty a new parent page is allocated and added to
* the list.
*
* When a parent page gets fully allocated it contains 2KB-pgtables in both
* upper and lower halves and is removed from mm_context_t::pgtable_list.
*
* When 2KB-pgtable is freed from to fully allocated parent page that
* page turns partially allocated and added to mm_context_t::pgtable_list.
*
* If 2KB-pgtable is freed from the partially allocated parent page that
* page turns unused and gets removed from mm_context_t::pgtable_list.
* Furthermore, the unused parent page is released.
*
* As follows from the above, no unallocated or fully allocated parent
* pages are contained in mm_context_t::pgtable_list.
*
* The upper byte (bits 24-31) of the parent page _refcount is used
* for tracking contained 2KB-pgtables and has the following format:
*
* PP AA
* 01234567 upper byte (bits 24-31) of struct page::_refcount
* || ||
* || |+--- upper 2KB-pgtable is allocated
* || +---- lower 2KB-pgtable is allocated
* |+------- upper 2KB-pgtable is pending for removal
* +-------- lower 2KB-pgtable is pending for removal
*
* (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why
* using _refcount is possible).
*
* When 2KB-pgtable is allocated the corresponding AA bit is set to 1.
* The parent page is either:
* - added to mm_context_t::pgtable_list in case the second half of the
* parent page is still unallocated;
* - removed from mm_context_t::pgtable_list in case both hales of the
* parent page are allocated;
* These operations are protected with mm_context_t::lock.
*
* When 2KB-pgtable is deallocated the corresponding AA bit is set to 0
* and the corresponding PP bit is set to 1 in a single atomic operation.
* Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually
* exclusive and may never be both set to 1!
* The parent page is either:
* - added to mm_context_t::pgtable_list in case the second half of the
* parent page is still allocated;
* - removed from mm_context_t::pgtable_list in case the second half of
* the parent page is unallocated;
* These operations are protected with mm_context_t::lock.
*
* It is important to understand that mm_context_t::lock only protects
* mm_context_t::pgtable_list and AA bits, but not the parent page itself
* and PP bits.
*
* Releasing the parent page happens whenever the PP bit turns from 1 to 0,
* while both AA bits and the second PP bit are already unset. Then the
* parent page does not contain any 2KB-pgtable fragment anymore, and it has
* also been removed from mm_context_t::pgtable_list. It is safe to release
* the page therefore.
*
* PGSTE memory spaces use full 4KB-pgtables and do not need most of the
* logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable
* while the PP bits are never used, nor such a page is added to or removed
* from mm_context_t::pgtable_list.
*/
unsigned long *page_table_alloc(struct mm_struct *mm)
{
@ -192,14 +260,23 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
page = list_first_entry(&mm->context.pgtable_list,
struct page, lru);
mask = atomic_read(&page->_refcount) >> 24;
mask = (mask | (mask >> 4)) & 3;
if (mask != 3) {
/*
* The pending removal bits must also be checked.
* Failure to do so might lead to an impossible
* value of (i.e 0x13 or 0x23) written to _refcount.
* Such values violate the assumption that pending and
* allocation bits are mutually exclusive, and the rest
* of the code unrails as result. That could lead to
* a whole bunch of races and corruptions.
*/
mask = (mask | (mask >> 4)) & 0x03U;
if (mask != 0x03U) {
table = (unsigned long *) page_to_virt(page);
bit = mask & 1; /* =1 -> second 2K */
if (bit)
table += PTRS_PER_PTE;
atomic_xor_bits(&page->_refcount,
1U << (bit + 24));
0x01U << (bit + 24));
list_del(&page->lru);
}
}
@ -220,12 +297,12 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
table = (unsigned long *) page_to_virt(page);
if (mm_alloc_pgste(mm)) {
/* Return 4K page table with PGSTEs */
atomic_xor_bits(&page->_refcount, 3 << 24);
atomic_xor_bits(&page->_refcount, 0x03U << 24);
memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
} else {
/* Return the first 2K fragment of the page */
atomic_xor_bits(&page->_refcount, 1 << 24);
atomic_xor_bits(&page->_refcount, 0x01U << 24);
memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
spin_lock_bh(&mm->context.lock);
list_add(&page->lru, &mm->context.pgtable_list);
@ -244,19 +321,24 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
/* Free 2K page table fragment of a 4K page */
bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
spin_lock_bh(&mm->context.lock);
/*
* Mark the page for delayed release. The actual release
* will happen outside of the critical section from this
* function or from __tlb_remove_table()
*/
mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
mask >>= 24;
if (mask & 3)
if (mask & 0x03U)
list_add(&page->lru, &mm->context.pgtable_list);
else
list_del(&page->lru);
spin_unlock_bh(&mm->context.lock);
mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24));
mask >>= 24;
if (mask != 0)
if (mask != 0x00U)
return;
} else {
atomic_xor_bits(&page->_refcount, 3U << 24);
atomic_xor_bits(&page->_refcount, 0x03U << 24);
}
pgtable_pte_page_dtor(page);
@ -274,43 +356,48 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
page = virt_to_page(table);
if (mm_alloc_pgste(mm)) {
gmap_unlink(mm, table, vmaddr);
table = (unsigned long *) ((unsigned long)table | 3);
table = (unsigned long *) ((unsigned long)table | 0x03U);
tlb_remove_table(tlb, table);
return;
}
bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
spin_lock_bh(&mm->context.lock);
/*
* Mark the page for delayed release. The actual release will happen
* outside of the critical section from __tlb_remove_table() or from
* page_table_free()
*/
mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
mask >>= 24;
if (mask & 3)
if (mask & 0x03U)
list_add_tail(&page->lru, &mm->context.pgtable_list);
else
list_del(&page->lru);
spin_unlock_bh(&mm->context.lock);
table = (unsigned long *) ((unsigned long) table | (1U << bit));
table = (unsigned long *) ((unsigned long) table | (0x01U << bit));
tlb_remove_table(tlb, table);
}
void __tlb_remove_table(void *_table)
{
unsigned int mask = (unsigned long) _table & 3;
unsigned int mask = (unsigned long) _table & 0x03U;
void *table = (void *)((unsigned long) _table ^ mask);
struct page *page = virt_to_page(table);
switch (mask) {
case 0: /* pmd, pud, or p4d */
case 0x00U: /* pmd, pud, or p4d */
free_pages((unsigned long) table, 2);
break;
case 1: /* lower 2K of a 4K page table */
case 2: /* higher 2K of a 4K page table */
case 0x01U: /* lower 2K of a 4K page table */
case 0x02U: /* higher 2K of a 4K page table */
mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
mask >>= 24;
if (mask != 0)
if (mask != 0x00U)
break;
fallthrough;
case 3: /* 4K page table with pgstes */
if (mask & 3)
atomic_xor_bits(&page->_refcount, 3 << 24);
case 0x03U: /* 4K page table with pgstes */
if (mask & 0x03U)
atomic_xor_bits(&page->_refcount, 0x03U << 24);
pgtable_pte_page_dtor(page);
__free_page(page);
break;