s390/mm: better annotate 2KB pagetable fragments handling
Explicitly encode immediate value of pending remove nibble (bits 31-28) and tracking nibble (bits 27-24) of the page refcount whenever these nibbles are tested or changed, for better readability. Also, add some comments describing how the fragments are handled. Reviewed-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com> Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com> Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
This commit is contained in:
parent
c2c224932f
commit
1194372db6
|
@ -176,7 +176,75 @@ void page_table_free_pgste(struct page *page)
|
|||
#endif /* CONFIG_PGSTE */
|
||||
|
||||
/*
|
||||
* page table entry allocation/free routines.
|
||||
* A 2KB-pgtable is either upper or lower half of a normal page.
|
||||
* The second half of the page may be unused or used as another
|
||||
* 2KB-pgtable.
|
||||
*
|
||||
* Whenever possible the parent page for a new 2KB-pgtable is picked
|
||||
* from the list of partially allocated pages mm_context_t::pgtable_list.
|
||||
* In case the list is empty a new parent page is allocated and added to
|
||||
* the list.
|
||||
*
|
||||
* When a parent page gets fully allocated it contains 2KB-pgtables in both
|
||||
* upper and lower halves and is removed from mm_context_t::pgtable_list.
|
||||
*
|
||||
* When 2KB-pgtable is freed from to fully allocated parent page that
|
||||
* page turns partially allocated and added to mm_context_t::pgtable_list.
|
||||
*
|
||||
* If 2KB-pgtable is freed from the partially allocated parent page that
|
||||
* page turns unused and gets removed from mm_context_t::pgtable_list.
|
||||
* Furthermore, the unused parent page is released.
|
||||
*
|
||||
* As follows from the above, no unallocated or fully allocated parent
|
||||
* pages are contained in mm_context_t::pgtable_list.
|
||||
*
|
||||
* The upper byte (bits 24-31) of the parent page _refcount is used
|
||||
* for tracking contained 2KB-pgtables and has the following format:
|
||||
*
|
||||
* PP AA
|
||||
* 01234567 upper byte (bits 24-31) of struct page::_refcount
|
||||
* || ||
|
||||
* || |+--- upper 2KB-pgtable is allocated
|
||||
* || +---- lower 2KB-pgtable is allocated
|
||||
* |+------- upper 2KB-pgtable is pending for removal
|
||||
* +-------- lower 2KB-pgtable is pending for removal
|
||||
*
|
||||
* (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why
|
||||
* using _refcount is possible).
|
||||
*
|
||||
* When 2KB-pgtable is allocated the corresponding AA bit is set to 1.
|
||||
* The parent page is either:
|
||||
* - added to mm_context_t::pgtable_list in case the second half of the
|
||||
* parent page is still unallocated;
|
||||
* - removed from mm_context_t::pgtable_list in case both hales of the
|
||||
* parent page are allocated;
|
||||
* These operations are protected with mm_context_t::lock.
|
||||
*
|
||||
* When 2KB-pgtable is deallocated the corresponding AA bit is set to 0
|
||||
* and the corresponding PP bit is set to 1 in a single atomic operation.
|
||||
* Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually
|
||||
* exclusive and may never be both set to 1!
|
||||
* The parent page is either:
|
||||
* - added to mm_context_t::pgtable_list in case the second half of the
|
||||
* parent page is still allocated;
|
||||
* - removed from mm_context_t::pgtable_list in case the second half of
|
||||
* the parent page is unallocated;
|
||||
* These operations are protected with mm_context_t::lock.
|
||||
*
|
||||
* It is important to understand that mm_context_t::lock only protects
|
||||
* mm_context_t::pgtable_list and AA bits, but not the parent page itself
|
||||
* and PP bits.
|
||||
*
|
||||
* Releasing the parent page happens whenever the PP bit turns from 1 to 0,
|
||||
* while both AA bits and the second PP bit are already unset. Then the
|
||||
* parent page does not contain any 2KB-pgtable fragment anymore, and it has
|
||||
* also been removed from mm_context_t::pgtable_list. It is safe to release
|
||||
* the page therefore.
|
||||
*
|
||||
* PGSTE memory spaces use full 4KB-pgtables and do not need most of the
|
||||
* logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable
|
||||
* while the PP bits are never used, nor such a page is added to or removed
|
||||
* from mm_context_t::pgtable_list.
|
||||
*/
|
||||
unsigned long *page_table_alloc(struct mm_struct *mm)
|
||||
{
|
||||
|
@ -192,14 +260,23 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
|
|||
page = list_first_entry(&mm->context.pgtable_list,
|
||||
struct page, lru);
|
||||
mask = atomic_read(&page->_refcount) >> 24;
|
||||
mask = (mask | (mask >> 4)) & 3;
|
||||
if (mask != 3) {
|
||||
/*
|
||||
* The pending removal bits must also be checked.
|
||||
* Failure to do so might lead to an impossible
|
||||
* value of (i.e 0x13 or 0x23) written to _refcount.
|
||||
* Such values violate the assumption that pending and
|
||||
* allocation bits are mutually exclusive, and the rest
|
||||
* of the code unrails as result. That could lead to
|
||||
* a whole bunch of races and corruptions.
|
||||
*/
|
||||
mask = (mask | (mask >> 4)) & 0x03U;
|
||||
if (mask != 0x03U) {
|
||||
table = (unsigned long *) page_to_virt(page);
|
||||
bit = mask & 1; /* =1 -> second 2K */
|
||||
if (bit)
|
||||
table += PTRS_PER_PTE;
|
||||
atomic_xor_bits(&page->_refcount,
|
||||
1U << (bit + 24));
|
||||
0x01U << (bit + 24));
|
||||
list_del(&page->lru);
|
||||
}
|
||||
}
|
||||
|
@ -220,12 +297,12 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
|
|||
table = (unsigned long *) page_to_virt(page);
|
||||
if (mm_alloc_pgste(mm)) {
|
||||
/* Return 4K page table with PGSTEs */
|
||||
atomic_xor_bits(&page->_refcount, 3 << 24);
|
||||
atomic_xor_bits(&page->_refcount, 0x03U << 24);
|
||||
memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
|
||||
memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
|
||||
} else {
|
||||
/* Return the first 2K fragment of the page */
|
||||
atomic_xor_bits(&page->_refcount, 1 << 24);
|
||||
atomic_xor_bits(&page->_refcount, 0x01U << 24);
|
||||
memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
|
||||
spin_lock_bh(&mm->context.lock);
|
||||
list_add(&page->lru, &mm->context.pgtable_list);
|
||||
|
@ -244,19 +321,24 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
|
|||
/* Free 2K page table fragment of a 4K page */
|
||||
bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
|
||||
spin_lock_bh(&mm->context.lock);
|
||||
/*
|
||||
* Mark the page for delayed release. The actual release
|
||||
* will happen outside of the critical section from this
|
||||
* function or from __tlb_remove_table()
|
||||
*/
|
||||
mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
|
||||
mask >>= 24;
|
||||
if (mask & 3)
|
||||
if (mask & 0x03U)
|
||||
list_add(&page->lru, &mm->context.pgtable_list);
|
||||
else
|
||||
list_del(&page->lru);
|
||||
spin_unlock_bh(&mm->context.lock);
|
||||
mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24));
|
||||
mask >>= 24;
|
||||
if (mask != 0)
|
||||
if (mask != 0x00U)
|
||||
return;
|
||||
} else {
|
||||
atomic_xor_bits(&page->_refcount, 3U << 24);
|
||||
atomic_xor_bits(&page->_refcount, 0x03U << 24);
|
||||
}
|
||||
|
||||
pgtable_pte_page_dtor(page);
|
||||
|
@ -274,43 +356,48 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
|
|||
page = virt_to_page(table);
|
||||
if (mm_alloc_pgste(mm)) {
|
||||
gmap_unlink(mm, table, vmaddr);
|
||||
table = (unsigned long *) ((unsigned long)table | 3);
|
||||
table = (unsigned long *) ((unsigned long)table | 0x03U);
|
||||
tlb_remove_table(tlb, table);
|
||||
return;
|
||||
}
|
||||
bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
|
||||
spin_lock_bh(&mm->context.lock);
|
||||
/*
|
||||
* Mark the page for delayed release. The actual release will happen
|
||||
* outside of the critical section from __tlb_remove_table() or from
|
||||
* page_table_free()
|
||||
*/
|
||||
mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
|
||||
mask >>= 24;
|
||||
if (mask & 3)
|
||||
if (mask & 0x03U)
|
||||
list_add_tail(&page->lru, &mm->context.pgtable_list);
|
||||
else
|
||||
list_del(&page->lru);
|
||||
spin_unlock_bh(&mm->context.lock);
|
||||
table = (unsigned long *) ((unsigned long) table | (1U << bit));
|
||||
table = (unsigned long *) ((unsigned long) table | (0x01U << bit));
|
||||
tlb_remove_table(tlb, table);
|
||||
}
|
||||
|
||||
void __tlb_remove_table(void *_table)
|
||||
{
|
||||
unsigned int mask = (unsigned long) _table & 3;
|
||||
unsigned int mask = (unsigned long) _table & 0x03U;
|
||||
void *table = (void *)((unsigned long) _table ^ mask);
|
||||
struct page *page = virt_to_page(table);
|
||||
|
||||
switch (mask) {
|
||||
case 0: /* pmd, pud, or p4d */
|
||||
case 0x00U: /* pmd, pud, or p4d */
|
||||
free_pages((unsigned long) table, 2);
|
||||
break;
|
||||
case 1: /* lower 2K of a 4K page table */
|
||||
case 2: /* higher 2K of a 4K page table */
|
||||
case 0x01U: /* lower 2K of a 4K page table */
|
||||
case 0x02U: /* higher 2K of a 4K page table */
|
||||
mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
|
||||
mask >>= 24;
|
||||
if (mask != 0)
|
||||
if (mask != 0x00U)
|
||||
break;
|
||||
fallthrough;
|
||||
case 3: /* 4K page table with pgstes */
|
||||
if (mask & 3)
|
||||
atomic_xor_bits(&page->_refcount, 3 << 24);
|
||||
case 0x03U: /* 4K page table with pgstes */
|
||||
if (mask & 0x03U)
|
||||
atomic_xor_bits(&page->_refcount, 0x03U << 24);
|
||||
pgtable_pte_page_dtor(page);
|
||||
__free_page(page);
|
||||
break;
|
||||
|
|
Loading…
Reference in New Issue