926 lines
22 KiB
C
926 lines
22 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
#include <linux/mm.h>
|
|
#include <linux/mm_inline.h>
|
|
#include <linux/sched/mm.h>
|
|
#include <linux/wait.h>
|
|
#include <linux/sched/signal.h>
|
|
#include <linux/mm_pte.h>
|
|
#include <asm/pgalloc.h>
|
|
|
|
#define MIN(x, y) (x < y ? x : y)
|
|
#define MAX(x, y) (x > y ? x : y)
|
|
|
|
DEFINE_STATIC_KEY_FALSE(async_fork_enabled_key);
|
|
DEFINE_STATIC_KEY_FALSE(async_fork_staging_key);
|
|
|
|
typedef enum {
|
|
ITER_FAST,
|
|
ITER_SLOW,
|
|
ITER_FALLBACK,
|
|
ITER_MADVISE,
|
|
} iter_type_t;
|
|
|
|
static void async_pmd_populate(struct mm_struct *mm, pud_t *pud,
|
|
unsigned long address,
|
|
pmd_t *new)
|
|
{
|
|
spinlock_t *ptl;
|
|
|
|
/* The same as what __pmd_alloc() do */
|
|
smp_wmb();
|
|
ptl = pud_lock(mm, pud);
|
|
#ifndef __ARCH_HAS_4LEVEL_HACK
|
|
if (!pud_present(*pud)) {
|
|
mm_inc_nr_pmds(mm);
|
|
pud_populate(mm, pud, new);
|
|
} else /* Another has populated it */
|
|
pmd_free(mm, new);
|
|
#else
|
|
if (!pgd_present(*pud)) {
|
|
mm_inc_nr_pmds(mm);
|
|
pgd_populate(mm, pud, new);
|
|
} else /* Another has populated it */
|
|
pmd_free(mm, new);
|
|
#endif
|
|
spin_unlock(ptl);
|
|
}
|
|
|
|
static void async_pte_populate(struct mm_struct *mm, pmd_t *pmd,
|
|
pgtable_t new)
|
|
{
|
|
spinlock_t *ptl;
|
|
|
|
/* The same as what __pte_alloc() do */
|
|
smp_wmb();
|
|
ptl = pmd_lock(mm, pmd);
|
|
if (likely(pmd_none(*pmd))) {
|
|
mm_inc_nr_ptes(mm);
|
|
pmd_populate(mm, pmd, new);
|
|
new = NULL;
|
|
}
|
|
spin_unlock(ptl);
|
|
}
|
|
|
|
static inline void async_fork_mkpmd(struct mm_struct *src_mm,
|
|
unsigned long addr,
|
|
pmd_t *pmd)
|
|
{
|
|
spinlock_t *pmd_ptl;
|
|
|
|
if (is_pmd_async_fork(*pmd)) {
|
|
pr_warn("pmd=%08lx is already made async fork\n",
|
|
pmd->pmd);
|
|
return;
|
|
}
|
|
|
|
pmd_ptl = pmd_lock(src_mm, pmd);
|
|
|
|
/* check again */
|
|
if (is_pmd_async_fork(*pmd))
|
|
goto unlock;
|
|
|
|
set_pmd_at(src_mm, addr, pmd, pmd_wrprotect(*pmd));
|
|
unlock:
|
|
spin_unlock(pmd_ptl);
|
|
|
|
addr = addr & PMD_MASK;
|
|
pr_debug("pmd is made async fork, addr=%lx-%lx pmd=%lx oldmm=%p\n",
|
|
addr, addr + PMD_SIZE, pmd->pmd, src_mm);
|
|
}
|
|
|
|
static inline void __async_fork_clean_pmd(struct mm_struct *src_mm,
|
|
unsigned long addr,
|
|
pmd_t *pmd)
|
|
{
|
|
pmd_t tmp = *pmd;
|
|
|
|
if (is_pmd_async_fork(*pmd)) {
|
|
tmp = pmd_mkwrite(tmp);
|
|
} else {
|
|
pr_warn("pmd don't need to clean async fork, pmd=%08lx mm=%p",
|
|
tmp.pmd, src_mm);
|
|
return;
|
|
}
|
|
|
|
set_pmd_at(src_mm, addr, pmd, tmp);
|
|
pr_debug("pmd clean async fork addr=%lx pmd=%lx mm=%p\n", addr,
|
|
pmd->pmd, src_mm);
|
|
}
|
|
|
|
static inline void async_fork_clean_pmd(struct mm_struct *src_mm,
|
|
unsigned long addr,
|
|
pmd_t *pmd)
|
|
{
|
|
spinlock_t *pmd_ptl;
|
|
|
|
pmd_ptl = pmd_lock(src_mm, pmd);
|
|
__async_fork_clean_pmd(src_mm, addr, pmd);
|
|
spin_unlock(pmd_ptl);
|
|
}
|
|
|
|
static inline void async_fork_vma_unbind(struct vm_area_struct *vma)
|
|
{
|
|
if (!vma->async_fork_vma)
|
|
return;
|
|
|
|
pr_debug("vma unbinded, vma=%p vma=%p mm=%p\n", vma,
|
|
vma->async_fork_vma, vma->vm_mm);
|
|
|
|
vma->async_fork_vma->async_fork_vma = NULL;
|
|
vma->async_fork_vma = NULL;
|
|
}
|
|
|
|
static int
|
|
async_copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
|
pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
|
|
unsigned long addr, unsigned long end)
|
|
{
|
|
spinlock_t *src_ptl, *dst_ptl;
|
|
pte_t *orig_src_pte, *orig_dst_pte;
|
|
pte_t *src_pte, *dst_pte;
|
|
int rss[NR_MM_COUNTERS];
|
|
swp_entry_t entry = (swp_entry_t){0};
|
|
|
|
init_rss_vec(rss);
|
|
dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
|
|
if (!dst_pte)
|
|
return -ENOMEM;
|
|
src_pte = pte_offset_map(src_pmd, addr);
|
|
src_ptl = pte_lockptr(src_mm, src_pmd);
|
|
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
|
|
orig_src_pte = src_pte;
|
|
orig_dst_pte = dst_pte;
|
|
arch_enter_lazy_mmu_mode();
|
|
|
|
do {
|
|
if (pte_none(*src_pte))
|
|
continue;
|
|
entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
|
|
vma, addr, rss);
|
|
if (entry.val)
|
|
break;
|
|
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
|
|
|
|
arch_leave_lazy_mmu_mode();
|
|
spin_unlock(src_ptl);
|
|
pte_unmap(orig_src_pte);
|
|
add_mm_rss_vec(dst_mm, rss);
|
|
pte_unmap_unlock(orig_dst_pte, dst_ptl);
|
|
|
|
if (entry.val) {
|
|
if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
|
|
return -ENOMEM;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/* pmd_lock should be hold when call this function. Meanwhile, the dst_pmd
|
|
* should already be allocated.
|
|
*/
|
|
static int
|
|
async_copy_pmd_one(struct mm_struct *dst_mm, struct vm_area_struct *src_vma,
|
|
pmd_t *src_pmd, pmd_t *dst_pmd, unsigned long addr)
|
|
{
|
|
unsigned long pmd_start, pmd_end, vma_start, vma_end, start, end;
|
|
struct mm_struct *src_mm = src_vma->vm_mm;
|
|
struct vm_area_struct *tmp;
|
|
int count = 0;
|
|
|
|
if (!is_pmd_async_fork(*src_pmd) || !src_vma->async_fork_vma)
|
|
return 0;
|
|
|
|
tmp = find_vma(src_vma->vm_mm, addr & PMD_MASK);
|
|
if (!tmp) {
|
|
pr_warn("vma not founded for addr:%08lx\n", addr);
|
|
return -EINVAL;
|
|
}
|
|
|
|
pmd_start = addr & PMD_MASK;
|
|
pmd_end = pmd_start + PMD_SIZE;
|
|
|
|
do {
|
|
vma_start = tmp->vm_start;
|
|
vma_end = tmp->vm_end;
|
|
|
|
/* we hit the end of the pmd address range */
|
|
if (pmd_end <= vma_start)
|
|
break;
|
|
|
|
/* this vma is not in the range of current pmd */
|
|
if (!tmp->async_fork_vma || pmd_start >= vma_end)
|
|
goto vma_next;
|
|
|
|
start = MAX(vma_start, pmd_start);
|
|
end = MIN(vma_end, pmd_end);
|
|
|
|
if (async_copy_pte_range(dst_mm, src_mm, dst_pmd,
|
|
src_pmd, tmp, start, end)) {
|
|
return -ENOMEM;
|
|
}
|
|
|
|
count++;
|
|
pr_debug("pte copied in %lx-%lx mm=%p\n", start, end,
|
|
dst_mm);
|
|
vma_next:
|
|
tmp = tmp->vm_next;
|
|
} while (tmp);
|
|
|
|
if (!count)
|
|
pr_warn("no vma found for pmd=%08lx\n", src_pmd->pmd);
|
|
|
|
__async_fork_clean_pmd(src_mm, addr, src_pmd);
|
|
return 0;
|
|
}
|
|
|
|
static inline int
|
|
async_fork_iter_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
|
pud_t *dst_pud, pud_t *src_pud,
|
|
struct vm_area_struct *vma, unsigned long addr,
|
|
unsigned long end,
|
|
iter_type_t type)
|
|
{
|
|
pmd_t *src_pmd, *dst_pmd = NULL;
|
|
bool pmd_alloced = false;
|
|
spinlock_t *pmd_ptl;
|
|
unsigned long next;
|
|
int ret = 0, err;
|
|
|
|
src_pmd = pmd_offset(src_pud, addr);
|
|
|
|
#define check_pmd_alloc() do { \
|
|
if (!pmd_alloced) { \
|
|
dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); \
|
|
if (!dst_pmd) \
|
|
return -ENOMEM; \
|
|
pmd_alloced = true; \
|
|
} \
|
|
} while (0);
|
|
|
|
do {
|
|
next = pmd_addr_end(addr, end);
|
|
|
|
if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
|
|
|| pmd_devmap(*src_pmd)) {
|
|
int err;
|
|
|
|
if (type != ITER_FAST)
|
|
continue;
|
|
|
|
check_pmd_alloc()
|
|
|
|
VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
|
|
pr_debug("huge pmd copy on %08lx-%08lx, src_pmd=%p dst_pmd=%p\n",
|
|
addr, next, src_pmd, dst_pmd);
|
|
err = copy_huge_pmd(dst_mm, src_mm,
|
|
dst_pmd, src_pmd, addr, vma);
|
|
if (err == -ENOMEM)
|
|
return -ENOMEM;
|
|
if (!err)
|
|
continue;
|
|
/* fall through */
|
|
}
|
|
|
|
if (pmd_none_or_clear_bad(src_pmd))
|
|
continue;
|
|
|
|
/* fast path during fork(), access to src mm and src vma
|
|
* is safe here.
|
|
*/
|
|
if (type == ITER_FAST) {
|
|
if (is_pmd_async_fork(*src_pmd))
|
|
pr_debug("pmd is already made async fork pmd=%lx\n",
|
|
src_pmd->pmd);
|
|
else
|
|
async_fork_mkpmd(src_mm, addr, src_pmd);
|
|
ret++;
|
|
continue;
|
|
}
|
|
|
|
if (!is_pmd_async_fork(*src_pmd))
|
|
continue;
|
|
|
|
if (type == ITER_FALLBACK) {
|
|
async_fork_clean_pmd(src_mm, addr, src_pmd);
|
|
continue;
|
|
}
|
|
|
|
/* In this case, the dst vma is not locked and can't be
|
|
* used directly.
|
|
*/
|
|
if (type == ITER_MADVISE) {
|
|
err = __async_fork_fixup_pmd(vma, src_pmd, addr);
|
|
if (err)
|
|
return err;
|
|
continue;
|
|
}
|
|
|
|
/* In ITER_SLOW path, src vma and dst vma is locked.
|
|
* Therefor, it is safe to use dst mm and src mm here.
|
|
*/
|
|
check_pmd_alloc()
|
|
if (pte_alloc(dst_mm, dst_pmd))
|
|
return -ENOMEM;
|
|
pmd_ptl = pmd_lock(src_mm, src_pmd);
|
|
err = async_copy_pmd_one(dst_mm, vma, src_pmd, dst_pmd,
|
|
addr);
|
|
spin_unlock(pmd_ptl);
|
|
if (err)
|
|
return -ENOMEM;
|
|
might_sleep();
|
|
} while (dst_pmd++, src_pmd++, addr = next, addr != end);
|
|
return ret;
|
|
}
|
|
|
|
static inline int
|
|
async_fork_iter_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
|
p4d_t *dst_p4d, p4d_t *src_p4d,
|
|
struct vm_area_struct *vma, unsigned long addr,
|
|
unsigned long end,
|
|
iter_type_t type)
|
|
{
|
|
pud_t *src_pud, *dst_pud;
|
|
unsigned long next;
|
|
int ret = 0, tmp;
|
|
|
|
src_pud = pud_offset(src_p4d, addr);
|
|
switch (type) {
|
|
case ITER_FAST:
|
|
dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
|
|
if (!dst_pud)
|
|
return -ENOMEM;
|
|
break;
|
|
case ITER_SLOW:
|
|
dst_pud = pud_offset(dst_p4d, addr);
|
|
break;
|
|
default:
|
|
dst_pud = NULL;
|
|
}
|
|
|
|
do {
|
|
next = pud_addr_end(addr, end);
|
|
|
|
if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
|
|
int err;
|
|
|
|
if (type != ITER_FAST)
|
|
continue;
|
|
|
|
VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
|
|
err = copy_huge_pud(dst_mm, src_mm,
|
|
dst_pud, src_pud, addr, vma);
|
|
if (err == -ENOMEM)
|
|
return -ENOMEM;
|
|
if (!err)
|
|
continue;
|
|
/* fall through */
|
|
}
|
|
|
|
if (pud_none_or_clear_bad(src_pud))
|
|
continue;
|
|
|
|
tmp = async_fork_iter_pmd_range(dst_mm, src_mm, dst_pud,
|
|
src_pud, vma, addr, next,
|
|
type);
|
|
if (tmp < 0)
|
|
return -ENOMEM;
|
|
ret += tmp;
|
|
} while (dst_pud++, src_pud++, addr = next, addr != end);
|
|
return ret;
|
|
}
|
|
|
|
static inline int
|
|
async_fork_iter_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
|
pgd_t *dst_pgd, pgd_t *src_pgd,
|
|
struct vm_area_struct *vma, unsigned long addr,
|
|
unsigned long end,
|
|
iter_type_t type)
|
|
{
|
|
p4d_t *src_p4d, *dst_p4d;
|
|
unsigned long next;
|
|
int ret = 0, tmp;
|
|
|
|
src_p4d = p4d_offset(src_pgd, addr);
|
|
switch (type) {
|
|
case ITER_FAST:
|
|
dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
|
|
if (!dst_p4d)
|
|
return -ENOMEM;
|
|
break;
|
|
case ITER_SLOW:
|
|
dst_p4d = p4d_offset(dst_pgd, addr);
|
|
break;
|
|
default:
|
|
dst_p4d = NULL;
|
|
}
|
|
|
|
do {
|
|
next = p4d_addr_end(addr, end);
|
|
if (p4d_none_or_clear_bad(src_p4d))
|
|
continue;
|
|
|
|
tmp = async_fork_iter_pud_range(dst_mm, src_mm, dst_p4d,
|
|
src_p4d, vma, addr,
|
|
next, type);
|
|
if (tmp < 0)
|
|
return -ENOMEM;
|
|
ret += tmp;
|
|
} while (dst_p4d++, src_p4d++, addr = next, addr != end);
|
|
return ret;
|
|
}
|
|
|
|
static inline int
|
|
__async_fork_iter_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
|
struct vm_area_struct *src_vma,
|
|
unsigned long addr, unsigned long end,
|
|
iter_type_t type)
|
|
{
|
|
pgd_t *src_pgd, *dst_pgd;
|
|
unsigned long next;
|
|
int ret = 0, tmp;
|
|
|
|
switch (type) {
|
|
case ITER_FAST:
|
|
case ITER_SLOW:
|
|
dst_pgd = pgd_offset(dst_mm, addr);
|
|
break;
|
|
default:
|
|
dst_pgd = NULL;
|
|
}
|
|
|
|
src_pgd = pgd_offset(src_mm, addr);
|
|
do {
|
|
next = pgd_addr_end(addr, end);
|
|
if (pgd_none_or_clear_bad(src_pgd))
|
|
continue;
|
|
|
|
tmp = async_fork_iter_p4d_range(dst_mm, src_mm,
|
|
dst_pgd, src_pgd,
|
|
src_vma,
|
|
addr, next,
|
|
type);
|
|
if (unlikely(tmp < 0))
|
|
return -ENOMEM;
|
|
ret += tmp;
|
|
} while (dst_pgd++, src_pgd++, addr = next, addr != end);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static inline int
|
|
async_fork_iter_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
|
struct vm_area_struct *src_vma,
|
|
iter_type_t type)
|
|
{
|
|
return __async_fork_iter_page_range(dst_mm, src_mm, src_vma,
|
|
src_vma->vm_start,
|
|
src_vma->vm_end,
|
|
type);
|
|
}
|
|
|
|
static inline struct vm_area_struct *
|
|
async_fork_vma_lock_peer(struct vm_area_struct *vma)
|
|
{
|
|
struct vm_area_struct *peer_vma;
|
|
retry:
|
|
mutex_lock(async_fork_vma_lock(vma));
|
|
peer_vma = vma->async_fork_vma;
|
|
if (!peer_vma) {
|
|
mutex_unlock(async_fork_vma_lock(vma));
|
|
return NULL;
|
|
}
|
|
if (!mutex_trylock(async_fork_vma_lock(peer_vma))) {
|
|
mutex_unlock(async_fork_vma_lock(vma));
|
|
goto retry;
|
|
}
|
|
|
|
return peer_vma;
|
|
}
|
|
|
|
static inline void
|
|
async_fork_vma_unlock_peer(struct vm_area_struct *vma,
|
|
struct vm_area_struct *peer)
|
|
{
|
|
mutex_unlock(async_fork_vma_lock(peer));
|
|
mutex_unlock(async_fork_vma_lock(vma));
|
|
}
|
|
|
|
/* The 'oldmm' here belongs to the parent process. This function can be
|
|
* called both in parent or child conetxt. If it is called in child
|
|
* context, the 'oldmm' will be write-loacked.
|
|
*/
|
|
static void __async_fork_fallback(struct mm_struct *oldmm,
|
|
struct mm_struct *mm)
|
|
{
|
|
struct vm_area_struct *mpnt, *child_vma;
|
|
bool do_lock = !!mm;
|
|
|
|
pr_warn("async fork is fallback, pid=%d comm=%s\n", current->pid,
|
|
current->comm);
|
|
|
|
for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
|
|
if (!mpnt->async_fork_vma)
|
|
continue;
|
|
|
|
/* If the callback is caused by the child process, we need
|
|
* hold the vma->async_fork_lock, as mpnt->async_fork_vma
|
|
* can be accessed in the parent process.
|
|
*/
|
|
if (do_lock)
|
|
child_vma = async_fork_vma_lock_peer(mpnt);
|
|
else
|
|
child_vma = mpnt->async_fork_vma;
|
|
|
|
if (!child_vma)
|
|
continue;
|
|
|
|
async_fork_iter_page_range(NULL, oldmm, mpnt,
|
|
ITER_FALLBACK);
|
|
child_vma->async_fork_vma = NULL;
|
|
mpnt->async_fork_vma = NULL;
|
|
|
|
if (do_lock)
|
|
async_fork_vma_unlock_peer(mpnt, child_vma);
|
|
}
|
|
|
|
spin_lock(&oldmm->async_fork_lock);
|
|
clear_bit(ASYNC_FORK_CANDIDATE, &oldmm->async_fork_flags);
|
|
oldmm->async_fork_mm = NULL;
|
|
spin_unlock(&oldmm->async_fork_lock);
|
|
wake_up_all(&oldmm->async_fork_wait);
|
|
|
|
if (!mm)
|
|
return;
|
|
|
|
/* oldmm is binded with mm, and we need to unbind them now */
|
|
clear_bit(ASYNC_FORK_PENDING, &mm->async_fork_flags);
|
|
mm->async_fork_mm = NULL;
|
|
|
|
mmdrop(oldmm);
|
|
mmdrop(mm);
|
|
static_branch_dec(&async_fork_staging_key);
|
|
pr_debug("fallback done, mm=%p, oldmm=%p\n", mm, oldmm);
|
|
}
|
|
|
|
static inline void async_fork_fallback(struct mm_struct *oldmm,
|
|
struct mm_struct *mm)
|
|
{
|
|
down_write(&oldmm->mmap_sem);
|
|
__async_fork_fallback(oldmm, mm);
|
|
up_write(&oldmm->mmap_sem);
|
|
}
|
|
|
|
static inline void async_fork_vma_bind(struct vm_area_struct *vma1,
|
|
struct vm_area_struct *vma2)
|
|
{
|
|
vma1->async_fork_vma = vma2;
|
|
vma2->async_fork_vma = vma1;
|
|
|
|
pr_debug("vma is binded, vma=%p vma=%p mm=%p,%p\n", vma1, vma2,
|
|
vma1->vm_mm, vma2->vm_mm);
|
|
}
|
|
|
|
void async_fork_fixup_vmas(struct mm_struct *mm)
|
|
{
|
|
struct vm_area_struct *vma;
|
|
|
|
if (!test_bit(ASYNC_FORK_CANDIDATE, &mm->async_fork_flags))
|
|
return;
|
|
|
|
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
|
async_fork_fixup_vma(vma);
|
|
cond_resched();
|
|
}
|
|
}
|
|
|
|
int async_fork_prepare(struct mm_struct *oldmm, struct mm_struct *mm)
|
|
{
|
|
DEFINE_WAIT_FUNC(wait, woken_wake_function);
|
|
int ret = 0;
|
|
|
|
pr_debug("async fork begin on pid=%d mm=%p oldmm=%p\n",
|
|
current->pid, mm, oldmm);
|
|
|
|
/* It's hard to deal with parallel async fork, as we can't process
|
|
* the 'fallback' case. If the previous async fork is processing,
|
|
* but the current fork failed and need to fallback, we can't tell
|
|
* which vma and which pmd should we fallback.
|
|
*
|
|
* Therefore, let's just 'serial' the async fork for now.
|
|
*/
|
|
|
|
if (!test_and_set_bit(ASYNC_FORK_CANDIDATE,
|
|
&oldmm->async_fork_flags))
|
|
goto no_prev;
|
|
|
|
pr_debug("prev async fork not finished yet,mm=%p\n", oldmm);
|
|
spin_lock_irq(&oldmm->async_fork_lock);
|
|
wait_event_lock_irq(oldmm->async_fork_wait,
|
|
!test_bit(ASYNC_FORK_CANDIDATE, &oldmm->async_fork_flags),
|
|
oldmm->async_fork_lock);
|
|
|
|
set_bit(ASYNC_FORK_CANDIDATE, &oldmm->async_fork_flags);
|
|
spin_unlock_irq(&oldmm->async_fork_lock);
|
|
no_prev:
|
|
if (ret)
|
|
pr_warn("async fork prepare failed, err=%d\n", ret);
|
|
return ret;
|
|
}
|
|
|
|
int __async_fork_fast(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
|
struct vm_area_struct *dst_vma,
|
|
struct vm_area_struct *src_vma)
|
|
{
|
|
int err;
|
|
|
|
pr_debug("parent async fork fast begin\n");
|
|
|
|
pr_debug("fast path vma=%p mm=%p addr: %08lx-%08lx\n", src_vma,
|
|
dst_mm, src_vma->vm_start, src_vma->vm_end);
|
|
err = async_fork_iter_page_range(dst_mm, src_mm, src_vma, ITER_FAST);
|
|
|
|
if (err > 0)
|
|
async_fork_vma_bind(dst_vma, src_vma);
|
|
|
|
pr_debug("parent async fork fast end, mm=%p err=%d\n", dst_mm,
|
|
err);
|
|
return err < 0 ? err : 0;
|
|
}
|
|
|
|
/* This function is called when copy_process() finished. And 'mm' here
|
|
* belong to the child process.
|
|
*/
|
|
void async_fork_fast_done(struct mm_struct *mm, int err)
|
|
{
|
|
if (!mm || !mm->async_fork_mm)
|
|
return;
|
|
|
|
if (err) {
|
|
pr_warn("async fork fallback in cpr done, err=%d\n", err);
|
|
async_fork_fallback(mm->async_fork_mm, mm);
|
|
return;
|
|
}
|
|
pr_debug("async fork fast success, mm=%p, oldmm=%p\n", mm,
|
|
mm->async_fork_mm);
|
|
}
|
|
|
|
/* This function is called when vma copy finished in the fast path.
|
|
* This function is called only if async fork heppens.
|
|
*
|
|
* This function is called with oldmm->mmap_sem write locked. If
|
|
* There are any pending async fork in the oldmm before, it always
|
|
* has already finished to async all the vmas before this function.
|
|
*/
|
|
void async_fork_mm_bind(struct mm_struct *oldmm, struct mm_struct *mm,
|
|
int err)
|
|
{
|
|
if (err) {
|
|
pr_warn("async fork fallback in cpr bind, err=%d\n", err);
|
|
__async_fork_fallback(oldmm, NULL);
|
|
return;
|
|
}
|
|
|
|
|
|
WARN_ON(oldmm->async_fork_mm);
|
|
|
|
/* it's safe to change 'oldmm->async_fork_mm' here */
|
|
oldmm->async_fork_mm = mm;
|
|
mm->async_fork_mm = oldmm;
|
|
|
|
/* We have to active the async fork now, as page fault can happens
|
|
* during copy_process(). Once this happen, we need to be able to
|
|
* clean the write-protect flag.
|
|
*/
|
|
|
|
set_bit(ASYNC_FORK_PENDING, &mm->async_fork_flags);
|
|
|
|
static_branch_inc(&async_fork_staging_key);
|
|
mmgrab(oldmm);
|
|
mmgrab(mm);
|
|
|
|
pr_debug("async fork mm binded, mm=%p, oldmm=%p, pid=%d, cpid=%d\n",
|
|
mm, oldmm, current->pid, mm->owner->pid);
|
|
}
|
|
|
|
/* The rest of async fork has finished. This function should be called
|
|
* in the child context.
|
|
*/
|
|
void async_fork_rest_success(struct mm_struct *oldmm,
|
|
struct mm_struct *mm)
|
|
{
|
|
WARN(!test_and_clear_bit(ASYNC_FORK_PENDING,
|
|
&mm->async_fork_flags),
|
|
"pending flags not found! pid=%d\n", current->pid);
|
|
|
|
clear_bit(ASYNC_FORK_PENDING, &mm->async_fork_flags);
|
|
mm->async_fork_mm = NULL;
|
|
|
|
spin_lock(&oldmm->async_fork_lock);
|
|
clear_bit(ASYNC_FORK_CANDIDATE, &oldmm->async_fork_flags);
|
|
oldmm->async_fork_mm = NULL;
|
|
spin_unlock(&oldmm->async_fork_lock);
|
|
|
|
wake_up_all(&oldmm->async_fork_wait);
|
|
|
|
mmdrop(oldmm);
|
|
mmdrop(mm);
|
|
static_branch_dec(&async_fork_staging_key);
|
|
|
|
pr_debug("child async fork success, mm=%p, oldmm=%p\n", mm,
|
|
oldmm);
|
|
}
|
|
|
|
/* The rest copy of page table. */
|
|
void async_fork_rest(struct mm_struct *mm)
|
|
{
|
|
struct vm_area_struct *mpnt, *src;
|
|
struct mm_struct *oldmm;
|
|
int err = 0;
|
|
|
|
if (!is_async_fork_pending(mm))
|
|
return;
|
|
|
|
pr_debug("child async fork begin mm=%p\n", mm);
|
|
|
|
down_write(&mm->mmap_sem);
|
|
oldmm = mm->async_fork_mm;
|
|
|
|
for (mpnt = mm->mmap; mpnt; mpnt = mpnt->vm_next) {
|
|
if (is_async_fork_fallback(mm))
|
|
goto do_fallback;
|
|
|
|
err = 0;
|
|
src = async_fork_vma_lock_peer(mpnt);
|
|
if (!src)
|
|
continue;
|
|
|
|
err = async_fork_iter_page_range(mm, oldmm, src,
|
|
ITER_SLOW);
|
|
async_fork_vma_unbind(mpnt);
|
|
async_fork_vma_unlock_peer(mpnt, src);
|
|
if (err < 0) {
|
|
async_fork_set_flags(mm, ASYNC_FORK_FALLBACK);
|
|
goto do_fallback;
|
|
}
|
|
}
|
|
|
|
if (is_async_fork_fallback(mm))
|
|
goto do_fallback;
|
|
|
|
async_fork_rest_success(oldmm, mm);
|
|
up_write(&mm->mmap_sem);
|
|
return;
|
|
|
|
do_fallback:
|
|
pr_warn("async fork fallback in cpr rest\n");
|
|
async_fork_fallback(oldmm, mm);
|
|
kill_pid(task_pid(current), SIGSEGV, 1);
|
|
up_write(&mm->mmap_sem);
|
|
}
|
|
|
|
int __async_fork_fixup_pmd(struct vm_area_struct *vma, pmd_t *pmd,
|
|
unsigned long addr)
|
|
{
|
|
bool need_pmd = false, need_pte = false;
|
|
struct mm_struct *dst_mm, *src_mm;
|
|
spinlock_t *pmd_ptl;
|
|
pgtable_t new_pte;
|
|
pgd_t *dst_pgd;
|
|
p4d_t *dst_p4d;
|
|
pud_t *dst_pud;
|
|
pmd_t *dst_pmd;
|
|
|
|
src_mm = vma->vm_mm;
|
|
pmd_ptl = pmd_lock(src_mm, pmd);
|
|
if (!is_vma_async_fork(vma) || !is_pmd_async_fork(*pmd)) {
|
|
spin_unlock(pmd_ptl);
|
|
return 0;
|
|
}
|
|
|
|
dst_mm = vma->async_fork_vma->vm_mm;
|
|
if (is_async_fork_fallback(dst_mm))
|
|
goto fallback;
|
|
|
|
dst_pgd = pgd_offset(dst_mm, addr);
|
|
WARN_ON(pgd_none(*dst_pgd));
|
|
|
|
dst_p4d = p4d_offset(dst_pgd, addr);
|
|
WARN_ON(p4d_none(*dst_p4d));
|
|
|
|
dst_pud = pud_offset(dst_p4d, addr);
|
|
|
|
if (pud_none(*dst_pud)) {
|
|
need_pmd = true;
|
|
need_pte = true;
|
|
} else {
|
|
dst_pmd = pmd_offset(dst_pud, addr);
|
|
if (pmd_none(*dst_pmd))
|
|
need_pte = true;
|
|
}
|
|
|
|
if (!need_pmd && !need_pte)
|
|
goto do_copy;
|
|
|
|
/* alloc pmd/pte without spin lock */
|
|
spin_unlock(pmd_ptl);
|
|
if (unlikely(need_pmd)) {
|
|
dst_pmd = pmd_alloc_one(dst_mm, addr);
|
|
if (!dst_pmd)
|
|
goto fallback;
|
|
}
|
|
if (need_pte) {
|
|
new_pte = pte_alloc_one(dst_mm);
|
|
if (!new_pte) {
|
|
if (need_pmd)
|
|
pmd_free(dst_mm, dst_pmd);
|
|
goto fallback;
|
|
}
|
|
}
|
|
|
|
spin_lock(pmd_ptl);
|
|
if (!is_vma_async_fork(vma) || !is_pmd_async_fork(*pmd)) {
|
|
spin_unlock(pmd_ptl);
|
|
if (need_pmd)
|
|
pmd_free(dst_mm, dst_pmd);
|
|
if (need_pte)
|
|
pte_free(dst_mm, new_pte);
|
|
return 0;
|
|
}
|
|
|
|
/* populate pmd and pte with pmd_ptl held */
|
|
if (need_pmd) {
|
|
async_pmd_populate(dst_mm, dst_pud, addr, dst_pmd);
|
|
dst_pmd = pmd_offset(dst_pud, addr);
|
|
}
|
|
|
|
if (need_pte)
|
|
async_pte_populate(dst_mm, dst_pmd, new_pte);
|
|
do_copy:
|
|
if (async_copy_pmd_one(dst_mm, vma, pmd, dst_pmd, addr))
|
|
goto fallback;
|
|
spin_unlock(pmd_ptl);
|
|
return 0;
|
|
|
|
fallback:
|
|
__async_fork_clean_pmd(src_mm, addr, pmd);
|
|
async_fork_set_flags(dst_mm, ASYNC_FORK_FALLBACK);
|
|
spin_unlock(pmd_ptl);
|
|
pr_warn("async fork fallback in pmd fix, oldmm=%p\n", src_mm);
|
|
|
|
return -ENOMEM;
|
|
}
|
|
|
|
/* mpnt->vm_mm should be write lock state */
|
|
void __async_fork_fixup_vma(struct vm_area_struct *mpnt)
|
|
{
|
|
struct vm_area_struct *dst_vma;
|
|
int err = 0;
|
|
|
|
dst_vma = async_fork_vma_lock_peer(mpnt);
|
|
if (!dst_vma)
|
|
return;
|
|
|
|
pr_debug("async fork fixup vma, oldmm=%p mm=%p vma=%08lx-%08lx\n",
|
|
mpnt->vm_mm, dst_vma->vm_mm, mpnt->vm_start,
|
|
mpnt->vm_end);
|
|
|
|
if (is_async_fork_fallback(dst_vma->vm_mm))
|
|
goto fallback;
|
|
|
|
err = async_fork_iter_page_range(dst_vma->vm_mm, mpnt->vm_mm,
|
|
mpnt, ITER_SLOW) < 0;
|
|
if (err) {
|
|
async_fork_set_flags(dst_vma->vm_mm, ASYNC_FORK_FALLBACK);
|
|
goto fallback;
|
|
}
|
|
|
|
unlock:
|
|
async_fork_vma_unbind(mpnt);
|
|
async_fork_vma_unlock_peer(mpnt, dst_vma);
|
|
|
|
pr_debug("async fork fixup vma finished\n");
|
|
return;
|
|
|
|
fallback:
|
|
pr_warn("async fork fallback in vma fix\n");
|
|
async_fork_iter_page_range(dst_vma->vm_mm, mpnt->vm_mm,
|
|
mpnt, ITER_FALLBACK);
|
|
goto unlock;
|
|
}
|
|
|
|
void __async_fork_madvise_vma(struct vm_area_struct *mpnt,
|
|
unsigned long start,
|
|
unsigned long end)
|
|
{
|
|
pr_debug("async fork madvise, oldmm=%p vma=%08lx-%08lx\n",
|
|
mpnt->vm_mm, mpnt->vm_start, mpnt->vm_end);
|
|
|
|
/* The mpnt will be used with pmd locked, so we can make sure
|
|
* that dst_vma is not freed.
|
|
*/
|
|
__async_fork_iter_page_range(NULL, mpnt->vm_mm, mpnt, start,
|
|
end, ITER_MADVISE);
|
|
|
|
pr_debug("async fork madvise finished\n");
|
|
}
|