OpenCloudOS-Kernel/mm/async_fork.c

926 lines
22 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/sched/mm.h>
#include <linux/wait.h>
#include <linux/sched/signal.h>
#include <linux/mm_pte.h>
#include <asm/pgalloc.h>
#define MIN(x, y) (x < y ? x : y)
#define MAX(x, y) (x > y ? x : y)
DEFINE_STATIC_KEY_FALSE(async_fork_enabled_key);
DEFINE_STATIC_KEY_FALSE(async_fork_staging_key);
typedef enum {
ITER_FAST,
ITER_SLOW,
ITER_FALLBACK,
ITER_MADVISE,
} iter_type_t;
static void async_pmd_populate(struct mm_struct *mm, pud_t *pud,
unsigned long address,
pmd_t *new)
{
spinlock_t *ptl;
/* The same as what __pmd_alloc() do */
smp_wmb();
ptl = pud_lock(mm, pud);
#ifndef __ARCH_HAS_4LEVEL_HACK
if (!pud_present(*pud)) {
mm_inc_nr_pmds(mm);
pud_populate(mm, pud, new);
} else /* Another has populated it */
pmd_free(mm, new);
#else
if (!pgd_present(*pud)) {
mm_inc_nr_pmds(mm);
pgd_populate(mm, pud, new);
} else /* Another has populated it */
pmd_free(mm, new);
#endif
spin_unlock(ptl);
}
static void async_pte_populate(struct mm_struct *mm, pmd_t *pmd,
pgtable_t new)
{
spinlock_t *ptl;
/* The same as what __pte_alloc() do */
smp_wmb();
ptl = pmd_lock(mm, pmd);
if (likely(pmd_none(*pmd))) {
mm_inc_nr_ptes(mm);
pmd_populate(mm, pmd, new);
new = NULL;
}
spin_unlock(ptl);
}
static inline void async_fork_mkpmd(struct mm_struct *src_mm,
unsigned long addr,
pmd_t *pmd)
{
spinlock_t *pmd_ptl;
if (is_pmd_async_fork(*pmd)) {
pr_warn("pmd=%08lx is already made async fork\n",
pmd->pmd);
return;
}
pmd_ptl = pmd_lock(src_mm, pmd);
/* check again */
if (is_pmd_async_fork(*pmd))
goto unlock;
set_pmd_at(src_mm, addr, pmd, pmd_wrprotect(*pmd));
unlock:
spin_unlock(pmd_ptl);
addr = addr & PMD_MASK;
pr_debug("pmd is made async fork, addr=%lx-%lx pmd=%lx oldmm=%p\n",
addr, addr + PMD_SIZE, pmd->pmd, src_mm);
}
static inline void __async_fork_clean_pmd(struct mm_struct *src_mm,
unsigned long addr,
pmd_t *pmd)
{
pmd_t tmp = *pmd;
if (is_pmd_async_fork(*pmd)) {
tmp = pmd_mkwrite(tmp);
} else {
pr_warn("pmd don't need to clean async fork, pmd=%08lx mm=%p",
tmp.pmd, src_mm);
return;
}
set_pmd_at(src_mm, addr, pmd, tmp);
pr_debug("pmd clean async fork addr=%lx pmd=%lx mm=%p\n", addr,
pmd->pmd, src_mm);
}
static inline void async_fork_clean_pmd(struct mm_struct *src_mm,
unsigned long addr,
pmd_t *pmd)
{
spinlock_t *pmd_ptl;
pmd_ptl = pmd_lock(src_mm, pmd);
__async_fork_clean_pmd(src_mm, addr, pmd);
spin_unlock(pmd_ptl);
}
static inline void async_fork_vma_unbind(struct vm_area_struct *vma)
{
if (!vma->async_fork_vma)
return;
pr_debug("vma unbinded, vma=%p vma=%p mm=%p\n", vma,
vma->async_fork_vma, vma->vm_mm);
vma->async_fork_vma->async_fork_vma = NULL;
vma->async_fork_vma = NULL;
}
static int
async_copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
spinlock_t *src_ptl, *dst_ptl;
pte_t *orig_src_pte, *orig_dst_pte;
pte_t *src_pte, *dst_pte;
int rss[NR_MM_COUNTERS];
swp_entry_t entry = (swp_entry_t){0};
init_rss_vec(rss);
dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
if (!dst_pte)
return -ENOMEM;
src_pte = pte_offset_map(src_pmd, addr);
src_ptl = pte_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
orig_src_pte = src_pte;
orig_dst_pte = dst_pte;
arch_enter_lazy_mmu_mode();
do {
if (pte_none(*src_pte))
continue;
entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
vma, addr, rss);
if (entry.val)
break;
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
spin_unlock(src_ptl);
pte_unmap(orig_src_pte);
add_mm_rss_vec(dst_mm, rss);
pte_unmap_unlock(orig_dst_pte, dst_ptl);
if (entry.val) {
if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
return -ENOMEM;
}
return 0;
}
/* pmd_lock should be hold when call this function. Meanwhile, the dst_pmd
* should already be allocated.
*/
static int
async_copy_pmd_one(struct mm_struct *dst_mm, struct vm_area_struct *src_vma,
pmd_t *src_pmd, pmd_t *dst_pmd, unsigned long addr)
{
unsigned long pmd_start, pmd_end, vma_start, vma_end, start, end;
struct mm_struct *src_mm = src_vma->vm_mm;
struct vm_area_struct *tmp;
int count = 0;
if (!is_pmd_async_fork(*src_pmd) || !src_vma->async_fork_vma)
return 0;
tmp = find_vma(src_vma->vm_mm, addr & PMD_MASK);
if (!tmp) {
pr_warn("vma not founded for addr:%08lx\n", addr);
return -EINVAL;
}
pmd_start = addr & PMD_MASK;
pmd_end = pmd_start + PMD_SIZE;
do {
vma_start = tmp->vm_start;
vma_end = tmp->vm_end;
/* we hit the end of the pmd address range */
if (pmd_end <= vma_start)
break;
/* this vma is not in the range of current pmd */
if (!tmp->async_fork_vma || pmd_start >= vma_end)
goto vma_next;
start = MAX(vma_start, pmd_start);
end = MIN(vma_end, pmd_end);
if (async_copy_pte_range(dst_mm, src_mm, dst_pmd,
src_pmd, tmp, start, end)) {
return -ENOMEM;
}
count++;
pr_debug("pte copied in %lx-%lx mm=%p\n", start, end,
dst_mm);
vma_next:
tmp = tmp->vm_next;
} while (tmp);
if (!count)
pr_warn("no vma found for pmd=%08lx\n", src_pmd->pmd);
__async_fork_clean_pmd(src_mm, addr, src_pmd);
return 0;
}
static inline int
async_fork_iter_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pud_t *dst_pud, pud_t *src_pud,
struct vm_area_struct *vma, unsigned long addr,
unsigned long end,
iter_type_t type)
{
pmd_t *src_pmd, *dst_pmd = NULL;
bool pmd_alloced = false;
spinlock_t *pmd_ptl;
unsigned long next;
int ret = 0, err;
src_pmd = pmd_offset(src_pud, addr);
#define check_pmd_alloc() do { \
if (!pmd_alloced) { \
dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); \
if (!dst_pmd) \
return -ENOMEM; \
pmd_alloced = true; \
} \
} while (0);
do {
next = pmd_addr_end(addr, end);
if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
|| pmd_devmap(*src_pmd)) {
int err;
if (type != ITER_FAST)
continue;
check_pmd_alloc()
VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
pr_debug("huge pmd copy on %08lx-%08lx, src_pmd=%p dst_pmd=%p\n",
addr, next, src_pmd, dst_pmd);
err = copy_huge_pmd(dst_mm, src_mm,
dst_pmd, src_pmd, addr, vma);
if (err == -ENOMEM)
return -ENOMEM;
if (!err)
continue;
/* fall through */
}
if (pmd_none_or_clear_bad(src_pmd))
continue;
/* fast path during fork(), access to src mm and src vma
* is safe here.
*/
if (type == ITER_FAST) {
if (is_pmd_async_fork(*src_pmd))
pr_debug("pmd is already made async fork pmd=%lx\n",
src_pmd->pmd);
else
async_fork_mkpmd(src_mm, addr, src_pmd);
ret++;
continue;
}
if (!is_pmd_async_fork(*src_pmd))
continue;
if (type == ITER_FALLBACK) {
async_fork_clean_pmd(src_mm, addr, src_pmd);
continue;
}
/* In this case, the dst vma is not locked and can't be
* used directly.
*/
if (type == ITER_MADVISE) {
err = __async_fork_fixup_pmd(vma, src_pmd, addr);
if (err)
return err;
continue;
}
/* In ITER_SLOW path, src vma and dst vma is locked.
* Therefor, it is safe to use dst mm and src mm here.
*/
check_pmd_alloc()
if (pte_alloc(dst_mm, dst_pmd))
return -ENOMEM;
pmd_ptl = pmd_lock(src_mm, src_pmd);
err = async_copy_pmd_one(dst_mm, vma, src_pmd, dst_pmd,
addr);
spin_unlock(pmd_ptl);
if (err)
return -ENOMEM;
might_sleep();
} while (dst_pmd++, src_pmd++, addr = next, addr != end);
return ret;
}
static inline int
async_fork_iter_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
p4d_t *dst_p4d, p4d_t *src_p4d,
struct vm_area_struct *vma, unsigned long addr,
unsigned long end,
iter_type_t type)
{
pud_t *src_pud, *dst_pud;
unsigned long next;
int ret = 0, tmp;
src_pud = pud_offset(src_p4d, addr);
switch (type) {
case ITER_FAST:
dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
if (!dst_pud)
return -ENOMEM;
break;
case ITER_SLOW:
dst_pud = pud_offset(dst_p4d, addr);
break;
default:
dst_pud = NULL;
}
do {
next = pud_addr_end(addr, end);
if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
int err;
if (type != ITER_FAST)
continue;
VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
err = copy_huge_pud(dst_mm, src_mm,
dst_pud, src_pud, addr, vma);
if (err == -ENOMEM)
return -ENOMEM;
if (!err)
continue;
/* fall through */
}
if (pud_none_or_clear_bad(src_pud))
continue;
tmp = async_fork_iter_pmd_range(dst_mm, src_mm, dst_pud,
src_pud, vma, addr, next,
type);
if (tmp < 0)
return -ENOMEM;
ret += tmp;
} while (dst_pud++, src_pud++, addr = next, addr != end);
return ret;
}
static inline int
async_fork_iter_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pgd_t *dst_pgd, pgd_t *src_pgd,
struct vm_area_struct *vma, unsigned long addr,
unsigned long end,
iter_type_t type)
{
p4d_t *src_p4d, *dst_p4d;
unsigned long next;
int ret = 0, tmp;
src_p4d = p4d_offset(src_pgd, addr);
switch (type) {
case ITER_FAST:
dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
if (!dst_p4d)
return -ENOMEM;
break;
case ITER_SLOW:
dst_p4d = p4d_offset(dst_pgd, addr);
break;
default:
dst_p4d = NULL;
}
do {
next = p4d_addr_end(addr, end);
if (p4d_none_or_clear_bad(src_p4d))
continue;
tmp = async_fork_iter_pud_range(dst_mm, src_mm, dst_p4d,
src_p4d, vma, addr,
next, type);
if (tmp < 0)
return -ENOMEM;
ret += tmp;
} while (dst_p4d++, src_p4d++, addr = next, addr != end);
return ret;
}
static inline int
__async_fork_iter_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
struct vm_area_struct *src_vma,
unsigned long addr, unsigned long end,
iter_type_t type)
{
pgd_t *src_pgd, *dst_pgd;
unsigned long next;
int ret = 0, tmp;
switch (type) {
case ITER_FAST:
case ITER_SLOW:
dst_pgd = pgd_offset(dst_mm, addr);
break;
default:
dst_pgd = NULL;
}
src_pgd = pgd_offset(src_mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(src_pgd))
continue;
tmp = async_fork_iter_p4d_range(dst_mm, src_mm,
dst_pgd, src_pgd,
src_vma,
addr, next,
type);
if (unlikely(tmp < 0))
return -ENOMEM;
ret += tmp;
} while (dst_pgd++, src_pgd++, addr = next, addr != end);
return ret;
}
static inline int
async_fork_iter_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
struct vm_area_struct *src_vma,
iter_type_t type)
{
return __async_fork_iter_page_range(dst_mm, src_mm, src_vma,
src_vma->vm_start,
src_vma->vm_end,
type);
}
static inline struct vm_area_struct *
async_fork_vma_lock_peer(struct vm_area_struct *vma)
{
struct vm_area_struct *peer_vma;
retry:
mutex_lock(async_fork_vma_lock(vma));
peer_vma = vma->async_fork_vma;
if (!peer_vma) {
mutex_unlock(async_fork_vma_lock(vma));
return NULL;
}
if (!mutex_trylock(async_fork_vma_lock(peer_vma))) {
mutex_unlock(async_fork_vma_lock(vma));
goto retry;
}
return peer_vma;
}
static inline void
async_fork_vma_unlock_peer(struct vm_area_struct *vma,
struct vm_area_struct *peer)
{
mutex_unlock(async_fork_vma_lock(peer));
mutex_unlock(async_fork_vma_lock(vma));
}
/* The 'oldmm' here belongs to the parent process. This function can be
* called both in parent or child conetxt. If it is called in child
* context, the 'oldmm' will be write-loacked.
*/
static void __async_fork_fallback(struct mm_struct *oldmm,
struct mm_struct *mm)
{
struct vm_area_struct *mpnt, *child_vma;
bool do_lock = !!mm;
pr_warn("async fork is fallback, pid=%d comm=%s\n", current->pid,
current->comm);
for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
if (!mpnt->async_fork_vma)
continue;
/* If the callback is caused by the child process, we need
* hold the vma->async_fork_lock, as mpnt->async_fork_vma
* can be accessed in the parent process.
*/
if (do_lock)
child_vma = async_fork_vma_lock_peer(mpnt);
else
child_vma = mpnt->async_fork_vma;
if (!child_vma)
continue;
async_fork_iter_page_range(NULL, oldmm, mpnt,
ITER_FALLBACK);
child_vma->async_fork_vma = NULL;
mpnt->async_fork_vma = NULL;
if (do_lock)
async_fork_vma_unlock_peer(mpnt, child_vma);
}
spin_lock(&oldmm->async_fork_lock);
clear_bit(ASYNC_FORK_CANDIDATE, &oldmm->async_fork_flags);
oldmm->async_fork_mm = NULL;
spin_unlock(&oldmm->async_fork_lock);
wake_up_all(&oldmm->async_fork_wait);
if (!mm)
return;
/* oldmm is binded with mm, and we need to unbind them now */
clear_bit(ASYNC_FORK_PENDING, &mm->async_fork_flags);
mm->async_fork_mm = NULL;
mmdrop(oldmm);
mmdrop(mm);
static_branch_dec(&async_fork_staging_key);
pr_debug("fallback done, mm=%p, oldmm=%p\n", mm, oldmm);
}
static inline void async_fork_fallback(struct mm_struct *oldmm,
struct mm_struct *mm)
{
down_write(&oldmm->mmap_sem);
__async_fork_fallback(oldmm, mm);
up_write(&oldmm->mmap_sem);
}
static inline void async_fork_vma_bind(struct vm_area_struct *vma1,
struct vm_area_struct *vma2)
{
vma1->async_fork_vma = vma2;
vma2->async_fork_vma = vma1;
pr_debug("vma is binded, vma=%p vma=%p mm=%p,%p\n", vma1, vma2,
vma1->vm_mm, vma2->vm_mm);
}
void async_fork_fixup_vmas(struct mm_struct *mm)
{
struct vm_area_struct *vma;
if (!test_bit(ASYNC_FORK_CANDIDATE, &mm->async_fork_flags))
return;
for (vma = mm->mmap; vma; vma = vma->vm_next) {
async_fork_fixup_vma(vma);
cond_resched();
}
}
int async_fork_prepare(struct mm_struct *oldmm, struct mm_struct *mm)
{
DEFINE_WAIT_FUNC(wait, woken_wake_function);
int ret = 0;
pr_debug("async fork begin on pid=%d mm=%p oldmm=%p\n",
current->pid, mm, oldmm);
/* It's hard to deal with parallel async fork, as we can't process
* the 'fallback' case. If the previous async fork is processing,
* but the current fork failed and need to fallback, we can't tell
* which vma and which pmd should we fallback.
*
* Therefore, let's just 'serial' the async fork for now.
*/
if (!test_and_set_bit(ASYNC_FORK_CANDIDATE,
&oldmm->async_fork_flags))
goto no_prev;
pr_debug("prev async fork not finished yet,mm=%p\n", oldmm);
spin_lock_irq(&oldmm->async_fork_lock);
wait_event_lock_irq(oldmm->async_fork_wait,
!test_bit(ASYNC_FORK_CANDIDATE, &oldmm->async_fork_flags),
oldmm->async_fork_lock);
set_bit(ASYNC_FORK_CANDIDATE, &oldmm->async_fork_flags);
spin_unlock_irq(&oldmm->async_fork_lock);
no_prev:
if (ret)
pr_warn("async fork prepare failed, err=%d\n", ret);
return ret;
}
int __async_fork_fast(struct mm_struct *dst_mm, struct mm_struct *src_mm,
struct vm_area_struct *dst_vma,
struct vm_area_struct *src_vma)
{
int err;
pr_debug("parent async fork fast begin\n");
pr_debug("fast path vma=%p mm=%p addr: %08lx-%08lx\n", src_vma,
dst_mm, src_vma->vm_start, src_vma->vm_end);
err = async_fork_iter_page_range(dst_mm, src_mm, src_vma, ITER_FAST);
if (err > 0)
async_fork_vma_bind(dst_vma, src_vma);
pr_debug("parent async fork fast end, mm=%p err=%d\n", dst_mm,
err);
return err < 0 ? err : 0;
}
/* This function is called when copy_process() finished. And 'mm' here
* belong to the child process.
*/
void async_fork_fast_done(struct mm_struct *mm, int err)
{
if (!mm || !mm->async_fork_mm)
return;
if (err) {
pr_warn("async fork fallback in cpr done, err=%d\n", err);
async_fork_fallback(mm->async_fork_mm, mm);
return;
}
pr_debug("async fork fast success, mm=%p, oldmm=%p\n", mm,
mm->async_fork_mm);
}
/* This function is called when vma copy finished in the fast path.
* This function is called only if async fork heppens.
*
* This function is called with oldmm->mmap_sem write locked. If
* There are any pending async fork in the oldmm before, it always
* has already finished to async all the vmas before this function.
*/
void async_fork_mm_bind(struct mm_struct *oldmm, struct mm_struct *mm,
int err)
{
if (err) {
pr_warn("async fork fallback in cpr bind, err=%d\n", err);
__async_fork_fallback(oldmm, NULL);
return;
}
WARN_ON(oldmm->async_fork_mm);
/* it's safe to change 'oldmm->async_fork_mm' here */
oldmm->async_fork_mm = mm;
mm->async_fork_mm = oldmm;
/* We have to active the async fork now, as page fault can happens
* during copy_process(). Once this happen, we need to be able to
* clean the write-protect flag.
*/
set_bit(ASYNC_FORK_PENDING, &mm->async_fork_flags);
static_branch_inc(&async_fork_staging_key);
mmgrab(oldmm);
mmgrab(mm);
pr_debug("async fork mm binded, mm=%p, oldmm=%p, pid=%d, cpid=%d\n",
mm, oldmm, current->pid, mm->owner->pid);
}
/* The rest of async fork has finished. This function should be called
* in the child context.
*/
void async_fork_rest_success(struct mm_struct *oldmm,
struct mm_struct *mm)
{
WARN(!test_and_clear_bit(ASYNC_FORK_PENDING,
&mm->async_fork_flags),
"pending flags not found! pid=%d\n", current->pid);
clear_bit(ASYNC_FORK_PENDING, &mm->async_fork_flags);
mm->async_fork_mm = NULL;
spin_lock(&oldmm->async_fork_lock);
clear_bit(ASYNC_FORK_CANDIDATE, &oldmm->async_fork_flags);
oldmm->async_fork_mm = NULL;
spin_unlock(&oldmm->async_fork_lock);
wake_up_all(&oldmm->async_fork_wait);
mmdrop(oldmm);
mmdrop(mm);
static_branch_dec(&async_fork_staging_key);
pr_debug("child async fork success, mm=%p, oldmm=%p\n", mm,
oldmm);
}
/* The rest copy of page table. */
void async_fork_rest(struct mm_struct *mm)
{
struct vm_area_struct *mpnt, *src;
struct mm_struct *oldmm;
int err = 0;
if (!is_async_fork_pending(mm))
return;
pr_debug("child async fork begin mm=%p\n", mm);
down_write(&mm->mmap_sem);
oldmm = mm->async_fork_mm;
for (mpnt = mm->mmap; mpnt; mpnt = mpnt->vm_next) {
if (is_async_fork_fallback(mm))
goto do_fallback;
err = 0;
src = async_fork_vma_lock_peer(mpnt);
if (!src)
continue;
err = async_fork_iter_page_range(mm, oldmm, src,
ITER_SLOW);
async_fork_vma_unbind(mpnt);
async_fork_vma_unlock_peer(mpnt, src);
if (err < 0) {
async_fork_set_flags(mm, ASYNC_FORK_FALLBACK);
goto do_fallback;
}
}
if (is_async_fork_fallback(mm))
goto do_fallback;
async_fork_rest_success(oldmm, mm);
up_write(&mm->mmap_sem);
return;
do_fallback:
pr_warn("async fork fallback in cpr rest\n");
async_fork_fallback(oldmm, mm);
kill_pid(task_pid(current), SIGSEGV, 1);
up_write(&mm->mmap_sem);
}
int __async_fork_fixup_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr)
{
bool need_pmd = false, need_pte = false;
struct mm_struct *dst_mm, *src_mm;
spinlock_t *pmd_ptl;
pgtable_t new_pte;
pgd_t *dst_pgd;
p4d_t *dst_p4d;
pud_t *dst_pud;
pmd_t *dst_pmd;
src_mm = vma->vm_mm;
pmd_ptl = pmd_lock(src_mm, pmd);
if (!is_vma_async_fork(vma) || !is_pmd_async_fork(*pmd)) {
spin_unlock(pmd_ptl);
return 0;
}
dst_mm = vma->async_fork_vma->vm_mm;
if (is_async_fork_fallback(dst_mm))
goto fallback;
dst_pgd = pgd_offset(dst_mm, addr);
WARN_ON(pgd_none(*dst_pgd));
dst_p4d = p4d_offset(dst_pgd, addr);
WARN_ON(p4d_none(*dst_p4d));
dst_pud = pud_offset(dst_p4d, addr);
if (pud_none(*dst_pud)) {
need_pmd = true;
need_pte = true;
} else {
dst_pmd = pmd_offset(dst_pud, addr);
if (pmd_none(*dst_pmd))
need_pte = true;
}
if (!need_pmd && !need_pte)
goto do_copy;
/* alloc pmd/pte without spin lock */
spin_unlock(pmd_ptl);
if (unlikely(need_pmd)) {
dst_pmd = pmd_alloc_one(dst_mm, addr);
if (!dst_pmd)
goto fallback;
}
if (need_pte) {
new_pte = pte_alloc_one(dst_mm);
if (!new_pte) {
if (need_pmd)
pmd_free(dst_mm, dst_pmd);
goto fallback;
}
}
spin_lock(pmd_ptl);
if (!is_vma_async_fork(vma) || !is_pmd_async_fork(*pmd)) {
spin_unlock(pmd_ptl);
if (need_pmd)
pmd_free(dst_mm, dst_pmd);
if (need_pte)
pte_free(dst_mm, new_pte);
return 0;
}
/* populate pmd and pte with pmd_ptl held */
if (need_pmd) {
async_pmd_populate(dst_mm, dst_pud, addr, dst_pmd);
dst_pmd = pmd_offset(dst_pud, addr);
}
if (need_pte)
async_pte_populate(dst_mm, dst_pmd, new_pte);
do_copy:
if (async_copy_pmd_one(dst_mm, vma, pmd, dst_pmd, addr))
goto fallback;
spin_unlock(pmd_ptl);
return 0;
fallback:
__async_fork_clean_pmd(src_mm, addr, pmd);
async_fork_set_flags(dst_mm, ASYNC_FORK_FALLBACK);
spin_unlock(pmd_ptl);
pr_warn("async fork fallback in pmd fix, oldmm=%p\n", src_mm);
return -ENOMEM;
}
/* mpnt->vm_mm should be write lock state */
void __async_fork_fixup_vma(struct vm_area_struct *mpnt)
{
struct vm_area_struct *dst_vma;
int err = 0;
dst_vma = async_fork_vma_lock_peer(mpnt);
if (!dst_vma)
return;
pr_debug("async fork fixup vma, oldmm=%p mm=%p vma=%08lx-%08lx\n",
mpnt->vm_mm, dst_vma->vm_mm, mpnt->vm_start,
mpnt->vm_end);
if (is_async_fork_fallback(dst_vma->vm_mm))
goto fallback;
err = async_fork_iter_page_range(dst_vma->vm_mm, mpnt->vm_mm,
mpnt, ITER_SLOW) < 0;
if (err) {
async_fork_set_flags(dst_vma->vm_mm, ASYNC_FORK_FALLBACK);
goto fallback;
}
unlock:
async_fork_vma_unbind(mpnt);
async_fork_vma_unlock_peer(mpnt, dst_vma);
pr_debug("async fork fixup vma finished\n");
return;
fallback:
pr_warn("async fork fallback in vma fix\n");
async_fork_iter_page_range(dst_vma->vm_mm, mpnt->vm_mm,
mpnt, ITER_FALLBACK);
goto unlock;
}
void __async_fork_madvise_vma(struct vm_area_struct *mpnt,
unsigned long start,
unsigned long end)
{
pr_debug("async fork madvise, oldmm=%p vma=%08lx-%08lx\n",
mpnt->vm_mm, mpnt->vm_start, mpnt->vm_end);
/* The mpnt will be used with pmd locked, so we can make sure
* that dst_vma is not freed.
*/
__async_fork_iter_page_range(NULL, mpnt->vm_mm, mpnt, start,
end, ITER_MADVISE);
pr_debug("async fork madvise finished\n");
}