2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* flexible mmap layout support
|
|
|
|
*
|
|
|
|
* Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
|
|
|
|
* All Rights Reserved.
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
|
|
* (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program; if not, write to the Free Software
|
|
|
|
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* Started by Ingo Molnar <mingo@elte.hu>
|
|
|
|
*/
|
|
|
|
|
2016-05-07 18:15:34 +08:00
|
|
|
#include <linux/elf-randomize.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/personality.h>
|
|
|
|
#include <linux/mm.h>
|
2011-10-30 22:17:13 +08:00
|
|
|
#include <linux/mman.h>
|
2017-02-09 01:51:30 +08:00
|
|
|
#include <linux/sched/signal.h>
|
2017-02-09 01:51:31 +08:00
|
|
|
#include <linux/sched/mm.h>
|
2011-01-12 16:55:27 +08:00
|
|
|
#include <linux/random.h>
|
2012-02-27 17:01:52 +08:00
|
|
|
#include <linux/compat.h>
|
2015-01-15 00:51:17 +08:00
|
|
|
#include <linux/security.h>
|
2008-02-10 01:24:37 +08:00
|
|
|
#include <asm/pgalloc.h>
|
2017-02-10 04:20:24 +08:00
|
|
|
#include <asm/elf.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-01-12 16:55:22 +08:00
|
|
|
static unsigned long stack_maxrandom_size(void)
|
|
|
|
{
|
|
|
|
if (!(current->flags & PF_RANDOMIZE))
|
|
|
|
return 0;
|
|
|
|
if (current->personality & ADDR_NO_RANDOMIZE)
|
|
|
|
return 0;
|
|
|
|
return STACK_RND_MASK << PAGE_SHIFT;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Top of mmap area (just below the process stack).
|
|
|
|
*
|
2011-01-12 16:55:23 +08:00
|
|
|
* Leave at least a ~32 MB hole.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2011-01-12 16:55:23 +08:00
|
|
|
#define MIN_GAP (32*1024*1024)
|
2009-03-18 20:27:36 +08:00
|
|
|
#define MAX_GAP (STACK_TOP/6*5)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-01-12 16:55:26 +08:00
|
|
|
static inline int mmap_is_legacy(void)
|
|
|
|
{
|
|
|
|
if (current->personality & ADDR_COMPAT_LAYOUT)
|
|
|
|
return 1;
|
|
|
|
if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
|
|
|
|
return 1;
|
|
|
|
return sysctl_legacy_va_layout;
|
|
|
|
}
|
|
|
|
|
2015-04-15 06:48:00 +08:00
|
|
|
unsigned long arch_mmap_rnd(void)
|
2011-01-12 16:55:27 +08:00
|
|
|
{
|
2015-11-10 19:30:28 +08:00
|
|
|
return (get_random_int() & MMAP_RND_MASK) << PAGE_SHIFT;
|
2011-01-12 16:55:27 +08:00
|
|
|
}
|
|
|
|
|
2015-04-15 06:47:57 +08:00
|
|
|
static unsigned long mmap_base_legacy(unsigned long rnd)
|
2013-11-13 07:07:55 +08:00
|
|
|
{
|
2015-04-15 06:47:57 +08:00
|
|
|
return TASK_UNMAPPED_BASE + rnd;
|
2013-11-13 07:07:55 +08:00
|
|
|
}
|
|
|
|
|
2015-04-15 06:47:57 +08:00
|
|
|
static inline unsigned long mmap_base(unsigned long rnd)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2010-01-14 03:44:33 +08:00
|
|
|
unsigned long gap = rlimit(RLIMIT_STACK);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (gap < MIN_GAP)
|
|
|
|
gap = MIN_GAP;
|
|
|
|
else if (gap > MAX_GAP)
|
|
|
|
gap = MAX_GAP;
|
2011-01-12 16:55:27 +08:00
|
|
|
gap &= PAGE_MASK;
|
2015-04-15 06:47:57 +08:00
|
|
|
return STACK_TOP - stack_maxrandom_size() - rnd - gap;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2015-01-15 00:51:17 +08:00
|
|
|
unsigned long
|
|
|
|
arch_get_unmapped_area(struct file *filp, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long pgoff, unsigned long flags)
|
|
|
|
{
|
|
|
|
struct mm_struct *mm = current->mm;
|
|
|
|
struct vm_area_struct *vma;
|
|
|
|
struct vm_unmapped_area_info info;
|
|
|
|
|
|
|
|
if (len > TASK_SIZE - mmap_min_addr)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
if (flags & MAP_FIXED)
|
|
|
|
return addr;
|
|
|
|
|
|
|
|
if (addr) {
|
|
|
|
addr = PAGE_ALIGN(addr);
|
|
|
|
vma = find_vma(mm, addr);
|
|
|
|
if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
|
|
|
|
(!vma || addr + len <= vma->vm_start))
|
|
|
|
return addr;
|
|
|
|
}
|
|
|
|
|
|
|
|
info.flags = 0;
|
|
|
|
info.length = len;
|
|
|
|
info.low_limit = mm->mmap_base;
|
|
|
|
info.high_limit = TASK_SIZE;
|
2015-11-10 19:30:28 +08:00
|
|
|
if (filp || (flags & MAP_SHARED))
|
|
|
|
info.align_mask = MMAP_ALIGN_MASK << PAGE_SHIFT;
|
|
|
|
else
|
|
|
|
info.align_mask = 0;
|
2015-01-15 00:51:17 +08:00
|
|
|
info.align_offset = pgoff << PAGE_SHIFT;
|
|
|
|
return vm_unmapped_area(&info);
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned long
|
|
|
|
arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
|
|
|
|
const unsigned long len, const unsigned long pgoff,
|
|
|
|
const unsigned long flags)
|
|
|
|
{
|
|
|
|
struct vm_area_struct *vma;
|
|
|
|
struct mm_struct *mm = current->mm;
|
|
|
|
unsigned long addr = addr0;
|
|
|
|
struct vm_unmapped_area_info info;
|
|
|
|
|
|
|
|
/* requested length too big for entire address space */
|
|
|
|
if (len > TASK_SIZE - mmap_min_addr)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
if (flags & MAP_FIXED)
|
|
|
|
return addr;
|
|
|
|
|
|
|
|
/* requesting a specific address */
|
|
|
|
if (addr) {
|
|
|
|
addr = PAGE_ALIGN(addr);
|
|
|
|
vma = find_vma(mm, addr);
|
|
|
|
if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
|
|
|
|
(!vma || addr + len <= vma->vm_start))
|
|
|
|
return addr;
|
|
|
|
}
|
|
|
|
|
|
|
|
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
|
|
|
|
info.length = len;
|
|
|
|
info.low_limit = max(PAGE_SIZE, mmap_min_addr);
|
|
|
|
info.high_limit = mm->mmap_base;
|
2015-11-10 19:30:28 +08:00
|
|
|
if (filp || (flags & MAP_SHARED))
|
|
|
|
info.align_mask = MMAP_ALIGN_MASK << PAGE_SHIFT;
|
|
|
|
else
|
|
|
|
info.align_mask = 0;
|
2015-01-15 00:51:17 +08:00
|
|
|
info.align_offset = pgoff << PAGE_SHIFT;
|
|
|
|
addr = vm_unmapped_area(&info);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A failed mmap() very likely causes application failure,
|
|
|
|
* so fall back to the bottom-up function here. This scenario
|
|
|
|
* can happen with large stack limits and large mmap()
|
|
|
|
* allocations.
|
|
|
|
*/
|
|
|
|
if (addr & ~PAGE_MASK) {
|
|
|
|
VM_BUG_ON(addr != -ENOMEM);
|
|
|
|
info.flags = 0;
|
|
|
|
info.low_limit = TASK_UNMAPPED_BASE;
|
|
|
|
info.high_limit = TASK_SIZE;
|
|
|
|
addr = vm_unmapped_area(&info);
|
|
|
|
}
|
|
|
|
|
|
|
|
return addr;
|
|
|
|
}
|
|
|
|
|
2013-02-11 21:29:49 +08:00
|
|
|
int s390_mmap_check(unsigned long addr, unsigned long len, unsigned long flags)
|
2009-03-18 20:27:37 +08:00
|
|
|
{
|
2016-01-11 18:47:12 +08:00
|
|
|
if (is_compat_task() || TASK_SIZE >= TASK_MAX_SIZE)
|
2013-02-11 21:29:49 +08:00
|
|
|
return 0;
|
|
|
|
if (!(flags & MAP_FIXED))
|
|
|
|
addr = 0;
|
2013-10-28 21:48:30 +08:00
|
|
|
if ((addr + len) >= TASK_SIZE)
|
s390/mm: fix asce_bits handling with dynamic pagetable levels
There is a race with multi-threaded applications between context switch and
pagetable upgrade. In switch_mm() a new user_asce is built from mm->pgd and
mm->context.asce_bits, w/o holding any locks. A concurrent mmap with a
pagetable upgrade on another thread in crst_table_upgrade() could already
have set new asce_bits, but not yet the new mm->pgd. This would result in a
corrupt user_asce in switch_mm(), and eventually in a kernel panic from a
translation exception.
Fix this by storing the complete asce instead of just the asce_bits, which
can then be read atomically from switch_mm(), so that it either sees the
old value or the new value, but no mixture. Both cases are OK. Having the
old value would result in a page fault on access to the higher level memory,
but the fault handler would see the new mm->pgd, if it was a valid access
after the mmap on the other thread has completed. So as worst-case scenario
we would have a page fault loop for the racing thread until the next time
slice.
Also remove dead code and simplify the upgrade/downgrade path, there are no
upgrades from 2 levels, and only downgrades from 3 levels for compat tasks.
There are also no concurrent upgrades, because the mmap_sem is held with
down_write() in do_mmap, so the flush and table checks during upgrade can
be removed.
Reported-by: Michael Munday <munday@ca.ibm.com>
Reviewed-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
2016-04-15 22:38:40 +08:00
|
|
|
return crst_table_upgrade(current->mm);
|
2009-03-18 20:27:37 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-02-10 01:24:37 +08:00
|
|
|
static unsigned long
|
|
|
|
s390_get_unmapped_area(struct file *filp, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long pgoff, unsigned long flags)
|
|
|
|
{
|
|
|
|
struct mm_struct *mm = current->mm;
|
2009-03-18 20:27:37 +08:00
|
|
|
unsigned long area;
|
2008-02-10 01:24:37 +08:00
|
|
|
int rc;
|
|
|
|
|
2009-03-18 20:27:37 +08:00
|
|
|
area = arch_get_unmapped_area(filp, addr, len, pgoff, flags);
|
|
|
|
if (!(area & ~PAGE_MASK))
|
|
|
|
return area;
|
2016-01-11 18:47:12 +08:00
|
|
|
if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < TASK_MAX_SIZE) {
|
2009-03-18 20:27:37 +08:00
|
|
|
/* Upgrade the page table to 4 levels and retry. */
|
s390/mm: fix asce_bits handling with dynamic pagetable levels
There is a race with multi-threaded applications between context switch and
pagetable upgrade. In switch_mm() a new user_asce is built from mm->pgd and
mm->context.asce_bits, w/o holding any locks. A concurrent mmap with a
pagetable upgrade on another thread in crst_table_upgrade() could already
have set new asce_bits, but not yet the new mm->pgd. This would result in a
corrupt user_asce in switch_mm(), and eventually in a kernel panic from a
translation exception.
Fix this by storing the complete asce instead of just the asce_bits, which
can then be read atomically from switch_mm(), so that it either sees the
old value or the new value, but no mixture. Both cases are OK. Having the
old value would result in a page fault on access to the higher level memory,
but the fault handler would see the new mm->pgd, if it was a valid access
after the mmap on the other thread has completed. So as worst-case scenario
we would have a page fault loop for the racing thread until the next time
slice.
Also remove dead code and simplify the upgrade/downgrade path, there are no
upgrades from 2 levels, and only downgrades from 3 levels for compat tasks.
There are also no concurrent upgrades, because the mmap_sem is held with
down_write() in do_mmap, so the flush and table checks during upgrade can
be removed.
Reported-by: Michael Munday <munday@ca.ibm.com>
Reviewed-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
2016-04-15 22:38:40 +08:00
|
|
|
rc = crst_table_upgrade(mm);
|
2008-02-10 01:24:37 +08:00
|
|
|
if (rc)
|
|
|
|
return (unsigned long) rc;
|
2009-03-18 20:27:37 +08:00
|
|
|
area = arch_get_unmapped_area(filp, addr, len, pgoff, flags);
|
2008-02-10 01:24:37 +08:00
|
|
|
}
|
2009-03-18 20:27:37 +08:00
|
|
|
return area;
|
2008-02-10 01:24:37 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned long
|
2009-03-18 20:27:37 +08:00
|
|
|
s390_get_unmapped_area_topdown(struct file *filp, const unsigned long addr,
|
2008-02-10 01:24:37 +08:00
|
|
|
const unsigned long len, const unsigned long pgoff,
|
|
|
|
const unsigned long flags)
|
|
|
|
{
|
|
|
|
struct mm_struct *mm = current->mm;
|
2009-03-18 20:27:37 +08:00
|
|
|
unsigned long area;
|
2008-02-10 01:24:37 +08:00
|
|
|
int rc;
|
|
|
|
|
2009-03-18 20:27:37 +08:00
|
|
|
area = arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags);
|
|
|
|
if (!(area & ~PAGE_MASK))
|
|
|
|
return area;
|
2016-01-11 18:47:12 +08:00
|
|
|
if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < TASK_MAX_SIZE) {
|
2009-03-18 20:27:37 +08:00
|
|
|
/* Upgrade the page table to 4 levels and retry. */
|
s390/mm: fix asce_bits handling with dynamic pagetable levels
There is a race with multi-threaded applications between context switch and
pagetable upgrade. In switch_mm() a new user_asce is built from mm->pgd and
mm->context.asce_bits, w/o holding any locks. A concurrent mmap with a
pagetable upgrade on another thread in crst_table_upgrade() could already
have set new asce_bits, but not yet the new mm->pgd. This would result in a
corrupt user_asce in switch_mm(), and eventually in a kernel panic from a
translation exception.
Fix this by storing the complete asce instead of just the asce_bits, which
can then be read atomically from switch_mm(), so that it either sees the
old value or the new value, but no mixture. Both cases are OK. Having the
old value would result in a page fault on access to the higher level memory,
but the fault handler would see the new mm->pgd, if it was a valid access
after the mmap on the other thread has completed. So as worst-case scenario
we would have a page fault loop for the racing thread until the next time
slice.
Also remove dead code and simplify the upgrade/downgrade path, there are no
upgrades from 2 levels, and only downgrades from 3 levels for compat tasks.
There are also no concurrent upgrades, because the mmap_sem is held with
down_write() in do_mmap, so the flush and table checks during upgrade can
be removed.
Reported-by: Michael Munday <munday@ca.ibm.com>
Reviewed-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
2016-04-15 22:38:40 +08:00
|
|
|
rc = crst_table_upgrade(mm);
|
2008-02-10 01:24:37 +08:00
|
|
|
if (rc)
|
|
|
|
return (unsigned long) rc;
|
2009-03-18 20:27:37 +08:00
|
|
|
area = arch_get_unmapped_area_topdown(filp, addr, len,
|
|
|
|
pgoff, flags);
|
2008-02-10 01:24:37 +08:00
|
|
|
}
|
2009-03-18 20:27:37 +08:00
|
|
|
return area;
|
2008-02-10 01:24:37 +08:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* This function, called very early during the creation of a new
|
|
|
|
* process VM image, sets up which VM layout function to use:
|
|
|
|
*/
|
|
|
|
void arch_pick_mmap_layout(struct mm_struct *mm)
|
|
|
|
{
|
2015-04-15 06:47:57 +08:00
|
|
|
unsigned long random_factor = 0UL;
|
|
|
|
|
|
|
|
if (current->flags & PF_RANDOMIZE)
|
2015-04-15 06:48:00 +08:00
|
|
|
random_factor = arch_mmap_rnd();
|
2015-04-15 06:47:57 +08:00
|
|
|
|
2008-02-10 01:24:37 +08:00
|
|
|
/*
|
|
|
|
* Fall back to the standard layout if the personality
|
|
|
|
* bit is set, or if the expected stack growth is unlimited:
|
|
|
|
*/
|
|
|
|
if (mmap_is_legacy()) {
|
2015-04-15 06:47:57 +08:00
|
|
|
mm->mmap_base = mmap_base_legacy(random_factor);
|
2008-02-10 01:24:37 +08:00
|
|
|
mm->get_unmapped_area = s390_get_unmapped_area;
|
|
|
|
} else {
|
2015-04-15 06:47:57 +08:00
|
|
|
mm->mmap_base = mmap_base(random_factor);
|
2008-02-10 01:24:37 +08:00
|
|
|
mm->get_unmapped_area = s390_get_unmapped_area_topdown;
|
|
|
|
}
|
|
|
|
}
|