diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 58afc04c107e..513df75d0fc9 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -1,6 +1,8 @@ #ifndef _LINUX_PRCTL_H #define _LINUX_PRCTL_H +#include + /* Values to pass as first argument to prctl() */ #define PR_SET_PDEATHSIG 1 /* Second arg is a signal */ @@ -119,6 +121,31 @@ # define PR_SET_MM_ENV_END 11 # define PR_SET_MM_AUXV 12 # define PR_SET_MM_EXE_FILE 13 +# define PR_SET_MM_MAP 14 +# define PR_SET_MM_MAP_SIZE 15 + +/* + * This structure provides new memory descriptor + * map which mostly modifies /proc/pid/stat[m] + * output for a task. This mostly done in a + * sake of checkpoint/restore functionality. + */ +struct prctl_mm_map { + __u64 start_code; /* code section bounds */ + __u64 end_code; + __u64 start_data; /* data section bounds */ + __u64 end_data; + __u64 start_brk; /* heap for brk() syscall */ + __u64 brk; + __u64 start_stack; /* stack starts at */ + __u64 arg_start; /* command line arguments bounds */ + __u64 arg_end; + __u64 env_start; /* environment variables bounds */ + __u64 env_end; + __u64 *auxv; /* auxiliary vector */ + __u32 auxv_size; /* vector size */ + __u32 exe_fd; /* /proc/$pid/exe link file */ +}; /* * Set specific pid that is allowed to ptrace the current task. diff --git a/kernel/sys.c b/kernel/sys.c index 14222a1699c0..f7030b060018 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1687,6 +1687,187 @@ exit: return err; } +#ifdef CONFIG_CHECKPOINT_RESTORE +/* + * WARNING: we don't require any capability here so be very careful + * in what is allowed for modification from userspace. + */ +static int validate_prctl_map(struct prctl_mm_map *prctl_map) +{ + unsigned long mmap_max_addr = TASK_SIZE; + struct mm_struct *mm = current->mm; + int error = -EINVAL, i; + + static const unsigned char offsets[] = { + offsetof(struct prctl_mm_map, start_code), + offsetof(struct prctl_mm_map, end_code), + offsetof(struct prctl_mm_map, start_data), + offsetof(struct prctl_mm_map, end_data), + offsetof(struct prctl_mm_map, start_brk), + offsetof(struct prctl_mm_map, brk), + offsetof(struct prctl_mm_map, start_stack), + offsetof(struct prctl_mm_map, arg_start), + offsetof(struct prctl_mm_map, arg_end), + offsetof(struct prctl_mm_map, env_start), + offsetof(struct prctl_mm_map, env_end), + }; + + /* + * Make sure the members are not somewhere outside + * of allowed address space. + */ + for (i = 0; i < ARRAY_SIZE(offsets); i++) { + u64 val = *(u64 *)((char *)prctl_map + offsets[i]); + + if ((unsigned long)val >= mmap_max_addr || + (unsigned long)val < mmap_min_addr) + goto out; + } + + /* + * Make sure the pairs are ordered. + */ +#define __prctl_check_order(__m1, __op, __m2) \ + ((unsigned long)prctl_map->__m1 __op \ + (unsigned long)prctl_map->__m2) ? 0 : -EINVAL + error = __prctl_check_order(start_code, <, end_code); + error |= __prctl_check_order(start_data, <, end_data); + error |= __prctl_check_order(start_brk, <=, brk); + error |= __prctl_check_order(arg_start, <=, arg_end); + error |= __prctl_check_order(env_start, <=, env_end); + if (error) + goto out; +#undef __prctl_check_order + + error = -EINVAL; + + /* + * @brk should be after @end_data in traditional maps. + */ + if (prctl_map->start_brk <= prctl_map->end_data || + prctl_map->brk <= prctl_map->end_data) + goto out; + + /* + * Neither we should allow to override limits if they set. + */ + if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk, + prctl_map->start_brk, prctl_map->end_data, + prctl_map->start_data)) + goto out; + + /* + * Someone is trying to cheat the auxv vector. + */ + if (prctl_map->auxv_size) { + if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv)) + goto out; + } + + /* + * Finally, make sure the caller has the rights to + * change /proc/pid/exe link: only local root should + * be allowed to. + */ + if (prctl_map->exe_fd != (u32)-1) { + struct user_namespace *ns = current_user_ns(); + const struct cred *cred = current_cred(); + + if (!uid_eq(cred->uid, make_kuid(ns, 0)) || + !gid_eq(cred->gid, make_kgid(ns, 0))) + goto out; + } + + error = 0; +out: + return error; +} + +static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size) +{ + struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, }; + unsigned long user_auxv[AT_VECTOR_SIZE]; + struct mm_struct *mm = current->mm; + int error; + + BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); + BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256); + + if (opt == PR_SET_MM_MAP_SIZE) + return put_user((unsigned int)sizeof(prctl_map), + (unsigned int __user *)addr); + + if (data_size != sizeof(prctl_map)) + return -EINVAL; + + if (copy_from_user(&prctl_map, addr, sizeof(prctl_map))) + return -EFAULT; + + error = validate_prctl_map(&prctl_map); + if (error) + return error; + + if (prctl_map.auxv_size) { + memset(user_auxv, 0, sizeof(user_auxv)); + if (copy_from_user(user_auxv, + (const void __user *)prctl_map.auxv, + prctl_map.auxv_size)) + return -EFAULT; + + /* Last entry must be AT_NULL as specification requires */ + user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL; + user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL; + } + + down_write(&mm->mmap_sem); + if (prctl_map.exe_fd != (u32)-1) + error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd); + downgrade_write(&mm->mmap_sem); + if (error) + goto out; + + /* + * We don't validate if these members are pointing to + * real present VMAs because application may have correspond + * VMAs already unmapped and kernel uses these members for statistics + * output in procfs mostly, except + * + * - @start_brk/@brk which are used in do_brk but kernel lookups + * for VMAs when updating these memvers so anything wrong written + * here cause kernel to swear at userspace program but won't lead + * to any problem in kernel itself + */ + + mm->start_code = prctl_map.start_code; + mm->end_code = prctl_map.end_code; + mm->start_data = prctl_map.start_data; + mm->end_data = prctl_map.end_data; + mm->start_brk = prctl_map.start_brk; + mm->brk = prctl_map.brk; + mm->start_stack = prctl_map.start_stack; + mm->arg_start = prctl_map.arg_start; + mm->arg_end = prctl_map.arg_end; + mm->env_start = prctl_map.env_start; + mm->env_end = prctl_map.env_end; + + /* + * Note this update of @saved_auxv is lockless thus + * if someone reads this member in procfs while we're + * updating -- it may get partly updated results. It's + * known and acceptable trade off: we leave it as is to + * not introduce additional locks here making the kernel + * more complex. + */ + if (prctl_map.auxv_size) + memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); + + error = 0; +out: + up_read(&mm->mmap_sem); + return error; +} +#endif /* CONFIG_CHECKPOINT_RESTORE */ + static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { @@ -1694,9 +1875,16 @@ static int prctl_set_mm(int opt, unsigned long addr, struct vm_area_struct *vma; int error; - if (arg5 || (arg4 && opt != PR_SET_MM_AUXV)) + if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV && + opt != PR_SET_MM_MAP && + opt != PR_SET_MM_MAP_SIZE))) return -EINVAL; +#ifdef CONFIG_CHECKPOINT_RESTORE + if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE) + return prctl_set_mm_map(opt, (const void __user *)addr, arg4); +#endif + if (!capable(CAP_SYS_RESOURCE)) return -EPERM;