Merge branch 'akpm' (patches from Andrew)
Merge still more updates from Andrew Morton: "18 patches. Subsystems affected by this patch series: mm (memcg and cleanups) and epoll" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: mm/Kconfig: fix spelling mistake "whats" -> "what's" selftests/filesystems: expand epoll with epoll_pwait2 epoll: wire up syscall epoll_pwait2 epoll: add syscall epoll_pwait2 epoll: convert internal api to timespec64 epoll: eliminate unnecessary lock for zero timeout epoll: replace gotos with a proper loop epoll: pull all code between fetch_events and send_event into the loop epoll: simplify and optimize busy loop logic epoll: move eavail next to the list_empty_careful check epoll: pull fatal signal checks into ep_send_events() epoll: simplify signal handling epoll: check for events when removing a timed out thread from the wait queue mm/memcontrol:rewrite mem_cgroup_page_lruvec() mm, kvm: account kvm_vcpu_mmap to kmemcg mm/memcg: remove unused definitions mm/memcg: warning on !memcg after readahead page charged mm/memcg: bail early from swap accounting if memcg disabled
This commit is contained in:
commit
1db98bcf56
|
@ -480,3 +480,4 @@
|
|||
548 common pidfd_getfd sys_pidfd_getfd
|
||||
549 common faccessat2 sys_faccessat2
|
||||
550 common process_madvise sys_process_madvise
|
||||
551 common epoll_pwait2 sys_epoll_pwait2
|
||||
|
|
|
@ -454,3 +454,4 @@
|
|||
438 common pidfd_getfd sys_pidfd_getfd
|
||||
439 common faccessat2 sys_faccessat2
|
||||
440 common process_madvise sys_process_madvise
|
||||
441 common epoll_pwait2 sys_epoll_pwait2
|
||||
|
|
|
@ -38,7 +38,7 @@
|
|||
#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
|
||||
#define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
|
||||
|
||||
#define __NR_compat_syscalls 441
|
||||
#define __NR_compat_syscalls 442
|
||||
#endif
|
||||
|
||||
#define __ARCH_WANT_SYS_CLONE
|
||||
|
|
|
@ -889,6 +889,8 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
|
|||
__SYSCALL(__NR_faccessat2, sys_faccessat2)
|
||||
#define __NR_process_madvise 440
|
||||
__SYSCALL(__NR_process_madvise, sys_process_madvise)
|
||||
#define __NR_epoll_pwait2 441
|
||||
__SYSCALL(__NR_epoll_pwait2, sys_epoll_pwait2)
|
||||
|
||||
/*
|
||||
* Please add new compat syscalls above this comment and update
|
||||
|
|
|
@ -361,3 +361,4 @@
|
|||
438 common pidfd_getfd sys_pidfd_getfd
|
||||
439 common faccessat2 sys_faccessat2
|
||||
440 common process_madvise sys_process_madvise
|
||||
441 common epoll_pwait2 sys_epoll_pwait2
|
||||
|
|
|
@ -440,3 +440,4 @@
|
|||
438 common pidfd_getfd sys_pidfd_getfd
|
||||
439 common faccessat2 sys_faccessat2
|
||||
440 common process_madvise sys_process_madvise
|
||||
441 common epoll_pwait2 sys_epoll_pwait2
|
||||
|
|
|
@ -446,3 +446,4 @@
|
|||
438 common pidfd_getfd sys_pidfd_getfd
|
||||
439 common faccessat2 sys_faccessat2
|
||||
440 common process_madvise sys_process_madvise
|
||||
441 common epoll_pwait2 sys_epoll_pwait2
|
||||
|
|
|
@ -379,3 +379,4 @@
|
|||
438 n32 pidfd_getfd sys_pidfd_getfd
|
||||
439 n32 faccessat2 sys_faccessat2
|
||||
440 n32 process_madvise sys_process_madvise
|
||||
441 n32 epoll_pwait2 sys_epoll_pwait2
|
||||
|
|
|
@ -355,3 +355,4 @@
|
|||
438 n64 pidfd_getfd sys_pidfd_getfd
|
||||
439 n64 faccessat2 sys_faccessat2
|
||||
440 n64 process_madvise sys_process_madvise
|
||||
441 n64 epoll_pwait2 sys_epoll_pwait2
|
||||
|
|
|
@ -428,3 +428,4 @@
|
|||
438 o32 pidfd_getfd sys_pidfd_getfd
|
||||
439 o32 faccessat2 sys_faccessat2
|
||||
440 o32 process_madvise sys_process_madvise
|
||||
441 o32 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
|
||||
|
|
|
@ -438,3 +438,4 @@
|
|||
438 common pidfd_getfd sys_pidfd_getfd
|
||||
439 common faccessat2 sys_faccessat2
|
||||
440 common process_madvise sys_process_madvise
|
||||
441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
|
||||
|
|
|
@ -530,3 +530,4 @@
|
|||
438 common pidfd_getfd sys_pidfd_getfd
|
||||
439 common faccessat2 sys_faccessat2
|
||||
440 common process_madvise sys_process_madvise
|
||||
441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
|
||||
|
|
|
@ -443,3 +443,4 @@
|
|||
438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd
|
||||
439 common faccessat2 sys_faccessat2 sys_faccessat2
|
||||
440 common process_madvise sys_process_madvise sys_process_madvise
|
||||
441 common epoll_pwait2 sys_epoll_pwait2 sys_epoll_pwait2
|
||||
|
|
|
@ -443,3 +443,4 @@
|
|||
438 common pidfd_getfd sys_pidfd_getfd
|
||||
439 common faccessat2 sys_faccessat2
|
||||
440 common process_madvise sys_process_madvise
|
||||
441 common epoll_pwait2 sys_epoll_pwait2
|
||||
|
|
|
@ -486,3 +486,4 @@
|
|||
438 common pidfd_getfd sys_pidfd_getfd
|
||||
439 common faccessat2 sys_faccessat2
|
||||
440 common process_madvise sys_process_madvise
|
||||
441 common epoll_pwait2 sys_epoll_pwait2
|
||||
|
|
|
@ -445,3 +445,4 @@
|
|||
438 i386 pidfd_getfd sys_pidfd_getfd
|
||||
439 i386 faccessat2 sys_faccessat2
|
||||
440 i386 process_madvise sys_process_madvise
|
||||
441 i386 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
|
||||
|
|
|
@ -362,6 +362,7 @@
|
|||
438 common pidfd_getfd sys_pidfd_getfd
|
||||
439 common faccessat2 sys_faccessat2
|
||||
440 common process_madvise sys_process_madvise
|
||||
441 common epoll_pwait2 sys_epoll_pwait2
|
||||
|
||||
#
|
||||
# Due to a historical design error, certain syscalls are numbered differently
|
||||
|
|
|
@ -9869,7 +9869,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
|
|||
|
||||
r = -ENOMEM;
|
||||
|
||||
page = alloc_page(GFP_KERNEL | __GFP_ZERO);
|
||||
page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
|
||||
if (!page)
|
||||
goto fail_free_lapic;
|
||||
vcpu->arch.pio_data = page_address(page);
|
||||
|
|
|
@ -411,3 +411,4 @@
|
|||
438 common pidfd_getfd sys_pidfd_getfd
|
||||
439 common faccessat2 sys_faccessat2
|
||||
440 common process_madvise sys_process_madvise
|
||||
441 common epoll_pwait2 sys_epoll_pwait2
|
||||
|
|
289
fs/eventpoll.c
289
fs/eventpoll.c
|
@ -389,19 +389,24 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time)
|
|||
*
|
||||
* we must do our busy polling with irqs enabled
|
||||
*/
|
||||
static void ep_busy_loop(struct eventpoll *ep, int nonblock)
|
||||
static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
|
||||
{
|
||||
unsigned int napi_id = READ_ONCE(ep->napi_id);
|
||||
|
||||
if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on())
|
||||
if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) {
|
||||
napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false,
|
||||
BUSY_POLL_BUDGET);
|
||||
}
|
||||
|
||||
static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
|
||||
{
|
||||
if (ep->napi_id)
|
||||
if (ep_events_available(ep))
|
||||
return true;
|
||||
/*
|
||||
* Busy poll timed out. Drop NAPI ID for now, we can add
|
||||
* it back in when we have moved a socket with a valid NAPI
|
||||
* ID onto the ready list.
|
||||
*/
|
||||
ep->napi_id = 0;
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -441,12 +446,9 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
|
|||
|
||||
#else
|
||||
|
||||
static inline void ep_busy_loop(struct eventpoll *ep, int nonblock)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
|
||||
static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
|
||||
|
@ -1625,6 +1627,14 @@ static int ep_send_events(struct eventpoll *ep,
|
|||
poll_table pt;
|
||||
int res = 0;
|
||||
|
||||
/*
|
||||
* Always short-circuit for fatal signals to allow threads to make a
|
||||
* timely exit without the chance of finding more events available and
|
||||
* fetching repeatedly.
|
||||
*/
|
||||
if (fatal_signal_pending(current))
|
||||
return -EINTR;
|
||||
|
||||
init_poll_funcptr(&pt, NULL);
|
||||
|
||||
mutex_lock(&ep->mtx);
|
||||
|
@ -1702,15 +1712,25 @@ static int ep_send_events(struct eventpoll *ep,
|
|||
return res;
|
||||
}
|
||||
|
||||
static inline struct timespec64 ep_set_mstimeout(long ms)
|
||||
static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
|
||||
{
|
||||
struct timespec64 now, ts = {
|
||||
.tv_sec = ms / MSEC_PER_SEC,
|
||||
.tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
|
||||
};
|
||||
struct timespec64 now;
|
||||
|
||||
if (ms < 0)
|
||||
return NULL;
|
||||
|
||||
if (!ms) {
|
||||
to->tv_sec = 0;
|
||||
to->tv_nsec = 0;
|
||||
return to;
|
||||
}
|
||||
|
||||
to->tv_sec = ms / MSEC_PER_SEC;
|
||||
to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC);
|
||||
|
||||
ktime_get_ts64(&now);
|
||||
return timespec64_add_safe(now, ts);
|
||||
*to = timespec64_add_safe(now, *to);
|
||||
return to;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1722,8 +1742,8 @@ static inline struct timespec64 ep_set_mstimeout(long ms)
|
|||
* stored.
|
||||
* @maxevents: Size (in terms of number of events) of the caller event buffer.
|
||||
* @timeout: Maximum timeout for the ready events fetch operation, in
|
||||
* milliseconds. If the @timeout is zero, the function will not block,
|
||||
* while if the @timeout is less than zero, the function will block
|
||||
* timespec. If the timeout is zero, the function will not block,
|
||||
* while if the @timeout ptr is NULL, the function will block
|
||||
* until at least one event has been retrieved (or an error
|
||||
* occurred).
|
||||
*
|
||||
|
@ -1731,55 +1751,59 @@ static inline struct timespec64 ep_set_mstimeout(long ms)
|
|||
* error code, in case of error.
|
||||
*/
|
||||
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
|
||||
int maxevents, long timeout)
|
||||
int maxevents, struct timespec64 *timeout)
|
||||
{
|
||||
int res = 0, eavail, timed_out = 0;
|
||||
int res, eavail, timed_out = 0;
|
||||
u64 slack = 0;
|
||||
wait_queue_entry_t wait;
|
||||
ktime_t expires, *to = NULL;
|
||||
|
||||
lockdep_assert_irqs_enabled();
|
||||
|
||||
if (timeout > 0) {
|
||||
struct timespec64 end_time = ep_set_mstimeout(timeout);
|
||||
|
||||
slack = select_estimate_accuracy(&end_time);
|
||||
if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
|
||||
slack = select_estimate_accuracy(timeout);
|
||||
to = &expires;
|
||||
*to = timespec64_to_ktime(end_time);
|
||||
} else if (timeout == 0) {
|
||||
*to = timespec64_to_ktime(*timeout);
|
||||
} else if (timeout) {
|
||||
/*
|
||||
* Avoid the unnecessary trip to the wait queue loop, if the
|
||||
* caller specified a non blocking operation. We still need
|
||||
* lock because we could race and not see an epi being added
|
||||
* to the ready list while in irq callback. Thus incorrectly
|
||||
* returning 0 back to userspace.
|
||||
* caller specified a non blocking operation.
|
||||
*/
|
||||
timed_out = 1;
|
||||
|
||||
write_lock_irq(&ep->lock);
|
||||
eavail = ep_events_available(ep);
|
||||
write_unlock_irq(&ep->lock);
|
||||
|
||||
goto send_events;
|
||||
}
|
||||
|
||||
fetch_events:
|
||||
|
||||
if (!ep_events_available(ep))
|
||||
ep_busy_loop(ep, timed_out);
|
||||
|
||||
eavail = ep_events_available(ep);
|
||||
if (eavail)
|
||||
goto send_events;
|
||||
|
||||
/*
|
||||
* Busy poll timed out. Drop NAPI ID for now, we can add
|
||||
* it back in when we have moved a socket with a valid NAPI
|
||||
* ID onto the ready list.
|
||||
* This call is racy: We may or may not see events that are being added
|
||||
* to the ready list under the lock (e.g., in IRQ callbacks). For, cases
|
||||
* with a non-zero timeout, this thread will check the ready list under
|
||||
* lock and will added to the wait queue. For, cases with a zero
|
||||
* timeout, the user by definition should not care and will have to
|
||||
* recheck again.
|
||||
*/
|
||||
ep_reset_busy_poll_napi_id(ep);
|
||||
eavail = ep_events_available(ep);
|
||||
|
||||
while (1) {
|
||||
if (eavail) {
|
||||
/*
|
||||
* Try to transfer events to user space. In case we get
|
||||
* 0 events and there's still timeout left over, we go
|
||||
* trying again in search of more luck.
|
||||
*/
|
||||
res = ep_send_events(ep, events, maxevents);
|
||||
if (res)
|
||||
return res;
|
||||
}
|
||||
|
||||
if (timed_out)
|
||||
return 0;
|
||||
|
||||
eavail = ep_busy_loop(ep, timed_out);
|
||||
if (eavail)
|
||||
continue;
|
||||
|
||||
if (signal_pending(current))
|
||||
return -EINTR;
|
||||
|
||||
do {
|
||||
/*
|
||||
* Internally init_wait() uses autoremove_wake_function(),
|
||||
* thus wait entry is removed from the wait queue on each
|
||||
|
@ -1809,55 +1833,38 @@ fetch_events:
|
|||
* important.
|
||||
*/
|
||||
eavail = ep_events_available(ep);
|
||||
if (!eavail) {
|
||||
if (signal_pending(current))
|
||||
res = -EINTR;
|
||||
else
|
||||
__add_wait_queue_exclusive(&ep->wq, &wait);
|
||||
}
|
||||
if (!eavail)
|
||||
__add_wait_queue_exclusive(&ep->wq, &wait);
|
||||
|
||||
write_unlock_irq(&ep->lock);
|
||||
|
||||
if (eavail || res)
|
||||
break;
|
||||
if (!eavail)
|
||||
timed_out = !schedule_hrtimeout_range(to, slack,
|
||||
HRTIMER_MODE_ABS);
|
||||
__set_current_state(TASK_RUNNING);
|
||||
|
||||
if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
|
||||
timed_out = 1;
|
||||
break;
|
||||
}
|
||||
|
||||
/* We were woken up, thus go and try to harvest some events */
|
||||
/*
|
||||
* We were woken up, thus go and try to harvest some events.
|
||||
* If timed out and still on the wait queue, recheck eavail
|
||||
* carefully under lock, below.
|
||||
*/
|
||||
eavail = 1;
|
||||
|
||||
} while (0);
|
||||
|
||||
__set_current_state(TASK_RUNNING);
|
||||
|
||||
if (!list_empty_careful(&wait.entry)) {
|
||||
write_lock_irq(&ep->lock);
|
||||
__remove_wait_queue(&ep->wq, &wait);
|
||||
write_unlock_irq(&ep->lock);
|
||||
if (!list_empty_careful(&wait.entry)) {
|
||||
write_lock_irq(&ep->lock);
|
||||
/*
|
||||
* If the thread timed out and is not on the wait queue,
|
||||
* it means that the thread was woken up after its
|
||||
* timeout expired before it could reacquire the lock.
|
||||
* Thus, when wait.entry is empty, it needs to harvest
|
||||
* events.
|
||||
*/
|
||||
if (timed_out)
|
||||
eavail = list_empty(&wait.entry);
|
||||
__remove_wait_queue(&ep->wq, &wait);
|
||||
write_unlock_irq(&ep->lock);
|
||||
}
|
||||
}
|
||||
|
||||
send_events:
|
||||
if (fatal_signal_pending(current)) {
|
||||
/*
|
||||
* Always short-circuit for fatal signals to allow
|
||||
* threads to make a timely exit without the chance of
|
||||
* finding more events available and fetching
|
||||
* repeatedly.
|
||||
*/
|
||||
res = -EINTR;
|
||||
}
|
||||
/*
|
||||
* Try to transfer events to user space. In case we get 0 events and
|
||||
* there's still timeout left over, we go trying again in search of
|
||||
* more luck.
|
||||
*/
|
||||
if (!res && eavail &&
|
||||
!(res = ep_send_events(ep, events, maxevents)) && !timed_out)
|
||||
goto fetch_events;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -2176,7 +2183,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
|
|||
* part of the user space epoll_wait(2).
|
||||
*/
|
||||
static int do_epoll_wait(int epfd, struct epoll_event __user *events,
|
||||
int maxevents, int timeout)
|
||||
int maxevents, struct timespec64 *to)
|
||||
{
|
||||
int error;
|
||||
struct fd f;
|
||||
|
@ -2210,7 +2217,7 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
|
|||
ep = f.file->private_data;
|
||||
|
||||
/* Time to fish for events ... */
|
||||
error = ep_poll(ep, events, maxevents, timeout);
|
||||
error = ep_poll(ep, events, maxevents, to);
|
||||
|
||||
error_fput:
|
||||
fdput(f);
|
||||
|
@ -2220,16 +2227,19 @@ error_fput:
|
|||
SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
|
||||
int, maxevents, int, timeout)
|
||||
{
|
||||
return do_epoll_wait(epfd, events, maxevents, timeout);
|
||||
struct timespec64 to;
|
||||
|
||||
return do_epoll_wait(epfd, events, maxevents,
|
||||
ep_timeout_to_timespec(&to, timeout));
|
||||
}
|
||||
|
||||
/*
|
||||
* Implement the event wait interface for the eventpoll file. It is the kernel
|
||||
* part of the user space epoll_pwait(2).
|
||||
*/
|
||||
SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
|
||||
int, maxevents, int, timeout, const sigset_t __user *, sigmask,
|
||||
size_t, sigsetsize)
|
||||
static int do_epoll_pwait(int epfd, struct epoll_event __user *events,
|
||||
int maxevents, struct timespec64 *to,
|
||||
const sigset_t __user *sigmask, size_t sigsetsize)
|
||||
{
|
||||
int error;
|
||||
|
||||
|
@ -2241,18 +2251,47 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
|
|||
if (error)
|
||||
return error;
|
||||
|
||||
error = do_epoll_wait(epfd, events, maxevents, timeout);
|
||||
error = do_epoll_wait(epfd, events, maxevents, to);
|
||||
|
||||
restore_saved_sigmask_unless(error == -EINTR);
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
|
||||
int, maxevents, int, timeout, const sigset_t __user *, sigmask,
|
||||
size_t, sigsetsize)
|
||||
{
|
||||
struct timespec64 to;
|
||||
|
||||
return do_epoll_pwait(epfd, events, maxevents,
|
||||
ep_timeout_to_timespec(&to, timeout),
|
||||
sigmask, sigsetsize);
|
||||
}
|
||||
|
||||
SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events,
|
||||
int, maxevents, const struct __kernel_timespec __user *, timeout,
|
||||
const sigset_t __user *, sigmask, size_t, sigsetsize)
|
||||
{
|
||||
struct timespec64 ts, *to = NULL;
|
||||
|
||||
if (timeout) {
|
||||
if (get_timespec64(&ts, timeout))
|
||||
return -EFAULT;
|
||||
to = &ts;
|
||||
if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return do_epoll_pwait(epfd, events, maxevents, to,
|
||||
sigmask, sigsetsize);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_COMPAT
|
||||
COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
|
||||
struct epoll_event __user *, events,
|
||||
int, maxevents, int, timeout,
|
||||
const compat_sigset_t __user *, sigmask,
|
||||
compat_size_t, sigsetsize)
|
||||
static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events,
|
||||
int maxevents, struct timespec64 *timeout,
|
||||
const compat_sigset_t __user *sigmask,
|
||||
compat_size_t sigsetsize)
|
||||
{
|
||||
long err;
|
||||
|
||||
|
@ -2265,10 +2304,46 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
|
|||
return err;
|
||||
|
||||
err = do_epoll_wait(epfd, events, maxevents, timeout);
|
||||
|
||||
restore_saved_sigmask_unless(err == -EINTR);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
|
||||
struct epoll_event __user *, events,
|
||||
int, maxevents, int, timeout,
|
||||
const compat_sigset_t __user *, sigmask,
|
||||
compat_size_t, sigsetsize)
|
||||
{
|
||||
struct timespec64 to;
|
||||
|
||||
return do_compat_epoll_pwait(epfd, events, maxevents,
|
||||
ep_timeout_to_timespec(&to, timeout),
|
||||
sigmask, sigsetsize);
|
||||
}
|
||||
|
||||
COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd,
|
||||
struct epoll_event __user *, events,
|
||||
int, maxevents,
|
||||
const struct __kernel_timespec __user *, timeout,
|
||||
const compat_sigset_t __user *, sigmask,
|
||||
compat_size_t, sigsetsize)
|
||||
{
|
||||
struct timespec64 ts, *to = NULL;
|
||||
|
||||
if (timeout) {
|
||||
if (get_timespec64(&ts, timeout))
|
||||
return -EFAULT;
|
||||
to = &ts;
|
||||
if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return do_compat_epoll_pwait(epfd, events, maxevents, to,
|
||||
sigmask, sigsetsize);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static int __init eventpoll_init(void)
|
||||
|
|
|
@ -537,6 +537,12 @@ asmlinkage long compat_sys_epoll_pwait(int epfd,
|
|||
int maxevents, int timeout,
|
||||
const compat_sigset_t __user *sigmask,
|
||||
compat_size_t sigsetsize);
|
||||
asmlinkage long compat_sys_epoll_pwait2(int epfd,
|
||||
struct epoll_event __user *events,
|
||||
int maxevents,
|
||||
const struct __kernel_timespec __user *timeout,
|
||||
const compat_sigset_t __user *sigmask,
|
||||
compat_size_t sigsetsize);
|
||||
|
||||
/* fs/fcntl.c */
|
||||
asmlinkage long compat_sys_fcntl(unsigned int fd, unsigned int cmd,
|
||||
|
|
|
@ -620,9 +620,10 @@ mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
|
|||
/**
|
||||
* mem_cgroup_lruvec - get the lru list vector for a memcg & node
|
||||
* @memcg: memcg of the wanted lruvec
|
||||
* @pgdat: pglist_data
|
||||
*
|
||||
* Returns the lru list vector holding pages for a given @memcg &
|
||||
* @node combination. This can be the node lruvec, if the memory
|
||||
* @pgdat combination. This can be the node lruvec, if the memory
|
||||
* controller is disabled.
|
||||
*/
|
||||
static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
|
||||
|
@ -652,7 +653,21 @@ out:
|
|||
return lruvec;
|
||||
}
|
||||
|
||||
struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);
|
||||
/**
|
||||
* mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
|
||||
* @page: the page
|
||||
* @pgdat: pgdat of the page
|
||||
*
|
||||
* This function relies on page->mem_cgroup being stable.
|
||||
*/
|
||||
static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
|
||||
struct pglist_data *pgdat)
|
||||
{
|
||||
struct mem_cgroup *memcg = page_memcg(page);
|
||||
|
||||
VM_WARN_ON_ONCE_PAGE(!memcg, page);
|
||||
return mem_cgroup_lruvec(memcg, pgdat);
|
||||
}
|
||||
|
||||
static inline bool lruvec_holds_page_lru_lock(struct page *page,
|
||||
struct lruvec *lruvec)
|
||||
|
@ -913,41 +928,6 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
|
|||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
/**
|
||||
* mod_memcg_page_state - update page state statistics
|
||||
* @page: the page
|
||||
* @idx: page state item to account
|
||||
* @val: number of pages (positive or negative)
|
||||
*
|
||||
* The @page must be locked or the caller must use lock_page_memcg()
|
||||
* to prevent double accounting when the page is concurrently being
|
||||
* moved to another memcg:
|
||||
*
|
||||
* lock_page(page) or lock_page_memcg(page)
|
||||
* if (TestClearPageState(page))
|
||||
* mod_memcg_page_state(page, state, -1);
|
||||
* unlock_page(page) or unlock_page_memcg(page)
|
||||
*
|
||||
* Kernel pages are an exception to this, since they'll never move.
|
||||
*/
|
||||
static inline void __mod_memcg_page_state(struct page *page,
|
||||
int idx, int val)
|
||||
{
|
||||
struct mem_cgroup *memcg = page_memcg(page);
|
||||
|
||||
if (memcg)
|
||||
__mod_memcg_state(memcg, idx, val);
|
||||
}
|
||||
|
||||
static inline void mod_memcg_page_state(struct page *page,
|
||||
int idx, int val)
|
||||
{
|
||||
struct mem_cgroup *memcg = page_memcg(page);
|
||||
|
||||
if (memcg)
|
||||
mod_memcg_state(memcg, idx, val);
|
||||
}
|
||||
|
||||
static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
|
||||
enum node_stat_item idx)
|
||||
{
|
||||
|
@ -1395,18 +1375,6 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
|
|||
{
|
||||
}
|
||||
|
||||
static inline void __mod_memcg_page_state(struct page *page,
|
||||
int idx,
|
||||
int nr)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void mod_memcg_page_state(struct page *page,
|
||||
int idx,
|
||||
int nr)
|
||||
{
|
||||
}
|
||||
|
||||
static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
|
||||
enum node_stat_item idx)
|
||||
{
|
||||
|
@ -1479,34 +1447,6 @@ static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
|
|||
}
|
||||
#endif /* CONFIG_MEMCG */
|
||||
|
||||
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
||||
static inline void __inc_memcg_state(struct mem_cgroup *memcg,
|
||||
int idx)
|
||||
{
|
||||
__mod_memcg_state(memcg, idx, 1);
|
||||
}
|
||||
|
||||
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
||||
static inline void __dec_memcg_state(struct mem_cgroup *memcg,
|
||||
int idx)
|
||||
{
|
||||
__mod_memcg_state(memcg, idx, -1);
|
||||
}
|
||||
|
||||
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
||||
static inline void __inc_memcg_page_state(struct page *page,
|
||||
int idx)
|
||||
{
|
||||
__mod_memcg_page_state(page, idx, 1);
|
||||
}
|
||||
|
||||
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
||||
static inline void __dec_memcg_page_state(struct page *page,
|
||||
int idx)
|
||||
{
|
||||
__mod_memcg_page_state(page, idx, -1);
|
||||
}
|
||||
|
||||
static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
|
||||
{
|
||||
__mod_lruvec_kmem_state(p, idx, 1);
|
||||
|
@ -1517,34 +1457,6 @@ static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx)
|
|||
__mod_lruvec_kmem_state(p, idx, -1);
|
||||
}
|
||||
|
||||
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
||||
static inline void inc_memcg_state(struct mem_cgroup *memcg,
|
||||
int idx)
|
||||
{
|
||||
mod_memcg_state(memcg, idx, 1);
|
||||
}
|
||||
|
||||
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
||||
static inline void dec_memcg_state(struct mem_cgroup *memcg,
|
||||
int idx)
|
||||
{
|
||||
mod_memcg_state(memcg, idx, -1);
|
||||
}
|
||||
|
||||
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
||||
static inline void inc_memcg_page_state(struct page *page,
|
||||
int idx)
|
||||
{
|
||||
mod_memcg_page_state(page, idx, 1);
|
||||
}
|
||||
|
||||
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
||||
static inline void dec_memcg_page_state(struct page *page,
|
||||
int idx)
|
||||
{
|
||||
mod_memcg_page_state(page, idx, -1);
|
||||
}
|
||||
|
||||
static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
|
||||
{
|
||||
struct mem_cgroup *memcg;
|
||||
|
@ -1733,21 +1645,6 @@ static inline void memcg_kmem_uncharge_page(struct page *page, int order)
|
|||
__memcg_kmem_uncharge_page(page, order);
|
||||
}
|
||||
|
||||
static inline int memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
|
||||
unsigned int nr_pages)
|
||||
{
|
||||
if (memcg_kmem_enabled())
|
||||
return __memcg_kmem_charge(memcg, gfp, nr_pages);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void memcg_kmem_uncharge(struct mem_cgroup *memcg,
|
||||
unsigned int nr_pages)
|
||||
{
|
||||
if (memcg_kmem_enabled())
|
||||
__memcg_kmem_uncharge(memcg, nr_pages);
|
||||
}
|
||||
|
||||
/*
|
||||
* A helper for accessing memcg's kmem_id, used for getting
|
||||
* corresponding LRU lists.
|
||||
|
|
|
@ -37,6 +37,18 @@ void dump_mm(const struct mm_struct *mm);
|
|||
BUG(); \
|
||||
} \
|
||||
} while (0)
|
||||
#define VM_WARN_ON_ONCE_PAGE(cond, page) ({ \
|
||||
static bool __section(".data.once") __warned; \
|
||||
int __ret_warn_once = !!(cond); \
|
||||
\
|
||||
if (unlikely(__ret_warn_once && !__warned)) { \
|
||||
dump_page(page, "VM_WARN_ON_ONCE_PAGE(" __stringify(cond)")");\
|
||||
__warned = true; \
|
||||
WARN_ON(1); \
|
||||
} \
|
||||
unlikely(__ret_warn_once); \
|
||||
})
|
||||
|
||||
#define VM_WARN_ON(cond) (void)WARN_ON(cond)
|
||||
#define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond)
|
||||
#define VM_WARN_ONCE(cond, format...) (void)WARN_ONCE(cond, format)
|
||||
|
@ -48,6 +60,7 @@ void dump_mm(const struct mm_struct *mm);
|
|||
#define VM_BUG_ON_MM(cond, mm) VM_BUG_ON(cond)
|
||||
#define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond)
|
||||
#define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond)
|
||||
#define VM_WARN_ON_ONCE_PAGE(cond, page) BUILD_BUG_ON_INVALID(cond)
|
||||
#define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond)
|
||||
#define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
|
||||
#endif
|
||||
|
|
|
@ -362,6 +362,11 @@ asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
|
|||
int maxevents, int timeout,
|
||||
const sigset_t __user *sigmask,
|
||||
size_t sigsetsize);
|
||||
asmlinkage long sys_epoll_pwait2(int epfd, struct epoll_event __user *events,
|
||||
int maxevents,
|
||||
const struct __kernel_timespec __user *timeout,
|
||||
const sigset_t __user *sigmask,
|
||||
size_t sigsetsize);
|
||||
|
||||
/* fs/fcntl.c */
|
||||
asmlinkage long sys_dup(unsigned int fildes);
|
||||
|
|
|
@ -859,9 +859,11 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
|
|||
__SYSCALL(__NR_faccessat2, sys_faccessat2)
|
||||
#define __NR_process_madvise 440
|
||||
__SYSCALL(__NR_process_madvise, sys_process_madvise)
|
||||
#define __NR_epoll_pwait2 441
|
||||
__SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2)
|
||||
|
||||
#undef __NR_syscalls
|
||||
#define __NR_syscalls 441
|
||||
#define __NR_syscalls 442
|
||||
|
||||
/*
|
||||
* 32 bit systems traditionally used different
|
||||
|
|
|
@ -68,6 +68,8 @@ COND_SYSCALL(epoll_create1);
|
|||
COND_SYSCALL(epoll_ctl);
|
||||
COND_SYSCALL(epoll_pwait);
|
||||
COND_SYSCALL_COMPAT(epoll_pwait);
|
||||
COND_SYSCALL(epoll_pwait2);
|
||||
COND_SYSCALL_COMPAT(epoll_pwait2);
|
||||
|
||||
/* fs/fcntl.c */
|
||||
|
||||
|
|
|
@ -713,7 +713,7 @@ config ZSMALLOC_STAT
|
|||
select DEBUG_FS
|
||||
help
|
||||
This option enables code in the zsmalloc to collect various
|
||||
statistics about whats happening in zsmalloc and exports that
|
||||
statistics about what's happening in zsmalloc and exports that
|
||||
information to userspace via debugfs.
|
||||
If unsure, say N.
|
||||
|
||||
|
|
|
@ -1342,46 +1342,6 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
|
|||
}
|
||||
#endif
|
||||
|
||||
/**
|
||||
* mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
|
||||
* @page: the page
|
||||
* @pgdat: pgdat of the page
|
||||
*
|
||||
* This function relies on page's memcg being stable - see the
|
||||
* access rules in commit_charge().
|
||||
*/
|
||||
struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
|
||||
{
|
||||
struct mem_cgroup_per_node *mz;
|
||||
struct mem_cgroup *memcg;
|
||||
struct lruvec *lruvec;
|
||||
|
||||
if (mem_cgroup_disabled()) {
|
||||
lruvec = &pgdat->__lruvec;
|
||||
goto out;
|
||||
}
|
||||
|
||||
memcg = page_memcg(page);
|
||||
/*
|
||||
* Swapcache readahead pages are added to the LRU - and
|
||||
* possibly migrated - before they are charged.
|
||||
*/
|
||||
if (!memcg)
|
||||
memcg = root_mem_cgroup;
|
||||
|
||||
mz = mem_cgroup_page_nodeinfo(memcg, page);
|
||||
lruvec = &mz->lruvec;
|
||||
out:
|
||||
/*
|
||||
* Since a node can be onlined after the mem_cgroup was created,
|
||||
* we have to be prepared to initialize lruvec->zone here;
|
||||
* and if offlined then reonlined, we need to reinitialize it.
|
||||
*/
|
||||
if (unlikely(lruvec->pgdat != pgdat))
|
||||
lruvec->pgdat = pgdat;
|
||||
return lruvec;
|
||||
}
|
||||
|
||||
/**
|
||||
* lock_page_lruvec - lock and return lruvec for a given page.
|
||||
* @page: the page
|
||||
|
@ -6987,6 +6947,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
|
|||
return;
|
||||
|
||||
memcg = page_memcg(oldpage);
|
||||
VM_WARN_ON_ONCE_PAGE(!memcg, oldpage);
|
||||
if (!memcg)
|
||||
return;
|
||||
|
||||
|
@ -7178,12 +7139,15 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
|
|||
VM_BUG_ON_PAGE(PageLRU(page), page);
|
||||
VM_BUG_ON_PAGE(page_count(page), page);
|
||||
|
||||
if (mem_cgroup_disabled())
|
||||
return;
|
||||
|
||||
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
|
||||
return;
|
||||
|
||||
memcg = page_memcg(page);
|
||||
|
||||
/* Readahead page, never charged */
|
||||
VM_WARN_ON_ONCE_PAGE(!memcg, page);
|
||||
if (!memcg)
|
||||
return;
|
||||
|
||||
|
@ -7242,12 +7206,15 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
|
|||
struct mem_cgroup *memcg;
|
||||
unsigned short oldid;
|
||||
|
||||
if (mem_cgroup_disabled())
|
||||
return 0;
|
||||
|
||||
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
|
||||
return 0;
|
||||
|
||||
memcg = page_memcg(page);
|
||||
|
||||
/* Readahead page, never charged */
|
||||
VM_WARN_ON_ONCE_PAGE(!memcg, page);
|
||||
if (!memcg)
|
||||
return 0;
|
||||
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#define _GNU_SOURCE
|
||||
#include <asm/unistd.h>
|
||||
#include <linux/time_types.h>
|
||||
#include <poll.h>
|
||||
#include <unistd.h>
|
||||
#include <assert.h>
|
||||
|
@ -21,6 +23,19 @@ struct epoll_mtcontext
|
|||
pthread_t waiter;
|
||||
};
|
||||
|
||||
#ifndef __NR_epoll_pwait2
|
||||
#define __NR_epoll_pwait2 -1
|
||||
#endif
|
||||
|
||||
static inline int sys_epoll_pwait2(int fd, struct epoll_event *events,
|
||||
int maxevents,
|
||||
const struct __kernel_timespec *timeout,
|
||||
const sigset_t *sigset, size_t sigsetsize)
|
||||
{
|
||||
return syscall(__NR_epoll_pwait2, fd, events, maxevents, timeout,
|
||||
sigset, sigsetsize);
|
||||
}
|
||||
|
||||
static void signal_handler(int signum)
|
||||
{
|
||||
}
|
||||
|
@ -3377,4 +3392,61 @@ TEST(epoll61)
|
|||
close(ctx.evfd);
|
||||
}
|
||||
|
||||
/* Equivalent to basic test epoll1, but exercising epoll_pwait2. */
|
||||
TEST(epoll62)
|
||||
{
|
||||
int efd;
|
||||
int sfd[2];
|
||||
struct epoll_event e;
|
||||
|
||||
ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
|
||||
|
||||
efd = epoll_create(1);
|
||||
ASSERT_GE(efd, 0);
|
||||
|
||||
e.events = EPOLLIN;
|
||||
ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], &e), 0);
|
||||
|
||||
ASSERT_EQ(write(sfd[1], "w", 1), 1);
|
||||
|
||||
EXPECT_EQ(sys_epoll_pwait2(efd, &e, 1, NULL, NULL, 0), 1);
|
||||
EXPECT_EQ(sys_epoll_pwait2(efd, &e, 1, NULL, NULL, 0), 1);
|
||||
|
||||
close(efd);
|
||||
close(sfd[0]);
|
||||
close(sfd[1]);
|
||||
}
|
||||
|
||||
/* Epoll_pwait2 basic timeout test. */
|
||||
TEST(epoll63)
|
||||
{
|
||||
const int cfg_delay_ms = 10;
|
||||
unsigned long long tdiff;
|
||||
struct __kernel_timespec ts;
|
||||
int efd;
|
||||
int sfd[2];
|
||||
struct epoll_event e;
|
||||
|
||||
ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
|
||||
|
||||
efd = epoll_create(1);
|
||||
ASSERT_GE(efd, 0);
|
||||
|
||||
e.events = EPOLLIN;
|
||||
ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], &e), 0);
|
||||
|
||||
ts.tv_sec = 0;
|
||||
ts.tv_nsec = cfg_delay_ms * 1000 * 1000;
|
||||
|
||||
tdiff = msecs();
|
||||
EXPECT_EQ(sys_epoll_pwait2(efd, &e, 1, &ts, NULL, 0), 0);
|
||||
tdiff = msecs() - tdiff;
|
||||
|
||||
EXPECT_GE(tdiff, cfg_delay_ms);
|
||||
|
||||
close(efd);
|
||||
close(sfd[0]);
|
||||
close(sfd[1]);
|
||||
}
|
||||
|
||||
TEST_HARNESS_MAIN
|
||||
|
|
|
@ -111,7 +111,7 @@ int kvm_coalesced_mmio_init(struct kvm *kvm)
|
|||
{
|
||||
struct page *page;
|
||||
|
||||
page = alloc_page(GFP_KERNEL | __GFP_ZERO);
|
||||
page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
|
||||
if (!page)
|
||||
return -ENOMEM;
|
||||
|
||||
|
|
|
@ -3116,7 +3116,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
|
|||
}
|
||||
|
||||
BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
|
||||
page = alloc_page(GFP_KERNEL | __GFP_ZERO);
|
||||
page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
|
||||
if (!page) {
|
||||
r = -ENOMEM;
|
||||
goto vcpu_free;
|
||||
|
|
Loading…
Reference in New Issue