Merge branch 'page-refs' (page ref overflow)
Merge page ref overflow branch. Jann Horn reported that he can overflow the page ref count with sufficient memory (and a filesystem that is intentionally extremely slow). Admittedly it's not exactly easy. To have more than four billion references to a page requires a minimum of 32GB of kernel memory just for the pointers to the pages, much less any metadata to keep track of those pointers. Jann needed a total of 140GB of memory and a specially crafted filesystem that leaves all reads pending (in order to not ever free the page references and just keep adding more). Still, we have a fairly straightforward way to limit the two obvious user-controllable sources of page references: direct-IO like page references gotten through get_user_pages(), and the splice pipe page duplication. So let's just do that. * branch page-refs: fs: prevent page refcount overflow in pipe_buf_get mm: prevent get_user_pages() from overflowing page refcount mm: add 'try_get_page()' helper function mm: make page ref count overflow check tighter and more explicit
This commit is contained in:
commit
6b3a707736
|
@ -2056,10 +2056,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
|
|||
rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
|
||||
|
||||
ret = -EINVAL;
|
||||
if (rem < len) {
|
||||
pipe_unlock(pipe);
|
||||
goto out;
|
||||
}
|
||||
if (rem < len)
|
||||
goto out_free;
|
||||
|
||||
rem = len;
|
||||
while (rem) {
|
||||
|
@ -2077,7 +2075,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
|
|||
pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
|
||||
pipe->nrbufs--;
|
||||
} else {
|
||||
pipe_buf_get(pipe, ibuf);
|
||||
if (!pipe_buf_get(pipe, ibuf))
|
||||
goto out_free;
|
||||
|
||||
*obuf = *ibuf;
|
||||
obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
|
||||
obuf->len = rem;
|
||||
|
@ -2100,11 +2100,11 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
|
|||
ret = fuse_dev_do_write(fud, &cs, len);
|
||||
|
||||
pipe_lock(pipe);
|
||||
out_free:
|
||||
for (idx = 0; idx < nbuf; idx++)
|
||||
pipe_buf_release(pipe, &bufs[idx]);
|
||||
pipe_unlock(pipe);
|
||||
|
||||
out:
|
||||
kvfree(bufs);
|
||||
return ret;
|
||||
}
|
||||
|
|
|
@ -188,9 +188,9 @@ EXPORT_SYMBOL(generic_pipe_buf_steal);
|
|||
* in the tee() system call, when we duplicate the buffers in one
|
||||
* pipe into another.
|
||||
*/
|
||||
void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
|
||||
bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
|
||||
{
|
||||
get_page(buf->page);
|
||||
return try_get_page(buf->page);
|
||||
}
|
||||
EXPORT_SYMBOL(generic_pipe_buf_get);
|
||||
|
||||
|
|
12
fs/splice.c
12
fs/splice.c
|
@ -1593,7 +1593,11 @@ retry:
|
|||
* Get a reference to this pipe buffer,
|
||||
* so we can copy the contents over.
|
||||
*/
|
||||
pipe_buf_get(ipipe, ibuf);
|
||||
if (!pipe_buf_get(ipipe, ibuf)) {
|
||||
if (ret == 0)
|
||||
ret = -EFAULT;
|
||||
break;
|
||||
}
|
||||
*obuf = *ibuf;
|
||||
|
||||
/*
|
||||
|
@ -1667,7 +1671,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
|
|||
* Get a reference to this pipe buffer,
|
||||
* so we can copy the contents over.
|
||||
*/
|
||||
pipe_buf_get(ipipe, ibuf);
|
||||
if (!pipe_buf_get(ipipe, ibuf)) {
|
||||
if (ret == 0)
|
||||
ret = -EFAULT;
|
||||
break;
|
||||
}
|
||||
|
||||
obuf = opipe->bufs + nbuf;
|
||||
*obuf = *ibuf;
|
||||
|
|
|
@ -966,6 +966,10 @@ static inline bool is_pci_p2pdma_page(const struct page *page)
|
|||
}
|
||||
#endif /* CONFIG_DEV_PAGEMAP_OPS */
|
||||
|
||||
/* 127: arbitrary random number, small enough to assemble well */
|
||||
#define page_ref_zero_or_close_to_overflow(page) \
|
||||
((unsigned int) page_ref_count(page) + 127u <= 127u)
|
||||
|
||||
static inline void get_page(struct page *page)
|
||||
{
|
||||
page = compound_head(page);
|
||||
|
@ -973,10 +977,19 @@ static inline void get_page(struct page *page)
|
|||
* Getting a normal page or the head of a compound page
|
||||
* requires to already have an elevated page->_refcount.
|
||||
*/
|
||||
VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
|
||||
VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page);
|
||||
page_ref_inc(page);
|
||||
}
|
||||
|
||||
static inline __must_check bool try_get_page(struct page *page)
|
||||
{
|
||||
page = compound_head(page);
|
||||
if (WARN_ON_ONCE(page_ref_count(page) <= 0))
|
||||
return false;
|
||||
page_ref_inc(page);
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline void put_page(struct page *page)
|
||||
{
|
||||
page = compound_head(page);
|
||||
|
|
|
@ -101,18 +101,20 @@ struct pipe_buf_operations {
|
|||
/*
|
||||
* Get a reference to the pipe buffer.
|
||||
*/
|
||||
void (*get)(struct pipe_inode_info *, struct pipe_buffer *);
|
||||
bool (*get)(struct pipe_inode_info *, struct pipe_buffer *);
|
||||
};
|
||||
|
||||
/**
|
||||
* pipe_buf_get - get a reference to a pipe_buffer
|
||||
* @pipe: the pipe that the buffer belongs to
|
||||
* @buf: the buffer to get a reference to
|
||||
*
|
||||
* Return: %true if the reference was successfully obtained.
|
||||
*/
|
||||
static inline void pipe_buf_get(struct pipe_inode_info *pipe,
|
||||
static inline __must_check bool pipe_buf_get(struct pipe_inode_info *pipe,
|
||||
struct pipe_buffer *buf)
|
||||
{
|
||||
buf->ops->get(pipe, buf);
|
||||
return buf->ops->get(pipe, buf);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -171,7 +173,7 @@ struct pipe_inode_info *alloc_pipe_info(void);
|
|||
void free_pipe_info(struct pipe_inode_info *);
|
||||
|
||||
/* Generic pipe buffer ops functions */
|
||||
void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
|
||||
bool generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
|
||||
int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *);
|
||||
int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
|
||||
void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);
|
||||
|
|
|
@ -7041,12 +7041,16 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
|
|||
buf->private = 0;
|
||||
}
|
||||
|
||||
static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
|
||||
static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe,
|
||||
struct pipe_buffer *buf)
|
||||
{
|
||||
struct buffer_ref *ref = (struct buffer_ref *)buf->private;
|
||||
|
||||
if (ref->ref > INT_MAX/2)
|
||||
return false;
|
||||
|
||||
ref->ref++;
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Pipe buffer operations for a buffer. */
|
||||
|
|
48
mm/gup.c
48
mm/gup.c
|
@ -160,8 +160,12 @@ retry:
|
|||
goto retry;
|
||||
}
|
||||
|
||||
if (flags & FOLL_GET)
|
||||
get_page(page);
|
||||
if (flags & FOLL_GET) {
|
||||
if (unlikely(!try_get_page(page))) {
|
||||
page = ERR_PTR(-ENOMEM);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
if (flags & FOLL_TOUCH) {
|
||||
if ((flags & FOLL_WRITE) &&
|
||||
!pte_dirty(pte) && !PageDirty(page))
|
||||
|
@ -298,7 +302,10 @@ retry_locked:
|
|||
if (pmd_trans_unstable(pmd))
|
||||
ret = -EBUSY;
|
||||
} else {
|
||||
get_page(page);
|
||||
if (unlikely(!try_get_page(page))) {
|
||||
spin_unlock(ptl);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
spin_unlock(ptl);
|
||||
lock_page(page);
|
||||
ret = split_huge_page(page);
|
||||
|
@ -500,7 +507,10 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
|
|||
if (is_device_public_page(*page))
|
||||
goto unmap;
|
||||
}
|
||||
get_page(*page);
|
||||
if (unlikely(!try_get_page(*page))) {
|
||||
ret = -ENOMEM;
|
||||
goto unmap;
|
||||
}
|
||||
out:
|
||||
ret = 0;
|
||||
unmap:
|
||||
|
@ -1545,6 +1555,20 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the compund head page with ref appropriately incremented,
|
||||
* or NULL if that failed.
|
||||
*/
|
||||
static inline struct page *try_get_compound_head(struct page *page, int refs)
|
||||
{
|
||||
struct page *head = compound_head(page);
|
||||
if (WARN_ON_ONCE(page_ref_count(head) < 0))
|
||||
return NULL;
|
||||
if (unlikely(!page_cache_add_speculative(head, refs)))
|
||||
return NULL;
|
||||
return head;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
|
||||
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
|
||||
int write, struct page **pages, int *nr)
|
||||
|
@ -1579,9 +1603,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
|
|||
|
||||
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
|
||||
page = pte_page(pte);
|
||||
head = compound_head(page);
|
||||
|
||||
if (!page_cache_get_speculative(head))
|
||||
head = try_get_compound_head(page, 1);
|
||||
if (!head)
|
||||
goto pte_unmap;
|
||||
|
||||
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
|
||||
|
@ -1720,8 +1744,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
|
|||
refs++;
|
||||
} while (addr += PAGE_SIZE, addr != end);
|
||||
|
||||
head = compound_head(pmd_page(orig));
|
||||
if (!page_cache_add_speculative(head, refs)) {
|
||||
head = try_get_compound_head(pmd_page(orig), refs);
|
||||
if (!head) {
|
||||
*nr -= refs;
|
||||
return 0;
|
||||
}
|
||||
|
@ -1758,8 +1782,8 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
|
|||
refs++;
|
||||
} while (addr += PAGE_SIZE, addr != end);
|
||||
|
||||
head = compound_head(pud_page(orig));
|
||||
if (!page_cache_add_speculative(head, refs)) {
|
||||
head = try_get_compound_head(pud_page(orig), refs);
|
||||
if (!head) {
|
||||
*nr -= refs;
|
||||
return 0;
|
||||
}
|
||||
|
@ -1795,8 +1819,8 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
|
|||
refs++;
|
||||
} while (addr += PAGE_SIZE, addr != end);
|
||||
|
||||
head = compound_head(pgd_page(orig));
|
||||
if (!page_cache_add_speculative(head, refs)) {
|
||||
head = try_get_compound_head(pgd_page(orig), refs);
|
||||
if (!head) {
|
||||
*nr -= refs;
|
||||
return 0;
|
||||
}
|
||||
|
|
13
mm/hugetlb.c
13
mm/hugetlb.c
|
@ -4299,6 +4299,19 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|||
|
||||
pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
|
||||
page = pte_page(huge_ptep_get(pte));
|
||||
|
||||
/*
|
||||
* Instead of doing 'try_get_page()' below in the same_page
|
||||
* loop, just check the count once here.
|
||||
*/
|
||||
if (unlikely(page_count(page) <= 0)) {
|
||||
if (pages) {
|
||||
spin_unlock(ptl);
|
||||
remainder = 0;
|
||||
err = -ENOMEM;
|
||||
break;
|
||||
}
|
||||
}
|
||||
same_page:
|
||||
if (pages) {
|
||||
pages[i] = mem_map_offset(page, pfn_offset);
|
||||
|
|
Loading…
Reference in New Issue