Commit 6b3a7077 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'page-refs' (page ref overflow)

Merge page ref overflow branch.

Jann Horn reported that he can overflow the page ref count with
sufficient memory (and a filesystem that is intentionally extremely
slow).

Admittedly it's not exactly easy.  To have more than four billion
references to a page requires a minimum of 32GB of kernel memory just
for the pointers to the pages, much less any metadata to keep track of
those pointers.  Jann needed a total of 140GB of memory and a specially
crafted filesystem that leaves all reads pending (in order to not ever
free the page references and just keep adding more).

Still, we have a fairly straightforward way to limit the two obvious
user-controllable sources of page references: direct-IO like page
references gotten through get_user_pages(), and the splice pipe page
duplication.  So let's just do that.

* branch page-refs:
  fs: prevent page refcount overflow in pipe_buf_get
  mm: prevent get_user_pages() from overflowing page refcount
  mm: add 'try_get_page()' helper function
  mm: make page ref count overflow check tighter and more explicit
parents 4443f8e6 15fab63e
...@@ -2056,10 +2056,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, ...@@ -2056,10 +2056,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len; rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
ret = -EINVAL; ret = -EINVAL;
if (rem < len) { if (rem < len)
pipe_unlock(pipe); goto out_free;
goto out;
}
rem = len; rem = len;
while (rem) { while (rem) {
...@@ -2077,7 +2075,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, ...@@ -2077,7 +2075,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
pipe->nrbufs--; pipe->nrbufs--;
} else { } else {
pipe_buf_get(pipe, ibuf); if (!pipe_buf_get(pipe, ibuf))
goto out_free;
*obuf = *ibuf; *obuf = *ibuf;
obuf->flags &= ~PIPE_BUF_FLAG_GIFT; obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
obuf->len = rem; obuf->len = rem;
...@@ -2100,11 +2100,11 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, ...@@ -2100,11 +2100,11 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
ret = fuse_dev_do_write(fud, &cs, len); ret = fuse_dev_do_write(fud, &cs, len);
pipe_lock(pipe); pipe_lock(pipe);
out_free:
for (idx = 0; idx < nbuf; idx++) for (idx = 0; idx < nbuf; idx++)
pipe_buf_release(pipe, &bufs[idx]); pipe_buf_release(pipe, &bufs[idx]);
pipe_unlock(pipe); pipe_unlock(pipe);
out:
kvfree(bufs); kvfree(bufs);
return ret; return ret;
} }
......
...@@ -188,9 +188,9 @@ EXPORT_SYMBOL(generic_pipe_buf_steal); ...@@ -188,9 +188,9 @@ EXPORT_SYMBOL(generic_pipe_buf_steal);
* in the tee() system call, when we duplicate the buffers in one * in the tee() system call, when we duplicate the buffers in one
* pipe into another. * pipe into another.
*/ */
void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
{ {
get_page(buf->page); return try_get_page(buf->page);
} }
EXPORT_SYMBOL(generic_pipe_buf_get); EXPORT_SYMBOL(generic_pipe_buf_get);
......
...@@ -1593,7 +1593,11 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, ...@@ -1593,7 +1593,11 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
* Get a reference to this pipe buffer, * Get a reference to this pipe buffer,
* so we can copy the contents over. * so we can copy the contents over.
*/ */
pipe_buf_get(ipipe, ibuf); if (!pipe_buf_get(ipipe, ibuf)) {
if (ret == 0)
ret = -EFAULT;
break;
}
*obuf = *ibuf; *obuf = *ibuf;
/* /*
...@@ -1667,7 +1671,11 @@ static int link_pipe(struct pipe_inode_info *ipipe, ...@@ -1667,7 +1671,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
* Get a reference to this pipe buffer, * Get a reference to this pipe buffer,
* so we can copy the contents over. * so we can copy the contents over.
*/ */
pipe_buf_get(ipipe, ibuf); if (!pipe_buf_get(ipipe, ibuf)) {
if (ret == 0)
ret = -EFAULT;
break;
}
obuf = opipe->bufs + nbuf; obuf = opipe->bufs + nbuf;
*obuf = *ibuf; *obuf = *ibuf;
......
...@@ -966,6 +966,10 @@ static inline bool is_pci_p2pdma_page(const struct page *page) ...@@ -966,6 +966,10 @@ static inline bool is_pci_p2pdma_page(const struct page *page)
} }
#endif /* CONFIG_DEV_PAGEMAP_OPS */ #endif /* CONFIG_DEV_PAGEMAP_OPS */
/* 127: arbitrary random number, small enough to assemble well */
#define page_ref_zero_or_close_to_overflow(page) \
((unsigned int) page_ref_count(page) + 127u <= 127u)
static inline void get_page(struct page *page) static inline void get_page(struct page *page)
{ {
page = compound_head(page); page = compound_head(page);
...@@ -973,8 +977,17 @@ static inline void get_page(struct page *page) ...@@ -973,8 +977,17 @@ static inline void get_page(struct page *page)
* Getting a normal page or the head of a compound page * Getting a normal page or the head of a compound page
* requires to already have an elevated page->_refcount. * requires to already have an elevated page->_refcount.
*/ */
VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page); VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page);
page_ref_inc(page);
}
static inline __must_check bool try_get_page(struct page *page)
{
page = compound_head(page);
if (WARN_ON_ONCE(page_ref_count(page) <= 0))
return false;
page_ref_inc(page); page_ref_inc(page);
return true;
} }
static inline void put_page(struct page *page) static inline void put_page(struct page *page)
......
...@@ -101,18 +101,20 @@ struct pipe_buf_operations { ...@@ -101,18 +101,20 @@ struct pipe_buf_operations {
/* /*
* Get a reference to the pipe buffer. * Get a reference to the pipe buffer.
*/ */
void (*get)(struct pipe_inode_info *, struct pipe_buffer *); bool (*get)(struct pipe_inode_info *, struct pipe_buffer *);
}; };
/** /**
* pipe_buf_get - get a reference to a pipe_buffer * pipe_buf_get - get a reference to a pipe_buffer
* @pipe: the pipe that the buffer belongs to * @pipe: the pipe that the buffer belongs to
* @buf: the buffer to get a reference to * @buf: the buffer to get a reference to
*
* Return: %true if the reference was successfully obtained.
*/ */
static inline void pipe_buf_get(struct pipe_inode_info *pipe, static inline __must_check bool pipe_buf_get(struct pipe_inode_info *pipe,
struct pipe_buffer *buf) struct pipe_buffer *buf)
{ {
buf->ops->get(pipe, buf); return buf->ops->get(pipe, buf);
} }
/** /**
...@@ -171,7 +173,7 @@ struct pipe_inode_info *alloc_pipe_info(void); ...@@ -171,7 +173,7 @@ struct pipe_inode_info *alloc_pipe_info(void);
void free_pipe_info(struct pipe_inode_info *); void free_pipe_info(struct pipe_inode_info *);
/* Generic pipe buffer ops functions */ /* Generic pipe buffer ops functions */
void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); bool generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *);
int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);
......
...@@ -7041,12 +7041,16 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, ...@@ -7041,12 +7041,16 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
buf->private = 0; buf->private = 0;
} }
static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe,
struct pipe_buffer *buf) struct pipe_buffer *buf)
{ {
struct buffer_ref *ref = (struct buffer_ref *)buf->private; struct buffer_ref *ref = (struct buffer_ref *)buf->private;
if (ref->ref > INT_MAX/2)
return false;
ref->ref++; ref->ref++;
return true;
} }
/* Pipe buffer operations for a buffer. */ /* Pipe buffer operations for a buffer. */
......
...@@ -160,8 +160,12 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, ...@@ -160,8 +160,12 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
goto retry; goto retry;
} }
if (flags & FOLL_GET) if (flags & FOLL_GET) {
get_page(page); if (unlikely(!try_get_page(page))) {
page = ERR_PTR(-ENOMEM);
goto out;
}
}
if (flags & FOLL_TOUCH) { if (flags & FOLL_TOUCH) {
if ((flags & FOLL_WRITE) && if ((flags & FOLL_WRITE) &&
!pte_dirty(pte) && !PageDirty(page)) !pte_dirty(pte) && !PageDirty(page))
...@@ -298,7 +302,10 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, ...@@ -298,7 +302,10 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
if (pmd_trans_unstable(pmd)) if (pmd_trans_unstable(pmd))
ret = -EBUSY; ret = -EBUSY;
} else { } else {
get_page(page); if (unlikely(!try_get_page(page))) {
spin_unlock(ptl);
return ERR_PTR(-ENOMEM);
}
spin_unlock(ptl); spin_unlock(ptl);
lock_page(page); lock_page(page);
ret = split_huge_page(page); ret = split_huge_page(page);
...@@ -500,7 +507,10 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, ...@@ -500,7 +507,10 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
if (is_device_public_page(*page)) if (is_device_public_page(*page))
goto unmap; goto unmap;
} }
get_page(*page); if (unlikely(!try_get_page(*page))) {
ret = -ENOMEM;
goto unmap;
}
out: out:
ret = 0; ret = 0;
unmap: unmap:
...@@ -1545,6 +1555,20 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) ...@@ -1545,6 +1555,20 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
} }
} }
/*
* Return the compund head page with ref appropriately incremented,
* or NULL if that failed.
*/
static inline struct page *try_get_compound_head(struct page *page, int refs)
{
struct page *head = compound_head(page);
if (WARN_ON_ONCE(page_ref_count(head) < 0))
return NULL;
if (unlikely(!page_cache_add_speculative(head, refs)))
return NULL;
return head;
}
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr) int write, struct page **pages, int *nr)
...@@ -1579,9 +1603,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, ...@@ -1579,9 +1603,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
VM_BUG_ON(!pfn_valid(pte_pfn(pte))); VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte); page = pte_page(pte);
head = compound_head(page);
if (!page_cache_get_speculative(head)) head = try_get_compound_head(page, 1);
if (!head)
goto pte_unmap; goto pte_unmap;
if (unlikely(pte_val(pte) != pte_val(*ptep))) { if (unlikely(pte_val(pte) != pte_val(*ptep))) {
...@@ -1720,8 +1744,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, ...@@ -1720,8 +1744,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
refs++; refs++;
} while (addr += PAGE_SIZE, addr != end); } while (addr += PAGE_SIZE, addr != end);
head = compound_head(pmd_page(orig)); head = try_get_compound_head(pmd_page(orig), refs);
if (!page_cache_add_speculative(head, refs)) { if (!head) {
*nr -= refs; *nr -= refs;
return 0; return 0;
} }
...@@ -1758,8 +1782,8 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, ...@@ -1758,8 +1782,8 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
refs++; refs++;
} while (addr += PAGE_SIZE, addr != end); } while (addr += PAGE_SIZE, addr != end);
head = compound_head(pud_page(orig)); head = try_get_compound_head(pud_page(orig), refs);
if (!page_cache_add_speculative(head, refs)) { if (!head) {
*nr -= refs; *nr -= refs;
return 0; return 0;
} }
...@@ -1795,8 +1819,8 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, ...@@ -1795,8 +1819,8 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
refs++; refs++;
} while (addr += PAGE_SIZE, addr != end); } while (addr += PAGE_SIZE, addr != end);
head = compound_head(pgd_page(orig)); head = try_get_compound_head(pgd_page(orig), refs);
if (!page_cache_add_speculative(head, refs)) { if (!head) {
*nr -= refs; *nr -= refs;
return 0; return 0;
} }
......
...@@ -4299,6 +4299,19 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -4299,6 +4299,19 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
page = pte_page(huge_ptep_get(pte)); page = pte_page(huge_ptep_get(pte));
/*
* Instead of doing 'try_get_page()' below in the same_page
* loop, just check the count once here.
*/
if (unlikely(page_count(page) <= 0)) {
if (pages) {
spin_unlock(ptl);
remainder = 0;
err = -ENOMEM;
break;
}
}
same_page: same_page:
if (pages) { if (pages) {
pages[i] = mem_map_offset(page, pfn_offset); pages[i] = mem_map_offset(page, pfn_offset);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment