mm: avoid taking rmap locks in move_ptes()

During mremap(), the destination VMA is generally placed after the
original vma in rmap traversal order: in move_vma(), we always have
new_pgoff >= vma->vm_pgoff, and as a result new_vma->vm_pgoff >=
vma->vm_pgoff unless vma_merge() merged the new vma with an adjacent one.

When the destination VMA is placed after the original in rmap traversal
order, we can avoid taking the rmap locks in move_ptes().

Essentially, this reintroduces the optimization that had been disabled in
"mm anon rmap: remove anon_vma_moveto_tail".  The difference is that we
don't try to impose the rmap traversal order; instead we just rely on
things being in the desired order in the common case and fall back to
taking locks in the uncommon case.  Also we skip the i_mmap_mutex in
addition to the anon_vma lock: in both cases, the vmas are traversed in
increasing vm_pgoff order with ties resolved in tree insertion order.

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Michel Lespinasse 2012-10-08 16:31:50 -07:00 committed by Linus Torvalds
parent 523d4e2008
commit 38a76013ad
4 changed files with 49 additions and 23 deletions

View file

@ -603,7 +603,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
* process cleanup to remove whatever mess we made. * process cleanup to remove whatever mess we made.
*/ */
if (length != move_page_tables(vma, old_start, if (length != move_page_tables(vma, old_start,
vma, new_start, length)) vma, new_start, length, false))
return -ENOMEM; return -ENOMEM;
lru_add_drain(); lru_add_drain();

View file

@ -1060,7 +1060,8 @@ vm_is_stack(struct task_struct *task, struct vm_area_struct *vma, int in_group);
extern unsigned long move_page_tables(struct vm_area_struct *vma, extern unsigned long move_page_tables(struct vm_area_struct *vma,
unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long old_addr, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long len); unsigned long new_addr, unsigned long len,
bool need_rmap_locks);
extern unsigned long do_mremap(unsigned long addr, extern unsigned long do_mremap(unsigned long addr,
unsigned long old_len, unsigned long new_len, unsigned long old_len, unsigned long new_len,
unsigned long flags, unsigned long new_addr); unsigned long flags, unsigned long new_addr);
@ -1410,7 +1411,8 @@ extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
struct rb_node **, struct rb_node *); struct rb_node **, struct rb_node *);
extern void unlink_file_vma(struct vm_area_struct *); extern void unlink_file_vma(struct vm_area_struct *);
extern struct vm_area_struct *copy_vma(struct vm_area_struct **, extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
unsigned long addr, unsigned long len, pgoff_t pgoff); unsigned long addr, unsigned long len, pgoff_t pgoff,
bool *need_rmap_locks);
extern void exit_mmap(struct mm_struct *); extern void exit_mmap(struct mm_struct *);
extern int mm_take_all_locks(struct mm_struct *mm); extern int mm_take_all_locks(struct mm_struct *mm);

View file

@ -2371,7 +2371,8 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
* prior to moving page table entries, to effect an mremap move. * prior to moving page table entries, to effect an mremap move.
*/ */
struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
unsigned long addr, unsigned long len, pgoff_t pgoff) unsigned long addr, unsigned long len, pgoff_t pgoff,
bool *need_rmap_locks)
{ {
struct vm_area_struct *vma = *vmap; struct vm_area_struct *vma = *vmap;
unsigned long vma_start = vma->vm_start; unsigned long vma_start = vma->vm_start;
@ -2413,8 +2414,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
* linear if there are no pages mapped yet. * linear if there are no pages mapped yet.
*/ */
VM_BUG_ON(faulted_in_anon_vma); VM_BUG_ON(faulted_in_anon_vma);
*vmap = new_vma; *vmap = vma = new_vma;
} }
*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
} else { } else {
new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (new_vma) { if (new_vma) {
@ -2434,6 +2436,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
if (new_vma->vm_ops && new_vma->vm_ops->open) if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma); new_vma->vm_ops->open(new_vma);
vma_link(mm, new_vma, prev, rb_link, rb_parent); vma_link(mm, new_vma, prev, rb_link, rb_parent);
*need_rmap_locks = false;
} }
} }
return new_vma; return new_vma;

View file

@ -71,26 +71,42 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
unsigned long old_addr, unsigned long old_end, unsigned long old_addr, unsigned long old_end,
struct vm_area_struct *new_vma, pmd_t *new_pmd, struct vm_area_struct *new_vma, pmd_t *new_pmd,
unsigned long new_addr) unsigned long new_addr, bool need_rmap_locks)
{ {
struct address_space *mapping = NULL; struct address_space *mapping = NULL;
struct anon_vma *anon_vma = vma->anon_vma; struct anon_vma *anon_vma = NULL;
struct mm_struct *mm = vma->vm_mm; struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte; pte_t *old_pte, *new_pte, pte;
spinlock_t *old_ptl, *new_ptl; spinlock_t *old_ptl, *new_ptl;
if (vma->vm_file) { /*
/* * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma
* Subtle point from Rajesh Venkatasubramanian: before * locks to ensure that rmap will always observe either the old or the
* moving file-based ptes, we must lock truncate_pagecache * new ptes. This is the easiest way to avoid races with
* out, since it might clean the dst vma before the src vma, * truncate_pagecache(), page migration, etc...
* and we propagate stale pages into the dst afterward. *
*/ * When need_rmap_locks is false, we use other ways to avoid
mapping = vma->vm_file->f_mapping; * such races:
mutex_lock(&mapping->i_mmap_mutex); *
* - During exec() shift_arg_pages(), we use a specially tagged vma
* which rmap call sites look for using is_vma_temporary_stack().
*
* - During mremap(), new_vma is often known to be placed after vma
* in rmap traversal order. This ensures rmap will always observe
* either the old pte, or the new pte, or both (the page table locks
* serialize access to individual ptes, but only rmap traversal
* order guarantees that we won't miss both the old and new ptes).
*/
if (need_rmap_locks) {
if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
mutex_lock(&mapping->i_mmap_mutex);
}
if (vma->anon_vma) {
anon_vma = vma->anon_vma;
anon_vma_lock(anon_vma);
}
} }
if (anon_vma)
anon_vma_lock(anon_vma);
/* /*
* We don't have to worry about the ordering of src and dst * We don't have to worry about the ordering of src and dst
@ -127,7 +143,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long move_page_tables(struct vm_area_struct *vma,
unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long old_addr, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long len) unsigned long new_addr, unsigned long len,
bool need_rmap_locks)
{ {
unsigned long extent, next, old_end; unsigned long extent, next, old_end;
pmd_t *old_pmd, *new_pmd; pmd_t *old_pmd, *new_pmd;
@ -174,7 +191,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
if (extent > LATENCY_LIMIT) if (extent > LATENCY_LIMIT)
extent = LATENCY_LIMIT; extent = LATENCY_LIMIT;
move_ptes(vma, old_pmd, old_addr, old_addr + extent, move_ptes(vma, old_pmd, old_addr, old_addr + extent,
new_vma, new_pmd, new_addr); new_vma, new_pmd, new_addr, need_rmap_locks);
need_flush = true; need_flush = true;
} }
if (likely(need_flush)) if (likely(need_flush))
@ -198,6 +215,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
unsigned long hiwater_vm; unsigned long hiwater_vm;
int split = 0; int split = 0;
int err; int err;
bool need_rmap_locks;
/* /*
* We'd prefer to avoid failure later on in do_munmap: * We'd prefer to avoid failure later on in do_munmap:
@ -219,18 +237,21 @@ static unsigned long move_vma(struct vm_area_struct *vma,
return err; return err;
new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
&need_rmap_locks);
if (!new_vma) if (!new_vma)
return -ENOMEM; return -ENOMEM;
moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
need_rmap_locks);
if (moved_len < old_len) { if (moved_len < old_len) {
/* /*
* On error, move entries back from new area to old, * On error, move entries back from new area to old,
* which will succeed since page tables still there, * which will succeed since page tables still there,
* and then proceed to unmap new area instead of old. * and then proceed to unmap new area instead of old.
*/ */
move_page_tables(new_vma, new_addr, vma, old_addr, moved_len); move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
true);
vma = new_vma; vma = new_vma;
old_len = new_len; old_len = new_len;
old_addr = new_addr; old_addr = new_addr;