swap: make each swap partition have one address_space

When I use several fast SSD to do swap, swapper_space.tree_lock is
heavily contended.  This makes each swap partition have one
address_space to reduce the lock contention.  There is an array of
address_space for swap.  The swap entry type is the index to the array.

In my test with 3 SSD, this increases the swapout throughput 20%.

[akpm@linux-foundation.org: revert unneeded change to  __add_to_swap_cache]
Signed-off-by: Shaohua Li <shli@fusionio.com>
Cc: Hugh Dickins <hughd@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Change-Id: I8503ace83342398bf7be3d2216616868cca65311
This commit is contained in:
Shaohua Li 2013-02-22 16:34:37 -08:00 committed by Artem Borisov
parent 08e00bc8bc
commit df680e4101
8 changed files with 66 additions and 34 deletions

View File

@ -40,7 +40,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
* sysctl_overcommit_ratio / 100) + total_swap_pages; * sysctl_overcommit_ratio / 100) + total_swap_pages;
cached = global_page_state(NR_FILE_PAGES) - cached = global_page_state(NR_FILE_PAGES) -
total_swapcache_pages - i.bufferram; total_swapcache_pages() - i.bufferram;
if (cached < 0) if (cached < 0)
cached = 0; cached = 0;
@ -109,7 +109,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
K(i.freeram), K(i.freeram),
K(i.bufferram), K(i.bufferram),
K(cached), K(cached),
K(total_swapcache_pages), K(total_swapcache_pages()),
K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]), K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]),
K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]), K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),
K(pages[LRU_ACTIVE_ANON]), K(pages[LRU_ACTIVE_ANON]),

View File

@ -8,7 +8,7 @@
#include <linux/memcontrol.h> #include <linux/memcontrol.h>
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/node.h> #include <linux/node.h>
#include <linux/fs.h>
#include <linux/atomic.h> #include <linux/atomic.h>
#include <asm/page.h> #include <asm/page.h>
@ -319,8 +319,9 @@ extern int swap_writepage(struct page *page, struct writeback_control *wbc);
extern void end_swap_bio_read(struct bio *bio, int err); extern void end_swap_bio_read(struct bio *bio, int err);
/* linux/mm/swap_state.c */ /* linux/mm/swap_state.c */
extern struct address_space swapper_space; extern struct address_space swapper_spaces[];
#define total_swapcache_pages swapper_space.nrpages #define swap_address_space(entry) (&swapper_spaces[swp_type(entry)])
extern unsigned long total_swapcache_pages(void);
extern void show_swap_cache_info(void); extern void show_swap_cache_info(void);
extern int add_to_swap(struct page *); extern int add_to_swap(struct page *);
extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t);
@ -388,7 +389,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
#define nr_swap_pages 0L #define nr_swap_pages 0L
#define total_swap_pages 0L #define total_swap_pages 0L
#define total_swapcache_pages 0UL #define total_swapcache_pages() 0UL
#define si_swapinfo(val) \ #define si_swapinfo(val) \
do { (val)->freeswap = (val)->totalswap = 0; } while (0) do { (val)->freeswap = (val)->totalswap = 0; } while (0)

View File

@ -5186,7 +5186,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
* Because lookup_swap_cache() updates some statistics counter, * Because lookup_swap_cache() updates some statistics counter,
* we call find_get_page() with swapper_space directly. * we call find_get_page() with swapper_space directly.
*/ */
page = find_get_page(&swapper_space, ent.val); page = find_get_page(swap_address_space(ent), ent.val);
if (do_swap_account) if (do_swap_account)
entry->val = ent.val; entry->val = ent.val;
@ -5229,7 +5229,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
swp_entry_t swap = radix_to_swp_entry(page); swp_entry_t swap = radix_to_swp_entry(page);
if (do_swap_account) if (do_swap_account)
*entry = swap; *entry = swap;
page = find_get_page(&swapper_space, swap.val); page = find_get_page(swap_address_space(swap), swap.val);
} }
#endif #endif
return page; return page;

View File

@ -75,7 +75,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
/* shmem/tmpfs may return swap: account for swapcache page too. */ /* shmem/tmpfs may return swap: account for swapcache page too. */
if (radix_tree_exceptional_entry(page)) { if (radix_tree_exceptional_entry(page)) {
swp_entry_t swap = radix_to_swp_entry(page); swp_entry_t swap = radix_to_swp_entry(page);
page = find_get_page(&swapper_space, swap.val); page = find_get_page(swap_address_space(swap), swap.val);
} }
#endif #endif
if (page) { if (page) {
@ -135,7 +135,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
} else { } else {
#ifdef CONFIG_SWAP #ifdef CONFIG_SWAP
pgoff = entry.val; pgoff = entry.val;
*vec = mincore_page(&swapper_space, pgoff); *vec = mincore_page(swap_address_space(entry),
pgoff);
#else #else
WARN_ON(1); WARN_ON(1);
*vec = 1; *vec = 1;

View File

@ -843,9 +843,14 @@ EXPORT_SYMBOL(pagevec_lookup_tag);
void __init swap_setup(void) void __init swap_setup(void)
{ {
unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
#ifdef CONFIG_SWAP #ifdef CONFIG_SWAP
bdi_init(swapper_space.backing_dev_info); int i;
bdi_init(swapper_spaces[0].backing_dev_info);
for (i = 0; i < MAX_SWAPFILES; i++) {
spin_lock_init(&swapper_spaces[i].tree_lock);
INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
}
#endif #endif
/* Use a smaller cluster for small-memory machines */ /* Use a smaller cluster for small-memory machines */

View File

@ -35,12 +35,12 @@ static struct backing_dev_info swap_backing_dev_info = {
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
}; };
struct address_space swapper_space = { struct address_space swapper_spaces[MAX_SWAPFILES] = {
.page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), [0 ... MAX_SWAPFILES - 1] = {
.tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
.a_ops = &swap_aops, .a_ops = &swap_aops,
.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), .backing_dev_info = &swap_backing_dev_info,
.backing_dev_info = &swap_backing_dev_info, }
}; };
#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
@ -52,9 +52,19 @@ static struct {
unsigned long find_total; unsigned long find_total;
} swap_cache_info; } swap_cache_info;
unsigned long total_swapcache_pages(void)
{
int i;
unsigned long ret = 0;
for (i = 0; i < MAX_SWAPFILES; i++)
ret += swapper_spaces[i].nrpages;
return ret;
}
void show_swap_cache_info(void) void show_swap_cache_info(void)
{ {
printk("%lu pages in swap cache\n", total_swapcache_pages); printk("%lu pages in swap cache\n", total_swapcache_pages());
printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
swap_cache_info.add_total, swap_cache_info.del_total, swap_cache_info.add_total, swap_cache_info.del_total,
swap_cache_info.find_success, swap_cache_info.find_total); swap_cache_info.find_success, swap_cache_info.find_total);
@ -69,6 +79,7 @@ void show_swap_cache_info(void)
static int __add_to_swap_cache(struct page *page, swp_entry_t entry) static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
{ {
int error; int error;
struct address_space *address_space;
VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(PageSwapCache(page)); VM_BUG_ON(PageSwapCache(page));
@ -78,14 +89,16 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
SetPageSwapCache(page); SetPageSwapCache(page);
set_page_private(page, entry.val); set_page_private(page, entry.val);
spin_lock_irq(&swapper_space.tree_lock); address_space = swap_address_space(entry);
error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); spin_lock_irq(&address_space->tree_lock);
error = radix_tree_insert(&address_space->page_tree,
entry.val, page);
if (likely(!error)) { if (likely(!error)) {
total_swapcache_pages++; address_space->nrpages++;
__inc_zone_page_state(page, NR_FILE_PAGES); __inc_zone_page_state(page, NR_FILE_PAGES);
INC_CACHE_INFO(add_total); INC_CACHE_INFO(add_total);
} }
spin_unlock_irq(&swapper_space.tree_lock); spin_unlock_irq(&address_space->tree_lock);
if (unlikely(error)) { if (unlikely(error)) {
/* /*
@ -121,14 +134,19 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
*/ */
void __delete_from_swap_cache(struct page *page) void __delete_from_swap_cache(struct page *page)
{ {
swp_entry_t entry;
struct address_space *address_space;
VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(!PageSwapCache(page)); VM_BUG_ON(!PageSwapCache(page));
VM_BUG_ON(PageWriteback(page)); VM_BUG_ON(PageWriteback(page));
radix_tree_delete(&swapper_space.page_tree, page_private(page)); entry.val = page_private(page);
address_space = swap_address_space(entry);
radix_tree_delete(&address_space->page_tree, page_private(page));
set_page_private(page, 0); set_page_private(page, 0);
ClearPageSwapCache(page); ClearPageSwapCache(page);
total_swapcache_pages--; address_space->nrpages--;
__dec_zone_page_state(page, NR_FILE_PAGES); __dec_zone_page_state(page, NR_FILE_PAGES);
INC_CACHE_INFO(del_total); INC_CACHE_INFO(del_total);
} }
@ -194,12 +212,14 @@ int add_to_swap(struct page *page)
void delete_from_swap_cache(struct page *page) void delete_from_swap_cache(struct page *page)
{ {
swp_entry_t entry; swp_entry_t entry;
struct address_space *address_space;
entry.val = page_private(page); entry.val = page_private(page);
spin_lock_irq(&swapper_space.tree_lock); address_space = swap_address_space(entry);
spin_lock_irq(&address_space->tree_lock);
__delete_from_swap_cache(page); __delete_from_swap_cache(page);
spin_unlock_irq(&swapper_space.tree_lock); spin_unlock_irq(&address_space->tree_lock);
swapcache_free(entry, page); swapcache_free(entry, page);
page_cache_release(page); page_cache_release(page);
@ -262,7 +282,7 @@ struct page * lookup_swap_cache(swp_entry_t entry)
{ {
struct page *page; struct page *page;
page = find_get_page(&swapper_space, entry.val); page = find_get_page(swap_address_space(entry), entry.val);
if (page) if (page)
INC_CACHE_INFO(find_success); INC_CACHE_INFO(find_success);
@ -289,7 +309,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* called after lookup_swap_cache() failed, re-calling * called after lookup_swap_cache() failed, re-calling
* that would confuse statistics. * that would confuse statistics.
*/ */
found_page = find_get_page(&swapper_space, entry.val); found_page = find_get_page(swap_address_space(entry),
entry.val);
if (found_page) if (found_page)
break; break;

View File

@ -76,7 +76,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
struct page *page; struct page *page;
int ret = 0; int ret = 0;
page = find_get_page(&swapper_space, entry.val); page = find_get_page(swap_address_space(entry), entry.val);
if (!page) if (!page)
return 0; return 0;
/* /*
@ -735,7 +735,8 @@ int free_swap_and_cache(swp_entry_t entry)
p = swap_info_get(entry); p = swap_info_get(entry);
if (p) { if (p) {
if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
page = find_get_page(&swapper_space, entry.val); page = find_get_page(swap_address_space(entry),
entry.val);
if (page && !trylock_page(page)) { if (page && !trylock_page(page)) {
page_cache_release(page); page_cache_release(page);
page = NULL; page = NULL;

View File

@ -360,9 +360,12 @@ struct address_space *page_mapping(struct page *page)
VM_BUG_ON(PageSlab(page)); VM_BUG_ON(PageSlab(page));
#ifdef CONFIG_SWAP #ifdef CONFIG_SWAP
if (unlikely(PageSwapCache(page))) if (unlikely(PageSwapCache(page))) {
mapping = &swapper_space; swp_entry_t entry;
else
entry.val = page_private(page);
mapping = swap_address_space(entry);
} else
#endif #endif
if ((unsigned long)mapping & PAGE_MAPPING_ANON) if ((unsigned long)mapping & PAGE_MAPPING_ANON)
mapping = NULL; mapping = NULL;