mm: Backport ZCache from QC kernel 3.18

Change-Id: I3edff3a56cf6525f13430ab93309272d1faecfe1
Signed-off-by: Kevin F. Haggerty <haggertk@lineageos.org>
This commit is contained in:
Steve Kondik 2016-04-14 02:52:21 -07:00 committed by Francescodario Cuzzocrea
parent c780c66637
commit b23d0a2d3c
16 changed files with 1778 additions and 3084 deletions

View File

@ -82,8 +82,6 @@ source "drivers/staging/iio/Kconfig"
source "drivers/staging/zram/Kconfig"
source "drivers/staging/zcache/Kconfig"
source "drivers/staging/zsmalloc/Kconfig"
source "drivers/staging/wlags49_h2/Kconfig"

View File

@ -33,7 +33,6 @@ obj-$(CONFIG_VME_BUS) += vme/
obj-$(CONFIG_DX_SEP) += sep/
obj-$(CONFIG_IIO) += iio/
obj-$(CONFIG_ZRAM) += zram/
obj-$(CONFIG_ZCACHE) += zcache/
obj-$(CONFIG_ZSMALLOC) += zsmalloc/
obj-$(CONFIG_WLAGS49_H2) += wlags49_h2/
obj-$(CONFIG_WLAGS49_H25) += wlags49_h25/

View File

@ -41,6 +41,7 @@
#include <linux/delay.h>
#include <linux/swap.h>
#include <linux/fs.h>
#include <linux/zcache.h>
#include <linux/ratelimit.h>
@ -199,7 +200,7 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
is_active_high = (global_page_state(NR_ACTIVE_FILE) >
global_page_state(NR_INACTIVE_FILE)) ? 1 : 0;
#endif
other_file = global_page_state(NR_FILE_PAGES);
other_file = global_page_state(NR_FILE_PAGES) + zcache_pages();
#if defined(CONFIG_CMA_PAGE_COUNTING) && defined(CONFIG_EXCLUDE_LRU_LIVING_IN_CMA)
if (get_nr_swap_pages() < SSWAP_LMK_THRESHOLD && cma_page_ratio >= CMA_PAGE_RATIO
@ -446,7 +447,7 @@ static int android_oom_handler(struct notifier_block *nb,
nr_cma_inactive_file = global_page_state(NR_CMA_INACTIVE_FILE);
nr_cma_active_file = global_page_state(NR_CMA_ACTIVE_FILE);
other_file = global_page_state(NR_FILE_PAGES) -
other_file = global_page_state(NR_FILE_PAGES) + zcache_pages() -
global_page_state(NR_SHMEM) -
total_swapcache_pages -
nr_cma_inactive_file -

View File

@ -1,14 +0,0 @@
config ZCACHE
bool "Dynamic compression of swap pages and clean pagecache pages"
# X86 dependency is because zsmalloc uses non-portable pte/tlb
# functions
depends on (CLEANCACHE || FRONTSWAP) && CRYPTO=y && X86
select ZSMALLOC
select CRYPTO_LZO
default n
help
Zcache doubles RAM efficiency while providing a significant
performance boosts on many workloads. Zcache uses
compression and an in-kernel implementation of transcendent
memory to store clean page cache pages and swap in RAM,
providing a noticeable reduction in disk I/O.

View File

@ -1,3 +0,0 @@
zcache-y := zcache-main.o tmem.o
obj-$(CONFIG_ZCACHE) += zcache.o

View File

@ -1,770 +0,0 @@
/*
* In-kernel transcendent memory (generic implementation)
*
* Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp.
*
* The primary purpose of Transcedent Memory ("tmem") is to map object-oriented
* "handles" (triples containing a pool id, and object id, and an index), to
* pages in a page-accessible memory (PAM). Tmem references the PAM pages via
* an abstract "pampd" (PAM page-descriptor), which can be operated on by a
* set of functions (pamops). Each pampd contains some representation of
* PAGE_SIZE bytes worth of data. Tmem must support potentially millions of
* pages and must be able to insert, find, and delete these pages at a
* potential frequency of thousands per second concurrently across many CPUs,
* (and, if used with KVM, across many vcpus across many guests).
* Tmem is tracked with a hierarchy of data structures, organized by
* the elements in a handle-tuple: pool_id, object_id, and page index.
* One or more "clients" (e.g. guests) each provide one or more tmem_pools.
* Each pool, contains a hash table of rb_trees of tmem_objs. Each
* tmem_obj contains a radix-tree-like tree of pointers, with intermediate
* nodes called tmem_objnodes. Each leaf pointer in this tree points to
* a pampd, which is accessible only through a small set of callbacks
* registered by the PAM implementation (see tmem_register_pamops). Tmem
* does all memory allocation via a set of callbacks registered by the tmem
* host implementation (e.g. see tmem_register_hostops).
*/
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>
#include "tmem.h"
/* data structure sentinels used for debugging... see tmem.h */
#define POOL_SENTINEL 0x87658765
#define OBJ_SENTINEL 0x12345678
#define OBJNODE_SENTINEL 0xfedcba09
/*
* A tmem host implementation must use this function to register callbacks
* for memory allocation.
*/
static struct tmem_hostops tmem_hostops;
static void tmem_objnode_tree_init(void);
void tmem_register_hostops(struct tmem_hostops *m)
{
tmem_objnode_tree_init();
tmem_hostops = *m;
}
/*
* A tmem host implementation must use this function to register
* callbacks for a page-accessible memory (PAM) implementation
*/
static struct tmem_pamops tmem_pamops;
void tmem_register_pamops(struct tmem_pamops *m)
{
tmem_pamops = *m;
}
/*
* Oid's are potentially very sparse and tmem_objs may have an indeterminately
* short life, being added and deleted at a relatively high frequency.
* So an rb_tree is an ideal data structure to manage tmem_objs. But because
* of the potentially huge number of tmem_objs, each pool manages a hashtable
* of rb_trees to reduce search, insert, delete, and rebalancing time.
* Each hashbucket also has a lock to manage concurrent access.
*
* The following routines manage tmem_objs. When any tmem_obj is accessed,
* the hashbucket lock must be held.
*/
/* searches for object==oid in pool, returns locked object if found */
static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb,
struct tmem_oid *oidp)
{
struct rb_node *rbnode;
struct tmem_obj *obj;
rbnode = hb->obj_rb_root.rb_node;
while (rbnode) {
BUG_ON(RB_EMPTY_NODE(rbnode));
obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
switch (tmem_oid_compare(oidp, &obj->oid)) {
case 0: /* equal */
goto out;
case -1:
rbnode = rbnode->rb_left;
break;
case 1:
rbnode = rbnode->rb_right;
break;
}
}
obj = NULL;
out:
return obj;
}
static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *);
/* free an object that has no more pampds in it */
static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb)
{
struct tmem_pool *pool;
BUG_ON(obj == NULL);
ASSERT_SENTINEL(obj, OBJ);
BUG_ON(obj->pampd_count > 0);
pool = obj->pool;
BUG_ON(pool == NULL);
if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */
tmem_pampd_destroy_all_in_obj(obj);
BUG_ON(obj->objnode_tree_root != NULL);
BUG_ON((long)obj->objnode_count != 0);
atomic_dec(&pool->obj_count);
BUG_ON(atomic_read(&pool->obj_count) < 0);
INVERT_SENTINEL(obj, OBJ);
obj->pool = NULL;
tmem_oid_set_invalid(&obj->oid);
rb_erase(&obj->rb_tree_node, &hb->obj_rb_root);
}
/*
* initialize, and insert an tmem_object_root (called only if find failed)
*/
static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb,
struct tmem_pool *pool,
struct tmem_oid *oidp)
{
struct rb_root *root = &hb->obj_rb_root;
struct rb_node **new = &(root->rb_node), *parent = NULL;
struct tmem_obj *this;
BUG_ON(pool == NULL);
atomic_inc(&pool->obj_count);
obj->objnode_tree_height = 0;
obj->objnode_tree_root = NULL;
obj->pool = pool;
obj->oid = *oidp;
obj->objnode_count = 0;
obj->pampd_count = 0;
(*tmem_pamops.new_obj)(obj);
SET_SENTINEL(obj, OBJ);
while (*new) {
BUG_ON(RB_EMPTY_NODE(*new));
this = rb_entry(*new, struct tmem_obj, rb_tree_node);
parent = *new;
switch (tmem_oid_compare(oidp, &this->oid)) {
case 0:
BUG(); /* already present; should never happen! */
break;
case -1:
new = &(*new)->rb_left;
break;
case 1:
new = &(*new)->rb_right;
break;
}
}
rb_link_node(&obj->rb_tree_node, parent, new);
rb_insert_color(&obj->rb_tree_node, root);
}
/*
* Tmem is managed as a set of tmem_pools with certain attributes, such as
* "ephemeral" vs "persistent". These attributes apply to all tmem_objs
* and all pampds that belong to a tmem_pool. A tmem_pool is created
* or deleted relatively rarely (for example, when a filesystem is
* mounted or unmounted.
*/
/* flush all data from a pool and, optionally, free it */
static void tmem_pool_flush(struct tmem_pool *pool, bool destroy)
{
struct rb_node *rbnode;
struct tmem_obj *obj;
struct tmem_hashbucket *hb = &pool->hashbucket[0];
int i;
BUG_ON(pool == NULL);
for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
spin_lock(&hb->lock);
rbnode = rb_first(&hb->obj_rb_root);
while (rbnode != NULL) {
obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
rbnode = rb_next(rbnode);
tmem_pampd_destroy_all_in_obj(obj);
tmem_obj_free(obj, hb);
(*tmem_hostops.obj_free)(obj, pool);
}
spin_unlock(&hb->lock);
}
if (destroy)
list_del(&pool->pool_list);
}
/*
* A tmem_obj contains a radix-tree-like tree in which the intermediate
* nodes are called tmem_objnodes. (The kernel lib/radix-tree.c implementation
* is very specialized and tuned for specific uses and is not particularly
* suited for use from this code, though some code from the core algorithms has
* been reused, thus the copyright notices below). Each tmem_objnode contains
* a set of pointers which point to either a set of intermediate tmem_objnodes
* or a set of of pampds.
*
* Portions Copyright (C) 2001 Momchil Velikov
* Portions Copyright (C) 2001 Christoph Hellwig
* Portions Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
*/
struct tmem_objnode_tree_path {
struct tmem_objnode *objnode;
int offset;
};
/* objnode height_to_maxindex translation */
static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1];
static void tmem_objnode_tree_init(void)
{
unsigned int ht, tmp;
for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) {
tmp = ht * OBJNODE_TREE_MAP_SHIFT;
if (tmp >= OBJNODE_TREE_INDEX_BITS)
tmem_objnode_tree_h2max[ht] = ~0UL;
else
tmem_objnode_tree_h2max[ht] =
(~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1;
}
}
static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj)
{
struct tmem_objnode *objnode;
ASSERT_SENTINEL(obj, OBJ);
BUG_ON(obj->pool == NULL);
ASSERT_SENTINEL(obj->pool, POOL);
objnode = (*tmem_hostops.objnode_alloc)(obj->pool);
if (unlikely(objnode == NULL))
goto out;
objnode->obj = obj;
SET_SENTINEL(objnode, OBJNODE);
memset(&objnode->slots, 0, sizeof(objnode->slots));
objnode->slots_in_use = 0;
obj->objnode_count++;
out:
return objnode;
}
static void tmem_objnode_free(struct tmem_objnode *objnode)
{
struct tmem_pool *pool;
int i;
BUG_ON(objnode == NULL);
for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++)
BUG_ON(objnode->slots[i] != NULL);
ASSERT_SENTINEL(objnode, OBJNODE);
INVERT_SENTINEL(objnode, OBJNODE);
BUG_ON(objnode->obj == NULL);
ASSERT_SENTINEL(objnode->obj, OBJ);
pool = objnode->obj->pool;
BUG_ON(pool == NULL);
ASSERT_SENTINEL(pool, POOL);
objnode->obj->objnode_count--;
objnode->obj = NULL;
(*tmem_hostops.objnode_free)(objnode, pool);
}
/*
* lookup index in object and return associated pampd (or NULL if not found)
*/
static void **__tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
{
unsigned int height, shift;
struct tmem_objnode **slot = NULL;
BUG_ON(obj == NULL);
ASSERT_SENTINEL(obj, OBJ);
BUG_ON(obj->pool == NULL);
ASSERT_SENTINEL(obj->pool, POOL);
height = obj->objnode_tree_height;
if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height])
goto out;
if (height == 0 && obj->objnode_tree_root) {
slot = &obj->objnode_tree_root;
goto out;
}
shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
slot = &obj->objnode_tree_root;
while (height > 0) {
if (*slot == NULL)
goto out;
slot = (struct tmem_objnode **)
((*slot)->slots +
((index >> shift) & OBJNODE_TREE_MAP_MASK));
shift -= OBJNODE_TREE_MAP_SHIFT;
height--;
}
out:
return slot != NULL ? (void **)slot : NULL;
}
static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
{
struct tmem_objnode **slot;
slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
return slot != NULL ? *slot : NULL;
}
static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index,
void *new_pampd)
{
struct tmem_objnode **slot;
void *ret = NULL;
slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
if ((slot != NULL) && (*slot != NULL)) {
void *old_pampd = *(void **)slot;
*(void **)slot = new_pampd;
(*tmem_pamops.free)(old_pampd, obj->pool, NULL, 0);
ret = new_pampd;
}
return ret;
}
static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index,
void *pampd)
{
int ret = 0;
struct tmem_objnode *objnode = NULL, *newnode, *slot;
unsigned int height, shift;
int offset = 0;
/* if necessary, extend the tree to be higher */
if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) {
height = obj->objnode_tree_height + 1;
if (index > tmem_objnode_tree_h2max[height])
while (index > tmem_objnode_tree_h2max[height])
height++;
if (obj->objnode_tree_root == NULL) {
obj->objnode_tree_height = height;
goto insert;
}
do {
newnode = tmem_objnode_alloc(obj);
if (!newnode) {
ret = -ENOMEM;
goto out;
}
newnode->slots[0] = obj->objnode_tree_root;
newnode->slots_in_use = 1;
obj->objnode_tree_root = newnode;
obj->objnode_tree_height++;
} while (height > obj->objnode_tree_height);
}
insert:
slot = obj->objnode_tree_root;
height = obj->objnode_tree_height;
shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
while (height > 0) {
if (slot == NULL) {
/* add a child objnode. */
slot = tmem_objnode_alloc(obj);
if (!slot) {
ret = -ENOMEM;
goto out;
}
if (objnode) {
objnode->slots[offset] = slot;
objnode->slots_in_use++;
} else
obj->objnode_tree_root = slot;
}
/* go down a level */
offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
objnode = slot;
slot = objnode->slots[offset];
shift -= OBJNODE_TREE_MAP_SHIFT;
height--;
}
BUG_ON(slot != NULL);
if (objnode) {
objnode->slots_in_use++;
objnode->slots[offset] = pampd;
} else
obj->objnode_tree_root = pampd;
obj->pampd_count++;
out:
return ret;
}
static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index)
{
struct tmem_objnode_tree_path path[OBJNODE_TREE_MAX_PATH + 1];
struct tmem_objnode_tree_path *pathp = path;
struct tmem_objnode *slot = NULL;
unsigned int height, shift;
int offset;
BUG_ON(obj == NULL);
ASSERT_SENTINEL(obj, OBJ);
BUG_ON(obj->pool == NULL);
ASSERT_SENTINEL(obj->pool, POOL);
height = obj->objnode_tree_height;
if (index > tmem_objnode_tree_h2max[height])
goto out;
slot = obj->objnode_tree_root;
if (height == 0 && obj->objnode_tree_root) {
obj->objnode_tree_root = NULL;
goto out;
}
shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT;
pathp->objnode = NULL;
do {
if (slot == NULL)
goto out;
pathp++;
offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
pathp->offset = offset;
pathp->objnode = slot;
slot = slot->slots[offset];
shift -= OBJNODE_TREE_MAP_SHIFT;
height--;
} while (height > 0);
if (slot == NULL)
goto out;
while (pathp->objnode) {
pathp->objnode->slots[pathp->offset] = NULL;
pathp->objnode->slots_in_use--;
if (pathp->objnode->slots_in_use) {
if (pathp->objnode == obj->objnode_tree_root) {
while (obj->objnode_tree_height > 0 &&
obj->objnode_tree_root->slots_in_use == 1 &&
obj->objnode_tree_root->slots[0]) {
struct tmem_objnode *to_free =
obj->objnode_tree_root;
obj->objnode_tree_root =
to_free->slots[0];
obj->objnode_tree_height--;
to_free->slots[0] = NULL;
to_free->slots_in_use = 0;
tmem_objnode_free(to_free);
}
}
goto out;
}
tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */
pathp--;
}
obj->objnode_tree_height = 0;
obj->objnode_tree_root = NULL;
out:
if (slot != NULL)
obj->pampd_count--;
BUG_ON(obj->pampd_count < 0);
return slot;
}
/* recursively walk the objnode_tree destroying pampds and objnodes */
static void tmem_objnode_node_destroy(struct tmem_obj *obj,
struct tmem_objnode *objnode,
unsigned int ht)
{
int i;
if (ht == 0)
return;
for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) {
if (objnode->slots[i]) {
if (ht == 1) {
obj->pampd_count--;
(*tmem_pamops.free)(objnode->slots[i],
obj->pool, NULL, 0);
objnode->slots[i] = NULL;
continue;
}
tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1);
tmem_objnode_free(objnode->slots[i]);
objnode->slots[i] = NULL;
}
}
}
static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj)
{
if (obj->objnode_tree_root == NULL)
return;
if (obj->objnode_tree_height == 0) {
obj->pampd_count--;
(*tmem_pamops.free)(obj->objnode_tree_root, obj->pool, NULL, 0);
} else {
tmem_objnode_node_destroy(obj, obj->objnode_tree_root,
obj->objnode_tree_height);
tmem_objnode_free(obj->objnode_tree_root);
obj->objnode_tree_height = 0;
}
obj->objnode_tree_root = NULL;
(*tmem_pamops.free_obj)(obj->pool, obj);
}
/*
* Tmem is operated on by a set of well-defined actions:
* "put", "get", "flush", "flush_object", "new pool" and "destroy pool".
* (The tmem ABI allows for subpages and exchanges but these operations
* are not included in this implementation.)
*
* These "tmem core" operations are implemented in the following functions.
*/
/*
* "Put" a page, e.g. copy a page from the kernel into newly allocated
* PAM space (if such space is available). Tmem_put is complicated by
* a corner case: What if a page with matching handle already exists in
* tmem? To guarantee coherency, one of two actions is necessary: Either
* the data for the page must be overwritten, or the page must be
* "flushed" so that the data is not accessible to a subsequent "get".
* Since these "duplicate puts" are relatively rare, this implementation
* always flushes for simplicity.
*/
int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
char *data, size_t size, bool raw, bool ephemeral)
{
struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL;
void *pampd = NULL, *pampd_del = NULL;
int ret = -ENOMEM;
struct tmem_hashbucket *hb;
hb = &pool->hashbucket[tmem_oid_hash(oidp)];
spin_lock(&hb->lock);
obj = objfound = tmem_obj_find(hb, oidp);
if (obj != NULL) {
pampd = tmem_pampd_lookup_in_obj(objfound, index);
if (pampd != NULL) {
/* if found, is a dup put, flush the old one */
pampd_del = tmem_pampd_delete_from_obj(obj, index);
BUG_ON(pampd_del != pampd);
(*tmem_pamops.free)(pampd, pool, oidp, index);
if (obj->pampd_count == 0) {
objnew = obj;
objfound = NULL;
}
pampd = NULL;
}
} else {
obj = objnew = (*tmem_hostops.obj_alloc)(pool);
if (unlikely(obj == NULL)) {
ret = -ENOMEM;
goto out;
}
tmem_obj_init(obj, hb, pool, oidp);
}
BUG_ON(obj == NULL);
BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound));
pampd = (*tmem_pamops.create)(data, size, raw, ephemeral,
obj->pool, &obj->oid, index);
if (unlikely(pampd == NULL))
goto free;
ret = tmem_pampd_add_to_obj(obj, index, pampd);
if (unlikely(ret == -ENOMEM))
/* may have partially built objnode tree ("stump") */
goto delete_and_free;
goto out;
delete_and_free:
(void)tmem_pampd_delete_from_obj(obj, index);
free:
if (pampd)
(*tmem_pamops.free)(pampd, pool, NULL, 0);
if (objnew) {
tmem_obj_free(objnew, hb);
(*tmem_hostops.obj_free)(objnew, pool);
}
out:
spin_unlock(&hb->lock);
return ret;
}
/*
* "Get" a page, e.g. if one can be found, copy the tmem page with the
* matching handle from PAM space to the kernel. By tmem definition,
* when a "get" is successful on an ephemeral page, the page is "flushed",
* and when a "get" is successful on a persistent page, the page is retained
* in tmem. Note that to preserve
* coherency, "get" can never be skipped if tmem contains the data.
* That is, if a get is done with a certain handle and fails, any
* subsequent "get" must also fail (unless of course there is a
* "put" done with the same handle).
*/
int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
char *data, size_t *size, bool raw, int get_and_free)
{
struct tmem_obj *obj;
void *pampd;
bool ephemeral = is_ephemeral(pool);
int ret = -1;
struct tmem_hashbucket *hb;
bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral);
bool lock_held = false;
hb = &pool->hashbucket[tmem_oid_hash(oidp)];
spin_lock(&hb->lock);
lock_held = true;
obj = tmem_obj_find(hb, oidp);
if (obj == NULL)
goto out;
if (free)
pampd = tmem_pampd_delete_from_obj(obj, index);
else
pampd = tmem_pampd_lookup_in_obj(obj, index);
if (pampd == NULL)
goto out;
if (free) {
if (obj->pampd_count == 0) {
tmem_obj_free(obj, hb);
(*tmem_hostops.obj_free)(obj, pool);
obj = NULL;
}
}
if (tmem_pamops.is_remote(pampd)) {
lock_held = false;
spin_unlock(&hb->lock);
}
if (free)
ret = (*tmem_pamops.get_data_and_free)(
data, size, raw, pampd, pool, oidp, index);
else
ret = (*tmem_pamops.get_data)(
data, size, raw, pampd, pool, oidp, index);
if (ret < 0)
goto out;
ret = 0;
out:
if (lock_held)
spin_unlock(&hb->lock);
return ret;
}
/*
* If a page in tmem matches the handle, "flush" this page from tmem such
* that any subsequent "get" does not succeed (unless, of course, there
* was another "put" with the same handle).
*/
int tmem_flush_page(struct tmem_pool *pool,
struct tmem_oid *oidp, uint32_t index)
{
struct tmem_obj *obj;
void *pampd;
int ret = -1;
struct tmem_hashbucket *hb;
hb = &pool->hashbucket[tmem_oid_hash(oidp)];
spin_lock(&hb->lock);
obj = tmem_obj_find(hb, oidp);
if (obj == NULL)
goto out;
pampd = tmem_pampd_delete_from_obj(obj, index);
if (pampd == NULL)
goto out;
(*tmem_pamops.free)(pampd, pool, oidp, index);
if (obj->pampd_count == 0) {
tmem_obj_free(obj, hb);
(*tmem_hostops.obj_free)(obj, pool);
}
ret = 0;
out:
spin_unlock(&hb->lock);
return ret;
}
/*
* If a page in tmem matches the handle, replace the page so that any
* subsequent "get" gets the new page. Returns 0 if
* there was a page to replace, else returns -1.
*/
int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp,
uint32_t index, void *new_pampd)
{
struct tmem_obj *obj;
int ret = -1;
struct tmem_hashbucket *hb;
hb = &pool->hashbucket[tmem_oid_hash(oidp)];
spin_lock(&hb->lock);
obj = tmem_obj_find(hb, oidp);
if (obj == NULL)
goto out;
new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd);
ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj);
out:
spin_unlock(&hb->lock);
return ret;
}
/*
* "Flush" all pages in tmem matching this oid.
*/
int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp)
{
struct tmem_obj *obj;
struct tmem_hashbucket *hb;
int ret = -1;
hb = &pool->hashbucket[tmem_oid_hash(oidp)];
spin_lock(&hb->lock);
obj = tmem_obj_find(hb, oidp);
if (obj == NULL)
goto out;
tmem_pampd_destroy_all_in_obj(obj);
tmem_obj_free(obj, hb);
(*tmem_hostops.obj_free)(obj, pool);
ret = 0;
out:
spin_unlock(&hb->lock);
return ret;
}
/*
* "Flush" all pages (and tmem_objs) from this tmem_pool and disable
* all subsequent access to this tmem_pool.
*/
int tmem_destroy_pool(struct tmem_pool *pool)
{
int ret = -1;
if (pool == NULL)
goto out;
tmem_pool_flush(pool, 1);
ret = 0;
out:
return ret;
}
static LIST_HEAD(tmem_global_pool_list);
/*
* Create a new tmem_pool with the provided flag and return
* a pool id provided by the tmem host implementation.
*/
void tmem_new_pool(struct tmem_pool *pool, uint32_t flags)
{
int persistent = flags & TMEM_POOL_PERSIST;
int shared = flags & TMEM_POOL_SHARED;
struct tmem_hashbucket *hb = &pool->hashbucket[0];
int i;
for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
hb->obj_rb_root = RB_ROOT;
spin_lock_init(&hb->lock);
}
INIT_LIST_HEAD(&pool->pool_list);
atomic_set(&pool->obj_count, 0);
SET_SENTINEL(pool, POOL);
list_add_tail(&pool->pool_list, &tmem_global_pool_list);
pool->persistent = persistent;
pool->shared = shared;
}

View File

@ -1,206 +0,0 @@
/*
* tmem.h
*
* Transcendent memory
*
* Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp.
*/
#ifndef _TMEM_H_
#define _TMEM_H_
#include <linux/types.h>
#include <linux/highmem.h>
#include <linux/hash.h>
#include <linux/atomic.h>
/*
* These are pre-defined by the Xen<->Linux ABI
*/
#define TMEM_PUT_PAGE 4
#define TMEM_GET_PAGE 5
#define TMEM_FLUSH_PAGE 6
#define TMEM_FLUSH_OBJECT 7
#define TMEM_POOL_PERSIST 1
#define TMEM_POOL_SHARED 2
#define TMEM_POOL_PRECOMPRESSED 4
#define TMEM_POOL_PAGESIZE_SHIFT 4
#define TMEM_POOL_PAGESIZE_MASK 0xf
#define TMEM_POOL_RESERVED_BITS 0x00ffff00
/*
* sentinels have proven very useful for debugging but can be removed
* or disabled before final merge.
*/
#define SENTINELS
#ifdef SENTINELS
#define DECL_SENTINEL uint32_t sentinel;
#define SET_SENTINEL(_x, _y) (_x->sentinel = _y##_SENTINEL)
#define INVERT_SENTINEL(_x, _y) (_x->sentinel = ~_y##_SENTINEL)
#define ASSERT_SENTINEL(_x, _y) WARN_ON(_x->sentinel != _y##_SENTINEL)
#define ASSERT_INVERTED_SENTINEL(_x, _y) WARN_ON(_x->sentinel != ~_y##_SENTINEL)
#else
#define DECL_SENTINEL
#define SET_SENTINEL(_x, _y) do { } while (0)
#define INVERT_SENTINEL(_x, _y) do { } while (0)
#define ASSERT_SENTINEL(_x, _y) do { } while (0)
#define ASSERT_INVERTED_SENTINEL(_x, _y) do { } while (0)
#endif
#define ASSERT_SPINLOCK(_l) lockdep_assert_held(_l)
/*
* A pool is the highest-level data structure managed by tmem and
* usually corresponds to a large independent set of pages such as
* a filesystem. Each pool has an id, and certain attributes and counters.
* It also contains a set of hash buckets, each of which contains an rbtree
* of objects and a lock to manage concurrency within the pool.
*/
#define TMEM_HASH_BUCKET_BITS 8
#define TMEM_HASH_BUCKETS (1<<TMEM_HASH_BUCKET_BITS)
struct tmem_hashbucket {
struct rb_root obj_rb_root;
spinlock_t lock;
};
struct tmem_pool {
void *client; /* "up" for some clients, avoids table lookup */
struct list_head pool_list;
uint32_t pool_id;
bool persistent;
bool shared;
atomic_t obj_count;
atomic_t refcount;
struct tmem_hashbucket hashbucket[TMEM_HASH_BUCKETS];
DECL_SENTINEL
};
#define is_persistent(_p) (_p->persistent)
#define is_ephemeral(_p) (!(_p->persistent))
/*
* An object id ("oid") is large: 192-bits (to ensure, for example, files
* in a modern filesystem can be uniquely identified).
*/
struct tmem_oid {
uint64_t oid[3];
};
static inline void tmem_oid_set_invalid(struct tmem_oid *oidp)
{
oidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL;
}
static inline bool tmem_oid_valid(struct tmem_oid *oidp)
{
return oidp->oid[0] != -1UL || oidp->oid[1] != -1UL ||
oidp->oid[2] != -1UL;
}
static inline int tmem_oid_compare(struct tmem_oid *left,
struct tmem_oid *right)
{
int ret;
if (left->oid[2] == right->oid[2]) {
if (left->oid[1] == right->oid[1]) {
if (left->oid[0] == right->oid[0])
ret = 0;
else if (left->oid[0] < right->oid[0])
ret = -1;
else
return 1;
} else if (left->oid[1] < right->oid[1])
ret = -1;
else
ret = 1;
} else if (left->oid[2] < right->oid[2])
ret = -1;
else
ret = 1;
return ret;
}
static inline unsigned tmem_oid_hash(struct tmem_oid *oidp)
{
return hash_long(oidp->oid[0] ^ oidp->oid[1] ^ oidp->oid[2],
TMEM_HASH_BUCKET_BITS);
}
/*
* A tmem_obj contains an identifier (oid), pointers to the parent
* pool and the rb_tree to which it belongs, counters, and an ordered
* set of pampds, structured in a radix-tree-like tree. The intermediate
* nodes of the tree are called tmem_objnodes.
*/
struct tmem_objnode;
struct tmem_obj {
struct tmem_oid oid;
struct tmem_pool *pool;
struct rb_node rb_tree_node;
struct tmem_objnode *objnode_tree_root;
unsigned int objnode_tree_height;
unsigned long objnode_count;
long pampd_count;
void *extra; /* for private use by pampd implementation */
DECL_SENTINEL
};
#define OBJNODE_TREE_MAP_SHIFT 6
#define OBJNODE_TREE_MAP_SIZE (1UL << OBJNODE_TREE_MAP_SHIFT)
#define OBJNODE_TREE_MAP_MASK (OBJNODE_TREE_MAP_SIZE-1)
#define OBJNODE_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
#define OBJNODE_TREE_MAX_PATH \
(OBJNODE_TREE_INDEX_BITS/OBJNODE_TREE_MAP_SHIFT + 2)
struct tmem_objnode {
struct tmem_obj *obj;
DECL_SENTINEL
void *slots[OBJNODE_TREE_MAP_SIZE];
unsigned int slots_in_use;
};
/* pampd abstract datatype methods provided by the PAM implementation */
struct tmem_pamops {
void *(*create)(char *, size_t, bool, int,
struct tmem_pool *, struct tmem_oid *, uint32_t);
int (*get_data)(char *, size_t *, bool, void *, struct tmem_pool *,
struct tmem_oid *, uint32_t);
int (*get_data_and_free)(char *, size_t *, bool, void *,
struct tmem_pool *, struct tmem_oid *,
uint32_t);
void (*free)(void *, struct tmem_pool *, struct tmem_oid *, uint32_t);
void (*free_obj)(struct tmem_pool *, struct tmem_obj *);
bool (*is_remote)(void *);
void (*new_obj)(struct tmem_obj *);
int (*replace_in_obj)(void *, struct tmem_obj *);
};
extern void tmem_register_pamops(struct tmem_pamops *m);
/* memory allocation methods provided by the host implementation */
struct tmem_hostops {
struct tmem_obj *(*obj_alloc)(struct tmem_pool *);
void (*obj_free)(struct tmem_obj *, struct tmem_pool *);
struct tmem_objnode *(*objnode_alloc)(struct tmem_pool *);
void (*objnode_free)(struct tmem_objnode *, struct tmem_pool *);
};
extern void tmem_register_hostops(struct tmem_hostops *m);
/* core tmem accessor functions */
extern int tmem_put(struct tmem_pool *, struct tmem_oid *, uint32_t index,
char *, size_t, bool, bool);
extern int tmem_get(struct tmem_pool *, struct tmem_oid *, uint32_t index,
char *, size_t *, bool, int);
extern int tmem_replace(struct tmem_pool *, struct tmem_oid *, uint32_t index,
void *);
extern int tmem_flush_page(struct tmem_pool *, struct tmem_oid *,
uint32_t index);
extern int tmem_flush_object(struct tmem_pool *, struct tmem_oid *);
extern int tmem_destroy_pool(struct tmem_pool *);
extern void tmem_new_pool(struct tmem_pool *, uint32_t);
#endif /* _TMEM_H */

File diff suppressed because it is too large Load Diff

View File

@ -112,6 +112,9 @@ enum pageflags {
#ifdef CONFIG_SCFS_LOWER_PAGECACHE_INVALIDATION
PG_scfslower,
PG_nocache,
#endif
#ifdef CONFIG_ZCACHE
PG_was_active,
#endif
__NR_PAGEFLAGS,
#if defined(CONFIG_CMA_PAGE_COUNTING)
@ -217,6 +220,11 @@ PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
__PAGEFLAG(SlobFree, slob_free)
#ifdef CONFIG_ZCACHE
PAGEFLAG(WasActive, was_active)
#else
PAGEFLAG_FALSE(WasActive)
#endif
/*
* Private page markings that may be used by the filesystem that owns the page

22
include/linux/zbud.h Normal file
View File

@ -0,0 +1,22 @@
#ifndef _ZBUD_H_
#define _ZBUD_H_
#include <linux/types.h>
struct zbud_pool;
struct zbud_ops {
int (*evict)(struct zbud_pool *pool, unsigned long handle);
};
struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops);
void zbud_destroy_pool(struct zbud_pool *pool);
int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
unsigned long *handle);
void zbud_free(struct zbud_pool *pool, unsigned long handle);
int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries);
void *zbud_map(struct zbud_pool *pool, unsigned long handle);
void zbud_unmap(struct zbud_pool *pool, unsigned long handle);
u64 zbud_get_pool_size(struct zbud_pool *pool);
#endif /* _ZBUD_H_ */

22
include/linux/zcache.h Normal file
View File

@ -0,0 +1,22 @@
/*
* Copyright (c) 2015, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
* only version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#ifndef _LINUX_ZCACHE_H
#define _LINUX_ZCACHE_H
#ifdef CONFIG_ZCACHE
extern u64 zcache_pages(void);
#else
u64 zcache_pages(void) { return 0; }
#endif
#endif /* _LINUX_ZCACHE_H */

View File

@ -507,11 +507,36 @@ config MIN_DIRTY_THRESH_PAGES
disables this option.)
Do not use it if you unsure.
config MMAP_READAROUND_LIMIT
int "Limit mmap readaround upperbound"
default 0
help
Inappropriate mmap readaround size can hurt device performance
during the sluggish situation. Add the hard upper-limit for
mmap readaround.
mmap readaround.
config ZBUD
tristate "Low density storage for compressed pages"
default n
help
A special purpose allocator for storing compressed pages.
It is designed to store up to two compressed pages per physical
page. While this design limits storage density, it has simple and
deterministic reclaim properties that make it preferable to a higher
density approach when reclaim will be used.
config ZCACHE
bool "Compressed cache for file pages (EXPERIMENTAL)"
depends on CRYPTO && CLEANCACHE
select CRYPTO_LZO
select ZBUD
default n
help
A compressed cache for file pages.
It takes active file pages that are in the process of being reclaimed
and attempts to compress them into a dynamically allocated RAM-based
memory pool.
If this process is successful, when those file pages needed again, the
I/O reading operation was avoided. This results in a significant performance
gains under memory pressure for systems full with file pages.

View File

@ -53,3 +53,5 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
obj-$(CONFIG_CLEANCACHE) += cleancache.o
obj-$(CONFIG_ZSMALLOC_NEW) += zsmalloc.o
obj-$(CONFIG_ZCACHE) += zcache.o
obj-$(CONFIG_ZBUD) += zbud.o

View File

@ -1303,6 +1303,7 @@ putback_inactive_pages(struct mem_cgroup_zone *mz,
while (!list_empty(page_list)) {
struct page *page = lru_to_page(page_list);
int lru;
int file;
VM_BUG_ON(PageLRU(page));
list_del(&page->lru);
@ -1315,8 +1316,12 @@ putback_inactive_pages(struct mem_cgroup_zone *mz,
SetPageLRU(page);
lru = page_lru(page);
add_page_to_lru_list(zone, page, lru);
file = is_file_lru(lru);
if (IS_ENABLED(CONFIG_ZCACHE))
if (file)
SetPageWasActive(page);
if (is_active_lru(lru)) {
int file = is_file_lru(lru);
int numpages = hpage_nr_pages(page);
reclaim_stat->recent_rotated[file] += numpages;
}
@ -1612,6 +1617,12 @@ static void shrink_active_list(unsigned long nr_to_scan,
}
ClearPageActive(page); /* we are de-activating */
if (IS_ENABLED(CONFIG_ZCACHE))
/*
* For zcache to know whether the page is from active
* file list
*/
SetPageWasActive(page);
list_add(&page->lru, &l_inactive);
}

527
mm/zbud.c Normal file
View File

@ -0,0 +1,527 @@
/*
* zbud.c
*
* Copyright (C) 2013, Seth Jennings, IBM
*
* Concepts based on zcache internal zbud allocator by Dan Magenheimer.
*
* zbud is an special purpose allocator for storing compressed pages. Contrary
* to what its name may suggest, zbud is not a buddy allocator, but rather an
* allocator that "buddies" two compressed pages together in a single memory
* page.
*
* While this design limits storage density, it has simple and deterministic
* reclaim properties that make it preferable to a higher density approach when
* reclaim will be used.
*
* zbud works by storing compressed pages, or "zpages", together in pairs in a
* single memory page called a "zbud page". The first buddy is "left
* justifed" at the beginning of the zbud page, and the last buddy is "right
* justified" at the end of the zbud page. The benefit is that if either
* buddy is freed, the freed buddy space, coalesced with whatever slack space
* that existed between the buddies, results in the largest possible free region
* within the zbud page.
*
* zbud also provides an attractive lower bound on density. The ratio of zpages
* to zbud pages can not be less than 1. This ensures that zbud can never "do
* harm" by using more pages to store zpages than the uncompressed zpages would
* have used on their own.
*
* zbud pages are divided into "chunks". The size of the chunks is fixed at
* compile time and determined by NCHUNKS_ORDER below. Dividing zbud pages
* into chunks allows organizing unbuddied zbud pages into a manageable number
* of unbuddied lists according to the number of free chunks available in the
* zbud page.
*
* The zbud API differs from that of conventional allocators in that the
* allocation function, zbud_alloc(), returns an opaque handle to the user,
* not a dereferenceable pointer. The user must map the handle using
* zbud_map() in order to get a usable pointer by which to access the
* allocation data and unmap the handle with zbud_unmap() when operations
* on the allocation data are complete.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/atomic.h>
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/preempt.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/zbud.h>
/*****************
* Structures
*****************/
/*
* NCHUNKS_ORDER determines the internal allocation granularity, effectively
* adjusting internal fragmentation. It also determines the number of
* freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
* allocation granularity will be in chunks of size PAGE_SIZE/64, and there
* will be 64 freelists per pool.
*/
#define NCHUNKS_ORDER 6
#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
#define CHUNK_SIZE (1 << CHUNK_SHIFT)
#define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
#define ZHDR_SIZE_ALIGNED CHUNK_SIZE
/**
* struct zbud_pool - stores metadata for each zbud pool
* @lock: protects all pool fields and first|last_chunk fields of any
* zbud page in the pool
* @unbuddied: array of lists tracking zbud pages that only contain one buddy;
* the lists each zbud page is added to depends on the size of
* its free region.
* @buddied: list tracking the zbud pages that contain two buddies;
* these zbud pages are full
* @lru: list tracking the zbud pages in LRU order by most recently
* added buddy.
* @pages_nr: number of zbud pages in the pool.
* @ops: pointer to a structure of user defined operations specified at
* pool creation time.
*
* This structure is allocated at pool creation time and maintains metadata
* pertaining to a particular zbud pool.
*/
struct zbud_pool {
spinlock_t lock;
struct list_head unbuddied[NCHUNKS];
struct list_head buddied;
struct list_head lru;
u64 pages_nr;
struct zbud_ops *ops;
};
/*
* struct zbud_header - zbud page metadata occupying the first chunk of each
* zbud page.
* @buddy: links the zbud page into the unbuddied/buddied lists in the pool
* @lru: links the zbud page into the lru list in the pool
* @first_chunks: the size of the first buddy in chunks, 0 if free
* @last_chunks: the size of the last buddy in chunks, 0 if free
*/
struct zbud_header {
struct list_head buddy;
struct list_head lru;
unsigned int first_chunks;
unsigned int last_chunks;
bool under_reclaim;
};
/*****************
* Helpers
*****************/
/* Just to make the code easier to read */
enum buddy {
FIRST,
LAST
};
/* Converts an allocation size in bytes to size in zbud chunks */
static int size_to_chunks(int size)
{
return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
}
#define for_each_unbuddied_list(_iter, _begin) \
for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
/* Initializes the zbud header of a newly allocated zbud page */
static struct zbud_header *init_zbud_page(struct page *page)
{
struct zbud_header *zhdr = page_address(page);
zhdr->first_chunks = 0;
zhdr->last_chunks = 0;
INIT_LIST_HEAD(&zhdr->buddy);
INIT_LIST_HEAD(&zhdr->lru);
zhdr->under_reclaim = 0;
return zhdr;
}
/* Resets the struct page fields and frees the page */
static void free_zbud_page(struct zbud_header *zhdr)
{
__free_page(virt_to_page(zhdr));
}
/*
* Encodes the handle of a particular buddy within a zbud page
* Pool lock should be held as this function accesses first|last_chunks
*/
static unsigned long encode_handle(struct zbud_header *zhdr, enum buddy bud)
{
unsigned long handle;
/*
* For now, the encoded handle is actually just the pointer to the data
* but this might not always be the case. A little information hiding.
* Add CHUNK_SIZE to the handle if it is the first allocation to jump
* over the zbud header in the first chunk.
*/
handle = (unsigned long)zhdr;
if (bud == FIRST)
/* skip over zbud header */
handle += ZHDR_SIZE_ALIGNED;
else /* bud == LAST */
handle += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
return handle;
}
/* Returns the zbud page where a given handle is stored */
static struct zbud_header *handle_to_zbud_header(unsigned long handle)
{
return (struct zbud_header *)(handle & PAGE_MASK);
}
/* Returns the number of free chunks in a zbud page */
static int num_free_chunks(struct zbud_header *zhdr)
{
/*
* Rather than branch for different situations, just use the fact that
* free buddies have a length of zero to simplify everything. -1 at the
* end for the zbud header.
*/
return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks - 1;
}
/*****************
* API Functions
*****************/
/**
* zbud_create_pool() - create a new zbud pool
* @gfp: gfp flags when allocating the zbud pool structure
* @ops: user-defined operations for the zbud pool
*
* Return: pointer to the new zbud pool or NULL if the metadata allocation
* failed.
*/
struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops)
{
struct zbud_pool *pool;
int i;
pool = kmalloc(sizeof(struct zbud_pool), gfp);
if (!pool)
return NULL;
spin_lock_init(&pool->lock);
for_each_unbuddied_list(i, 0)
INIT_LIST_HEAD(&pool->unbuddied[i]);
INIT_LIST_HEAD(&pool->buddied);
INIT_LIST_HEAD(&pool->lru);
pool->pages_nr = 0;
pool->ops = ops;
return pool;
}
/**
* zbud_destroy_pool() - destroys an existing zbud pool
* @pool: the zbud pool to be destroyed
*
* The pool should be emptied before this function is called.
*/
void zbud_destroy_pool(struct zbud_pool *pool)
{
kfree(pool);
}
/**
* zbud_alloc() - allocates a region of a given size
* @pool: zbud pool from which to allocate
* @size: size in bytes of the desired allocation
* @gfp: gfp flags used if the pool needs to grow
* @handle: handle of the new allocation
*
* This function will attempt to find a free region in the pool large enough to
* satisfy the allocation request. A search of the unbuddied lists is
* performed first. If no suitable free region is found, then a new page is
* allocated and added to the pool to satisfy the request.
*
* gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
* as zbud pool pages.
*
* Return: 0 if success and handle is set, otherwise -EINVAL is the size or
* gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
* a new page.
*/
int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
unsigned long *handle)
{
int chunks, i, freechunks;
struct zbud_header *zhdr = NULL;
enum buddy bud;
struct page *page;
if (size <= 0 || gfp & __GFP_HIGHMEM)
return -EINVAL;
if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED)
return -ENOSPC;
chunks = size_to_chunks(size);
spin_lock(&pool->lock);
/* First, try to find an unbuddied zbud page. */
zhdr = NULL;
for_each_unbuddied_list(i, chunks) {
if (!list_empty(&pool->unbuddied[i])) {
zhdr = list_first_entry(&pool->unbuddied[i],
struct zbud_header, buddy);
list_del(&zhdr->buddy);
if (zhdr->first_chunks == 0)
bud = FIRST;
else
bud = LAST;
goto found;
}
}
/* Couldn't find unbuddied zbud page, create new one */
spin_unlock(&pool->lock);
page = alloc_page(gfp);
if (!page)
return -ENOMEM;
spin_lock(&pool->lock);
pool->pages_nr++;
zhdr = init_zbud_page(page);
bud = FIRST;
found:
if (bud == FIRST)
zhdr->first_chunks = chunks;
else
zhdr->last_chunks = chunks;
if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0) {
/* Add to unbuddied list */
freechunks = num_free_chunks(zhdr);
list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
} else {
/* Add to buddied list */
list_add(&zhdr->buddy, &pool->buddied);
}
/* Add/move zbud page to beginning of LRU */
if (!list_empty(&zhdr->lru))
list_del(&zhdr->lru);
list_add(&zhdr->lru, &pool->lru);
*handle = encode_handle(zhdr, bud);
spin_unlock(&pool->lock);
return 0;
}
/**
* zbud_free() - frees the allocation associated with the given handle
* @pool: pool in which the allocation resided
* @handle: handle associated with the allocation returned by zbud_alloc()
*
* In the case that the zbud page in which the allocation resides is under
* reclaim, as indicated by the PG_reclaim flag being set, this function
* only sets the first|last_chunks to 0. The page is actually freed
* once both buddies are evicted (see zbud_reclaim_page() below).
*/
void zbud_free(struct zbud_pool *pool, unsigned long handle)
{
struct zbud_header *zhdr;
int freechunks;
spin_lock(&pool->lock);
zhdr = handle_to_zbud_header(handle);
/* If first buddy, handle will be page aligned */
if ((handle - ZHDR_SIZE_ALIGNED) & ~PAGE_MASK)
zhdr->last_chunks = 0;
else
zhdr->first_chunks = 0;
if (zhdr->under_reclaim) {
/* zbud page is under reclaim, reclaim will free */
spin_unlock(&pool->lock);
return;
}
/* Remove from existing buddy list */
list_del(&zhdr->buddy);
if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
/* zbud page is empty, free */
list_del(&zhdr->lru);
free_zbud_page(zhdr);
pool->pages_nr--;
} else {
/* Add to unbuddied list */
freechunks = num_free_chunks(zhdr);
list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
}
spin_unlock(&pool->lock);
}
#define list_tail_entry(ptr, type, member) \
list_entry((ptr)->prev, type, member)
/**
* zbud_reclaim_page() - evicts allocations from a pool page and frees it
* @pool: pool from which a page will attempt to be evicted
* @retires: number of pages on the LRU list for which eviction will
* be attempted before failing
*
* zbud reclaim is different from normal system reclaim in that the reclaim is
* done from the bottom, up. This is because only the bottom layer, zbud, has
* information on how the allocations are organized within each zbud page. This
* has the potential to create interesting locking situations between zbud and
* the user, however.
*
* To avoid these, this is how zbud_reclaim_page() should be called:
* The user detects a page should be reclaimed and calls zbud_reclaim_page().
* zbud_reclaim_page() will remove a zbud page from the pool LRU list and call
* the user-defined eviction handler with the pool and handle as arguments.
*
* If the handle can not be evicted, the eviction handler should return
* non-zero. zbud_reclaim_page() will add the zbud page back to the
* appropriate list and try the next zbud page on the LRU up to
* a user defined number of retries.
*
* If the handle is successfully evicted, the eviction handler should
* return 0 _and_ should have called zbud_free() on the handle. zbud_free()
* contains logic to delay freeing the page if the page is under reclaim,
* as indicated by the setting of the PG_reclaim flag on the underlying page.
*
* If all buddies in the zbud page are successfully evicted, then the
* zbud page can be freed.
*
* Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
* no pages to evict or an eviction handler is not registered, -EAGAIN if
* the retry limit was hit.
*/
int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
{
int i, ret, freechunks;
struct zbud_header *zhdr;
unsigned long first_handle = 0, last_handle = 0;
spin_lock(&pool->lock);
if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
retries == 0) {
spin_unlock(&pool->lock);
return -EINVAL;
}
for (i = 0; i < retries; i++) {
zhdr = list_tail_entry(&pool->lru, struct zbud_header, lru);
list_del(&zhdr->lru);
list_del(&zhdr->buddy);
/* Protect zbud page against free */
zhdr->under_reclaim = true;
/*
* We need encode the handles before unlocking, since we can
* race with free that will set (first|last)_chunks to 0
*/
first_handle = 0;
last_handle = 0;
if (zhdr->first_chunks)
first_handle = encode_handle(zhdr, FIRST);
if (zhdr->last_chunks)
last_handle = encode_handle(zhdr, LAST);
spin_unlock(&pool->lock);
/* Issue the eviction callback(s) */
if (first_handle) {
ret = pool->ops->evict(pool, first_handle);
if (ret)
goto next;
}
if (last_handle) {
ret = pool->ops->evict(pool, last_handle);
if (ret)
goto next;
}
next:
spin_lock(&pool->lock);
zhdr->under_reclaim = false;
if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
/*
* Both buddies are now free, free the zbud page and
* return success.
*/
free_zbud_page(zhdr);
pool->pages_nr--;
spin_unlock(&pool->lock);
return 0;
} else if (zhdr->first_chunks == 0 ||
zhdr->last_chunks == 0) {
/* add to unbuddied list */
freechunks = num_free_chunks(zhdr);
list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
} else {
/* add to buddied list */
list_add(&zhdr->buddy, &pool->buddied);
}
/* add to beginning of LRU */
list_add(&zhdr->lru, &pool->lru);
}
spin_unlock(&pool->lock);
return -EAGAIN;
}
/**
* zbud_map() - maps the allocation associated with the given handle
* @pool: pool in which the allocation resides
* @handle: handle associated with the allocation to be mapped
*
* While trivial for zbud, the mapping functions for others allocators
* implementing this allocation API could have more complex information encoded
* in the handle and could create temporary mappings to make the data
* accessible to the user.
*
* Returns: a pointer to the mapped allocation
*/
void *zbud_map(struct zbud_pool *pool, unsigned long handle)
{
return (void *)(handle);
}
/**
* zbud_unmap() - maps the allocation associated with the given handle
* @pool: pool in which the allocation resides
* @handle: handle associated with the allocation to be unmapped
*/
void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
{
}
/**
* zbud_get_pool_size() - gets the zbud pool size in pages
* @pool: pool whose size is being queried
*
* Returns: size in pages of the given pool. The pool lock need not be
* taken to access pages_nr.
*/
u64 zbud_get_pool_size(struct zbud_pool *pool)
{
return pool->pages_nr;
}
static int __init init_zbud(void)
{
/* Make sure the zbud header will fit in one chunk */
BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED);
pr_info("loaded\n");
return 0;
}
static void __exit exit_zbud(void)
{
pr_info("unloaded\n");
}
module_init(init_zbud);
module_exit(exit_zbud);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>");
MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages");

1155
mm/zcache.c Normal file

File diff suppressed because it is too large Load Diff