mm: Backport ZRAM/ZSMALLOC from Google kernel

Change-Id: Ib07ead1e23e816c96552254c049016825a164f2c

UPSTREAM: zram/zcomp: use GFP_NOIO to allocate streams

(cherry picked from commit 3d5fe03a3ea013060ebba2a811aeb0f23f56aefa)

We can end up allocating a new compression stream with GFP_KERNEL from
within the IO path, which may result is nested (recursive) IO
operations.  That can introduce problems if the IO path in question is a
reclaimer, holding some locks that will deadlock nested IOs.

Allocate streams and working memory using GFP_NOIO flag, forbidding
recursive IO and FS operations.

An example:

  inconsistent {IN-RECLAIM_FS-W} -> {RECLAIM_FS-ON-W} usage.
  git/20158 [HC0[0]:SC0[0]:HE1:SE1] takes:
   (jbd2_handle){+.+.?.}, at:  start_this_handle+0x4ca/0x555
  {IN-RECLAIM_FS-W} state was registered at:
     __lock_acquire+0x8da/0x117b
     lock_acquire+0x10c/0x1a7
     start_this_handle+0x52d/0x555
     jbd2__journal_start+0xb4/0x237
     __ext4_journal_start_sb+0x108/0x17e
     ext4_dirty_inode+0x32/0x61
     __mark_inode_dirty+0x16b/0x60c
     iput+0x11e/0x274
     __dentry_kill+0x148/0x1b8
     shrink_dentry_list+0x274/0x44a
     prune_dcache_sb+0x4a/0x55
     super_cache_scan+0xfc/0x176
     shrink_slab.part.14.constprop.25+0x2a2/0x4d3
     shrink_zone+0x74/0x140
     kswapd+0x6b7/0x930
     kthread+0x107/0x10f
     ret_from_fork+0x3f/0x70
  irq event stamp: 138297
  hardirqs last  enabled at (138297):  debug_check_no_locks_freed+0x113/0x12f
  hardirqs last disabled at (138296):  debug_check_no_locks_freed+0x33/0x12f
  softirqs last  enabled at (137818):  __do_softirq+0x2d3/0x3e9
  softirqs last disabled at (137813):  irq_exit+0x41/0x95

               other info that might help us debug this:
   Possible unsafe locking scenario:
         CPU0
         ----
    lock(jbd2_handle);
    <Interrupt>
      lock(jbd2_handle);

                *** DEADLOCK ***
  5 locks held by git/20158:
   #0:  (sb_writers#7){.+.+.+}, at: [<ffffffff81155411>] mnt_want_write+0x24/0x4b
   #1:  (&type->i_mutex_dir_key#2/1){+.+.+.}, at: [<ffffffff81145087>] lock_rename+0xd9/0xe3
   #2:  (&sb->s_type->i_mutex_key#11){+.+.+.}, at: [<ffffffff8114f8e2>] lock_two_nondirectories+0x3f/0x6b
   #3:  (&sb->s_type->i_mutex_key#11/4){+.+.+.}, at: [<ffffffff8114f909>] lock_two_nondirectories+0x66/0x6b
   #4:  (jbd2_handle){+.+.?.}, at: [<ffffffff811e31db>] start_this_handle+0x4ca/0x555

               stack backtrace:
  CPU: 2 PID: 20158 Comm: git Not tainted 4.1.0-rc7-next-20150615-dbg-00016-g8bdf555-dirty #211
  Call Trace:
    dump_stack+0x4c/0x6e
    mark_lock+0x384/0x56d
    mark_held_locks+0x5f/0x76
    lockdep_trace_alloc+0xb2/0xb5
    kmem_cache_alloc_trace+0x32/0x1e2
    zcomp_strm_alloc+0x25/0x73 [zram]
    zcomp_strm_multi_find+0xe7/0x173 [zram]
    zcomp_strm_find+0xc/0xe [zram]
    zram_bvec_rw+0x2ca/0x7e0 [zram]
    zram_make_request+0x1fa/0x301 [zram]
    generic_make_request+0x9c/0xdb
    submit_bio+0xf7/0x120
    ext4_io_submit+0x2e/0x43
    ext4_bio_write_page+0x1b7/0x300
    mpage_submit_page+0x60/0x77
    mpage_map_and_submit_buffers+0x10f/0x21d
    ext4_writepages+0xc8c/0xe1b
    do_writepages+0x23/0x2c
    __filemap_fdatawrite_range+0x84/0x8b
    filemap_flush+0x1c/0x1e
    ext4_alloc_da_blocks+0xb8/0x117
    ext4_rename+0x132/0x6dc
    ? mark_held_locks+0x5f/0x76
    ext4_rename2+0x29/0x2b
    vfs_rename+0x540/0x636
    SyS_renameat2+0x359/0x44d
    SyS_rename+0x1e/0x20
    entry_SYSCALL_64_fastpath+0x12/0x6f

[minchan@kernel.org: add stable mark]
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Kyeongdon Kim <kyeongdon.kim@lge.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

UPSTREAM: zram: try vmalloc() after kmalloc()

(cherry picked from commit d913897abace843bba20249f3190167f7895e9c3)

When we're using LZ4 multi compression streams for zram swap, we found
out page allocation failure message in system running test.  That was
not only once, but a few(2 - 5 times per test).  Also, some failure
cases were continually occurring to try allocation order 3.

In order to make parallel compression private data, we should call
kzalloc() with order 2/3 in runtime(lzo/lz4).  But if there is no order
2/3 size memory to allocate in that time, page allocation fails.  This
patch makes to use vmalloc() as fallback of kmalloc(), this prevents
page alloc failure warning.

After using this, we never found warning message in running test, also
It could reduce process startup latency about 60-120ms in each case.

For reference a call trace :

    Binder_1: page allocation failure: order:3, mode:0x10c0d0
    CPU: 0 PID: 424 Comm: Binder_1 Tainted: GW 3.10.49-perf-g991d02b-dirty #20
    Call trace:
      dump_backtrace+0x0/0x270
      show_stack+0x10/0x1c
      dump_stack+0x1c/0x28
      warn_alloc_failed+0xfc/0x11c
      __alloc_pages_nodemask+0x724/0x7f0
      __get_free_pages+0x14/0x5c
      kmalloc_order_trace+0x38/0xd8
      zcomp_lz4_create+0x2c/0x38
      zcomp_strm_alloc+0x34/0x78
      zcomp_strm_multi_find+0x124/0x1ec
      zcomp_strm_find+0xc/0x18
      zram_bvec_rw+0x2fc/0x780
      zram_make_request+0x25c/0x2d4
      generic_make_request+0x80/0xbc
      submit_bio+0xa4/0x15c
      __swap_writepage+0x218/0x230
      swap_writepage+0x3c/0x4c
      shrink_page_list+0x51c/0x8d0
      shrink_inactive_list+0x3f8/0x60c
      shrink_lruvec+0x33c/0x4cc
      shrink_zone+0x3c/0x100
      try_to_free_pages+0x2b8/0x54c
      __alloc_pages_nodemask+0x514/0x7f0
      __get_free_pages+0x14/0x5c
      proc_info_read+0x50/0xe4
      vfs_read+0xa0/0x12c
      SyS_read+0x44/0x74
    DMA: 3397*4kB (MC) 26*8kB (RC) 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB
         0*512kB 0*1024kB 0*2048kB 0*4096kB = 13796kB

[minchan@kernel.org: change vmalloc gfp and adding comment about gfp]
[sergey.senozhatsky@gmail.com: tweak comments and styles]
Signed-off-by: Kyeongdon Kim <kyeongdon.kim@lge.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

UPSTREAM: zram: pass gfp from zcomp frontend to backend

(cherry picked from commit 75d8947a36d0c9aedd69118d1f14bf424005c7c2)

Each zcomp backend uses own gfp flag but it's pointless because the
context they could be called is driven by upper layer(ie, zcomp
frontend).  As well, zcomp frondend could call them in different
context.  One context(ie, zram init part) is it should be better to make
sure successful allocation other context(ie, further stream allocation
part for accelarating I/O speed) is just optional so let's pass gfp down
from driver (ie, zcomp frontend) like normal MM convention.

[sergey.senozhatsky@gmail.com: add missing __vmalloc zero and highmem gfps]
Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

UPSTREAM: zram/zcomp: do not zero out zcomp private pages

(cherry picked from commit e02d238c9852a91b30da9ea32ce36d1416cdc683)

Do not __GFP_ZERO allocated zcomp ->private pages.  We keep allocated
streams around and use them for read/write requests, so we supply a
zeroed out ->private to compression algorithm as a scratch buffer only
once -- the first time we use that stream.  For the rest of IO requests
served by this stream ->private usually contains some temporarily data
from the previous requests.

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

UPSTREAM: block: disable entropy contributions for nonrot devices

(cherry picked from commit b277da0a8a594308e17881f4926879bd5fca2a2d)

Clear QUEUE_FLAG_ADD_RANDOM in all block drivers that set
QUEUE_FLAG_NONROT.

Historically, all block devices have automatically made entropy
contributions.  But as previously stated in commit e2e1a148 ("block: add
sysfs knob for turning off disk entropy contributions"):
    - On SSD disks, the completion times aren't as random as they
      are for rotational drives. So it's questionable whether they
      should contribute to the random pool in the first place.
    - Calling add_disk_randomness() has a lot of overhead.

There are more reliable sources for randomness than non-rotational block
devices.  From a security perspective it is better to err on the side of
caution than to allow entropy contributions from unreliable "random"
sources.

Change-Id: I2a4f86bacee8786e2cb1a82d45156338f79d64e0
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
Signed-off-by: Kevin F. Haggerty <haggertk@lineageos.org>
This commit is contained in:
Steve Kondik 2016-04-14 04:43:41 -07:00 committed by Francescodario Cuzzocrea
parent 9fe207732b
commit b0bda35dbd
30 changed files with 1070 additions and 1699 deletions

View file

@ -1,68 +1,70 @@
zsmalloc Memory Allocator
zsmalloc
--------
Overview
This allocator is designed for use with zram. Thus, the allocator is
supposed to work well under low memory conditions. In particular, it
never attempts higher order page allocation which is very likely to
fail under memory pressure. On the other hand, if we just use single
(0-order) pages, it would suffer from very high fragmentation --
any object of size PAGE_SIZE/2 or larger would occupy an entire page.
This was one of the major issues with its predecessor (xvmalloc).
zmalloc a new slab-based memory allocator,
zsmalloc, for storing compressed pages. It is designed for
low fragmentation and high allocation success rate on
large object, but <= PAGE_SIZE allocations.
To overcome these issues, zsmalloc allocates a bunch of 0-order pages
and links them together using various 'struct page' fields. These linked
pages act as a single higher-order page i.e. an object can span 0-order
page boundaries. The code refers to these linked pages as a single entity
called zspage.
zsmalloc differs from the kernel slab allocator in two primary
ways to achieve these design goals.
For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE
since this satisfies the requirements of all its current users (in the
worst case, page is incompressible and is thus stored "as-is" i.e. in
uncompressed form). For allocation requests larger than this size, failure
is returned (see zs_malloc).
zsmalloc never requires high order page allocations to back
slabs, or "size classes" in zsmalloc terms. Instead it allows
multiple single-order pages to be stitched together into a
"zspage" which backs the slab. This allows for higher allocation
success rate under memory pressure.
Additionally, zs_malloc() does not return a dereferenceable pointer.
Instead, it returns an opaque handle (unsigned long) which encodes actual
location of the allocated object. The reason for this indirection is that
zsmalloc does not keep zspages permanently mapped since that would cause
issues on 32-bit systems where the VA region for kernel space mappings
is very small. So, before using the allocating memory, the object has to
be mapped using zs_map_object() to get a usable pointer and subsequently
unmapped using zs_unmap_object().
Also, zsmalloc allows objects to span page boundaries within the
zspage. This allows for lower fragmentation than could be had
with the kernel slab allocator for objects between PAGE_SIZE/2
and PAGE_SIZE. With the kernel slab allocator, if a page compresses
to 60% of it original size, the memory savings gained through
compression is lost in fragmentation because another object of
the same size can't be stored in the leftover space.
stat
----
This ability to span pages results in zsmalloc allocations not being
directly addressable by the user. The user is given an
non-dereferencable handle in response to an allocation request.
That handle must be mapped, using zs_map_object(), which returns
a pointer to the mapped region that can be used. The mapping is
necessary since the object data may reside in two different
noncontigious pages.
With CONFIG_ZSMALLOC_STAT, we could see zsmalloc internal information via
/sys/kernel/debug/zsmalloc/<user name>. Here is a sample of stat output:
For 32-bit systems, zsmalloc has the added benefit of being
able to back slabs with HIGHMEM pages, something not possible
with the kernel slab allocators (SLAB or SLUB).
# cat /sys/kernel/debug/zsmalloc/zram0/classes
Usage:
class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage
..
..
9 176 0 1 186 129 8 4
10 192 1 0 2880 2872 135 3
11 208 0 1 819 795 42 2
12 224 0 1 219 159 12 4
..
..
#include <linux/zsmalloc.h>
/* create a new pool */
struct zs_pool *pool = zs_create_pool("mypool", GFP_KERNEL);
class: index
size: object size zspage stores
almost_empty: the number of ZS_ALMOST_EMPTY zspages(see below)
almost_full: the number of ZS_ALMOST_FULL zspages(see below)
obj_allocated: the number of objects allocated
obj_used: the number of objects allocated to the user
pages_used: the number of pages allocated for the class
pages_per_zspage: the number of 0-order pages to make a zspage
/* allocate a 256 byte object */
unsigned long handle = zs_malloc(pool, 256);
We assign a zspage to ZS_ALMOST_EMPTY fullness group when:
n <= N / f, where
n = number of allocated objects
N = total number of objects zspage can store
f = fullness_threshold_frac(ie, 4 at the moment)
/*
* Map the object to get a dereferenceable pointer in "read-write mode"
* (see zsmalloc.h for additional modes)
*/
void *ptr = zs_map_object(pool, handle, ZS_MM_RW);
/* do something with ptr */
/*
* Unmap the object when done dealing with it. You should try to
* minimize the time for which the object is mapped since preemption
* is disabled during the mapped period.
*/
zs_unmap_object(pool, handle);
/* free the object */
zs_free(pool, handle);
/* destroy the pool */
zs_destroy_pool(pool);
Similarly, we assign zspage to:
ZS_ALMOST_FULL when n > N / f
ZS_EMPTY when n == 0
ZS_FULL when n == N

View file

@ -555,4 +555,6 @@ config BLK_DEV_RBD
If unsure, say N.
source "drivers/block/zram/Kconfig"
endif # BLK_DEV

View file

@ -42,4 +42,6 @@ obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
obj-$(CONFIG_BLK_DEV_RBD) += rbd.o
obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/
obj-$(CONFIG_ZRAM) += zram/
swim_mod-y := swim.o swim_asm.o

View file

@ -3632,6 +3632,7 @@ skip_create_disk:
/* Set device limits. */
set_bit(QUEUE_FLAG_NONROT, &dd->queue->queue_flags);
clear_bit(QUEUE_FLAG_ADD_RANDOM, &dd->queue->queue_flags);
blk_queue_max_segments(dd->queue, MTIP_MAX_SG);
blk_queue_physical_block_size(dd->queue, 4096);
blk_queue_io_min(dd->queue, 4096);

View file

@ -14,7 +14,16 @@ config ZRAM
disks and maybe many more.
See zram.txt for more information.
Project home: <https://compcache.googlecode.com/>
config ZRAM_LZ4_COMPRESS
bool "Enable LZ4 algorithm support"
depends on ZRAM
select LZ4_COMPRESS
select LZ4_DECOMPRESS
default n
help
This option enables LZ4 compression algorithm support. Compression
algorithm can be changed using `comp_algorithm' device attribute.
config ZRAM_DEBUG
bool "Compressed RAM block device debug support"

View file

@ -0,0 +1,5 @@
zram-y := zcomp_lzo.o zcomp.o zram_drv.o
zram-$(CONFIG_ZRAM_LZ4_COMPRESS) += zcomp_lz4.o
obj-$(CONFIG_ZRAM) += zram.o

361
drivers/block/zram/zcomp.c Normal file
View file

@ -0,0 +1,361 @@
/*
* Copyright (C) 2014 Sergey Senozhatsky.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/wait.h>
#include <linux/sched.h>
#include "zcomp.h"
#include "zcomp_lzo.h"
#ifdef CONFIG_ZRAM_LZ4_COMPRESS
#include "zcomp_lz4.h"
#endif
/*
* single zcomp_strm backend
*/
struct zcomp_strm_single {
struct mutex strm_lock;
struct zcomp_strm *zstrm;
};
/*
* multi zcomp_strm backend
*/
struct zcomp_strm_multi {
/* protect strm list */
spinlock_t strm_lock;
/* max possible number of zstrm streams */
int max_strm;
/* number of available zstrm streams */
int avail_strm;
/* list of available strms */
struct list_head idle_strm;
wait_queue_head_t strm_wait;
};
static struct zcomp_backend *backends[] = {
&zcomp_lzo,
#ifdef CONFIG_ZRAM_LZ4_COMPRESS
&zcomp_lz4,
#endif
NULL
};
static struct zcomp_backend *find_backend(const char *compress)
{
int i = 0;
while (backends[i]) {
if (sysfs_streq(compress, backends[i]->name))
break;
i++;
}
return backends[i];
}
static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm)
{
if (zstrm->private)
comp->backend->destroy(zstrm->private);
free_pages((unsigned long)zstrm->buffer, 1);
kfree(zstrm);
}
/*
* allocate new zcomp_strm structure with ->private initialized by
* backend, return NULL on error
*/
static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags)
{
struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), flags);
if (!zstrm)
return NULL;
zstrm->private = comp->backend->create(flags);
/*
* allocate 2 pages. 1 for compressed data, plus 1 extra for the
* case when compressed size is larger than the original one
*/
zstrm->buffer = (void *)__get_free_pages(flags | __GFP_ZERO, 1);
if (!zstrm->private || !zstrm->buffer) {
zcomp_strm_free(comp, zstrm);
zstrm = NULL;
}
return zstrm;
}
/*
* get idle zcomp_strm or wait until other process release
* (zcomp_strm_release()) one for us
*/
static struct zcomp_strm *zcomp_strm_multi_find(struct zcomp *comp)
{
struct zcomp_strm_multi *zs = comp->stream;
struct zcomp_strm *zstrm;
while (1) {
spin_lock(&zs->strm_lock);
if (!list_empty(&zs->idle_strm)) {
zstrm = list_entry(zs->idle_strm.next,
struct zcomp_strm, list);
list_del(&zstrm->list);
spin_unlock(&zs->strm_lock);
return zstrm;
}
/* zstrm streams limit reached, wait for idle stream */
if (zs->avail_strm >= zs->max_strm) {
spin_unlock(&zs->strm_lock);
wait_event(zs->strm_wait, !list_empty(&zs->idle_strm));
continue;
}
/* allocate new zstrm stream */
zs->avail_strm++;
spin_unlock(&zs->strm_lock);
/*
* This function can be called in swapout/fs write path
* so we can't use GFP_FS|IO. And it assumes we already
* have at least one stream in zram initialization so we
* don't do best effort to allocate more stream in here.
* A default stream will work well without further multiple
* streams. That's why we use NORETRY | NOWARN.
*/
zstrm = zcomp_strm_alloc(comp, GFP_NOIO | __GFP_NORETRY |
__GFP_NOWARN);
if (!zstrm) {
spin_lock(&zs->strm_lock);
zs->avail_strm--;
spin_unlock(&zs->strm_lock);
wait_event(zs->strm_wait, !list_empty(&zs->idle_strm));
continue;
}
break;
}
return zstrm;
}
/* add stream back to idle list and wake up waiter or free the stream */
static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstrm)
{
struct zcomp_strm_multi *zs = comp->stream;
spin_lock(&zs->strm_lock);
if (zs->avail_strm <= zs->max_strm) {
list_add(&zstrm->list, &zs->idle_strm);
spin_unlock(&zs->strm_lock);
wake_up(&zs->strm_wait);
return;
}
zs->avail_strm--;
spin_unlock(&zs->strm_lock);
zcomp_strm_free(comp, zstrm);
}
/* change max_strm limit */
static bool zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm)
{
struct zcomp_strm_multi *zs = comp->stream;
struct zcomp_strm *zstrm;
spin_lock(&zs->strm_lock);
zs->max_strm = num_strm;
/*
* if user has lowered the limit and there are idle streams,
* immediately free as much streams (and memory) as we can.
*/
while (zs->avail_strm > num_strm && !list_empty(&zs->idle_strm)) {
zstrm = list_entry(zs->idle_strm.next,
struct zcomp_strm, list);
list_del(&zstrm->list);
zcomp_strm_free(comp, zstrm);
zs->avail_strm--;
}
spin_unlock(&zs->strm_lock);
return true;
}
static void zcomp_strm_multi_destroy(struct zcomp *comp)
{
struct zcomp_strm_multi *zs = comp->stream;
struct zcomp_strm *zstrm;
while (!list_empty(&zs->idle_strm)) {
zstrm = list_entry(zs->idle_strm.next,
struct zcomp_strm, list);
list_del(&zstrm->list);
zcomp_strm_free(comp, zstrm);
}
kfree(zs);
}
static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm)
{
struct zcomp_strm *zstrm;
struct zcomp_strm_multi *zs;
comp->destroy = zcomp_strm_multi_destroy;
comp->strm_find = zcomp_strm_multi_find;
comp->strm_release = zcomp_strm_multi_release;
comp->set_max_streams = zcomp_strm_multi_set_max_streams;
zs = kmalloc(sizeof(struct zcomp_strm_multi), GFP_KERNEL);
if (!zs)
return -ENOMEM;
comp->stream = zs;
spin_lock_init(&zs->strm_lock);
INIT_LIST_HEAD(&zs->idle_strm);
init_waitqueue_head(&zs->strm_wait);
zs->max_strm = max_strm;
zs->avail_strm = 1;
zstrm = zcomp_strm_alloc(comp, GFP_KERNEL);
if (!zstrm) {
kfree(zs);
return -ENOMEM;
}
list_add(&zstrm->list, &zs->idle_strm);
return 0;
}
static struct zcomp_strm *zcomp_strm_single_find(struct zcomp *comp)
{
struct zcomp_strm_single *zs = comp->stream;
mutex_lock(&zs->strm_lock);
return zs->zstrm;
}
static void zcomp_strm_single_release(struct zcomp *comp,
struct zcomp_strm *zstrm)
{
struct zcomp_strm_single *zs = comp->stream;
mutex_unlock(&zs->strm_lock);
}
static bool zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm)
{
/* zcomp_strm_single support only max_comp_streams == 1 */
return false;
}
static void zcomp_strm_single_destroy(struct zcomp *comp)
{
struct zcomp_strm_single *zs = comp->stream;
zcomp_strm_free(comp, zs->zstrm);
kfree(zs);
}
static int zcomp_strm_single_create(struct zcomp *comp)
{
struct zcomp_strm_single *zs;
comp->destroy = zcomp_strm_single_destroy;
comp->strm_find = zcomp_strm_single_find;
comp->strm_release = zcomp_strm_single_release;
comp->set_max_streams = zcomp_strm_single_set_max_streams;
zs = kmalloc(sizeof(struct zcomp_strm_single), GFP_KERNEL);
if (!zs)
return -ENOMEM;
comp->stream = zs;
mutex_init(&zs->strm_lock);
zs->zstrm = zcomp_strm_alloc(comp, GFP_KERNEL);
if (!zs->zstrm) {
kfree(zs);
return -ENOMEM;
}
return 0;
}
/* show available compressors */
ssize_t zcomp_available_show(const char *comp, char *buf)
{
ssize_t sz = 0;
int i = 0;
while (backends[i]) {
if (sysfs_streq(comp, backends[i]->name))
sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
"[%s] ", backends[i]->name);
else
sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
"%s ", backends[i]->name);
i++;
}
sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n");
return sz;
}
bool zcomp_set_max_streams(struct zcomp *comp, int num_strm)
{
return comp->set_max_streams(comp, num_strm);
}
struct zcomp_strm *zcomp_strm_find(struct zcomp *comp)
{
return comp->strm_find(comp);
}
void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm)
{
comp->strm_release(comp, zstrm);
}
int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm,
const unsigned char *src, size_t *dst_len)
{
return comp->backend->compress(src, zstrm->buffer, dst_len,
zstrm->private);
}
int zcomp_decompress(struct zcomp *comp, const unsigned char *src,
size_t src_len, unsigned char *dst)
{
return comp->backend->decompress(src, src_len, dst);
}
void zcomp_destroy(struct zcomp *comp)
{
comp->destroy(comp);
kfree(comp);
}
/*
* search available compressors for requested algorithm.
* allocate new zcomp and initialize it. return compressing
* backend pointer or ERR_PTR if things went bad. ERR_PTR(-EINVAL)
* if requested algorithm is not supported, ERR_PTR(-ENOMEM) in
* case of allocation error.
*/
struct zcomp *zcomp_create(const char *compress, int max_strm)
{
struct zcomp *comp;
struct zcomp_backend *backend;
backend = find_backend(compress);
if (!backend)
return ERR_PTR(-EINVAL);
comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL);
if (!comp)
return ERR_PTR(-ENOMEM);
comp->backend = backend;
if (max_strm > 1)
zcomp_strm_multi_create(comp, max_strm);
else
zcomp_strm_single_create(comp);
if (!comp->stream) {
kfree(comp);
return ERR_PTR(-ENOMEM);
}
return comp;
}

View file

@ -0,0 +1,68 @@
/*
* Copyright (C) 2014 Sergey Senozhatsky.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#ifndef _ZCOMP_H_
#define _ZCOMP_H_
#include <linux/mutex.h>
struct zcomp_strm {
/* compression/decompression buffer */
void *buffer;
/*
* The private data of the compression stream, only compression
* stream backend can touch this (e.g. compression algorithm
* working memory)
*/
void *private;
/* used in multi stream backend, protected by backend strm_lock */
struct list_head list;
};
/* static compression backend */
struct zcomp_backend {
int (*compress)(const unsigned char *src, unsigned char *dst,
size_t *dst_len, void *private);
int (*decompress)(const unsigned char *src, size_t src_len,
unsigned char *dst);
void *(*create)(gfp_t flags);
void (*destroy)(void *private);
const char *name;
};
/* dynamic per-device compression frontend */
struct zcomp {
void *stream;
struct zcomp_backend *backend;
struct zcomp_strm *(*strm_find)(struct zcomp *comp);
void (*strm_release)(struct zcomp *comp, struct zcomp_strm *zstrm);
bool (*set_max_streams)(struct zcomp *comp, int num_strm);
void (*destroy)(struct zcomp *comp);
};
ssize_t zcomp_available_show(const char *comp, char *buf);
struct zcomp *zcomp_create(const char *comp, int max_strm);
void zcomp_destroy(struct zcomp *comp);
struct zcomp_strm *zcomp_strm_find(struct zcomp *comp);
void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm);
int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm,
const unsigned char *src, size_t *dst_len);
int zcomp_decompress(struct zcomp *comp, const unsigned char *src,
size_t src_len, unsigned char *dst);
bool zcomp_set_max_streams(struct zcomp *comp, int num_strm);
#endif /* _ZCOMP_H_ */

View file

@ -0,0 +1,56 @@
/*
* Copyright (C) 2014 Sergey Senozhatsky.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/lz4.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include "zcomp_lz4.h"
static void *zcomp_lz4_create(gfp_t flags)
{
void *ret;
ret = kmalloc(LZ4_MEM_COMPRESS, flags);
if (!ret)
ret = __vmalloc(LZ4_MEM_COMPRESS,
flags | __GFP_HIGHMEM,
PAGE_KERNEL);
return ret;
}
static void zcomp_lz4_destroy(void *private)
{
kvfree(private);
}
static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst,
size_t *dst_len, void *private)
{
/* return : Success if return 0 */
return lz4_compress(src, PAGE_SIZE, dst, dst_len, private);
}
static int zcomp_lz4_decompress(const unsigned char *src, size_t src_len,
unsigned char *dst)
{
size_t dst_len = PAGE_SIZE;
/* return : Success if return 0 */
return lz4_decompress_unknownoutputsize(src, src_len, dst, &dst_len);
}
struct zcomp_backend zcomp_lz4 = {
.compress = zcomp_lz4_compress,
.decompress = zcomp_lz4_decompress,
.create = zcomp_lz4_create,
.destroy = zcomp_lz4_destroy,
.name = "lz4",
};

View file

@ -0,0 +1,17 @@
/*
* Copyright (C) 2014 Sergey Senozhatsky.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#ifndef _ZCOMP_LZ4_H_
#define _ZCOMP_LZ4_H_
#include "zcomp.h"
extern struct zcomp_backend zcomp_lz4;
#endif /* _ZCOMP_LZ4_H_ */

View file

@ -0,0 +1,56 @@
/*
* Copyright (C) 2014 Sergey Senozhatsky.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/lzo.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include "zcomp_lzo.h"
static void *lzo_create(gfp_t flags)
{
void *ret;
ret = kmalloc(LZO1X_MEM_COMPRESS, flags);
if (!ret)
ret = __vmalloc(LZO1X_MEM_COMPRESS,
flags | __GFP_HIGHMEM,
PAGE_KERNEL);
return ret;
}
static void lzo_destroy(void *private)
{
kvfree(private);
}
static int lzo_compress(const unsigned char *src, unsigned char *dst,
size_t *dst_len, void *private)
{
int ret = lzo1x_1_compress(src, PAGE_SIZE, dst, dst_len, private);
return ret == LZO_E_OK ? 0 : ret;
}
static int lzo_decompress(const unsigned char *src, size_t src_len,
unsigned char *dst)
{
size_t dst_len = PAGE_SIZE;
int ret = lzo1x_decompress_safe(src, src_len, dst, &dst_len);
return ret == LZO_E_OK ? 0 : ret;
}
struct zcomp_backend zcomp_lzo = {
.compress = lzo_compress,
.decompress = lzo_decompress,
.create = lzo_create,
.destroy = lzo_destroy,
.name = "lzo",
};

View file

@ -0,0 +1,17 @@
/*
* Copyright (C) 2014 Sergey Senozhatsky.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#ifndef _ZCOMP_LZO_H_
#define _ZCOMP_LZO_H_
#include "zcomp.h"
extern struct zcomp_backend zcomp_lzo;
#endif /* _ZCOMP_LZO_H_ */

View file

@ -2,6 +2,7 @@
* Compressed RAM block device
*
* Copyright (C) 2008, 2009, 2010 Nitin Gupta
* 2012, 2013 Minchan Kim
*
* This code is released using a dual license strategy: BSD/GPL
* You can choose the licence that better fits your requirements.
@ -9,7 +10,6 @@
* Released under the terms of 3-clause BSD License
* Released under the terms of GNU General Public License Version 2.0
*
* Project home: http://compcache.googlecode.com
*/
#define KMSG_COMPONENT "zram"
@ -29,26 +29,36 @@
#include <linux/genhd.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/lzo.h>
#include <linux/string.h>
#include <linux/vmalloc.h>
#include <linux/ratelimit.h>
#include <linux/err.h>
#include "zram_drv.h"
/* Globals */
static int zram_major;
static struct zram *zram_devices;
/*
* We don't need to see memory allocation errors more than once every 1
* second to know that a problem is occurring.
*/
#define ALLOC_ERROR_LOG_RATE_MS 1000
static const char *default_compressor = "lzo";
/* Module params (documentation at end) */
static unsigned int num_devices = 1;
#define ZRAM_ATTR_RO(name) \
static ssize_t zram_attr_##name##_show(struct device *d, \
struct device_attribute *attr, char *b) \
{ \
struct zram *zram = dev_to_zram(d); \
return scnprintf(b, PAGE_SIZE, "%llu\n", \
(u64)atomic64_read(&zram->stats.name)); \
} \
static struct device_attribute dev_attr_##name = \
__ATTR(name, S_IRUGO, zram_attr_##name##_show, NULL);
static inline int init_done(struct zram *zram)
{
return zram->meta != NULL;
}
static inline struct zram *dev_to_zram(struct device *dev)
{
return (struct zram *)dev_to_disk(dev)->private_data;
@ -59,59 +69,20 @@ static ssize_t disksize_show(struct device *dev,
{
struct zram *zram = dev_to_zram(dev);
return sprintf(buf, "%llu\n", zram->disksize);
return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize);
}
static ssize_t initstate_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
u32 val;
struct zram *zram = dev_to_zram(dev);
return sprintf(buf, "%u\n", zram->init_done);
}
down_read(&zram->init_lock);
val = init_done(zram);
up_read(&zram->init_lock);
static ssize_t num_reads_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct zram *zram = dev_to_zram(dev);
return sprintf(buf, "%llu\n",
(u64)atomic64_read(&zram->stats.num_reads));
}
static ssize_t num_writes_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct zram *zram = dev_to_zram(dev);
return sprintf(buf, "%llu\n",
(u64)atomic64_read(&zram->stats.num_writes));
}
static ssize_t invalid_io_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct zram *zram = dev_to_zram(dev);
return sprintf(buf, "%llu\n",
(u64)atomic64_read(&zram->stats.invalid_io));
}
static ssize_t notify_free_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct zram *zram = dev_to_zram(dev);
return sprintf(buf, "%llu\n",
(u64)atomic64_read(&zram->stats.notify_free));
}
static ssize_t zero_pages_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct zram *zram = dev_to_zram(dev);
return sprintf(buf, "%u\n", zram->stats.pages_zero);
return scnprintf(buf, PAGE_SIZE, "%u\n", val);
}
static ssize_t orig_data_size_show(struct device *dev,
@ -119,17 +90,8 @@ static ssize_t orig_data_size_show(struct device *dev,
{
struct zram *zram = dev_to_zram(dev);
return sprintf(buf, "%llu\n",
(u64)(zram->stats.pages_stored) << PAGE_SHIFT);
}
static ssize_t compr_data_size_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct zram *zram = dev_to_zram(dev);
return sprintf(buf, "%llu\n",
(u64)atomic64_read(&zram->stats.compr_size));
return scnprintf(buf, PAGE_SIZE, "%llu\n",
(u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT);
}
static ssize_t mem_used_total_show(struct device *dev,
@ -140,13 +102,84 @@ static ssize_t mem_used_total_show(struct device *dev,
struct zram_meta *meta = zram->meta;
down_read(&zram->init_lock);
if (zram->init_done)
if (init_done(zram))
val = zs_get_total_size_bytes(meta->mem_pool);
up_read(&zram->init_lock);
return sprintf(buf, "%llu\n", val);
return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
}
static ssize_t max_comp_streams_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
int val;
struct zram *zram = dev_to_zram(dev);
down_read(&zram->init_lock);
val = zram->max_comp_streams;
up_read(&zram->init_lock);
return scnprintf(buf, PAGE_SIZE, "%d\n", val);
}
static ssize_t max_comp_streams_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
int num;
struct zram *zram = dev_to_zram(dev);
int ret;
ret = kstrtoint(buf, 0, &num);
if (ret < 0)
return ret;
if (num < 1)
return -EINVAL;
down_write(&zram->init_lock);
if (init_done(zram)) {
if (!zcomp_set_max_streams(zram->comp, num)) {
pr_info("Cannot change max compression streams\n");
ret = -EINVAL;
goto out;
}
}
zram->max_comp_streams = num;
ret = len;
out:
up_write(&zram->init_lock);
return ret;
}
static ssize_t comp_algorithm_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
size_t sz;
struct zram *zram = dev_to_zram(dev);
down_read(&zram->init_lock);
sz = zcomp_available_show(zram->compressor, buf);
up_read(&zram->init_lock);
return sz;
}
static ssize_t comp_algorithm_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
struct zram *zram = dev_to_zram(dev);
down_write(&zram->init_lock);
if (init_done(zram)) {
up_write(&zram->init_lock);
pr_info("Can't change algorithm for initialized device\n");
return -EBUSY;
}
strlcpy(zram->compressor, buf, sizeof(zram->compressor));
up_write(&zram->init_lock);
return len;
}
/* flag operations needs meta->tb_lock */
static int zram_test_flag(struct zram_meta *meta, u32 index,
enum zram_pageflags flag)
{
@ -178,7 +211,8 @@ static inline int valid_io_request(struct zram *zram, struct bio *bio)
u64 start, end, bound;
/* unaligned request */
if (unlikely(bio->bi_sector & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
if (unlikely(bio->bi_sector &
(ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
return 0;
if (unlikely(bio->bi_size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))
return 0;
@ -197,8 +231,6 @@ static inline int valid_io_request(struct zram *zram, struct bio *bio)
static void zram_meta_free(struct zram_meta *meta)
{
zs_destroy_pool(meta->mem_pool);
kfree(meta->compress_workmem);
free_pages((unsigned long)meta->compress_buffer, 1);
vfree(meta->table);
kfree(meta);
}
@ -210,39 +242,24 @@ static struct zram_meta *zram_meta_alloc(u64 disksize)
if (!meta)
goto out;
meta->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
if (!meta->compress_workmem)
goto free_meta;
meta->compress_buffer =
(void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1);
if (!meta->compress_buffer) {
pr_err("Error allocating compressor buffer space\n");
goto free_workmem;
}
num_pages = disksize >> PAGE_SHIFT;
meta->table = vzalloc(num_pages * sizeof(*meta->table));
if (!meta->table) {
pr_err("Error allocating zram address table\n");
goto free_buffer;
goto free_meta;
}
meta->mem_pool = zs_create_pool(GFP_NOIO | __GFP_HIGHMEM |
__GFP_NOWARN);
meta->mem_pool = zs_create_pool(GFP_NOIO | __GFP_HIGHMEM);
if (!meta->mem_pool) {
pr_err("Error creating memory pool\n");
goto free_table;
}
rwlock_init(&meta->tb_lock);
return meta;
free_table:
vfree(meta->table);
free_buffer:
free_pages((unsigned long)meta->compress_buffer, 1);
free_workmem:
kfree(meta->compress_workmem);
free_meta:
kfree(meta);
meta = NULL;
@ -287,11 +304,11 @@ static void handle_zero_page(struct bio_vec *bvec)
flush_dcache_page(page);
}
/* NOTE: caller should hold meta->tb_lock with write-side */
static void zram_free_page(struct zram *zram, size_t index)
{
struct zram_meta *meta = zram->meta;
unsigned long handle = meta->table[index].handle;
u16 size = meta->table[index].size;
if (unlikely(!handle)) {
/*
@ -300,21 +317,15 @@ static void zram_free_page(struct zram *zram, size_t index)
*/
if (zram_test_flag(meta, index, ZRAM_ZERO)) {
zram_clear_flag(meta, index, ZRAM_ZERO);
zram->stats.pages_zero--;
atomic64_dec(&zram->stats.zero_pages);
}
return;
}
if (unlikely(size > max_zpage_size))
zram->stats.bad_compress--;
zs_free(meta->mem_pool, handle);
if (size <= PAGE_SIZE / 2)
zram->stats.good_compress--;
atomic64_sub(meta->table[index].size, &zram->stats.compr_size);
zram->stats.pages_stored--;
atomic64_sub(meta->table[index].size, &zram->stats.compr_data_size);
atomic64_dec(&zram->stats.pages_stored);
meta->table[index].handle = 0;
meta->table[index].size = 0;
@ -322,27 +333,32 @@ static void zram_free_page(struct zram *zram, size_t index)
static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
{
int ret = LZO_E_OK;
size_t clen = PAGE_SIZE;
int ret = 0;
unsigned char *cmem;
struct zram_meta *meta = zram->meta;
unsigned long handle = meta->table[index].handle;
unsigned long handle;
u16 size;
read_lock(&meta->tb_lock);
handle = meta->table[index].handle;
size = meta->table[index].size;
if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
read_unlock(&meta->tb_lock);
clear_page(mem);
return 0;
}
cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
if (meta->table[index].size == PAGE_SIZE)
if (size == PAGE_SIZE)
copy_page(mem, cmem);
else
ret = lzo1x_decompress_safe(cmem, meta->table[index].size,
mem, &clen);
ret = zcomp_decompress(zram->comp, cmem, size, mem);
zs_unmap_object(meta->mem_pool, handle);
read_unlock(&meta->tb_lock);
/* Should NEVER happen. Return bio error if it does. */
if (unlikely(ret != LZO_E_OK)) {
if (unlikely(ret)) {
pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
atomic64_inc(&zram->stats.failed_reads);
return ret;
@ -360,11 +376,14 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
struct zram_meta *meta = zram->meta;
page = bvec->bv_page;
read_lock(&meta->tb_lock);
if (unlikely(!meta->table[index].handle) ||
zram_test_flag(meta, index, ZRAM_ZERO)) {
read_unlock(&meta->tb_lock);
handle_zero_page(bvec);
return 0;
}
read_unlock(&meta->tb_lock);
if (is_partial_io(bvec))
/* Use a temporary buffer to decompress the page */
@ -382,7 +401,7 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
ret = zram_decompress_page(zram, uncmem, index);
/* Should NEVER happen. Return bio error if it does. */
if (unlikely(ret != LZO_E_OK))
if (unlikely(ret))
goto out_cleanup;
if (is_partial_io(bvec))
@ -407,11 +426,10 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
struct page *page;
unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
struct zram_meta *meta = zram->meta;
static unsigned long zram_rs_time;
struct zcomp_strm *zstrm;
bool locked = false;
page = bvec->bv_page;
src = meta->compress_buffer;
if (is_partial_io(bvec)) {
/*
* This is a partial IO. We need to read the full page
@ -427,6 +445,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
goto out;
}
zstrm = zcomp_strm_find(zram->comp);
locked = true;
user_mem = kmap_atomic(page);
if (is_partial_io(bvec)) {
@ -439,52 +459,41 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
}
if (page_zero_filled(uncmem)) {
kunmap_atomic(user_mem);
if (user_mem)
kunmap_atomic(user_mem);
/* Free memory associated with this sector now. */
write_lock(&zram->meta->tb_lock);
zram_free_page(zram, index);
zram->stats.pages_zero++;
zram_set_flag(meta, index, ZRAM_ZERO);
write_unlock(&zram->meta->tb_lock);
atomic64_inc(&zram->stats.zero_pages);
ret = 0;
goto out;
}
/*
* zram_slot_free_notify could miss free so that let's
* double check.
*/
if (unlikely(meta->table[index].handle ||
zram_test_flag(meta, index, ZRAM_ZERO)))
zram_free_page(zram, index);
ret = lzo1x_1_compress(uncmem, PAGE_SIZE, src, &clen,
meta->compress_workmem);
ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen);
if (!is_partial_io(bvec)) {
kunmap_atomic(user_mem);
user_mem = NULL;
uncmem = NULL;
}
if (unlikely(ret != LZO_E_OK)) {
if (unlikely(ret)) {
pr_err("Compression failed! err=%d\n", ret);
goto out;
}
src = zstrm->buffer;
if (unlikely(clen > max_zpage_size)) {
zram->stats.bad_compress++;
clen = PAGE_SIZE;
src = NULL;
if (is_partial_io(bvec))
src = uncmem;
}
handle = zs_malloc(meta->mem_pool, clen);
if (!handle) {
if (printk_timed_ratelimit(&zram_rs_time,
ALLOC_ERROR_LOG_RATE_MS))
pr_info("Error allocating memory for compressed page: %u, size=%zu\n",
index, clen);
pr_info("Error allocating memory for compressed page: %u, size=%zu\n",
index, clen);
ret = -ENOMEM;
goto out;
}
@ -498,82 +507,104 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
memcpy(cmem, src, clen);
}
zcomp_strm_release(zram->comp, zstrm);
locked = false;
zs_unmap_object(meta->mem_pool, handle);
/*
* Free memory associated with this sector
* before overwriting unused sectors.
*/
write_lock(&zram->meta->tb_lock);
zram_free_page(zram, index);
meta->table[index].handle = handle;
meta->table[index].size = clen;
write_unlock(&zram->meta->tb_lock);
/* Update stats */
atomic64_add(clen, &zram->stats.compr_size);
zram->stats.pages_stored++;
if (clen <= PAGE_SIZE / 2)
zram->stats.good_compress++;
atomic64_add(clen, &zram->stats.compr_data_size);
atomic64_inc(&zram->stats.pages_stored);
out:
if (locked)
zcomp_strm_release(zram->comp, zstrm);
if (is_partial_io(bvec))
kfree(uncmem);
if (ret)
atomic64_inc(&zram->stats.failed_writes);
return ret;
}
static void handle_pending_slot_free(struct zram *zram)
{
struct zram_slot_free *free_rq;
spin_lock(&zram->slot_free_lock);
while (zram->slot_free_rq) {
free_rq = zram->slot_free_rq;
zram->slot_free_rq = free_rq->next;
zram_free_page(zram, free_rq->index);
kfree(free_rq);
}
spin_unlock(&zram->slot_free_lock);
}
static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
int offset, struct bio *bio, int rw)
int offset, struct bio *bio)
{
int ret;
int rw = bio_data_dir(bio);
if (rw == READ) {
down_read(&zram->lock);
handle_pending_slot_free(zram);
atomic64_inc(&zram->stats.num_reads);
ret = zram_bvec_read(zram, bvec, index, offset, bio);
up_read(&zram->lock);
} else {
down_write(&zram->lock);
handle_pending_slot_free(zram);
atomic64_inc(&zram->stats.num_writes);
ret = zram_bvec_write(zram, bvec, index, offset);
up_write(&zram->lock);
}
return ret;
}
/*
* zram_bio_discard - handler on discard request
* @index: physical block index in PAGE_SIZE units
* @offset: byte offset within physical block
*/
static void zram_bio_discard(struct zram *zram, u32 index,
int offset, struct bio *bio)
{
size_t n = bio->bi_size;
/*
* zram manages data in physical block size units. Because logical block
* size isn't identical with physical block size on some arch, we
* could get a discard request pointing to a specific offset within a
* certain physical block. Although we can handle this request by
* reading that physiclal block and decompressing and partially zeroing
* and re-compressing and then re-storing it, this isn't reasonable
* because our intent with a discard request is to save memory. So
* skipping this logical block is appropriate here.
*/
if (offset) {
if (n < offset)
return;
n -= offset;
index++;
}
while (n >= PAGE_SIZE) {
/*
* Discard request can be large so the lock hold times could be
* lengthy. So take the lock once per page.
*/
write_lock(&zram->meta->tb_lock);
zram_free_page(zram, index);
write_unlock(&zram->meta->tb_lock);
index++;
n -= PAGE_SIZE;
}
}
static void zram_reset_device(struct zram *zram, bool reset_capacity)
{
size_t index;
struct zram_meta *meta;
flush_work(&zram->free_work);
down_write(&zram->init_lock);
if (!zram->init_done) {
if (!init_done(zram)) {
up_write(&zram->init_lock);
return;
}
meta = zram->meta;
zram->init_done = 0;
/* Free all pages that are still in this zram device */
for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
unsigned long handle = meta->table[index].handle;
@ -583,6 +614,9 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
zs_free(meta->mem_pool, handle);
}
zcomp_destroy(zram->comp);
zram->max_comp_streams = 1;
zram_meta_free(zram->meta);
zram->meta = NULL;
/* Reset stats */
@ -594,37 +628,14 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
up_write(&zram->init_lock);
}
static void zram_init_device(struct zram *zram, struct zram_meta *meta)
{
if (zram->disksize > 2 * (totalram_pages << PAGE_SHIFT)) {
pr_info(
"There is little point creating a zram of greater than "
"twice the size of memory since we expect a 2:1 compression "
"ratio. Note that zram uses about 0.1%% of the size of "
"the disk when not in use so a huge zram is "
"wasteful.\n"
"\tMemory Size: %lu kB\n"
"\tSize you selected: %llu kB\n"
"Continuing anyway ...\n",
(totalram_pages << PAGE_SHIFT) >> 10, zram->disksize >> 10
);
}
/* zram devices sort of resembles non-rotational disks */
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue);
zram->meta = meta;
zram->init_done = 1;
pr_debug("Initialization done!\n");
}
static ssize_t disksize_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
u64 disksize;
struct zcomp *comp;
struct zram_meta *meta;
struct zram *zram = dev_to_zram(dev);
int err;
disksize = memparse(buf, NULL);
if (!disksize)
@ -632,20 +643,37 @@ static ssize_t disksize_store(struct device *dev,
disksize = PAGE_ALIGN(disksize);
meta = zram_meta_alloc(disksize);
down_write(&zram->init_lock);
if (zram->init_done) {
up_write(&zram->init_lock);
zram_meta_free(meta);
pr_info("Cannot change disksize for initialized device\n");
return -EBUSY;
if (!meta)
return -ENOMEM;
comp = zcomp_create(zram->compressor, zram->max_comp_streams);
if (IS_ERR(comp)) {
pr_info("Cannot initialise %s compressing backend\n",
zram->compressor);
err = PTR_ERR(comp);
goto out_free_meta;
}
down_write(&zram->init_lock);
if (init_done(zram)) {
pr_info("Cannot change disksize for initialized device\n");
err = -EBUSY;
goto out_destroy_comp;
}
zram->meta = meta;
zram->comp = comp;
zram->disksize = disksize;
set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
zram_init_device(zram, meta);
up_write(&zram->init_lock);
return len;
out_destroy_comp:
up_write(&zram->init_lock);
zcomp_destroy(comp);
out_free_meta:
zram_meta_free(meta);
return err;
}
static ssize_t reset_store(struct device *dev,
@ -659,42 +687,51 @@ static ssize_t reset_store(struct device *dev,
zram = dev_to_zram(dev);
bdev = bdget_disk(zram->disk, 0);
if (!bdev)
return -ENOMEM;
/* Do not reset an active device! */
if (bdev->bd_holders)
return -EBUSY;
if (bdev->bd_holders) {
ret = -EBUSY;
goto out;
}
ret = kstrtou16(buf, 10, &do_reset);
if (ret)
return ret;
goto out;
if (!do_reset)
return -EINVAL;
if (!do_reset) {
ret = -EINVAL;
goto out;
}
/* Make sure all pending I/O is finished */
if (bdev)
fsync_bdev(bdev);
fsync_bdev(bdev);
bdput(bdev);
zram_reset_device(zram, true);
return len;
out:
bdput(bdev);
return ret;
}
static void __zram_make_request(struct zram *zram, struct bio *bio, int rw)
static void __zram_make_request(struct zram *zram, struct bio *bio)
{
int i, offset;
int offset, i;
u32 index;
struct bio_vec *bvec;
switch (rw) {
case READ:
atomic64_inc(&zram->stats.num_reads);
break;
case WRITE:
atomic64_inc(&zram->stats.num_writes);
break;
}
index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
offset = (bio->bi_sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
offset = (bio->bi_sector &
(SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
if (unlikely(bio->bi_rw & REQ_DISCARD)) {
zram_bio_discard(zram, index, offset, bio);
bio_endio(bio, 0);
return;
}
bio_for_each_segment(bvec, bio, i) {
int max_transfer_size = PAGE_SIZE - offset;
@ -710,16 +747,15 @@ static void __zram_make_request(struct zram *zram, struct bio *bio, int rw)
bv.bv_len = max_transfer_size;
bv.bv_offset = bvec->bv_offset;
if (zram_bvec_rw(zram, &bv, index, offset, bio, rw) < 0)
if (zram_bvec_rw(zram, &bv, index, offset, bio) < 0)
goto out;
bv.bv_len = bvec->bv_len - max_transfer_size;
bv.bv_offset += max_transfer_size;
if (zram_bvec_rw(zram, &bv, index+1, 0, bio, rw) < 0)
if (zram_bvec_rw(zram, &bv, index + 1, 0, bio) < 0)
goto out;
} else
if (zram_bvec_rw(zram, bvec, index, offset, bio, rw)
< 0)
if (zram_bvec_rw(zram, bvec, index, offset, bio) < 0)
goto out;
update_position(&index, &offset, bvec);
@ -741,7 +777,7 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
struct zram *zram = queue->queuedata;
down_read(&zram->init_lock);
if (unlikely(!zram->init_done))
if (unlikely(!init_done(zram)))
goto error;
if (!valid_io_request(zram, bio)) {
@ -749,7 +785,7 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
goto error;
}
__zram_make_request(zram, bio, bio_data_dir(bio));
__zram_make_request(zram, bio);
up_read(&zram->init_lock);
return;
@ -759,40 +795,19 @@ error:
bio_io_error(bio);
}
static void zram_slot_free(struct work_struct *work)
{
struct zram *zram;
zram = container_of(work, struct zram, free_work);
down_write(&zram->lock);
handle_pending_slot_free(zram);
up_write(&zram->lock);
}
static void add_slot_free(struct zram *zram, struct zram_slot_free *free_rq)
{
spin_lock(&zram->slot_free_lock);
free_rq->next = zram->slot_free_rq;
zram->slot_free_rq = free_rq;
spin_unlock(&zram->slot_free_lock);
}
static void zram_slot_free_notify(struct block_device *bdev,
unsigned long index)
{
struct zram *zram;
struct zram_slot_free *free_rq;
struct zram_meta *meta;
zram = bdev->bd_disk->private_data;
meta = zram->meta;
write_lock(&meta->tb_lock);
zram_free_page(zram, index);
write_unlock(&meta->tb_lock);
atomic64_inc(&zram->stats.notify_free);
free_rq = kmalloc(sizeof(struct zram_slot_free), GFP_ATOMIC);
if (!free_rq)
return;
free_rq->index = index;
add_slot_free(zram, free_rq);
schedule_work(&zram->free_work);
}
static const struct block_device_operations zram_devops = {
@ -804,14 +819,21 @@ static DEVICE_ATTR(disksize, S_IRUGO | S_IWUSR,
disksize_show, disksize_store);
static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL);
static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store);
static DEVICE_ATTR(num_reads, S_IRUGO, num_reads_show, NULL);
static DEVICE_ATTR(num_writes, S_IRUGO, num_writes_show, NULL);
static DEVICE_ATTR(invalid_io, S_IRUGO, invalid_io_show, NULL);
static DEVICE_ATTR(notify_free, S_IRUGO, notify_free_show, NULL);
static DEVICE_ATTR(zero_pages, S_IRUGO, zero_pages_show, NULL);
static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL);
static DEVICE_ATTR(compr_data_size, S_IRUGO, compr_data_size_show, NULL);
static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL);
static DEVICE_ATTR(max_comp_streams, S_IRUGO | S_IWUSR,
max_comp_streams_show, max_comp_streams_store);
static DEVICE_ATTR(comp_algorithm, S_IRUGO | S_IWUSR,
comp_algorithm_show, comp_algorithm_store);
ZRAM_ATTR_RO(num_reads);
ZRAM_ATTR_RO(num_writes);
ZRAM_ATTR_RO(failed_reads);
ZRAM_ATTR_RO(failed_writes);
ZRAM_ATTR_RO(invalid_io);
ZRAM_ATTR_RO(notify_free);
ZRAM_ATTR_RO(zero_pages);
ZRAM_ATTR_RO(compr_data_size);
static struct attribute *zram_disk_attrs[] = {
&dev_attr_disksize.attr,
@ -819,12 +841,16 @@ static struct attribute *zram_disk_attrs[] = {
&dev_attr_reset.attr,
&dev_attr_num_reads.attr,
&dev_attr_num_writes.attr,
&dev_attr_failed_reads.attr,
&dev_attr_failed_writes.attr,
&dev_attr_invalid_io.attr,
&dev_attr_notify_free.attr,
&dev_attr_zero_pages.attr,
&dev_attr_orig_data_size.attr,
&dev_attr_compr_data_size.attr,
&dev_attr_mem_used_total.attr,
&dev_attr_max_comp_streams.attr,
&dev_attr_comp_algorithm.attr,
NULL,
};
@ -836,13 +862,8 @@ static int create_device(struct zram *zram, int device_id)
{
int ret = -ENOMEM;
init_rwsem(&zram->lock);
init_rwsem(&zram->init_lock);
INIT_WORK(&zram->free_work, zram_slot_free);
spin_lock_init(&zram->slot_free_lock);
zram->slot_free_rq = NULL;
zram->queue = blk_alloc_queue(GFP_KERNEL);
if (!zram->queue) {
pr_err("Error allocating disk queue for device %d\n",
@ -870,7 +891,9 @@ static int create_device(struct zram *zram, int device_id)
/* Actual capacity set using syfs (/sys/block/zram<id>/disksize */
set_capacity(zram->disk, 0);
/* zram devices sort of resembles non-rotational disks */
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue);
queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue);
/*
* To ensure that we always get PAGE_SIZE aligned
* and n*PAGE_SIZED sized I/O requests.
@ -880,6 +903,21 @@ static int create_device(struct zram *zram, int device_id)
ZRAM_LOGICAL_BLOCK_SIZE);
blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
zram->disk->queue->limits.discard_granularity = PAGE_SIZE;
zram->disk->queue->limits.max_discard_sectors = UINT_MAX;
/*
* zram_bio_discard() will clear all logical blocks if logical block
* size is identical with physical block size(PAGE_SIZE). But if it is
* different, we will skip discarding some parts of logical blocks in
* the part of the request range which isn't aligned to physical block
* size. So we can't ensure that all discarded logical blocks are
* zeroed.
*/
if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE)
zram->disk->queue->limits.discard_zeroes_data = 1;
else
zram->disk->queue->limits.discard_zeroes_data = 0;
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue);
add_disk(zram->disk);
@ -889,8 +927,9 @@ static int create_device(struct zram *zram, int device_id)
pr_warn("Error creating sysfs group");
goto out_free_disk;
}
zram->init_done = 0;
strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
zram->meta = NULL;
zram->max_comp_streams = 1;
return 0;
out_free_disk:
@ -907,13 +946,10 @@ static void destroy_device(struct zram *zram)
sysfs_remove_group(&disk_to_dev(zram->disk)->kobj,
&zram_disk_attr_group);
if (zram->disk) {
del_gendisk(zram->disk);
put_disk(zram->disk);
}
del_gendisk(zram->disk);
put_disk(zram->disk);
if (zram->queue)
blk_cleanup_queue(zram->queue);
blk_cleanup_queue(zram->queue);
}
static int __init zram_init(void)
@ -992,4 +1028,3 @@ MODULE_PARM_DESC(num_devices, "Number of zram devices");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
MODULE_DESCRIPTION("Compressed RAM Block Device");
MODULE_ALIAS("devname:zram");

View file

@ -2,6 +2,7 @@
* Compressed RAM block device
*
* Copyright (C) 2008, 2009, 2010 Nitin Gupta
* 2012, 2013 Minchan Kim
*
* This code is released using a dual license strategy: BSD/GPL
* You can choose the licence that better fits your requirements.
@ -9,16 +10,15 @@
* Released under the terms of 3-clause BSD License
* Released under the terms of GNU General Public License Version 2.0
*
* Project home: http://compcache.googlecode.com
*/
#ifndef _ZRAM_DRV_H_
#define _ZRAM_DRV_H_
#include <linux/spinlock.h>
#include <linux/mutex.h>
#include <linux/zsmalloc.h>
#include "../zsmalloc/zsmalloc.h"
#include "zcomp.h"
/*
* Some arbitrary value. This is just to catch
@ -32,7 +32,7 @@ static const unsigned max_num_devices = 32;
* Pages that compress to size greater than this are stored
* uncompressed in memory.
*/
static const size_t max_zpage_size = PAGE_SIZE / 10 * 9;
static const size_t max_zpage_size = PAGE_SIZE / 4 * 3;
/*
* NOTE: max_zpage_size must be less than or equal to:
@ -65,52 +65,33 @@ enum zram_pageflags {
struct table {
unsigned long handle;
u16 size; /* object size (excluding header) */
u8 count; /* object ref count (not yet used) */
u8 flags;
} __aligned(4);
/*
* All 64bit fields should only be manipulated by 64bit atomic accessors.
* All modifications to 32bit counter should be protected by zram->lock.
*/
struct zram_stats {
atomic64_t compr_size; /* compressed size of pages stored */
atomic64_t compr_data_size; /* compressed size of pages stored */
atomic64_t num_reads; /* failed + successful */
atomic64_t num_writes; /* --do-- */
atomic64_t failed_reads; /* should NEVER! happen */
atomic64_t failed_writes; /* can happen when memory is too low */
atomic64_t invalid_io; /* non-page-aligned I/O requests */
atomic64_t notify_free; /* no. of swap slot free notifications */
u32 pages_zero; /* no. of zero filled pages */
u32 pages_stored; /* no. of pages currently stored */
u32 good_compress; /* % of pages with compression ratio<=50% */
u32 bad_compress; /* % of pages with compression ratio>=75% */
atomic64_t zero_pages; /* no. of zero filled pages */
atomic64_t pages_stored; /* no. of pages currently stored */
};
struct zram_meta {
void *compress_workmem;
void *compress_buffer;
rwlock_t tb_lock; /* protect table */
struct table *table;
struct zs_pool *mem_pool;
};
struct zram_slot_free {
unsigned long index;
struct zram_slot_free *next;
};
struct zram {
struct zram_meta *meta;
struct rw_semaphore lock; /* protect compression buffers, table,
* 32bit stat counters against concurrent
* notifications, reads and writes */
struct work_struct free_work; /* handle pending free request */
struct zram_slot_free *slot_free_rq; /* list head of free request */
struct request_queue *queue;
struct gendisk *disk;
int init_done;
struct zcomp *comp;
/* Prevent concurrent execution of device init, reset and R/W request */
struct rw_semaphore init_lock;
/*
@ -118,8 +99,8 @@ struct zram {
* we can store in a disk.
*/
u64 disksize; /* bytes */
spinlock_t slot_free_lock;
int max_comp_streams;
struct zram_stats stats;
char compressor[10];
};
#endif

View file

@ -686,8 +686,10 @@ static void ide_disk_setup(ide_drive_t *drive)
printk(KERN_INFO "%s: max request size: %dKiB\n", drive->name,
queue_max_sectors(q) / 2);
if (ata_id_is_ssd(id))
if (ata_id_is_ssd(id)) {
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q);
}
/* calculate drive capacity, and select LBA if possible */
ide_disk_get_capacity(drive);

View file

@ -294,6 +294,7 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
blk_queue_prep_rq(mq->queue, mmc_prep_request);
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mq->queue);
queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, mq->queue);
if (mmc_can_erase(card))
mmc_queue_setup_discard(mq->queue, card);

View file

@ -441,6 +441,7 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
blk_queue_logical_block_size(new->rq, tr->blksize);
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, new->rq);
queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, new->rq);
if (tr->discard) {
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, new->rq);

View file

@ -2390,8 +2390,10 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp)
rot = get_unaligned_be16(&buffer[4]);
if (rot == 1)
if (rot == 1) {
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, sdkp->disk->queue);
queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, sdkp->disk->queue);
}
out:
kfree(buffer);

View file

@ -80,10 +80,6 @@ source "drivers/staging/sep/Kconfig"
source "drivers/staging/iio/Kconfig"
source "drivers/staging/zram/Kconfig"
source "drivers/staging/zsmalloc/Kconfig"
source "drivers/staging/wlags49_h2/Kconfig"
source "drivers/staging/wlags49_h25/Kconfig"

View file

@ -32,8 +32,6 @@ obj-$(CONFIG_VT6656) += vt6656/
obj-$(CONFIG_VME_BUS) += vme/
obj-$(CONFIG_DX_SEP) += sep/
obj-$(CONFIG_IIO) += iio/
obj-$(CONFIG_ZRAM) += zram/
obj-$(CONFIG_ZSMALLOC) += zsmalloc/
obj-$(CONFIG_WLAGS49_H2) += wlags49_h2/
obj-$(CONFIG_WLAGS49_H25) += wlags49_h25/
obj-$(CONFIG_FB_SM7XX) += sm7xx/

View file

@ -1,3 +0,0 @@
zram-y := zram_drv.o
obj-$(CONFIG_ZRAM) += zram.o

View file

@ -1,77 +0,0 @@
zram: Compressed RAM based block devices
----------------------------------------
Project home: http://compcache.googlecode.com/
* Introduction
The zram module creates RAM based block devices named /dev/zram<id>
(<id> = 0, 1, ...). Pages written to these disks are compressed and stored
in memory itself. These disks allow very fast I/O and compression provides
good amounts of memory savings. Some of the usecases include /tmp storage,
use as swap disks, various caches under /var and maybe many more :)
Statistics for individual zram devices are exported through sysfs nodes at
/sys/block/zram<id>/
* Usage
Following shows a typical sequence of steps for using zram.
1) Load Module:
modprobe zram num_devices=4
This creates 4 devices: /dev/zram{0,1,2,3}
(num_devices parameter is optional. Default: 1)
2) Set Disksize
Set disk size by writing the value to sysfs node 'disksize'.
The value can be either in bytes or you can use mem suffixes.
Examples:
# Initialize /dev/zram0 with 50MB disksize
echo $((50*1024*1024)) > /sys/block/zram0/disksize
# Using mem suffixes
echo 256K > /sys/block/zram0/disksize
echo 512M > /sys/block/zram0/disksize
echo 1G > /sys/block/zram0/disksize
3) Activate:
mkswap /dev/zram0
swapon /dev/zram0
mkfs.ext4 /dev/zram1
mount /dev/zram1 /tmp
4) Stats:
Per-device statistics are exported as various nodes under
/sys/block/zram<id>/
disksize
num_reads
num_writes
invalid_io
notify_free
discard
zero_pages
orig_data_size
compr_data_size
mem_used_total
5) Deactivate:
swapoff /dev/zram0
umount /dev/zram1
6) Reset:
Write any positive value to 'reset' sysfs node
echo 1 > /sys/block/zram0/reset
echo 1 > /sys/block/zram1/reset
This frees all the memory allocated for the given device and
resets the disksize to zero. You must set the disksize again
before reusing the device.
Please report any problems at:
- Mailing list: linux-mm-cc at laptop dot org
- Issue tracker: http://code.google.com/p/compcache/issues/list
Nitin Gupta
ngupta@vflare.org

View file

@ -1,10 +0,0 @@
config ZSMALLOC
bool "Memory allocator for compressed pages"
default n
help
zsmalloc is a slab-based memory allocator designed to store
compressed RAM pages. zsmalloc uses virtual memory mapping
in order to reduce fragmentation. However, this results in a
non-standard allocator interface where a handle, not a pointer, is
returned by an alloc(). This handle must be mapped in order to
access the allocated space.

View file

@ -1,3 +0,0 @@
zsmalloc-y := zsmalloc-main.o
obj-$(CONFIG_ZSMALLOC) += zsmalloc.o

File diff suppressed because it is too large Load diff

View file

@ -1,43 +0,0 @@
/*
* zsmalloc memory allocator
*
* Copyright (C) 2011 Nitin Gupta
*
* This code is released using a dual license strategy: BSD/GPL
* You can choose the license that better fits your requirements.
*
* Released under the terms of 3-clause BSD License
* Released under the terms of GNU General Public License Version 2.0
*/
#ifndef _ZS_MALLOC_H_
#define _ZS_MALLOC_H_
#include <linux/types.h>
/*
* zsmalloc mapping modes
*
* NOTE: These only make a difference when a mapped object spans pages
*/
enum zs_mapmode {
ZS_MM_RW, /* normal read-write mapping */
ZS_MM_RO, /* read-only (no copy-out at unmap time) */
ZS_MM_WO /* write-only (no copy-in at map time) */
};
struct zs_pool;
struct zs_pool *zs_create_pool(gfp_t flags);
void zs_destroy_pool(struct zs_pool *pool);
unsigned long zs_malloc(struct zs_pool *pool, size_t size);
void zs_free(struct zs_pool *pool, unsigned long obj);
void *zs_map_object(struct zs_pool *pool, unsigned long handle,
enum zs_mapmode mm);
void zs_unmap_object(struct zs_pool *pool, unsigned long handle);
u64 zs_get_total_size_bytes(struct zs_pool *pool);
#endif

View file

@ -2,6 +2,7 @@
* zsmalloc memory allocator
*
* Copyright (C) 2011 Nitin Gupta
* Copyright (C) 2012, 2013 Minchan Kim
*
* This code is released using a dual license strategy: BSD/GPL
* You can choose the license that better fits your requirements.
@ -14,37 +15,24 @@
#define _ZS_MALLOC_H_
#include <linux/types.h>
#include <linux/mm_types.h>
/*
* zsmalloc mapping modes
*
* NOTE: These only make a difference when a mapped object spans pages.
* They also have no effect when PGTABLE_MAPPING is selected.
*/
* NOTE: These only make a difference when a mapped object spans pages
*/
enum zs_mapmode {
ZS_MM_RW, /* normal read-write mapping */
ZS_MM_RO, /* read-only (no copy-out at unmap time) */
ZS_MM_WO /* write-only (no copy-in at map time) */
/*
* NOTE: ZS_MM_WO should only be used for initializing new
* (uninitialized) allocations. Partial writes to already
* initialized allocations should use ZS_MM_RW to preserve the
* existing data.
*/
};
struct zs_ops {
struct page * (*alloc)(gfp_t);
void (*free)(struct page *);
};
struct zs_pool;
struct zs_pool *zs_create_pool(gfp_t flags, struct zs_ops *ops);
struct zs_pool *zs_create_pool(gfp_t flags);
void zs_destroy_pool(struct zs_pool *pool);
unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags);
unsigned long zs_malloc(struct zs_pool *pool, size_t size);
void zs_free(struct zs_pool *pool, unsigned long obj);
void *zs_map_object(struct zs_pool *pool, unsigned long handle,

View file

@ -540,3 +540,15 @@ config ZCACHE
If this process is successful, when those file pages needed again, the
I/O reading operation was avoided. This results in a significant performance
gains under memory pressure for systems full with file pages.
config ZSMALLOC
tristate "Memory allocator for compressed pages"
depends on MMU
default n
help
zsmalloc is a slab-based memory allocator designed to store
compressed RAM pages. zsmalloc uses virtual memory mapping
in order to reduce fragmentation. However, this results in a
non-standard allocator interface where a handle, not a pointer, is
returned by an alloc(). This handle must be mapped in order to
access the allocated space.

View file

@ -55,3 +55,4 @@ obj-$(CONFIG_CLEANCACHE) += cleancache.o
obj-$(CONFIG_ZSMALLOC_NEW) += zsmalloc.o
obj-$(CONFIG_ZCACHE) += zcache.o
obj-$(CONFIG_ZBUD) += zbud.o
obj-$(CONFIG_ZSMALLOC) += zsmalloc.o

View file

@ -2,6 +2,7 @@
* zsmalloc memory allocator
*
* Copyright (C) 2011 Nitin Gupta
* Copyright (C) 2012, 2013 Minchan Kim
*
* This code is released using a dual license strategy: BSD/GPL
* You can choose the license that better fits your requirements.
@ -27,21 +28,6 @@
* page boundaries. The code refers to these linked pages as a single entity
* called zspage.
*
* For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE
* since this satisfies the requirements of all its current users (in the
* worst case, page is incompressible and is thus stored "as-is" i.e. in
* uncompressed form). For allocation requests larger than this size, failure
* is returned (see zs_malloc).
*
* Additionally, zs_malloc() does not return a dereferenceable pointer.
* Instead, it returns an opaque handle (unsigned long) which encodes actual
* location of the allocated object. The reason for this indirection is that
* zsmalloc does not keep zspages permanently mapped since that would cause
* issues on 32-bit systems where the VA region for kernel space mappings
* is very small. So, before using the allocating memory, the object has to
* be mapped using zs_map_object() to get a usable pointer and subsequently
* unmapped using zs_unmap_object().
*
* Following is how we use various fields and flags of underlying
* struct page(s) to form a zspage.
*
@ -61,6 +47,8 @@
* page->freelist: points to the first free object in zspage.
* Free objects are linked together using in-place
* metadata.
* page->objects: maximum number of objects we can store in this
* zspage (class->zspage_order * PAGE_SIZE / class->size)
* page->lru: links together first pages of various zspages.
* Basically forming list of zspages in a fullness group.
* page->mapping: class index and fullness group of the zspage
@ -71,6 +59,10 @@
*
*/
#ifdef CONFIG_ZSMALLOC_DEBUG
#define DEBUG
#endif
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/bitops.h>
@ -87,7 +79,6 @@
#include <linux/hardirq.h>
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/zsmalloc.h>
/*
@ -107,7 +98,7 @@
/*
* Object location (<PFN>, <obj_idx>) is encoded as
* as single (unsigned long) handle value.
* as single (void *) handle value.
*
* Note that object index <obj_idx> is relative to system
* page <PFN> it is stored in, so for each sub-page belonging
@ -215,7 +206,7 @@ struct link_free {
struct zs_pool {
struct size_class size_class[ZS_SIZE_CLASSES];
struct zs_ops *ops;
gfp_t flags; /* allocation flags used when growing pool */
};
/*
@ -227,8 +218,19 @@ struct zs_pool {
#define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1)
#define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1)
/*
* By default, zsmalloc uses a copy-based object mapping method to access
* allocations that span two pages. However, if a particular architecture
* performs VM mapping faster than copying, then it should be added here
* so that USE_PGTABLE_MAPPING is defined. This causes zsmalloc to use
* page table mapping rather than copying for object mapping.
*/
#if defined(CONFIG_ARM) && !defined(MODULE)
#define USE_PGTABLE_MAPPING
#endif
struct mapping_area {
#ifdef CONFIG_PGTABLE_MAPPING
#ifdef USE_PGTABLE_MAPPING
struct vm_struct *vm; /* vm area for mapping object that span pages */
#else
char *vm_buf; /* copy buffer for objects that span pages */
@ -237,21 +239,6 @@ struct mapping_area {
enum zs_mapmode vm_mm; /* mapping mode */
};
/* default page alloc/free ops */
struct page *zs_alloc_page(gfp_t flags)
{
return alloc_page(flags);
}
void zs_free_page(struct page *page)
{
__free_page(page);
}
struct zs_ops zs_default_ops = {
.alloc = zs_alloc_page,
.free = zs_free_page
};
/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
@ -288,13 +275,6 @@ static void set_zspage_mapping(struct page *page, unsigned int class_idx,
page->mapping = (struct address_space *)m;
}
/*
* zsmalloc divides the pool into various size classes where each
* class maintains a list of zspages where each zspage is divided
* into equal sized chunks. Each allocation falls into one of these
* classes depending on its size. This function returns index of the
* size class which has chunk size big enough to hold the give size.
*/
static int get_size_class_index(int size)
{
int idx = 0;
@ -306,22 +286,14 @@ static int get_size_class_index(int size)
return idx;
}
/*
* For each size class, zspages are divided into different groups
* depending on how "full" they are. This was done so that we could
* easily find empty or nearly empty zspages when we try to shrink
* the pool (not yet implemented). This function returns fullness
* status of the given page.
*/
static enum fullness_group get_fullness_group(struct page *page,
struct size_class *class)
static enum fullness_group get_fullness_group(struct page *page)
{
int inuse, max_objects;
enum fullness_group fg;
BUG_ON(!is_first_page(page));
inuse = page->inuse;
max_objects = class->pages_per_zspage * PAGE_SIZE / class->size;
max_objects = page->objects;
if (inuse == 0)
fg = ZS_EMPTY;
@ -335,12 +307,6 @@ static enum fullness_group get_fullness_group(struct page *page,
return fg;
}
/*
* Each size class maintains various freelists and zspages are assigned
* to one of these freelists based on the number of live objects they
* have. This functions inserts the given zspage into the freelist
* identified by <class, fullness_group>.
*/
static void insert_zspage(struct page *page, struct size_class *class,
enum fullness_group fullness)
{
@ -358,10 +324,6 @@ static void insert_zspage(struct page *page, struct size_class *class,
*head = page;
}
/*
* This function removes the given zspage from the freelist identified
* by <class, fullness_group>.
*/
static void remove_zspage(struct page *page, struct size_class *class,
enum fullness_group fullness)
{
@ -383,15 +345,6 @@ static void remove_zspage(struct page *page, struct size_class *class,
list_del_init(&page->lru);
}
/*
* Each size class maintains zspages in different fullness groups depending
* on the number of live objects they contain. When allocating or freeing
* objects, the fullness status of the page can change, say, from ALMOST_FULL
* to ALMOST_EMPTY when freeing an object. This function checks if such
* a status change has occurred for the given page and accordingly moves the
* page from the freelist of the old fullness group to that of the new
* fullness group.
*/
static enum fullness_group fix_fullness_group(struct zs_pool *pool,
struct page *page)
{
@ -402,11 +355,11 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool,
BUG_ON(!is_first_page(page));
get_zspage_mapping(page, &class_idx, &currfg);
class = &pool->size_class[class_idx];
newfg = get_fullness_group(page, class);
newfg = get_fullness_group(page);
if (newfg == currfg)
goto out;
class = &pool->size_class[class_idx];
remove_zspage(page, class, currfg);
insert_zspage(page, class, newfg);
set_zspage_mapping(page, class_idx, newfg);
@ -470,14 +423,19 @@ static struct page *get_next_page(struct page *page)
if (is_last_page(page))
next = NULL;
else if (is_first_page(page))
next = (struct page *)page->private;
next = (struct page *)page_private(page);
else
next = list_entry(page->lru.next, struct page, lru);
return next;
}
/* Encode <page, obj_idx> as a single handle value */
/*
* Encode <page, obj_idx> as a single handle value.
* On hardware platforms with physical memory starting at 0x0 the pfn
* could be 0 so we ensure that the handle will never be 0 by adjusting the
* encoded obj_idx value before encoding.
*/
static void *obj_location_to_handle(struct page *page, unsigned long obj_idx)
{
unsigned long handle;
@ -488,17 +446,21 @@ static void *obj_location_to_handle(struct page *page, unsigned long obj_idx)
}
handle = page_to_pfn(page) << OBJ_INDEX_BITS;
handle |= (obj_idx & OBJ_INDEX_MASK);
handle |= ((obj_idx + 1) & OBJ_INDEX_MASK);
return (void *)handle;
}
/* Decode <page, obj_idx> pair from the given object handle */
/*
* Decode <page, obj_idx> pair from the given object handle. We adjust the
* decoded obj_idx back to its original value since it was adjusted in
* obj_location_to_handle().
*/
static void obj_handle_to_location(unsigned long handle, struct page **page,
unsigned long *obj_idx)
{
*page = pfn_to_page(handle >> OBJ_INDEX_BITS);
*obj_idx = handle & OBJ_INDEX_MASK;
*obj_idx = (handle & OBJ_INDEX_MASK) - 1;
}
static unsigned long obj_idx_to_offset(struct page *page,
@ -522,7 +484,7 @@ static void reset_page(struct page *page)
reset_page_mapcount(page);
}
static void free_zspage(struct zs_ops *ops, struct page *first_page)
static void free_zspage(struct page *first_page)
{
struct page *nextp, *tmp, *head_extra;
@ -532,7 +494,7 @@ static void free_zspage(struct zs_ops *ops, struct page *first_page)
head_extra = (struct page *)page_private(first_page);
reset_page(first_page);
ops->free(first_page);
__free_page(first_page);
/* zspage with only 1 system page */
if (!head_extra)
@ -541,10 +503,10 @@ static void free_zspage(struct zs_ops *ops, struct page *first_page)
list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) {
list_del(&nextp->lru);
reset_page(nextp);
ops->free(nextp);
__free_page(nextp);
}
reset_page(head_extra);
ops->free(head_extra);
__free_page(head_extra);
}
/* Initialize a newly allocated zspage */
@ -596,8 +558,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
/*
* Allocate a zspage for the given size class
*/
static struct page *alloc_zspage(struct zs_ops *ops, struct size_class *class,
gfp_t flags)
static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
{
int i, error;
struct page *first_page = NULL, *uninitialized_var(prev_page);
@ -617,7 +578,7 @@ static struct page *alloc_zspage(struct zs_ops *ops, struct size_class *class,
for (i = 0; i < class->pages_per_zspage; i++) {
struct page *page;
page = ops->alloc(flags);
page = alloc_page(flags);
if (!page)
goto cleanup;
@ -629,7 +590,7 @@ static struct page *alloc_zspage(struct zs_ops *ops, struct size_class *class,
first_page->inuse = 0;
}
if (i == 1)
first_page->private = (unsigned long)page;
set_page_private(first_page, (unsigned long)page);
if (i >= 1)
page->first_page = first_page;
if (i >= 2)
@ -642,12 +603,14 @@ static struct page *alloc_zspage(struct zs_ops *ops, struct size_class *class,
init_zspage(first_page, class);
first_page->freelist = obj_location_to_handle(first_page, 0);
/* Maximum number of objects we can store in this zspage */
first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size;
error = 0; /* Success */
cleanup:
if (unlikely(error) && first_page) {
free_zspage(ops, first_page);
free_zspage(first_page);
first_page = NULL;
}
@ -668,7 +631,7 @@ static struct page *find_get_zspage(struct size_class *class)
return page;
}
#ifdef CONFIG_PGTABLE_MAPPING
#ifdef USE_PGTABLE_MAPPING
static inline int __zs_cpu_up(struct mapping_area *area)
{
/*
@ -702,14 +665,11 @@ static inline void __zs_unmap_object(struct mapping_area *area,
struct page *pages[2], int off, int size)
{
unsigned long addr = (unsigned long)area->vm_addr;
unsigned long end = addr + (PAGE_SIZE * 2);
flush_cache_vunmap(addr, end);
unmap_kernel_range_noflush(addr, PAGE_SIZE * 2);
flush_tlb_kernel_range(addr, end);
unmap_kernel_range(addr, PAGE_SIZE * 2);
}
#else /* CONFIG_PGTABLE_MAPPING*/
#else /* USE_PGTABLE_MAPPING */
static inline int __zs_cpu_up(struct mapping_area *area)
{
@ -787,7 +747,7 @@ out:
pagefault_enable();
}
#endif /* CONFIG_PGTABLE_MAPPING */
#endif /* USE_PGTABLE_MAPPING */
static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action,
void *pcpu)
@ -820,21 +780,32 @@ static void zs_exit(void)
{
int cpu;
cpu_notifier_register_begin();
for_each_online_cpu(cpu)
zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu);
unregister_cpu_notifier(&zs_cpu_nb);
__unregister_cpu_notifier(&zs_cpu_nb);
cpu_notifier_register_done();
}
static int zs_init(void)
{
int cpu, ret;
register_cpu_notifier(&zs_cpu_nb);
cpu_notifier_register_begin();
__register_cpu_notifier(&zs_cpu_nb);
for_each_online_cpu(cpu) {
ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
if (notifier_to_errno(ret))
if (notifier_to_errno(ret)) {
cpu_notifier_register_done();
goto fail;
}
}
cpu_notifier_register_done();
return 0;
fail:
zs_exit();
@ -844,7 +815,6 @@ fail:
/**
* zs_create_pool - Creates an allocation pool to work from.
* @flags: allocation flags used to allocate pool metadata
* @ops: allocation/free callbacks for expanding the pool
*
* This function must be called before anything when using
* the zsmalloc allocator.
@ -852,13 +822,13 @@ fail:
* On success, a pointer to the newly created pool is returned,
* otherwise NULL.
*/
struct zs_pool *zs_create_pool(gfp_t flags, struct zs_ops *ops)
struct zs_pool *zs_create_pool(gfp_t flags)
{
int i, ovhd_size;
struct zs_pool *pool;
ovhd_size = roundup(sizeof(*pool), PAGE_SIZE);
pool = kzalloc(ovhd_size, flags);
pool = kzalloc(ovhd_size, GFP_KERNEL);
if (!pool)
return NULL;
@ -878,10 +848,7 @@ struct zs_pool *zs_create_pool(gfp_t flags, struct zs_ops *ops)
}
if (ops)
pool->ops = ops;
else
pool->ops = &zs_default_ops;
pool->flags = flags;
return pool;
}
@ -897,8 +864,7 @@ void zs_destroy_pool(struct zs_pool *pool)
for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) {
if (class->fullness_list[fg]) {
pr_info("Freeing non-empty class with size "
"%db, fullness group %d\n",
pr_info("Freeing non-empty class with size %db, fullness group %d\n",
class->size, fg);
}
}
@ -916,7 +882,7 @@ EXPORT_SYMBOL_GPL(zs_destroy_pool);
* otherwise 0.
* Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
*/
unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags)
unsigned long zs_malloc(struct zs_pool *pool, size_t size)
{
unsigned long obj;
struct link_free *link;
@ -938,7 +904,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags)
if (!first_page) {
spin_unlock(&class->lock);
first_page = alloc_zspage(pool->ops, class, flags);
first_page = alloc_zspage(class, pool->flags);
if (unlikely(!first_page))
return 0;
@ -1004,7 +970,7 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
spin_unlock(&class->lock);
if (fullness == ZS_EMPTY)
free_zspage(pool->ops, first_page);
free_zspage(first_page);
}
EXPORT_SYMBOL_GPL(zs_free);
@ -1021,7 +987,7 @@ EXPORT_SYMBOL_GPL(zs_free);
* against nested mappings.
*
* This function returns with preemption and page faults disabled.
*/
*/
void *zs_map_object(struct zs_pool *pool, unsigned long handle,
enum zs_mapmode mm)
{