mirror of
https://github.com/followmsi/android_kernel_google_msm.git
synced 2024-11-06 23:17:41 +00:00
5f7e2650e1
memfd_create() is similar to mmap(MAP_ANON), but returns a file-descriptor that you can pass to mmap(). It can support sealing and avoids any connection to user-visible mount-points. Thus, it's not subject to quotas on mounted file-systems, but can be used like malloc()'ed memory, but with a file-descriptor to it. memfd_create() returns the raw shmem file, so calls like ftruncate() can be used to modify the underlying inode. Also calls like fstat() will return proper information and mark the file as regular file. If you want sealing, you can specify MFD_ALLOW_SEALING. Otherwise, sealing is not supported (like on all other regular files). Compared to O_TMPFILE, it does not require a tmpfs mount-point and is not subject to a filesystem size limit. It is still properly accounted to memcg limits, though, and to the same overcommit or no-overcommit accounting as all user memory. Change-Id: Iaf959293e2c490523aeb46d56cc45b0e7bbe7bf5 Signed-off-by: David Herrmann <dh.herrmann@gmail.com> Acked-by: Hugh Dickins <hughd@google.com> Cc: Michael Kerrisk <mtk.manpages@gmail.com> Cc: Ryan Lortie <desrt@desrt.ca> Cc: Lennart Poettering <lennart@poettering.net> Cc: Daniel Mack <zonque@gmail.com> Cc: Andy Lutomirski <luto@amacapital.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Angelo G. Del Regno <kholk11@gmail.com>
2913 lines
72 KiB
C
2913 lines
72 KiB
C
/*
|
|
* Resizable virtual memory filesystem for Linux.
|
|
*
|
|
* Copyright (C) 2000 Linus Torvalds.
|
|
* 2000 Transmeta Corp.
|
|
* 2000-2001 Christoph Rohland
|
|
* 2000-2001 SAP AG
|
|
* 2002 Red Hat Inc.
|
|
* Copyright (C) 2002-2011 Hugh Dickins.
|
|
* Copyright (C) 2011 Google Inc.
|
|
* Copyright (C) 2002-2005 VERITAS Software Corporation.
|
|
* Copyright (C) 2004 Andi Kleen, SuSE Labs
|
|
*
|
|
* Extended attribute support for tmpfs:
|
|
* Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
|
|
* Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
|
|
*
|
|
* tiny-shmem:
|
|
* Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
|
|
*
|
|
* This file is released under the GPL.
|
|
*/
|
|
|
|
#include <linux/fs.h>
|
|
#include <linux/init.h>
|
|
#include <linux/vfs.h>
|
|
#include <linux/mount.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/file.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/export.h>
|
|
#include <linux/swap.h>
|
|
|
|
static struct vfsmount *shm_mnt;
|
|
|
|
#ifdef CONFIG_SHMEM
|
|
/*
|
|
* This virtual memory filesystem is heavily based on the ramfs. It
|
|
* extends ramfs by the ability to use swap and honor resource limits
|
|
* which makes it a completely usable filesystem.
|
|
*/
|
|
|
|
#include <linux/xattr.h>
|
|
#include <linux/exportfs.h>
|
|
#include <linux/posix_acl.h>
|
|
#include <linux/generic_acl.h>
|
|
#include <linux/mman.h>
|
|
#include <linux/string.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/backing-dev.h>
|
|
#include <linux/shmem_fs.h>
|
|
#include <linux/writeback.h>
|
|
#include <linux/blkdev.h>
|
|
#include <linux/pagevec.h>
|
|
#include <linux/percpu_counter.h>
|
|
#include <linux/splice.h>
|
|
#include <linux/security.h>
|
|
#include <linux/swapops.h>
|
|
#include <linux/mempolicy.h>
|
|
#include <linux/namei.h>
|
|
#include <linux/ctype.h>
|
|
#include <linux/migrate.h>
|
|
#include <linux/highmem.h>
|
|
#include <linux/seq_file.h>
|
|
#include <linux/syscalls.h>
|
|
#include <linux/magic.h>
|
|
#include <uapi/linux/memfd.h>
|
|
|
|
#include <asm/uaccess.h>
|
|
#include <asm/pgtable.h>
|
|
|
|
#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
|
|
#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
|
|
|
|
/* Pretend that each entry is of this size in directory's i_size */
|
|
#define BOGO_DIRENT_SIZE 20
|
|
|
|
/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
|
|
#define SHORT_SYMLINK_LEN 128
|
|
|
|
/*
|
|
* vmtruncate_range() communicates with shmem_fault via
|
|
* inode->i_private (with i_mutex making sure that it has only one user at
|
|
* a time): we would prefer not to enlarge the shmem inode just for that.
|
|
*/
|
|
struct shmem_falloc {
|
|
wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
|
|
pgoff_t start; /* start of range currently being fallocated */
|
|
pgoff_t next; /* the next page offset to be fallocated */
|
|
};
|
|
|
|
struct shmem_xattr {
|
|
struct list_head list; /* anchored by shmem_inode_info->xattr_list */
|
|
char *name; /* xattr name */
|
|
size_t size;
|
|
char value[0];
|
|
};
|
|
|
|
/* Flag allocation requirements to shmem_getpage */
|
|
enum sgp_type {
|
|
SGP_READ, /* don't exceed i_size, don't allocate page */
|
|
SGP_CACHE, /* don't exceed i_size, may allocate page */
|
|
SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */
|
|
SGP_WRITE, /* may exceed i_size, may allocate page */
|
|
};
|
|
|
|
#ifdef CONFIG_TMPFS
|
|
static unsigned long shmem_default_max_blocks(void)
|
|
{
|
|
return totalram_pages / 2;
|
|
}
|
|
|
|
static unsigned long shmem_default_max_inodes(void)
|
|
{
|
|
return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
|
|
}
|
|
#endif
|
|
|
|
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
|
|
struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
|
|
|
|
static inline int shmem_getpage(struct inode *inode, pgoff_t index,
|
|
struct page **pagep, enum sgp_type sgp, int *fault_type)
|
|
{
|
|
return shmem_getpage_gfp(inode, index, pagep, sgp,
|
|
mapping_gfp_mask(inode->i_mapping), fault_type);
|
|
}
|
|
|
|
static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
|
|
{
|
|
return sb->s_fs_info;
|
|
}
|
|
|
|
/*
|
|
* shmem_file_setup pre-accounts the whole fixed size of a VM object,
|
|
* for shared memory and for shared anonymous (/dev/zero) mappings
|
|
* (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
|
|
* consistent with the pre-accounting of private mappings ...
|
|
*/
|
|
static inline int shmem_acct_size(unsigned long flags, loff_t size)
|
|
{
|
|
return (flags & VM_NORESERVE) ?
|
|
0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
|
|
}
|
|
|
|
static inline void shmem_unacct_size(unsigned long flags, loff_t size)
|
|
{
|
|
if (!(flags & VM_NORESERVE))
|
|
vm_unacct_memory(VM_ACCT(size));
|
|
}
|
|
|
|
/*
|
|
* ... whereas tmpfs objects are accounted incrementally as
|
|
* pages are allocated, in order to allow huge sparse files.
|
|
* shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
|
|
* so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
|
|
*/
|
|
static inline int shmem_acct_block(unsigned long flags)
|
|
{
|
|
return (flags & VM_NORESERVE) ?
|
|
security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_CACHE_SIZE)) : 0;
|
|
}
|
|
|
|
static inline void shmem_unacct_blocks(unsigned long flags, long pages)
|
|
{
|
|
if (flags & VM_NORESERVE)
|
|
vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
|
|
}
|
|
|
|
static const struct super_operations shmem_ops;
|
|
static const struct address_space_operations shmem_aops;
|
|
static const struct file_operations shmem_file_operations;
|
|
static const struct inode_operations shmem_inode_operations;
|
|
static const struct inode_operations shmem_dir_inode_operations;
|
|
static const struct inode_operations shmem_special_inode_operations;
|
|
static const struct vm_operations_struct shmem_vm_ops;
|
|
|
|
static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
|
|
.ra_pages = 0, /* No readahead */
|
|
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
|
|
};
|
|
|
|
static LIST_HEAD(shmem_swaplist);
|
|
static DEFINE_MUTEX(shmem_swaplist_mutex);
|
|
|
|
static int shmem_reserve_inode(struct super_block *sb)
|
|
{
|
|
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
|
|
if (sbinfo->max_inodes) {
|
|
spin_lock(&sbinfo->stat_lock);
|
|
if (!sbinfo->free_inodes) {
|
|
spin_unlock(&sbinfo->stat_lock);
|
|
return -ENOSPC;
|
|
}
|
|
sbinfo->free_inodes--;
|
|
spin_unlock(&sbinfo->stat_lock);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static void shmem_free_inode(struct super_block *sb)
|
|
{
|
|
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
|
|
if (sbinfo->max_inodes) {
|
|
spin_lock(&sbinfo->stat_lock);
|
|
sbinfo->free_inodes++;
|
|
spin_unlock(&sbinfo->stat_lock);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* shmem_recalc_inode - recalculate the block usage of an inode
|
|
* @inode: inode to recalc
|
|
*
|
|
* We have to calculate the free blocks since the mm can drop
|
|
* undirtied hole pages behind our back.
|
|
*
|
|
* But normally info->alloced == inode->i_mapping->nrpages + info->swapped
|
|
* So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
|
|
*
|
|
* It has to be called with the spinlock held.
|
|
*/
|
|
static void shmem_recalc_inode(struct inode *inode)
|
|
{
|
|
struct shmem_inode_info *info = SHMEM_I(inode);
|
|
long freed;
|
|
|
|
freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
|
|
if (freed > 0) {
|
|
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
|
|
if (sbinfo->max_blocks)
|
|
percpu_counter_add(&sbinfo->used_blocks, -freed);
|
|
info->alloced -= freed;
|
|
inode->i_blocks -= freed * BLOCKS_PER_PAGE;
|
|
shmem_unacct_blocks(info->flags, freed);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Replace item expected in radix tree by a new item, while holding tree lock.
|
|
*/
|
|
static int shmem_radix_tree_replace(struct address_space *mapping,
|
|
pgoff_t index, void *expected, void *replacement)
|
|
{
|
|
void **pslot;
|
|
void *item = NULL;
|
|
|
|
VM_BUG_ON(!expected);
|
|
pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
|
|
if (pslot)
|
|
item = radix_tree_deref_slot_protected(pslot,
|
|
&mapping->tree_lock);
|
|
if (item != expected)
|
|
return -ENOENT;
|
|
if (replacement)
|
|
radix_tree_replace_slot(pslot, replacement);
|
|
else
|
|
radix_tree_delete(&mapping->page_tree, index);
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Like add_to_page_cache_locked, but error if expected item has gone.
|
|
*/
|
|
static int shmem_add_to_page_cache(struct page *page,
|
|
struct address_space *mapping,
|
|
pgoff_t index, gfp_t gfp, void *expected)
|
|
{
|
|
int error = 0;
|
|
|
|
VM_BUG_ON(!PageLocked(page));
|
|
VM_BUG_ON(!PageSwapBacked(page));
|
|
|
|
if (!expected)
|
|
error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
|
|
if (!error) {
|
|
page_cache_get(page);
|
|
page->mapping = mapping;
|
|
page->index = index;
|
|
|
|
spin_lock_irq(&mapping->tree_lock);
|
|
if (!expected)
|
|
error = radix_tree_insert(&mapping->page_tree,
|
|
index, page);
|
|
else
|
|
error = shmem_radix_tree_replace(mapping, index,
|
|
expected, page);
|
|
if (!error) {
|
|
mapping->nrpages++;
|
|
__inc_zone_page_state(page, NR_FILE_PAGES);
|
|
__inc_zone_page_state(page, NR_SHMEM);
|
|
spin_unlock_irq(&mapping->tree_lock);
|
|
} else {
|
|
page->mapping = NULL;
|
|
spin_unlock_irq(&mapping->tree_lock);
|
|
page_cache_release(page);
|
|
}
|
|
if (!expected)
|
|
radix_tree_preload_end();
|
|
}
|
|
if (error)
|
|
mem_cgroup_uncharge_cache_page(page);
|
|
return error;
|
|
}
|
|
|
|
/*
|
|
* Like delete_from_page_cache, but substitutes swap for page.
|
|
*/
|
|
static void shmem_delete_from_page_cache(struct page *page, void *radswap)
|
|
{
|
|
struct address_space *mapping = page->mapping;
|
|
int error;
|
|
|
|
spin_lock_irq(&mapping->tree_lock);
|
|
error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
|
|
page->mapping = NULL;
|
|
mapping->nrpages--;
|
|
__dec_zone_page_state(page, NR_FILE_PAGES);
|
|
__dec_zone_page_state(page, NR_SHMEM);
|
|
spin_unlock_irq(&mapping->tree_lock);
|
|
page_cache_release(page);
|
|
BUG_ON(error);
|
|
}
|
|
|
|
/*
|
|
* Like find_get_pages, but collecting swap entries as well as pages.
|
|
*/
|
|
static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
|
|
pgoff_t start, unsigned int nr_pages,
|
|
struct page **pages, pgoff_t *indices)
|
|
{
|
|
unsigned int i;
|
|
unsigned int ret;
|
|
unsigned int nr_found;
|
|
|
|
rcu_read_lock();
|
|
restart:
|
|
nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
|
|
(void ***)pages, indices, start, nr_pages);
|
|
ret = 0;
|
|
for (i = 0; i < nr_found; i++) {
|
|
struct page *page;
|
|
repeat:
|
|
page = radix_tree_deref_slot((void **)pages[i]);
|
|
if (unlikely(!page))
|
|
continue;
|
|
if (radix_tree_exception(page)) {
|
|
if (radix_tree_deref_retry(page))
|
|
goto restart;
|
|
/*
|
|
* Otherwise, we must be storing a swap entry
|
|
* here as an exceptional entry: so return it
|
|
* without attempting to raise page count.
|
|
*/
|
|
goto export;
|
|
}
|
|
if (!page_cache_get_speculative(page))
|
|
goto repeat;
|
|
|
|
/* Has the page moved? */
|
|
if (unlikely(page != *((void **)pages[i]))) {
|
|
page_cache_release(page);
|
|
goto repeat;
|
|
}
|
|
export:
|
|
indices[ret] = indices[i];
|
|
pages[ret] = page;
|
|
ret++;
|
|
}
|
|
if (unlikely(!ret && nr_found))
|
|
goto restart;
|
|
rcu_read_unlock();
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Remove swap entry from radix tree, free the swap and its page cache.
|
|
*/
|
|
static int shmem_free_swap(struct address_space *mapping,
|
|
pgoff_t index, void *radswap)
|
|
{
|
|
int error;
|
|
|
|
spin_lock_irq(&mapping->tree_lock);
|
|
error = shmem_radix_tree_replace(mapping, index, radswap, NULL);
|
|
spin_unlock_irq(&mapping->tree_lock);
|
|
if (!error)
|
|
free_swap_and_cache(radix_to_swp_entry(radswap));
|
|
return error;
|
|
}
|
|
|
|
/*
|
|
* Pagevec may contain swap entries, so shuffle up pages before releasing.
|
|
*/
|
|
static void shmem_deswap_pagevec(struct pagevec *pvec)
|
|
{
|
|
int i, j;
|
|
|
|
for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
|
|
struct page *page = pvec->pages[i];
|
|
if (!radix_tree_exceptional_entry(page))
|
|
pvec->pages[j++] = page;
|
|
}
|
|
pvec->nr = j;
|
|
}
|
|
|
|
/*
|
|
* SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
|
|
*/
|
|
void shmem_unlock_mapping(struct address_space *mapping)
|
|
{
|
|
struct pagevec pvec;
|
|
pgoff_t indices[PAGEVEC_SIZE];
|
|
pgoff_t index = 0;
|
|
|
|
pagevec_init(&pvec, 0);
|
|
/*
|
|
* Minor point, but we might as well stop if someone else SHM_LOCKs it.
|
|
*/
|
|
while (!mapping_unevictable(mapping)) {
|
|
/*
|
|
* Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
|
|
* has finished, if it hits a row of PAGEVEC_SIZE swap entries.
|
|
*/
|
|
pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
|
|
PAGEVEC_SIZE, pvec.pages, indices);
|
|
if (!pvec.nr)
|
|
break;
|
|
index = indices[pvec.nr - 1] + 1;
|
|
shmem_deswap_pagevec(&pvec);
|
|
check_move_unevictable_pages(pvec.pages, pvec.nr);
|
|
pagevec_release(&pvec);
|
|
cond_resched();
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Remove range of pages and swap entries from radix tree, and free them.
|
|
*/
|
|
void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
|
|
{
|
|
struct address_space *mapping = inode->i_mapping;
|
|
struct shmem_inode_info *info = SHMEM_I(inode);
|
|
pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
|
|
unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
|
|
pgoff_t end = (lend >> PAGE_CACHE_SHIFT);
|
|
struct pagevec pvec;
|
|
pgoff_t indices[PAGEVEC_SIZE];
|
|
long nr_swaps_freed = 0;
|
|
pgoff_t index;
|
|
int i;
|
|
|
|
BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
|
|
|
|
pagevec_init(&pvec, 0);
|
|
index = start;
|
|
while (index <= end) {
|
|
pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
|
|
min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
|
|
pvec.pages, indices);
|
|
if (!pvec.nr)
|
|
break;
|
|
mem_cgroup_uncharge_start();
|
|
for (i = 0; i < pagevec_count(&pvec); i++) {
|
|
struct page *page = pvec.pages[i];
|
|
|
|
index = indices[i];
|
|
if (index > end)
|
|
break;
|
|
|
|
if (radix_tree_exceptional_entry(page)) {
|
|
nr_swaps_freed += !shmem_free_swap(mapping,
|
|
index, page);
|
|
continue;
|
|
}
|
|
|
|
if (!trylock_page(page))
|
|
continue;
|
|
if (page->mapping == mapping) {
|
|
VM_BUG_ON(PageWriteback(page));
|
|
truncate_inode_page(mapping, page);
|
|
}
|
|
unlock_page(page);
|
|
}
|
|
shmem_deswap_pagevec(&pvec);
|
|
pagevec_release(&pvec);
|
|
mem_cgroup_uncharge_end();
|
|
cond_resched();
|
|
index++;
|
|
}
|
|
|
|
if (partial) {
|
|
struct page *page = NULL;
|
|
shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
|
|
if (page) {
|
|
zero_user_segment(page, partial, PAGE_CACHE_SIZE);
|
|
set_page_dirty(page);
|
|
unlock_page(page);
|
|
page_cache_release(page);
|
|
}
|
|
}
|
|
|
|
index = start;
|
|
while (index <= end) {
|
|
cond_resched();
|
|
pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
|
|
min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
|
|
pvec.pages, indices);
|
|
if (!pvec.nr) {
|
|
/* If all gone or hole-punch, we're done */
|
|
if (index == start || end != -1)
|
|
break;
|
|
/* But if truncating, restart to make sure all gone */
|
|
index = start;
|
|
continue;
|
|
}
|
|
mem_cgroup_uncharge_start();
|
|
for (i = 0; i < pagevec_count(&pvec); i++) {
|
|
struct page *page = pvec.pages[i];
|
|
|
|
index = indices[i];
|
|
if (index > end)
|
|
break;
|
|
|
|
if (radix_tree_exceptional_entry(page)) {
|
|
if (shmem_free_swap(mapping, index, page)) {
|
|
/* Swap was replaced by page: retry */
|
|
index--;
|
|
break;
|
|
}
|
|
nr_swaps_freed++;
|
|
continue;
|
|
}
|
|
|
|
lock_page(page);
|
|
if (page->mapping == mapping) {
|
|
VM_BUG_ON(PageWriteback(page));
|
|
truncate_inode_page(mapping, page);
|
|
} else {
|
|
/* Page was replaced by swap: retry */
|
|
unlock_page(page);
|
|
index--;
|
|
break;
|
|
}
|
|
unlock_page(page);
|
|
}
|
|
shmem_deswap_pagevec(&pvec);
|
|
pagevec_release(&pvec);
|
|
mem_cgroup_uncharge_end();
|
|
index++;
|
|
}
|
|
|
|
spin_lock(&info->lock);
|
|
info->swapped -= nr_swaps_freed;
|
|
shmem_recalc_inode(inode);
|
|
spin_unlock(&info->lock);
|
|
|
|
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
|
|
}
|
|
EXPORT_SYMBOL_GPL(shmem_truncate_range);
|
|
|
|
static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
|
|
{
|
|
struct inode *inode = dentry->d_inode;
|
|
int error;
|
|
|
|
error = inode_change_ok(inode, attr);
|
|
if (error)
|
|
return error;
|
|
|
|
if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
|
|
loff_t oldsize = inode->i_size;
|
|
loff_t newsize = attr->ia_size;
|
|
|
|
if (newsize != oldsize) {
|
|
i_size_write(inode, newsize);
|
|
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
|
|
}
|
|
if (newsize < oldsize) {
|
|
loff_t holebegin = round_up(newsize, PAGE_SIZE);
|
|
unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
|
|
shmem_truncate_range(inode, newsize, (loff_t)-1);
|
|
/* unmap again to remove racily COWed private pages */
|
|
unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
|
|
}
|
|
}
|
|
|
|
setattr_copy(inode, attr);
|
|
#ifdef CONFIG_TMPFS_POSIX_ACL
|
|
if (attr->ia_valid & ATTR_MODE)
|
|
error = generic_acl_chmod(inode);
|
|
#endif
|
|
return error;
|
|
}
|
|
|
|
static void shmem_evict_inode(struct inode *inode)
|
|
{
|
|
struct shmem_inode_info *info = SHMEM_I(inode);
|
|
struct shmem_xattr *xattr, *nxattr;
|
|
|
|
if (inode->i_mapping->a_ops == &shmem_aops) {
|
|
shmem_unacct_size(info->flags, inode->i_size);
|
|
inode->i_size = 0;
|
|
shmem_truncate_range(inode, 0, (loff_t)-1);
|
|
if (!list_empty(&info->swaplist)) {
|
|
mutex_lock(&shmem_swaplist_mutex);
|
|
list_del_init(&info->swaplist);
|
|
mutex_unlock(&shmem_swaplist_mutex);
|
|
}
|
|
} else
|
|
kfree(info->symlink);
|
|
|
|
list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
|
|
kfree(xattr->name);
|
|
kfree(xattr);
|
|
}
|
|
WARN_ON(inode->i_blocks);
|
|
shmem_free_inode(inode->i_sb);
|
|
end_writeback(inode);
|
|
}
|
|
|
|
/*
|
|
* If swap found in inode, free it and move page from swapcache to filecache.
|
|
*/
|
|
static int shmem_unuse_inode(struct shmem_inode_info *info,
|
|
swp_entry_t swap, struct page *page)
|
|
{
|
|
struct address_space *mapping = info->vfs_inode.i_mapping;
|
|
void *radswap;
|
|
pgoff_t index;
|
|
int error;
|
|
|
|
radswap = swp_to_radix_entry(swap);
|
|
index = radix_tree_locate_item(&mapping->page_tree, radswap);
|
|
if (index == -1)
|
|
return 0;
|
|
|
|
/*
|
|
* Move _head_ to start search for next from here.
|
|
* But be careful: shmem_evict_inode checks list_empty without taking
|
|
* mutex, and there's an instant in list_move_tail when info->swaplist
|
|
* would appear empty, if it were the only one on shmem_swaplist.
|
|
*/
|
|
if (shmem_swaplist.next != &info->swaplist)
|
|
list_move_tail(&shmem_swaplist, &info->swaplist);
|
|
|
|
/*
|
|
* We rely on shmem_swaplist_mutex, not only to protect the swaplist,
|
|
* but also to hold up shmem_evict_inode(): so inode cannot be freed
|
|
* beneath us (pagelock doesn't help until the page is in pagecache).
|
|
*/
|
|
error = shmem_add_to_page_cache(page, mapping, index,
|
|
GFP_NOWAIT, radswap);
|
|
/* which does mem_cgroup_uncharge_cache_page on error */
|
|
|
|
if (error != -ENOMEM) {
|
|
/*
|
|
* Truncation and eviction use free_swap_and_cache(), which
|
|
* only does trylock page: if we raced, best clean up here.
|
|
*/
|
|
delete_from_swap_cache(page);
|
|
set_page_dirty(page);
|
|
if (!error) {
|
|
spin_lock(&info->lock);
|
|
info->swapped--;
|
|
spin_unlock(&info->lock);
|
|
swap_free(swap);
|
|
}
|
|
error = 1; /* not an error, but entry was found */
|
|
}
|
|
return error;
|
|
}
|
|
|
|
/*
|
|
* Search through swapped inodes to find and replace swap by page.
|
|
*/
|
|
int shmem_unuse(swp_entry_t swap, struct page *page)
|
|
{
|
|
struct list_head *this, *next;
|
|
struct shmem_inode_info *info;
|
|
int found = 0;
|
|
int error;
|
|
|
|
/*
|
|
* Charge page using GFP_KERNEL while we can wait, before taking
|
|
* the shmem_swaplist_mutex which might hold up shmem_writepage().
|
|
* Charged back to the user (not to caller) when swap account is used.
|
|
*/
|
|
error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
|
|
if (error)
|
|
goto out;
|
|
/* No radix_tree_preload: swap entry keeps a place for page in tree */
|
|
|
|
mutex_lock(&shmem_swaplist_mutex);
|
|
list_for_each_safe(this, next, &shmem_swaplist) {
|
|
info = list_entry(this, struct shmem_inode_info, swaplist);
|
|
if (info->swapped)
|
|
found = shmem_unuse_inode(info, swap, page);
|
|
else
|
|
list_del_init(&info->swaplist);
|
|
cond_resched();
|
|
if (found)
|
|
break;
|
|
}
|
|
mutex_unlock(&shmem_swaplist_mutex);
|
|
|
|
if (!found)
|
|
mem_cgroup_uncharge_cache_page(page);
|
|
if (found < 0)
|
|
error = found;
|
|
out:
|
|
unlock_page(page);
|
|
page_cache_release(page);
|
|
return error;
|
|
}
|
|
|
|
/*
|
|
* Move the page from the page cache to the swap cache.
|
|
*/
|
|
static int shmem_writepage(struct page *page, struct writeback_control *wbc)
|
|
{
|
|
struct shmem_inode_info *info;
|
|
struct address_space *mapping;
|
|
struct inode *inode;
|
|
swp_entry_t swap;
|
|
pgoff_t index;
|
|
|
|
BUG_ON(!PageLocked(page));
|
|
mapping = page->mapping;
|
|
index = page->index;
|
|
inode = mapping->host;
|
|
info = SHMEM_I(inode);
|
|
if (info->flags & VM_LOCKED)
|
|
goto redirty;
|
|
if (!total_swap_pages)
|
|
goto redirty;
|
|
|
|
/*
|
|
* shmem_backing_dev_info's capabilities prevent regular writeback or
|
|
* sync from ever calling shmem_writepage; but a stacking filesystem
|
|
* might use ->writepage of its underlying filesystem, in which case
|
|
* tmpfs should write out to swap only in response to memory pressure,
|
|
* and not for the writeback threads or sync.
|
|
*/
|
|
if (!wbc->for_reclaim) {
|
|
WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
|
|
goto redirty;
|
|
}
|
|
swap = get_swap_page();
|
|
if (!swap.val)
|
|
goto redirty;
|
|
|
|
/*
|
|
* Add inode to shmem_unuse()'s list of swapped-out inodes,
|
|
* if it's not already there. Do it now before the page is
|
|
* moved to swap cache, when its pagelock no longer protects
|
|
* the inode from eviction. But don't unlock the mutex until
|
|
* we've incremented swapped, because shmem_unuse_inode() will
|
|
* prune a !swapped inode from the swaplist under this mutex.
|
|
*/
|
|
mutex_lock(&shmem_swaplist_mutex);
|
|
if (list_empty(&info->swaplist))
|
|
list_add_tail(&info->swaplist, &shmem_swaplist);
|
|
|
|
if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
|
|
swap_shmem_alloc(swap);
|
|
shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
|
|
|
|
spin_lock(&info->lock);
|
|
info->swapped++;
|
|
shmem_recalc_inode(inode);
|
|
spin_unlock(&info->lock);
|
|
|
|
mutex_unlock(&shmem_swaplist_mutex);
|
|
BUG_ON(page_mapped(page));
|
|
swap_writepage(page, wbc);
|
|
return 0;
|
|
}
|
|
|
|
mutex_unlock(&shmem_swaplist_mutex);
|
|
swapcache_free(swap, NULL);
|
|
redirty:
|
|
set_page_dirty(page);
|
|
if (wbc->for_reclaim)
|
|
return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */
|
|
unlock_page(page);
|
|
return 0;
|
|
}
|
|
|
|
#ifdef CONFIG_NUMA
|
|
#ifdef CONFIG_TMPFS
|
|
static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
|
|
{
|
|
char buffer[64];
|
|
|
|
if (!mpol || mpol->mode == MPOL_DEFAULT)
|
|
return; /* show nothing */
|
|
|
|
mpol_to_str(buffer, sizeof(buffer), mpol, 1);
|
|
|
|
seq_printf(seq, ",mpol=%s", buffer);
|
|
}
|
|
|
|
static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
|
|
{
|
|
struct mempolicy *mpol = NULL;
|
|
if (sbinfo->mpol) {
|
|
spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
|
|
mpol = sbinfo->mpol;
|
|
mpol_get(mpol);
|
|
spin_unlock(&sbinfo->stat_lock);
|
|
}
|
|
return mpol;
|
|
}
|
|
#endif /* CONFIG_TMPFS */
|
|
|
|
static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
|
|
struct shmem_inode_info *info, pgoff_t index)
|
|
{
|
|
struct vm_area_struct pvma;
|
|
struct page *page;
|
|
|
|
/* Create a pseudo vma that just contains the policy */
|
|
pvma.vm_start = 0;
|
|
pvma.vm_pgoff = index;
|
|
pvma.vm_ops = NULL;
|
|
pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
|
|
|
|
page = swapin_readahead(swap, gfp, &pvma, 0);
|
|
|
|
/* Drop reference taken by mpol_shared_policy_lookup() */
|
|
mpol_cond_put(pvma.vm_policy);
|
|
|
|
return page;
|
|
}
|
|
|
|
static struct page *shmem_alloc_page(gfp_t gfp,
|
|
struct shmem_inode_info *info, pgoff_t index)
|
|
{
|
|
struct vm_area_struct pvma;
|
|
struct page *page;
|
|
|
|
/* Create a pseudo vma that just contains the policy */
|
|
pvma.vm_start = 0;
|
|
pvma.vm_pgoff = index;
|
|
pvma.vm_ops = NULL;
|
|
pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
|
|
|
|
page = alloc_page_vma(gfp, &pvma, 0);
|
|
|
|
/* Drop reference taken by mpol_shared_policy_lookup() */
|
|
mpol_cond_put(pvma.vm_policy);
|
|
|
|
return page;
|
|
}
|
|
#else /* !CONFIG_NUMA */
|
|
#ifdef CONFIG_TMPFS
|
|
static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
|
|
{
|
|
}
|
|
#endif /* CONFIG_TMPFS */
|
|
|
|
static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
|
|
struct shmem_inode_info *info, pgoff_t index)
|
|
{
|
|
return swapin_readahead(swap, gfp, NULL, 0);
|
|
}
|
|
|
|
static inline struct page *shmem_alloc_page(gfp_t gfp,
|
|
struct shmem_inode_info *info, pgoff_t index)
|
|
{
|
|
return alloc_page(gfp);
|
|
}
|
|
#endif /* CONFIG_NUMA */
|
|
|
|
#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS)
|
|
static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
|
|
{
|
|
return NULL;
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* shmem_getpage_gfp - find page in cache, or get from swap, or allocate
|
|
*
|
|
* If we allocate a new one we do not mark it dirty. That's up to the
|
|
* vm. If we swap it in we mark it dirty since we also free the swap
|
|
* entry since a page cannot live in both the swap and page cache
|
|
*/
|
|
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
|
|
struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
|
|
{
|
|
struct address_space *mapping = inode->i_mapping;
|
|
struct shmem_inode_info *info;
|
|
struct shmem_sb_info *sbinfo;
|
|
struct page *page;
|
|
swp_entry_t swap;
|
|
int error;
|
|
int once = 0;
|
|
|
|
if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
|
|
return -EFBIG;
|
|
repeat:
|
|
swap.val = 0;
|
|
page = find_lock_page(mapping, index);
|
|
if (radix_tree_exceptional_entry(page)) {
|
|
swap = radix_to_swp_entry(page);
|
|
page = NULL;
|
|
}
|
|
|
|
if (sgp != SGP_WRITE &&
|
|
((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
|
|
error = -EINVAL;
|
|
goto failed;
|
|
}
|
|
|
|
if (page || (sgp == SGP_READ && !swap.val)) {
|
|
/*
|
|
* Once we can get the page lock, it must be uptodate:
|
|
* if there were an error in reading back from swap,
|
|
* the page would not be inserted into the filecache.
|
|
*/
|
|
BUG_ON(page && !PageUptodate(page));
|
|
*pagep = page;
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Fast cache lookup did not find it:
|
|
* bring it back from swap or allocate.
|
|
*/
|
|
info = SHMEM_I(inode);
|
|
sbinfo = SHMEM_SB(inode->i_sb);
|
|
|
|
if (swap.val) {
|
|
/* Look it up and read it in.. */
|
|
page = lookup_swap_cache(swap);
|
|
if (!page) {
|
|
/* here we actually do the io */
|
|
if (fault_type)
|
|
*fault_type |= VM_FAULT_MAJOR;
|
|
page = shmem_swapin(swap, gfp, info, index);
|
|
if (!page) {
|
|
error = -ENOMEM;
|
|
goto failed;
|
|
}
|
|
}
|
|
|
|
/* We have to do this with page locked to prevent races */
|
|
lock_page(page);
|
|
if (!PageUptodate(page)) {
|
|
error = -EIO;
|
|
goto failed;
|
|
}
|
|
wait_on_page_writeback(page);
|
|
|
|
/* Someone may have already done it for us */
|
|
if (page->mapping) {
|
|
if (page->mapping == mapping &&
|
|
page->index == index)
|
|
goto done;
|
|
error = -EEXIST;
|
|
goto failed;
|
|
}
|
|
|
|
error = mem_cgroup_cache_charge(page, current->mm,
|
|
gfp & GFP_RECLAIM_MASK);
|
|
if (!error)
|
|
error = shmem_add_to_page_cache(page, mapping, index,
|
|
gfp, swp_to_radix_entry(swap));
|
|
if (error)
|
|
goto failed;
|
|
|
|
spin_lock(&info->lock);
|
|
info->swapped--;
|
|
shmem_recalc_inode(inode);
|
|
spin_unlock(&info->lock);
|
|
|
|
delete_from_swap_cache(page);
|
|
set_page_dirty(page);
|
|
swap_free(swap);
|
|
|
|
} else {
|
|
if (shmem_acct_block(info->flags)) {
|
|
error = -ENOSPC;
|
|
goto failed;
|
|
}
|
|
if (sbinfo->max_blocks) {
|
|
if (percpu_counter_compare(&sbinfo->used_blocks,
|
|
sbinfo->max_blocks) >= 0) {
|
|
error = -ENOSPC;
|
|
goto unacct;
|
|
}
|
|
percpu_counter_inc(&sbinfo->used_blocks);
|
|
}
|
|
|
|
page = shmem_alloc_page(gfp, info, index);
|
|
if (!page) {
|
|
error = -ENOMEM;
|
|
goto decused;
|
|
}
|
|
|
|
SetPageSwapBacked(page);
|
|
__set_page_locked(page);
|
|
error = mem_cgroup_cache_charge(page, current->mm,
|
|
gfp & GFP_RECLAIM_MASK);
|
|
if (!error)
|
|
error = shmem_add_to_page_cache(page, mapping, index,
|
|
gfp, NULL);
|
|
if (error)
|
|
goto decused;
|
|
lru_cache_add_anon(page);
|
|
|
|
spin_lock(&info->lock);
|
|
info->alloced++;
|
|
inode->i_blocks += BLOCKS_PER_PAGE;
|
|
shmem_recalc_inode(inode);
|
|
spin_unlock(&info->lock);
|
|
|
|
clear_highpage(page);
|
|
flush_dcache_page(page);
|
|
SetPageUptodate(page);
|
|
if (sgp == SGP_DIRTY)
|
|
set_page_dirty(page);
|
|
}
|
|
done:
|
|
/* Perhaps the file has been truncated since we checked */
|
|
if (sgp != SGP_WRITE &&
|
|
((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
|
|
error = -EINVAL;
|
|
goto trunc;
|
|
}
|
|
*pagep = page;
|
|
return 0;
|
|
|
|
/*
|
|
* Error recovery.
|
|
*/
|
|
trunc:
|
|
ClearPageDirty(page);
|
|
delete_from_page_cache(page);
|
|
spin_lock(&info->lock);
|
|
info->alloced--;
|
|
inode->i_blocks -= BLOCKS_PER_PAGE;
|
|
spin_unlock(&info->lock);
|
|
decused:
|
|
if (sbinfo->max_blocks)
|
|
percpu_counter_add(&sbinfo->used_blocks, -1);
|
|
unacct:
|
|
shmem_unacct_blocks(info->flags, 1);
|
|
failed:
|
|
if (swap.val && error != -EINVAL) {
|
|
struct page *test = find_get_page(mapping, index);
|
|
if (test && !radix_tree_exceptional_entry(test))
|
|
page_cache_release(test);
|
|
/* Have another try if the entry has changed */
|
|
if (test != swp_to_radix_entry(swap))
|
|
error = -EEXIST;
|
|
}
|
|
if (page) {
|
|
unlock_page(page);
|
|
page_cache_release(page);
|
|
}
|
|
if (error == -ENOSPC && !once++) {
|
|
info = SHMEM_I(inode);
|
|
spin_lock(&info->lock);
|
|
shmem_recalc_inode(inode);
|
|
spin_unlock(&info->lock);
|
|
goto repeat;
|
|
}
|
|
if (error == -EEXIST)
|
|
goto repeat;
|
|
return error;
|
|
}
|
|
|
|
static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
|
{
|
|
struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
|
|
int error;
|
|
int ret = VM_FAULT_LOCKED;
|
|
|
|
/*
|
|
* Trinity finds that probing a hole which tmpfs is punching can
|
|
* prevent the hole-punch from ever completing: which in turn
|
|
* locks writers out with its hold on i_mutex. So refrain from
|
|
* faulting pages into the hole while it's being punched. Although
|
|
* shmem_truncate_range() does remove the additions, it may be unable to
|
|
* keep up, as each new page needs its own unmap_mapping_range() call,
|
|
* and the i_mmap tree grows ever slower to scan if new vmas are added.
|
|
*
|
|
* It does not matter if we sometimes reach this check just before the
|
|
* hole-punch begins, so that one fault then races with the punch:
|
|
* we just need to make racing faults a rare case.
|
|
*
|
|
* The implementation below would be much simpler if we just used a
|
|
* standard mutex or completion: but we cannot take i_mutex in fault,
|
|
* and bloating every shmem inode for this unlikely case would be sad.
|
|
*/
|
|
if (unlikely(inode->i_private)) {
|
|
struct shmem_falloc *shmem_falloc;
|
|
|
|
spin_lock(&inode->i_lock);
|
|
shmem_falloc = inode->i_private;
|
|
if (shmem_falloc &&
|
|
vmf->pgoff >= shmem_falloc->start &&
|
|
vmf->pgoff < shmem_falloc->next) {
|
|
wait_queue_head_t *shmem_falloc_waitq;
|
|
DEFINE_WAIT(shmem_fault_wait);
|
|
|
|
ret = VM_FAULT_NOPAGE;
|
|
if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
|
|
!(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
|
|
/* It's polite to up mmap_sem if we can */
|
|
up_read(&vma->vm_mm->mmap_sem);
|
|
ret = VM_FAULT_RETRY;
|
|
}
|
|
|
|
shmem_falloc_waitq = shmem_falloc->waitq;
|
|
prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
|
|
TASK_UNINTERRUPTIBLE);
|
|
spin_unlock(&inode->i_lock);
|
|
schedule();
|
|
|
|
/*
|
|
* shmem_falloc_waitq points into the vmtruncate_range()
|
|
* stack of the hole-punching task: shmem_falloc_waitq
|
|
* is usually invalid by the time we reach here, but
|
|
* finish_wait() does not dereference it in that case;
|
|
* though i_lock needed lest racing with wake_up_all().
|
|
*/
|
|
spin_lock(&inode->i_lock);
|
|
finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
|
|
spin_unlock(&inode->i_lock);
|
|
return ret;
|
|
}
|
|
spin_unlock(&inode->i_lock);
|
|
}
|
|
|
|
error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
|
|
if (error)
|
|
return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
|
|
|
|
if (ret & VM_FAULT_MAJOR) {
|
|
count_vm_event(PGMAJFAULT);
|
|
mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend)
|
|
{
|
|
/*
|
|
* If the underlying filesystem is not going to provide
|
|
* a way to truncate a range of blocks (punch a hole) -
|
|
* we should return failure right now.
|
|
* Only CONFIG_SHMEM shmem.c ever supported i_op->truncate_range().
|
|
*/
|
|
if (inode->i_op->truncate_range != shmem_truncate_range)
|
|
return -ENOSYS;
|
|
|
|
mutex_lock(&inode->i_mutex);
|
|
{
|
|
struct shmem_falloc shmem_falloc;
|
|
struct address_space *mapping = inode->i_mapping;
|
|
loff_t unmap_start = round_up(lstart, PAGE_SIZE);
|
|
loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1;
|
|
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
|
|
|
|
shmem_falloc.waitq = &shmem_falloc_waitq;
|
|
shmem_falloc.start = unmap_start >> PAGE_SHIFT;
|
|
shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
|
|
spin_lock(&inode->i_lock);
|
|
inode->i_private = &shmem_falloc;
|
|
spin_unlock(&inode->i_lock);
|
|
|
|
if ((u64)unmap_end > (u64)unmap_start)
|
|
unmap_mapping_range(mapping, unmap_start,
|
|
1 + unmap_end - unmap_start, 0);
|
|
shmem_truncate_range(inode, lstart, lend);
|
|
/* No need to unmap again: hole-punching leaves COWed pages */
|
|
|
|
spin_lock(&inode->i_lock);
|
|
inode->i_private = NULL;
|
|
wake_up_all(&shmem_falloc_waitq);
|
|
spin_unlock(&inode->i_lock);
|
|
}
|
|
mutex_unlock(&inode->i_mutex);
|
|
return 0;
|
|
}
|
|
|
|
#ifdef CONFIG_NUMA
|
|
static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
|
|
{
|
|
struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
|
|
return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
|
|
}
|
|
|
|
static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
|
|
unsigned long addr)
|
|
{
|
|
struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
|
|
pgoff_t index;
|
|
|
|
index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
|
|
return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
|
|
}
|
|
#endif
|
|
|
|
int shmem_lock(struct file *file, int lock, struct user_struct *user)
|
|
{
|
|
struct inode *inode = file->f_path.dentry->d_inode;
|
|
struct shmem_inode_info *info = SHMEM_I(inode);
|
|
int retval = -ENOMEM;
|
|
|
|
spin_lock(&info->lock);
|
|
if (lock && !(info->flags & VM_LOCKED)) {
|
|
if (!user_shm_lock(inode->i_size, user))
|
|
goto out_nomem;
|
|
info->flags |= VM_LOCKED;
|
|
mapping_set_unevictable(file->f_mapping);
|
|
}
|
|
if (!lock && (info->flags & VM_LOCKED) && user) {
|
|
user_shm_unlock(inode->i_size, user);
|
|
info->flags &= ~VM_LOCKED;
|
|
mapping_clear_unevictable(file->f_mapping);
|
|
}
|
|
retval = 0;
|
|
|
|
out_nomem:
|
|
spin_unlock(&info->lock);
|
|
return retval;
|
|
}
|
|
|
|
static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
|
|
{
|
|
file_accessed(file);
|
|
vma->vm_ops = &shmem_vm_ops;
|
|
vma->vm_flags |= VM_CAN_NONLINEAR;
|
|
return 0;
|
|
}
|
|
|
|
static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
|
|
umode_t mode, dev_t dev, unsigned long flags)
|
|
{
|
|
struct inode *inode;
|
|
struct shmem_inode_info *info;
|
|
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
|
|
|
|
if (shmem_reserve_inode(sb))
|
|
return NULL;
|
|
|
|
inode = new_inode(sb);
|
|
if (inode) {
|
|
inode->i_ino = get_next_ino();
|
|
inode_init_owner(inode, dir, mode);
|
|
inode->i_blocks = 0;
|
|
inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
|
|
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
|
|
inode->i_generation = get_seconds();
|
|
info = SHMEM_I(inode);
|
|
memset(info, 0, (char *)inode - (char *)info);
|
|
spin_lock_init(&info->lock);
|
|
info->flags = flags & VM_NORESERVE;
|
|
INIT_LIST_HEAD(&info->swaplist);
|
|
INIT_LIST_HEAD(&info->xattr_list);
|
|
cache_no_acl(inode);
|
|
|
|
switch (mode & S_IFMT) {
|
|
default:
|
|
inode->i_op = &shmem_special_inode_operations;
|
|
init_special_inode(inode, mode, dev);
|
|
break;
|
|
case S_IFREG:
|
|
inode->i_mapping->a_ops = &shmem_aops;
|
|
inode->i_op = &shmem_inode_operations;
|
|
inode->i_fop = &shmem_file_operations;
|
|
mpol_shared_policy_init(&info->policy,
|
|
shmem_get_sbmpol(sbinfo));
|
|
break;
|
|
case S_IFDIR:
|
|
inc_nlink(inode);
|
|
/* Some things misbehave if size == 0 on a directory */
|
|
inode->i_size = 2 * BOGO_DIRENT_SIZE;
|
|
inode->i_op = &shmem_dir_inode_operations;
|
|
inode->i_fop = &simple_dir_operations;
|
|
break;
|
|
case S_IFLNK:
|
|
/*
|
|
* Must not load anything in the rbtree,
|
|
* mpol_free_shared_policy will not be called.
|
|
*/
|
|
mpol_shared_policy_init(&info->policy, NULL);
|
|
break;
|
|
}
|
|
} else
|
|
shmem_free_inode(sb);
|
|
return inode;
|
|
}
|
|
|
|
#ifdef CONFIG_TMPFS
|
|
static const struct inode_operations shmem_symlink_inode_operations;
|
|
static const struct inode_operations shmem_short_symlink_operations;
|
|
|
|
#ifdef CONFIG_TMPFS_XATTR
|
|
static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
|
|
#else
|
|
#define shmem_initxattrs NULL
|
|
#endif
|
|
|
|
static int
|
|
shmem_write_begin(struct file *file, struct address_space *mapping,
|
|
loff_t pos, unsigned len, unsigned flags,
|
|
struct page **pagep, void **fsdata)
|
|
{
|
|
struct inode *inode = mapping->host;
|
|
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
|
|
return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
|
|
}
|
|
|
|
static int
|
|
shmem_write_end(struct file *file, struct address_space *mapping,
|
|
loff_t pos, unsigned len, unsigned copied,
|
|
struct page *page, void *fsdata)
|
|
{
|
|
struct inode *inode = mapping->host;
|
|
|
|
if (pos + copied > inode->i_size)
|
|
i_size_write(inode, pos + copied);
|
|
|
|
set_page_dirty(page);
|
|
unlock_page(page);
|
|
page_cache_release(page);
|
|
|
|
return copied;
|
|
}
|
|
|
|
static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
|
|
{
|
|
struct inode *inode = filp->f_path.dentry->d_inode;
|
|
struct address_space *mapping = inode->i_mapping;
|
|
pgoff_t index;
|
|
unsigned long offset;
|
|
enum sgp_type sgp = SGP_READ;
|
|
|
|
/*
|
|
* Might this read be for a stacking filesystem? Then when reading
|
|
* holes of a sparse file, we actually need to allocate those pages,
|
|
* and even mark them dirty, so it cannot exceed the max_blocks limit.
|
|
*/
|
|
if (segment_eq(get_fs(), KERNEL_DS))
|
|
sgp = SGP_DIRTY;
|
|
|
|
index = *ppos >> PAGE_CACHE_SHIFT;
|
|
offset = *ppos & ~PAGE_CACHE_MASK;
|
|
|
|
for (;;) {
|
|
struct page *page = NULL;
|
|
pgoff_t end_index;
|
|
unsigned long nr, ret;
|
|
loff_t i_size = i_size_read(inode);
|
|
|
|
end_index = i_size >> PAGE_CACHE_SHIFT;
|
|
if (index > end_index)
|
|
break;
|
|
if (index == end_index) {
|
|
nr = i_size & ~PAGE_CACHE_MASK;
|
|
if (nr <= offset)
|
|
break;
|
|
}
|
|
|
|
desc->error = shmem_getpage(inode, index, &page, sgp, NULL);
|
|
if (desc->error) {
|
|
if (desc->error == -EINVAL)
|
|
desc->error = 0;
|
|
break;
|
|
}
|
|
if (page)
|
|
unlock_page(page);
|
|
|
|
/*
|
|
* We must evaluate after, since reads (unlike writes)
|
|
* are called without i_mutex protection against truncate
|
|
*/
|
|
nr = PAGE_CACHE_SIZE;
|
|
i_size = i_size_read(inode);
|
|
end_index = i_size >> PAGE_CACHE_SHIFT;
|
|
if (index == end_index) {
|
|
nr = i_size & ~PAGE_CACHE_MASK;
|
|
if (nr <= offset) {
|
|
if (page)
|
|
page_cache_release(page);
|
|
break;
|
|
}
|
|
}
|
|
nr -= offset;
|
|
|
|
if (page) {
|
|
/*
|
|
* If users can be writing to this page using arbitrary
|
|
* virtual addresses, take care about potential aliasing
|
|
* before reading the page on the kernel side.
|
|
*/
|
|
if (mapping_writably_mapped(mapping))
|
|
flush_dcache_page(page);
|
|
/*
|
|
* Mark the page accessed if we read the beginning.
|
|
*/
|
|
if (!offset)
|
|
mark_page_accessed(page);
|
|
} else {
|
|
page = ZERO_PAGE(0);
|
|
page_cache_get(page);
|
|
}
|
|
|
|
/*
|
|
* Ok, we have the page, and it's up-to-date, so
|
|
* now we can copy it to user space...
|
|
*
|
|
* The actor routine returns how many bytes were actually used..
|
|
* NOTE! This may not be the same as how much of a user buffer
|
|
* we filled up (we may be padding etc), so we can only update
|
|
* "pos" here (the actor routine has to update the user buffer
|
|
* pointers and the remaining count).
|
|
*/
|
|
ret = actor(desc, page, offset, nr);
|
|
offset += ret;
|
|
index += offset >> PAGE_CACHE_SHIFT;
|
|
offset &= ~PAGE_CACHE_MASK;
|
|
|
|
page_cache_release(page);
|
|
if (ret != nr || !desc->count)
|
|
break;
|
|
|
|
cond_resched();
|
|
}
|
|
|
|
*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
|
|
file_accessed(filp);
|
|
}
|
|
|
|
static ssize_t shmem_file_aio_read(struct kiocb *iocb,
|
|
const struct iovec *iov, unsigned long nr_segs, loff_t pos)
|
|
{
|
|
struct file *filp = iocb->ki_filp;
|
|
ssize_t retval;
|
|
unsigned long seg;
|
|
size_t count;
|
|
loff_t *ppos = &iocb->ki_pos;
|
|
|
|
retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
|
|
if (retval)
|
|
return retval;
|
|
|
|
for (seg = 0; seg < nr_segs; seg++) {
|
|
read_descriptor_t desc;
|
|
|
|
desc.written = 0;
|
|
desc.arg.buf = iov[seg].iov_base;
|
|
desc.count = iov[seg].iov_len;
|
|
if (desc.count == 0)
|
|
continue;
|
|
desc.error = 0;
|
|
do_shmem_file_read(filp, ppos, &desc, file_read_actor);
|
|
retval += desc.written;
|
|
if (desc.error) {
|
|
retval = retval ?: desc.error;
|
|
break;
|
|
}
|
|
if (desc.count > 0)
|
|
break;
|
|
}
|
|
return retval;
|
|
}
|
|
|
|
static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
|
|
struct pipe_inode_info *pipe, size_t len,
|
|
unsigned int flags)
|
|
{
|
|
struct address_space *mapping = in->f_mapping;
|
|
struct inode *inode = mapping->host;
|
|
unsigned int loff, nr_pages, req_pages;
|
|
struct page *pages[PIPE_DEF_BUFFERS];
|
|
struct partial_page partial[PIPE_DEF_BUFFERS];
|
|
struct page *page;
|
|
pgoff_t index, end_index;
|
|
loff_t isize, left;
|
|
int error, page_nr;
|
|
struct splice_pipe_desc spd = {
|
|
.pages = pages,
|
|
.partial = partial,
|
|
.nr_pages_max = PIPE_DEF_BUFFERS,
|
|
.flags = flags,
|
|
.ops = &page_cache_pipe_buf_ops,
|
|
.spd_release = spd_release_page,
|
|
};
|
|
|
|
isize = i_size_read(inode);
|
|
if (unlikely(*ppos >= isize))
|
|
return 0;
|
|
|
|
left = isize - *ppos;
|
|
if (unlikely(left < len))
|
|
len = left;
|
|
|
|
if (splice_grow_spd(pipe, &spd))
|
|
return -ENOMEM;
|
|
|
|
index = *ppos >> PAGE_CACHE_SHIFT;
|
|
loff = *ppos & ~PAGE_CACHE_MASK;
|
|
req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
|
|
nr_pages = min(req_pages, pipe->buffers);
|
|
|
|
spd.nr_pages = find_get_pages_contig(mapping, index,
|
|
nr_pages, spd.pages);
|
|
index += spd.nr_pages;
|
|
error = 0;
|
|
|
|
while (spd.nr_pages < nr_pages) {
|
|
error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL);
|
|
if (error)
|
|
break;
|
|
unlock_page(page);
|
|
spd.pages[spd.nr_pages++] = page;
|
|
index++;
|
|
}
|
|
|
|
index = *ppos >> PAGE_CACHE_SHIFT;
|
|
nr_pages = spd.nr_pages;
|
|
spd.nr_pages = 0;
|
|
|
|
for (page_nr = 0; page_nr < nr_pages; page_nr++) {
|
|
unsigned int this_len;
|
|
|
|
if (!len)
|
|
break;
|
|
|
|
this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
|
|
page = spd.pages[page_nr];
|
|
|
|
if (!PageUptodate(page) || page->mapping != mapping) {
|
|
error = shmem_getpage(inode, index, &page,
|
|
SGP_CACHE, NULL);
|
|
if (error)
|
|
break;
|
|
unlock_page(page);
|
|
page_cache_release(spd.pages[page_nr]);
|
|
spd.pages[page_nr] = page;
|
|
}
|
|
|
|
isize = i_size_read(inode);
|
|
end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
|
|
if (unlikely(!isize || index > end_index))
|
|
break;
|
|
|
|
if (end_index == index) {
|
|
unsigned int plen;
|
|
|
|
plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
|
|
if (plen <= loff)
|
|
break;
|
|
|
|
this_len = min(this_len, plen - loff);
|
|
len = this_len;
|
|
}
|
|
|
|
spd.partial[page_nr].offset = loff;
|
|
spd.partial[page_nr].len = this_len;
|
|
len -= this_len;
|
|
loff = 0;
|
|
spd.nr_pages++;
|
|
index++;
|
|
}
|
|
|
|
while (page_nr < nr_pages)
|
|
page_cache_release(spd.pages[page_nr++]);
|
|
|
|
if (spd.nr_pages)
|
|
error = splice_to_pipe(pipe, &spd);
|
|
|
|
splice_shrink_spd(&spd);
|
|
|
|
if (error > 0) {
|
|
*ppos += error;
|
|
file_accessed(in);
|
|
}
|
|
return error;
|
|
}
|
|
|
|
static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
|
|
{
|
|
struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
|
|
|
|
buf->f_type = TMPFS_MAGIC;
|
|
buf->f_bsize = PAGE_CACHE_SIZE;
|
|
buf->f_namelen = NAME_MAX;
|
|
if (sbinfo->max_blocks) {
|
|
buf->f_blocks = sbinfo->max_blocks;
|
|
buf->f_bavail =
|
|
buf->f_bfree = sbinfo->max_blocks -
|
|
percpu_counter_sum(&sbinfo->used_blocks);
|
|
}
|
|
if (sbinfo->max_inodes) {
|
|
buf->f_files = sbinfo->max_inodes;
|
|
buf->f_ffree = sbinfo->free_inodes;
|
|
}
|
|
/* else leave those fields 0 like simple_statfs */
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* File creation. Allocate an inode, and we're done..
|
|
*/
|
|
static int
|
|
shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
|
|
{
|
|
struct inode *inode;
|
|
int error = -ENOSPC;
|
|
|
|
inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
|
|
if (inode) {
|
|
error = security_inode_init_security(inode, dir,
|
|
&dentry->d_name,
|
|
shmem_initxattrs, NULL);
|
|
if (error) {
|
|
if (error != -EOPNOTSUPP) {
|
|
iput(inode);
|
|
return error;
|
|
}
|
|
}
|
|
#ifdef CONFIG_TMPFS_POSIX_ACL
|
|
error = generic_acl_init(inode, dir);
|
|
if (error) {
|
|
iput(inode);
|
|
return error;
|
|
}
|
|
#else
|
|
error = 0;
|
|
#endif
|
|
dir->i_size += BOGO_DIRENT_SIZE;
|
|
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
|
|
d_instantiate(dentry, inode);
|
|
dget(dentry); /* Extra count - pin the dentry in core */
|
|
}
|
|
return error;
|
|
}
|
|
|
|
static int
|
|
shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
|
|
{
|
|
struct inode *inode;
|
|
int error = -ENOSPC;
|
|
|
|
inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE);
|
|
if (inode) {
|
|
error = security_inode_init_security(inode, dir,
|
|
NULL,
|
|
shmem_initxattrs, NULL);
|
|
if (error) {
|
|
if (error != -EOPNOTSUPP) {
|
|
iput(inode);
|
|
return error;
|
|
}
|
|
}
|
|
#ifdef CONFIG_TMPFS_POSIX_ACL
|
|
error = generic_acl_init(inode, dir);
|
|
if (error) {
|
|
iput(inode);
|
|
return error;
|
|
}
|
|
#else
|
|
error = 0;
|
|
#endif
|
|
d_tmpfile(dentry, inode);
|
|
}
|
|
return error;
|
|
}
|
|
|
|
static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
|
|
{
|
|
int error;
|
|
|
|
if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
|
|
return error;
|
|
inc_nlink(dir);
|
|
return 0;
|
|
}
|
|
|
|
static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
|
|
bool excl)
|
|
{
|
|
return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
|
|
}
|
|
|
|
/*
|
|
* Link a file..
|
|
*/
|
|
static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
|
|
{
|
|
struct inode *inode = old_dentry->d_inode;
|
|
int ret;
|
|
|
|
/*
|
|
* No ordinary (disk based) filesystem counts links as inodes;
|
|
* but each new link needs a new dentry, pinning lowmem, and
|
|
* tmpfs dentries cannot be pruned until they are unlinked.
|
|
*/
|
|
ret = shmem_reserve_inode(inode->i_sb);
|
|
if (ret)
|
|
goto out;
|
|
|
|
dir->i_size += BOGO_DIRENT_SIZE;
|
|
inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
|
|
inc_nlink(inode);
|
|
ihold(inode); /* New dentry reference */
|
|
dget(dentry); /* Extra pinning count for the created dentry */
|
|
d_instantiate(dentry, inode);
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
static int shmem_unlink(struct inode *dir, struct dentry *dentry)
|
|
{
|
|
struct inode *inode = dentry->d_inode;
|
|
|
|
if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
|
|
shmem_free_inode(inode->i_sb);
|
|
|
|
dir->i_size -= BOGO_DIRENT_SIZE;
|
|
inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
|
|
drop_nlink(inode);
|
|
dput(dentry); /* Undo the count from "create" - this does all the work */
|
|
return 0;
|
|
}
|
|
|
|
static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
|
|
{
|
|
if (!simple_empty(dentry))
|
|
return -ENOTEMPTY;
|
|
|
|
drop_nlink(dentry->d_inode);
|
|
drop_nlink(dir);
|
|
return shmem_unlink(dir, dentry);
|
|
}
|
|
|
|
/*
|
|
* The VFS layer already does all the dentry stuff for rename,
|
|
* we just have to decrement the usage count for the target if
|
|
* it exists so that the VFS layer correctly free's it when it
|
|
* gets overwritten.
|
|
*/
|
|
static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
|
|
{
|
|
struct inode *inode = old_dentry->d_inode;
|
|
int they_are_dirs = S_ISDIR(inode->i_mode);
|
|
|
|
if (!simple_empty(new_dentry))
|
|
return -ENOTEMPTY;
|
|
|
|
if (new_dentry->d_inode) {
|
|
(void) shmem_unlink(new_dir, new_dentry);
|
|
if (they_are_dirs) {
|
|
drop_nlink(new_dentry->d_inode);
|
|
drop_nlink(old_dir);
|
|
}
|
|
} else if (they_are_dirs) {
|
|
drop_nlink(old_dir);
|
|
inc_nlink(new_dir);
|
|
}
|
|
|
|
old_dir->i_size -= BOGO_DIRENT_SIZE;
|
|
new_dir->i_size += BOGO_DIRENT_SIZE;
|
|
old_dir->i_ctime = old_dir->i_mtime =
|
|
new_dir->i_ctime = new_dir->i_mtime =
|
|
inode->i_ctime = CURRENT_TIME;
|
|
return 0;
|
|
}
|
|
|
|
static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
|
|
{
|
|
int error;
|
|
int len;
|
|
struct inode *inode;
|
|
struct page *page;
|
|
char *kaddr;
|
|
struct shmem_inode_info *info;
|
|
|
|
len = strlen(symname) + 1;
|
|
if (len > PAGE_CACHE_SIZE)
|
|
return -ENAMETOOLONG;
|
|
|
|
inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE);
|
|
if (!inode)
|
|
return -ENOSPC;
|
|
|
|
error = security_inode_init_security(inode, dir, &dentry->d_name,
|
|
shmem_initxattrs, NULL);
|
|
if (error) {
|
|
if (error != -EOPNOTSUPP) {
|
|
iput(inode);
|
|
return error;
|
|
}
|
|
error = 0;
|
|
}
|
|
|
|
info = SHMEM_I(inode);
|
|
inode->i_size = len-1;
|
|
if (len <= SHORT_SYMLINK_LEN) {
|
|
info->symlink = kmemdup(symname, len, GFP_KERNEL);
|
|
if (!info->symlink) {
|
|
iput(inode);
|
|
return -ENOMEM;
|
|
}
|
|
inode->i_op = &shmem_short_symlink_operations;
|
|
} else {
|
|
error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
|
|
if (error) {
|
|
iput(inode);
|
|
return error;
|
|
}
|
|
inode->i_mapping->a_ops = &shmem_aops;
|
|
inode->i_op = &shmem_symlink_inode_operations;
|
|
kaddr = kmap_atomic(page);
|
|
memcpy(kaddr, symname, len);
|
|
kunmap_atomic(kaddr);
|
|
set_page_dirty(page);
|
|
unlock_page(page);
|
|
page_cache_release(page);
|
|
}
|
|
dir->i_size += BOGO_DIRENT_SIZE;
|
|
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
|
|
d_instantiate(dentry, inode);
|
|
dget(dentry);
|
|
return 0;
|
|
}
|
|
|
|
static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd)
|
|
{
|
|
nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink);
|
|
return NULL;
|
|
}
|
|
|
|
static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
|
|
{
|
|
struct page *page = NULL;
|
|
int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
|
|
nd_set_link(nd, error ? ERR_PTR(error) : kmap(page));
|
|
if (page)
|
|
unlock_page(page);
|
|
return page;
|
|
}
|
|
|
|
static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
|
|
{
|
|
if (!IS_ERR(nd_get_link(nd))) {
|
|
struct page *page = cookie;
|
|
kunmap(page);
|
|
mark_page_accessed(page);
|
|
page_cache_release(page);
|
|
}
|
|
}
|
|
|
|
#ifdef CONFIG_TMPFS_XATTR
|
|
/*
|
|
* Superblocks without xattr inode operations may get some security.* xattr
|
|
* support from the LSM "for free". As soon as we have any other xattrs
|
|
* like ACLs, we also need to implement the security.* handlers at
|
|
* filesystem level, though.
|
|
*/
|
|
|
|
/*
|
|
* Allocate new xattr and copy in the value; but leave the name to callers.
|
|
*/
|
|
static struct shmem_xattr *shmem_xattr_alloc(const void *value, size_t size)
|
|
{
|
|
struct shmem_xattr *new_xattr;
|
|
size_t len;
|
|
|
|
/* wrap around? */
|
|
len = sizeof(*new_xattr) + size;
|
|
if (len <= sizeof(*new_xattr))
|
|
return NULL;
|
|
|
|
new_xattr = kmalloc(len, GFP_KERNEL);
|
|
if (!new_xattr)
|
|
return NULL;
|
|
|
|
new_xattr->size = size;
|
|
memcpy(new_xattr->value, value, size);
|
|
return new_xattr;
|
|
}
|
|
|
|
/*
|
|
* Callback for security_inode_init_security() for acquiring xattrs.
|
|
*/
|
|
static int shmem_initxattrs(struct inode *inode,
|
|
const struct xattr *xattr_array,
|
|
void *fs_info)
|
|
{
|
|
struct shmem_inode_info *info = SHMEM_I(inode);
|
|
const struct xattr *xattr;
|
|
struct shmem_xattr *new_xattr;
|
|
size_t len;
|
|
|
|
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
|
|
new_xattr = shmem_xattr_alloc(xattr->value, xattr->value_len);
|
|
if (!new_xattr)
|
|
return -ENOMEM;
|
|
|
|
len = strlen(xattr->name) + 1;
|
|
new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
|
|
GFP_KERNEL);
|
|
if (!new_xattr->name) {
|
|
kfree(new_xattr);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
|
|
XATTR_SECURITY_PREFIX_LEN);
|
|
memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
|
|
xattr->name, len);
|
|
|
|
spin_lock(&info->lock);
|
|
list_add(&new_xattr->list, &info->xattr_list);
|
|
spin_unlock(&info->lock);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int shmem_xattr_get(struct dentry *dentry, const char *name,
|
|
void *buffer, size_t size)
|
|
{
|
|
struct shmem_inode_info *info;
|
|
struct shmem_xattr *xattr;
|
|
int ret = -ENODATA;
|
|
|
|
info = SHMEM_I(dentry->d_inode);
|
|
|
|
spin_lock(&info->lock);
|
|
list_for_each_entry(xattr, &info->xattr_list, list) {
|
|
if (strcmp(name, xattr->name))
|
|
continue;
|
|
|
|
ret = xattr->size;
|
|
if (buffer) {
|
|
if (size < xattr->size)
|
|
ret = -ERANGE;
|
|
else
|
|
memcpy(buffer, xattr->value, xattr->size);
|
|
}
|
|
break;
|
|
}
|
|
spin_unlock(&info->lock);
|
|
return ret;
|
|
}
|
|
|
|
static int shmem_xattr_set(struct inode *inode, const char *name,
|
|
const void *value, size_t size, int flags)
|
|
{
|
|
struct shmem_inode_info *info = SHMEM_I(inode);
|
|
struct shmem_xattr *xattr;
|
|
struct shmem_xattr *new_xattr = NULL;
|
|
int err = 0;
|
|
|
|
/* value == NULL means remove */
|
|
if (value) {
|
|
new_xattr = shmem_xattr_alloc(value, size);
|
|
if (!new_xattr)
|
|
return -ENOMEM;
|
|
|
|
new_xattr->name = kstrdup(name, GFP_KERNEL);
|
|
if (!new_xattr->name) {
|
|
kfree(new_xattr);
|
|
return -ENOMEM;
|
|
}
|
|
}
|
|
|
|
spin_lock(&info->lock);
|
|
list_for_each_entry(xattr, &info->xattr_list, list) {
|
|
if (!strcmp(name, xattr->name)) {
|
|
if (flags & XATTR_CREATE) {
|
|
xattr = new_xattr;
|
|
err = -EEXIST;
|
|
} else if (new_xattr) {
|
|
list_replace(&xattr->list, &new_xattr->list);
|
|
} else {
|
|
list_del(&xattr->list);
|
|
}
|
|
goto out;
|
|
}
|
|
}
|
|
if (flags & XATTR_REPLACE) {
|
|
xattr = new_xattr;
|
|
err = -ENODATA;
|
|
} else {
|
|
list_add(&new_xattr->list, &info->xattr_list);
|
|
xattr = NULL;
|
|
}
|
|
out:
|
|
spin_unlock(&info->lock);
|
|
if (xattr)
|
|
kfree(xattr->name);
|
|
kfree(xattr);
|
|
return err;
|
|
}
|
|
|
|
static const struct xattr_handler *shmem_xattr_handlers[] = {
|
|
#ifdef CONFIG_TMPFS_POSIX_ACL
|
|
&generic_acl_access_handler,
|
|
&generic_acl_default_handler,
|
|
#endif
|
|
NULL
|
|
};
|
|
|
|
static int shmem_xattr_validate(const char *name)
|
|
{
|
|
struct { const char *prefix; size_t len; } arr[] = {
|
|
{ XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN },
|
|
{ XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN }
|
|
};
|
|
int i;
|
|
|
|
for (i = 0; i < ARRAY_SIZE(arr); i++) {
|
|
size_t preflen = arr[i].len;
|
|
if (strncmp(name, arr[i].prefix, preflen) == 0) {
|
|
if (!name[preflen])
|
|
return -EINVAL;
|
|
return 0;
|
|
}
|
|
}
|
|
return -EOPNOTSUPP;
|
|
}
|
|
|
|
static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
|
|
void *buffer, size_t size)
|
|
{
|
|
int err;
|
|
|
|
/*
|
|
* If this is a request for a synthetic attribute in the system.*
|
|
* namespace use the generic infrastructure to resolve a handler
|
|
* for it via sb->s_xattr.
|
|
*/
|
|
if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
|
|
return generic_getxattr(dentry, name, buffer, size);
|
|
|
|
err = shmem_xattr_validate(name);
|
|
if (err)
|
|
return err;
|
|
|
|
return shmem_xattr_get(dentry, name, buffer, size);
|
|
}
|
|
|
|
static int shmem_setxattr(struct dentry *dentry, const char *name,
|
|
const void *value, size_t size, int flags)
|
|
{
|
|
int err;
|
|
|
|
/*
|
|
* If this is a request for a synthetic attribute in the system.*
|
|
* namespace use the generic infrastructure to resolve a handler
|
|
* for it via sb->s_xattr.
|
|
*/
|
|
if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
|
|
return generic_setxattr(dentry, name, value, size, flags);
|
|
|
|
err = shmem_xattr_validate(name);
|
|
if (err)
|
|
return err;
|
|
|
|
if (size == 0)
|
|
value = ""; /* empty EA, do not remove */
|
|
|
|
return shmem_xattr_set(dentry->d_inode, name, value, size, flags);
|
|
|
|
}
|
|
|
|
static int shmem_removexattr(struct dentry *dentry, const char *name)
|
|
{
|
|
int err;
|
|
|
|
/*
|
|
* If this is a request for a synthetic attribute in the system.*
|
|
* namespace use the generic infrastructure to resolve a handler
|
|
* for it via sb->s_xattr.
|
|
*/
|
|
if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
|
|
return generic_removexattr(dentry, name);
|
|
|
|
err = shmem_xattr_validate(name);
|
|
if (err)
|
|
return err;
|
|
|
|
return shmem_xattr_set(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
|
|
}
|
|
|
|
static bool xattr_is_trusted(const char *name)
|
|
{
|
|
return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
|
|
}
|
|
|
|
static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
|
|
{
|
|
bool trusted = capable(CAP_SYS_ADMIN);
|
|
struct shmem_xattr *xattr;
|
|
struct shmem_inode_info *info;
|
|
size_t used = 0;
|
|
|
|
info = SHMEM_I(dentry->d_inode);
|
|
|
|
spin_lock(&info->lock);
|
|
list_for_each_entry(xattr, &info->xattr_list, list) {
|
|
size_t len;
|
|
|
|
/* skip "trusted." attributes for unprivileged callers */
|
|
if (!trusted && xattr_is_trusted(xattr->name))
|
|
continue;
|
|
|
|
len = strlen(xattr->name) + 1;
|
|
used += len;
|
|
if (buffer) {
|
|
if (size < used) {
|
|
used = -ERANGE;
|
|
break;
|
|
}
|
|
memcpy(buffer, xattr->name, len);
|
|
buffer += len;
|
|
}
|
|
}
|
|
spin_unlock(&info->lock);
|
|
|
|
return used;
|
|
}
|
|
#endif /* CONFIG_TMPFS_XATTR */
|
|
|
|
static const struct inode_operations shmem_short_symlink_operations = {
|
|
.readlink = generic_readlink,
|
|
.follow_link = shmem_follow_short_symlink,
|
|
#ifdef CONFIG_TMPFS_XATTR
|
|
.setxattr = shmem_setxattr,
|
|
.getxattr = shmem_getxattr,
|
|
.listxattr = shmem_listxattr,
|
|
.removexattr = shmem_removexattr,
|
|
#endif
|
|
};
|
|
|
|
static const struct inode_operations shmem_symlink_inode_operations = {
|
|
.readlink = generic_readlink,
|
|
.follow_link = shmem_follow_link,
|
|
.put_link = shmem_put_link,
|
|
#ifdef CONFIG_TMPFS_XATTR
|
|
.setxattr = shmem_setxattr,
|
|
.getxattr = shmem_getxattr,
|
|
.listxattr = shmem_listxattr,
|
|
.removexattr = shmem_removexattr,
|
|
#endif
|
|
};
|
|
|
|
static struct dentry *shmem_get_parent(struct dentry *child)
|
|
{
|
|
return ERR_PTR(-ESTALE);
|
|
}
|
|
|
|
static int shmem_match(struct inode *ino, void *vfh)
|
|
{
|
|
__u32 *fh = vfh;
|
|
__u64 inum = fh[2];
|
|
inum = (inum << 32) | fh[1];
|
|
return ino->i_ino == inum && fh[0] == ino->i_generation;
|
|
}
|
|
|
|
static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
|
|
struct fid *fid, int fh_len, int fh_type)
|
|
{
|
|
struct inode *inode;
|
|
struct dentry *dentry = NULL;
|
|
u64 inum;
|
|
|
|
if (fh_len < 3)
|
|
return NULL;
|
|
|
|
inum = fid->raw[2];
|
|
inum = (inum << 32) | fid->raw[1];
|
|
|
|
inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
|
|
shmem_match, fid->raw);
|
|
if (inode) {
|
|
dentry = d_find_alias(inode);
|
|
iput(inode);
|
|
}
|
|
|
|
return dentry;
|
|
}
|
|
|
|
static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
|
|
int connectable)
|
|
{
|
|
struct inode *inode = dentry->d_inode;
|
|
|
|
if (*len < 3) {
|
|
*len = 3;
|
|
return 255;
|
|
}
|
|
|
|
if (inode_unhashed(inode)) {
|
|
/* Unfortunately insert_inode_hash is not idempotent,
|
|
* so as we hash inodes here rather than at creation
|
|
* time, we need a lock to ensure we only try
|
|
* to do it once
|
|
*/
|
|
static DEFINE_SPINLOCK(lock);
|
|
spin_lock(&lock);
|
|
if (inode_unhashed(inode))
|
|
__insert_inode_hash(inode,
|
|
inode->i_ino + inode->i_generation);
|
|
spin_unlock(&lock);
|
|
}
|
|
|
|
fh[0] = inode->i_generation;
|
|
fh[1] = inode->i_ino;
|
|
fh[2] = ((__u64)inode->i_ino) >> 32;
|
|
|
|
*len = 3;
|
|
return 1;
|
|
}
|
|
|
|
static const struct export_operations shmem_export_ops = {
|
|
.get_parent = shmem_get_parent,
|
|
.encode_fh = shmem_encode_fh,
|
|
.fh_to_dentry = shmem_fh_to_dentry,
|
|
};
|
|
|
|
static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
|
|
bool remount)
|
|
{
|
|
char *this_char, *value, *rest;
|
|
|
|
while (options != NULL) {
|
|
this_char = options;
|
|
for (;;) {
|
|
/*
|
|
* NUL-terminate this option: unfortunately,
|
|
* mount options form a comma-separated list,
|
|
* but mpol's nodelist may also contain commas.
|
|
*/
|
|
options = strchr(options, ',');
|
|
if (options == NULL)
|
|
break;
|
|
options++;
|
|
if (!isdigit(*options)) {
|
|
options[-1] = '\0';
|
|
break;
|
|
}
|
|
}
|
|
if (!*this_char)
|
|
continue;
|
|
if ((value = strchr(this_char,'=')) != NULL) {
|
|
*value++ = 0;
|
|
} else {
|
|
printk(KERN_ERR
|
|
"tmpfs: No value for mount option '%s'\n",
|
|
this_char);
|
|
return 1;
|
|
}
|
|
|
|
if (!strcmp(this_char,"size")) {
|
|
unsigned long long size;
|
|
size = memparse(value,&rest);
|
|
if (*rest == '%') {
|
|
size <<= PAGE_SHIFT;
|
|
size *= totalram_pages;
|
|
do_div(size, 100);
|
|
rest++;
|
|
}
|
|
if (*rest)
|
|
goto bad_val;
|
|
sbinfo->max_blocks =
|
|
DIV_ROUND_UP(size, PAGE_CACHE_SIZE);
|
|
} else if (!strcmp(this_char,"nr_blocks")) {
|
|
sbinfo->max_blocks = memparse(value, &rest);
|
|
if (*rest)
|
|
goto bad_val;
|
|
} else if (!strcmp(this_char,"nr_inodes")) {
|
|
sbinfo->max_inodes = memparse(value, &rest);
|
|
if (*rest)
|
|
goto bad_val;
|
|
} else if (!strcmp(this_char,"mode")) {
|
|
if (remount)
|
|
continue;
|
|
sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777;
|
|
if (*rest)
|
|
goto bad_val;
|
|
} else if (!strcmp(this_char,"uid")) {
|
|
if (remount)
|
|
continue;
|
|
sbinfo->uid = simple_strtoul(value, &rest, 0);
|
|
if (*rest)
|
|
goto bad_val;
|
|
} else if (!strcmp(this_char,"gid")) {
|
|
if (remount)
|
|
continue;
|
|
sbinfo->gid = simple_strtoul(value, &rest, 0);
|
|
if (*rest)
|
|
goto bad_val;
|
|
} else if (!strcmp(this_char,"mpol")) {
|
|
if (mpol_parse_str(value, &sbinfo->mpol, 1))
|
|
goto bad_val;
|
|
} else {
|
|
printk(KERN_ERR "tmpfs: Bad mount option %s\n",
|
|
this_char);
|
|
return 1;
|
|
}
|
|
}
|
|
return 0;
|
|
|
|
bad_val:
|
|
printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
|
|
value, this_char);
|
|
return 1;
|
|
|
|
}
|
|
|
|
static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
|
|
{
|
|
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
|
|
struct shmem_sb_info config = *sbinfo;
|
|
unsigned long inodes;
|
|
int error = -EINVAL;
|
|
|
|
config.mpol = NULL;
|
|
if (shmem_parse_options(data, &config, true))
|
|
return error;
|
|
|
|
spin_lock(&sbinfo->stat_lock);
|
|
inodes = sbinfo->max_inodes - sbinfo->free_inodes;
|
|
if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
|
|
goto out;
|
|
if (config.max_inodes < inodes)
|
|
goto out;
|
|
/*
|
|
* Those tests disallow limited->unlimited while any are in use;
|
|
* but we must separately disallow unlimited->limited, because
|
|
* in that case we have no record of how much is already in use.
|
|
*/
|
|
if (config.max_blocks && !sbinfo->max_blocks)
|
|
goto out;
|
|
if (config.max_inodes && !sbinfo->max_inodes)
|
|
goto out;
|
|
|
|
error = 0;
|
|
sbinfo->max_blocks = config.max_blocks;
|
|
sbinfo->max_inodes = config.max_inodes;
|
|
sbinfo->free_inodes = config.max_inodes - inodes;
|
|
|
|
/*
|
|
* Preserve previous mempolicy unless mpol remount option was specified.
|
|
*/
|
|
if (config.mpol) {
|
|
mpol_put(sbinfo->mpol);
|
|
sbinfo->mpol = config.mpol; /* transfers initial ref */
|
|
}
|
|
out:
|
|
spin_unlock(&sbinfo->stat_lock);
|
|
return error;
|
|
}
|
|
|
|
static int shmem_show_options(struct seq_file *seq, struct dentry *root)
|
|
{
|
|
struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
|
|
|
|
if (sbinfo->max_blocks != shmem_default_max_blocks())
|
|
seq_printf(seq, ",size=%luk",
|
|
sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10));
|
|
if (sbinfo->max_inodes != shmem_default_max_inodes())
|
|
seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
|
|
if (sbinfo->mode != (S_IRWXUGO | S_ISVTX))
|
|
seq_printf(seq, ",mode=%03ho", sbinfo->mode);
|
|
if (sbinfo->uid != 0)
|
|
seq_printf(seq, ",uid=%u", sbinfo->uid);
|
|
if (sbinfo->gid != 0)
|
|
seq_printf(seq, ",gid=%u", sbinfo->gid);
|
|
shmem_show_mpol(seq, sbinfo->mpol);
|
|
return 0;
|
|
}
|
|
|
|
#define MFD_NAME_PREFIX "memfd:"
|
|
#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
|
|
#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
|
|
|
|
#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING)
|
|
|
|
SYSCALL_DEFINE2(memfd_create,
|
|
const char __user *, uname,
|
|
unsigned int, flags)
|
|
{
|
|
struct shmem_inode_info *info;
|
|
struct file *file;
|
|
int fd, error;
|
|
char *name;
|
|
long len;
|
|
|
|
if (flags & ~(unsigned int)MFD_ALL_FLAGS)
|
|
return -EINVAL;
|
|
|
|
/* length includes terminating zero */
|
|
len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
|
|
if (len <= 0)
|
|
return -EFAULT;
|
|
if (len > MFD_NAME_MAX_LEN + 1)
|
|
return -EINVAL;
|
|
|
|
name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_TEMPORARY);
|
|
if (!name)
|
|
return -ENOMEM;
|
|
|
|
strcpy(name, MFD_NAME_PREFIX);
|
|
if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
|
|
error = -EFAULT;
|
|
goto err_name;
|
|
}
|
|
|
|
/* terminating-zero may have changed after strnlen_user() returned */
|
|
if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
|
|
error = -EFAULT;
|
|
goto err_name;
|
|
}
|
|
|
|
fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
|
|
if (fd < 0) {
|
|
error = fd;
|
|
goto err_name;
|
|
}
|
|
|
|
file = shmem_file_setup(name, 0, VM_NORESERVE);
|
|
if (IS_ERR(file)) {
|
|
error = PTR_ERR(file);
|
|
goto err_fd;
|
|
}
|
|
info = SHMEM_I(file->f_path.dentry->d_inode);
|
|
file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
|
|
file->f_flags |= O_RDWR | O_LARGEFILE;
|
|
|
|
fd_install(fd, file);
|
|
kfree(name);
|
|
return fd;
|
|
|
|
err_fd:
|
|
put_unused_fd(fd);
|
|
err_name:
|
|
kfree(name);
|
|
return error;
|
|
}
|
|
|
|
#endif /* CONFIG_TMPFS */
|
|
|
|
static void shmem_put_super(struct super_block *sb)
|
|
{
|
|
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
|
|
|
|
percpu_counter_destroy(&sbinfo->used_blocks);
|
|
kfree(sbinfo);
|
|
sb->s_fs_info = NULL;
|
|
}
|
|
|
|
int shmem_fill_super(struct super_block *sb, void *data, int silent)
|
|
{
|
|
struct inode *inode;
|
|
struct shmem_sb_info *sbinfo;
|
|
int err = -ENOMEM;
|
|
|
|
/* Round up to L1_CACHE_BYTES to resist false sharing */
|
|
sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
|
|
L1_CACHE_BYTES), GFP_KERNEL);
|
|
if (!sbinfo)
|
|
return -ENOMEM;
|
|
|
|
sbinfo->mode = S_IRWXUGO | S_ISVTX;
|
|
sbinfo->uid = current_fsuid();
|
|
sbinfo->gid = current_fsgid();
|
|
sb->s_fs_info = sbinfo;
|
|
|
|
#ifdef CONFIG_TMPFS
|
|
/*
|
|
* Per default we only allow half of the physical ram per
|
|
* tmpfs instance, limiting inodes to one per page of lowmem;
|
|
* but the internal instance is left unlimited.
|
|
*/
|
|
if (!(sb->s_flags & MS_NOUSER)) {
|
|
sbinfo->max_blocks = shmem_default_max_blocks();
|
|
sbinfo->max_inodes = shmem_default_max_inodes();
|
|
if (shmem_parse_options(data, sbinfo, false)) {
|
|
err = -EINVAL;
|
|
goto failed;
|
|
}
|
|
}
|
|
sb->s_export_op = &shmem_export_ops;
|
|
#else
|
|
sb->s_flags |= MS_NOUSER;
|
|
#endif
|
|
|
|
spin_lock_init(&sbinfo->stat_lock);
|
|
if (percpu_counter_init(&sbinfo->used_blocks, 0))
|
|
goto failed;
|
|
sbinfo->free_inodes = sbinfo->max_inodes;
|
|
|
|
sb->s_maxbytes = MAX_LFS_FILESIZE;
|
|
sb->s_blocksize = PAGE_CACHE_SIZE;
|
|
sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
|
|
sb->s_magic = TMPFS_MAGIC;
|
|
sb->s_op = &shmem_ops;
|
|
sb->s_time_gran = 1;
|
|
#ifdef CONFIG_TMPFS_XATTR
|
|
sb->s_xattr = shmem_xattr_handlers;
|
|
#endif
|
|
#ifdef CONFIG_TMPFS_POSIX_ACL
|
|
sb->s_flags |= MS_POSIXACL;
|
|
#endif
|
|
|
|
inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
|
|
if (!inode)
|
|
goto failed;
|
|
inode->i_uid = sbinfo->uid;
|
|
inode->i_gid = sbinfo->gid;
|
|
sb->s_root = d_make_root(inode);
|
|
if (!sb->s_root)
|
|
goto failed;
|
|
return 0;
|
|
|
|
failed:
|
|
shmem_put_super(sb);
|
|
return err;
|
|
}
|
|
|
|
static struct kmem_cache *shmem_inode_cachep;
|
|
|
|
static struct inode *shmem_alloc_inode(struct super_block *sb)
|
|
{
|
|
struct shmem_inode_info *info;
|
|
info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
|
|
if (!info)
|
|
return NULL;
|
|
return &info->vfs_inode;
|
|
}
|
|
|
|
static void shmem_destroy_callback(struct rcu_head *head)
|
|
{
|
|
struct inode *inode = container_of(head, struct inode, i_rcu);
|
|
kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
|
|
}
|
|
|
|
static void shmem_destroy_inode(struct inode *inode)
|
|
{
|
|
if (S_ISREG(inode->i_mode))
|
|
mpol_free_shared_policy(&SHMEM_I(inode)->policy);
|
|
call_rcu(&inode->i_rcu, shmem_destroy_callback);
|
|
}
|
|
|
|
static void shmem_init_inode(void *foo)
|
|
{
|
|
struct shmem_inode_info *info = foo;
|
|
inode_init_once(&info->vfs_inode);
|
|
}
|
|
|
|
static int shmem_init_inodecache(void)
|
|
{
|
|
shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
|
|
sizeof(struct shmem_inode_info),
|
|
0, SLAB_PANIC, shmem_init_inode);
|
|
return 0;
|
|
}
|
|
|
|
static void shmem_destroy_inodecache(void)
|
|
{
|
|
kmem_cache_destroy(shmem_inode_cachep);
|
|
}
|
|
|
|
static const struct address_space_operations shmem_aops = {
|
|
.writepage = shmem_writepage,
|
|
.set_page_dirty = __set_page_dirty_no_writeback,
|
|
#ifdef CONFIG_TMPFS
|
|
.write_begin = shmem_write_begin,
|
|
.write_end = shmem_write_end,
|
|
#endif
|
|
.migratepage = migrate_page,
|
|
.error_remove_page = generic_error_remove_page,
|
|
};
|
|
|
|
static const struct file_operations shmem_file_operations = {
|
|
.mmap = shmem_mmap,
|
|
#ifdef CONFIG_TMPFS
|
|
.llseek = generic_file_llseek,
|
|
.read = do_sync_read,
|
|
.write = do_sync_write,
|
|
.aio_read = shmem_file_aio_read,
|
|
.aio_write = generic_file_aio_write,
|
|
.fsync = noop_fsync,
|
|
.splice_read = shmem_file_splice_read,
|
|
.splice_write = generic_file_splice_write,
|
|
#endif
|
|
};
|
|
|
|
static const struct inode_operations shmem_inode_operations = {
|
|
.setattr = shmem_setattr,
|
|
.truncate_range = shmem_truncate_range,
|
|
#ifdef CONFIG_TMPFS_XATTR
|
|
.setxattr = shmem_setxattr,
|
|
.getxattr = shmem_getxattr,
|
|
.listxattr = shmem_listxattr,
|
|
.removexattr = shmem_removexattr,
|
|
#endif
|
|
};
|
|
|
|
static const struct inode_operations shmem_dir_inode_operations = {
|
|
#ifdef CONFIG_TMPFS
|
|
.create = shmem_create,
|
|
.lookup = simple_lookup,
|
|
.link = shmem_link,
|
|
.unlink = shmem_unlink,
|
|
.symlink = shmem_symlink,
|
|
.mkdir = shmem_mkdir,
|
|
.rmdir = shmem_rmdir,
|
|
.mknod = shmem_mknod,
|
|
.rename = shmem_rename,
|
|
.tmpfile = shmem_tmpfile,
|
|
#endif
|
|
#ifdef CONFIG_TMPFS_XATTR
|
|
.setxattr = shmem_setxattr,
|
|
.getxattr = shmem_getxattr,
|
|
.listxattr = shmem_listxattr,
|
|
.removexattr = shmem_removexattr,
|
|
#endif
|
|
#ifdef CONFIG_TMPFS_POSIX_ACL
|
|
.setattr = shmem_setattr,
|
|
#endif
|
|
};
|
|
|
|
static const struct inode_operations shmem_special_inode_operations = {
|
|
#ifdef CONFIG_TMPFS_XATTR
|
|
.setxattr = shmem_setxattr,
|
|
.getxattr = shmem_getxattr,
|
|
.listxattr = shmem_listxattr,
|
|
.removexattr = shmem_removexattr,
|
|
#endif
|
|
#ifdef CONFIG_TMPFS_POSIX_ACL
|
|
.setattr = shmem_setattr,
|
|
#endif
|
|
};
|
|
|
|
static const struct super_operations shmem_ops = {
|
|
.alloc_inode = shmem_alloc_inode,
|
|
.destroy_inode = shmem_destroy_inode,
|
|
#ifdef CONFIG_TMPFS
|
|
.statfs = shmem_statfs,
|
|
.remount_fs = shmem_remount_fs,
|
|
.show_options = shmem_show_options,
|
|
#endif
|
|
.evict_inode = shmem_evict_inode,
|
|
.drop_inode = generic_delete_inode,
|
|
.put_super = shmem_put_super,
|
|
};
|
|
|
|
static const struct vm_operations_struct shmem_vm_ops = {
|
|
.fault = shmem_fault,
|
|
#ifdef CONFIG_NUMA
|
|
.set_policy = shmem_set_policy,
|
|
.get_policy = shmem_get_policy,
|
|
#endif
|
|
};
|
|
|
|
static struct dentry *shmem_mount(struct file_system_type *fs_type,
|
|
int flags, const char *dev_name, void *data)
|
|
{
|
|
return mount_nodev(fs_type, flags, data, shmem_fill_super);
|
|
}
|
|
|
|
static struct file_system_type shmem_fs_type = {
|
|
.owner = THIS_MODULE,
|
|
.name = "tmpfs",
|
|
.mount = shmem_mount,
|
|
.kill_sb = kill_litter_super,
|
|
};
|
|
|
|
int __init shmem_init(void)
|
|
{
|
|
int error;
|
|
|
|
error = bdi_init(&shmem_backing_dev_info);
|
|
if (error)
|
|
goto out4;
|
|
|
|
error = shmem_init_inodecache();
|
|
if (error)
|
|
goto out3;
|
|
|
|
error = register_filesystem(&shmem_fs_type);
|
|
if (error) {
|
|
printk(KERN_ERR "Could not register tmpfs\n");
|
|
goto out2;
|
|
}
|
|
|
|
shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER,
|
|
shmem_fs_type.name, NULL);
|
|
if (IS_ERR(shm_mnt)) {
|
|
error = PTR_ERR(shm_mnt);
|
|
printk(KERN_ERR "Could not kern_mount tmpfs\n");
|
|
goto out1;
|
|
}
|
|
return 0;
|
|
|
|
out1:
|
|
unregister_filesystem(&shmem_fs_type);
|
|
out2:
|
|
shmem_destroy_inodecache();
|
|
out3:
|
|
bdi_destroy(&shmem_backing_dev_info);
|
|
out4:
|
|
shm_mnt = ERR_PTR(error);
|
|
return error;
|
|
}
|
|
|
|
#else /* !CONFIG_SHMEM */
|
|
|
|
/*
|
|
* tiny-shmem: simple shmemfs and tmpfs using ramfs code
|
|
*
|
|
* This is intended for small system where the benefits of the full
|
|
* shmem code (swap-backed and resource-limited) are outweighed by
|
|
* their complexity. On systems without swap this code should be
|
|
* effectively equivalent, but much lighter weight.
|
|
*/
|
|
|
|
#include <linux/ramfs.h>
|
|
|
|
static struct file_system_type shmem_fs_type = {
|
|
.name = "tmpfs",
|
|
.mount = ramfs_mount,
|
|
.kill_sb = kill_litter_super,
|
|
};
|
|
|
|
int __init shmem_init(void)
|
|
{
|
|
BUG_ON(register_filesystem(&shmem_fs_type) != 0);
|
|
|
|
shm_mnt = kern_mount(&shmem_fs_type);
|
|
BUG_ON(IS_ERR(shm_mnt));
|
|
|
|
return 0;
|
|
}
|
|
|
|
int shmem_unuse(swp_entry_t swap, struct page *page)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
int shmem_lock(struct file *file, int lock, struct user_struct *user)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
void shmem_unlock_mapping(struct address_space *mapping)
|
|
{
|
|
}
|
|
|
|
void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
|
|
{
|
|
truncate_inode_pages_range(inode->i_mapping, lstart, lend);
|
|
}
|
|
EXPORT_SYMBOL_GPL(shmem_truncate_range);
|
|
|
|
int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend)
|
|
{
|
|
/* Only CONFIG_SHMEM shmem.c ever supported i_op->truncate_range(). */
|
|
return -ENOSYS;
|
|
}
|
|
|
|
#define shmem_vm_ops generic_file_vm_ops
|
|
#define shmem_file_operations ramfs_file_operations
|
|
#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
|
|
#define shmem_acct_size(flags, size) 0
|
|
#define shmem_unacct_size(flags, size) do {} while (0)
|
|
|
|
#endif /* CONFIG_SHMEM */
|
|
|
|
/* common code */
|
|
|
|
/**
|
|
* shmem_file_setup - get an unlinked file living in tmpfs
|
|
* @name: name for dentry (to be seen in /proc/<pid>/maps
|
|
* @size: size to be set for the file
|
|
* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
|
|
*/
|
|
struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
|
|
{
|
|
int error;
|
|
struct file *file;
|
|
struct inode *inode;
|
|
struct path path;
|
|
struct dentry *root;
|
|
struct qstr this;
|
|
|
|
if (IS_ERR(shm_mnt))
|
|
return (void *)shm_mnt;
|
|
|
|
if (size < 0 || size > MAX_LFS_FILESIZE)
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
if (shmem_acct_size(flags, size))
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
error = -ENOMEM;
|
|
this.name = name;
|
|
this.len = strlen(name);
|
|
this.hash = 0; /* will go */
|
|
root = shm_mnt->mnt_root;
|
|
path.dentry = d_alloc(root, &this);
|
|
if (!path.dentry)
|
|
goto put_memory;
|
|
path.mnt = mntget(shm_mnt);
|
|
|
|
error = -ENOSPC;
|
|
inode = shmem_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
|
|
if (!inode)
|
|
goto put_dentry;
|
|
|
|
d_instantiate(path.dentry, inode);
|
|
inode->i_size = size;
|
|
clear_nlink(inode); /* It is unlinked */
|
|
#ifndef CONFIG_MMU
|
|
error = ramfs_nommu_expand_for_mapping(inode, size);
|
|
if (error)
|
|
goto put_dentry;
|
|
#endif
|
|
|
|
error = -ENFILE;
|
|
file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
|
|
&shmem_file_operations);
|
|
if (!file)
|
|
goto put_dentry;
|
|
|
|
return file;
|
|
|
|
put_dentry:
|
|
path_put(&path);
|
|
put_memory:
|
|
shmem_unacct_size(flags, size);
|
|
return ERR_PTR(error);
|
|
}
|
|
EXPORT_SYMBOL_GPL(shmem_file_setup);
|
|
|
|
void shmem_set_file(struct vm_area_struct *vma, struct file *file)
|
|
{
|
|
if (vma->vm_file)
|
|
fput(vma->vm_file);
|
|
vma->vm_file = file;
|
|
vma->vm_ops = &shmem_vm_ops;
|
|
vma->vm_flags |= VM_CAN_NONLINEAR;
|
|
}
|
|
|
|
/**
|
|
* shmem_zero_setup - setup a shared anonymous mapping
|
|
* @vma: the vma to be mmapped is prepared by do_mmap_pgoff
|
|
*/
|
|
int shmem_zero_setup(struct vm_area_struct *vma)
|
|
{
|
|
struct file *file;
|
|
loff_t size = vma->vm_end - vma->vm_start;
|
|
|
|
file = shmem_file_setup("dev/zero", size, vma->vm_flags);
|
|
if (IS_ERR(file))
|
|
return PTR_ERR(file);
|
|
|
|
shmem_set_file(vma, file);
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
|
|
* @mapping: the page's address_space
|
|
* @index: the page index
|
|
* @gfp: the page allocator flags to use if allocating
|
|
*
|
|
* This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
|
|
* with any new page allocations done using the specified allocation flags.
|
|
* But read_cache_page_gfp() uses the ->readpage() method: which does not
|
|
* suit tmpfs, since it may have pages in swapcache, and needs to find those
|
|
* for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
|
|
*
|
|
* i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
|
|
* with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
|
|
*/
|
|
struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
|
|
pgoff_t index, gfp_t gfp)
|
|
{
|
|
#ifdef CONFIG_SHMEM
|
|
struct inode *inode = mapping->host;
|
|
struct page *page;
|
|
int error;
|
|
|
|
BUG_ON(mapping->a_ops != &shmem_aops);
|
|
error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL);
|
|
if (error)
|
|
page = ERR_PTR(error);
|
|
else
|
|
unlock_page(page);
|
|
return page;
|
|
#else
|
|
/*
|
|
* The tiny !SHMEM case uses ramfs without swap
|
|
*/
|
|
return read_cache_page_gfp(mapping, index, gfp);
|
|
#endif
|
|
}
|
|
EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
|