From b7e570eef8454eb54d472454afdbab43ee4399a9 Mon Sep 17 00:00:00 2001 From: Arne Coucheron Date: Thu, 3 Oct 2019 04:04:47 +0200 Subject: [PATCH 01/54] Revert "shmem: fix splicing from a hole while it's punched" This reverts commit 21618a8f0f46ff327acd6a079bc05c065561e4c0. Change-Id: I1da9a6dbb1cb829c3964c3c29210872920a15898 --- mm/shmem.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index bf771f4288cd..f7ad7fbe450e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -501,19 +501,22 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) } index = start; - while (index <= end) { + for ( ; ; ) { cond_resched(); pvec.nr = shmem_find_get_pages_and_swap(mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, pvec.pages, indices); if (!pvec.nr) { - /* If all gone or hole-punch, we're done */ - if (index == start || end != -1) + if (index == start) break; - /* But if truncating, restart to make sure all gone */ index = start; continue; } + if (index == start && indices[0] > end) { + shmem_deswap_pagevec(&pvec); + pagevec_release(&pvec); + break; + } mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -523,12 +526,8 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) break; if (radix_tree_exceptional_entry(page)) { - if (shmem_free_swap(mapping, index, page)) { - /* Swap was replaced by page: retry */ - index--; - break; - } - nr_swaps_freed++; + nr_swaps_freed += !shmem_free_swap(mapping, + index, page); continue; } @@ -536,11 +535,6 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) if (page->mapping == mapping) { VM_BUG_ON(PageWriteback(page)); truncate_inode_page(mapping, page); - } else { - /* Page was replaced by swap: retry */ - unlock_page(page); - index--; - break; } unlock_page(page); } From 650ef9e38b95a8471cbc032903e681fe2332c27a Mon Sep 17 00:00:00 2001 From: Arne Coucheron Date: Thu, 3 Oct 2019 04:04:52 +0200 Subject: [PATCH 02/54] Revert "shmem: fix faulting into a hole, not taking i_mutex" This reverts commit 0f5a4a003f5d440c09f414f0c6fc6c591baa8b15. Change-Id: Id26b748385c0ab269a455299c8ed55e58a90d23e --- mm/shmem.c | 64 +++++++++++++++++------------------------------------- 1 file changed, 20 insertions(+), 44 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index f7ad7fbe450e..4e14a6078073 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -84,7 +84,6 @@ static struct vfsmount *shm_mnt; * a time): we would prefer not to enlarge the shmem inode just for that. */ struct shmem_falloc { - wait_queue_head_t *waitq; /* faults into hole wait for punch to end */ pgoff_t start; /* start of range currently being fallocated */ pgoff_t next; /* the next page offset to be fallocated */ }; @@ -1077,57 +1076,37 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * Trinity finds that probing a hole which tmpfs is punching can * prevent the hole-punch from ever completing: which in turn * locks writers out with its hold on i_mutex. So refrain from - * faulting pages into the hole while it's being punched. Although - * shmem_truncate_range() does remove the additions, it may be unable to - * keep up, as each new page needs its own unmap_mapping_range() call, - * and the i_mmap tree grows ever slower to scan if new vmas are added. - * - * It does not matter if we sometimes reach this check just before the - * hole-punch begins, so that one fault then races with the punch: - * we just need to make racing faults a rare case. - * - * The implementation below would be much simpler if we just used a - * standard mutex or completion: but we cannot take i_mutex in fault, - * and bloating every shmem inode for this unlikely case would be sad. + * faulting pages into the hole while it's being punched, and + * wait on i_mutex to be released if vmf->flags permits. */ if (unlikely(inode->i_private)) { struct shmem_falloc *shmem_falloc; spin_lock(&inode->i_lock); shmem_falloc = inode->i_private; - if (shmem_falloc && - vmf->pgoff >= shmem_falloc->start && - vmf->pgoff < shmem_falloc->next) { - wait_queue_head_t *shmem_falloc_waitq; - DEFINE_WAIT(shmem_fault_wait); - - ret = VM_FAULT_NOPAGE; + if (!shmem_falloc || + vmf->pgoff < shmem_falloc->start || + vmf->pgoff >= shmem_falloc->next) + shmem_falloc = NULL; + spin_unlock(&inode->i_lock); + /* + * i_lock has protected us from taking shmem_falloc seriously + * once return from vmtruncate_range() went back up that stack. + * i_lock does not serialize with i_mutex at all, but it does + * not matter if sometimes we wait unnecessarily, or sometimes + * miss out on waiting: we just need to make those cases rare. + */ + if (shmem_falloc) { if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { - /* It's polite to up mmap_sem if we can */ up_read(&vma->vm_mm->mmap_sem); - ret = VM_FAULT_RETRY; + mutex_lock(&inode->i_mutex); + mutex_unlock(&inode->i_mutex); + return VM_FAULT_RETRY; } - - shmem_falloc_waitq = shmem_falloc->waitq; - prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, - TASK_UNINTERRUPTIBLE); - spin_unlock(&inode->i_lock); - schedule(); - - /* - * shmem_falloc_waitq points into the vmtruncate_range() - * stack of the hole-punching task: shmem_falloc_waitq - * is usually invalid by the time we reach here, but - * finish_wait() does not dereference it in that case; - * though i_lock needed lest racing with wake_up_all(). - */ - spin_lock(&inode->i_lock); - finish_wait(shmem_falloc_waitq, &shmem_fault_wait); - spin_unlock(&inode->i_lock); - return ret; + /* cond_resched? Leave that to GUP or return to user */ + return VM_FAULT_NOPAGE; } - spin_unlock(&inode->i_lock); } error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); @@ -1158,9 +1137,7 @@ int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend) struct address_space *mapping = inode->i_mapping; loff_t unmap_start = round_up(lstart, PAGE_SIZE); loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1; - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); - shmem_falloc.waitq = &shmem_falloc_waitq; shmem_falloc.start = unmap_start >> PAGE_SHIFT; shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; spin_lock(&inode->i_lock); @@ -1175,7 +1152,6 @@ int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend) spin_lock(&inode->i_lock); inode->i_private = NULL; - wake_up_all(&shmem_falloc_waitq); spin_unlock(&inode->i_lock); } mutex_unlock(&inode->i_mutex); From c4fc51e323baa7974cb47f03bb35dbb413a7d9d0 Mon Sep 17 00:00:00 2001 From: Arne Coucheron Date: Thu, 3 Oct 2019 04:04:56 +0200 Subject: [PATCH 03/54] Revert "shmem: fix faulting into a hole while it's punched" This reverts commit a62f374abb72830e1319d45ecb5e597b1944c8cd. Change-Id: I1e5047c2ea616b3cb942b22e57a4c606b0dcba2a --- mm/shmem.c | 91 --------------------------------------------------- mm/truncate.c | 25 ++++++++++++++ 2 files changed, 25 insertions(+), 91 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 4e14a6078073..cb019e9c972f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -78,16 +78,6 @@ static struct vfsmount *shm_mnt; /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ #define SHORT_SYMLINK_LEN 128 -/* - * vmtruncate_range() communicates with shmem_fault via - * inode->i_private (with i_mutex making sure that it has only one user at - * a time): we would prefer not to enlarge the shmem inode just for that. - */ -struct shmem_falloc { - pgoff_t start; /* start of range currently being fallocated */ - pgoff_t next; /* the next page offset to be fallocated */ -}; - struct shmem_xattr { struct list_head list; /* anchored by shmem_inode_info->xattr_list */ char *name; /* xattr name */ @@ -1072,43 +1062,6 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) int error; int ret = VM_FAULT_LOCKED; - /* - * Trinity finds that probing a hole which tmpfs is punching can - * prevent the hole-punch from ever completing: which in turn - * locks writers out with its hold on i_mutex. So refrain from - * faulting pages into the hole while it's being punched, and - * wait on i_mutex to be released if vmf->flags permits. - */ - if (unlikely(inode->i_private)) { - struct shmem_falloc *shmem_falloc; - - spin_lock(&inode->i_lock); - shmem_falloc = inode->i_private; - if (!shmem_falloc || - vmf->pgoff < shmem_falloc->start || - vmf->pgoff >= shmem_falloc->next) - shmem_falloc = NULL; - spin_unlock(&inode->i_lock); - /* - * i_lock has protected us from taking shmem_falloc seriously - * once return from vmtruncate_range() went back up that stack. - * i_lock does not serialize with i_mutex at all, but it does - * not matter if sometimes we wait unnecessarily, or sometimes - * miss out on waiting: we just need to make those cases rare. - */ - if (shmem_falloc) { - if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && - !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { - up_read(&vma->vm_mm->mmap_sem); - mutex_lock(&inode->i_mutex); - mutex_unlock(&inode->i_mutex); - return VM_FAULT_RETRY; - } - /* cond_resched? Leave that to GUP or return to user */ - return VM_FAULT_NOPAGE; - } - } - error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); if (error) return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); @@ -1120,44 +1073,6 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return ret; } -int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend) -{ - /* - * If the underlying filesystem is not going to provide - * a way to truncate a range of blocks (punch a hole) - - * we should return failure right now. - * Only CONFIG_SHMEM shmem.c ever supported i_op->truncate_range(). - */ - if (inode->i_op->truncate_range != shmem_truncate_range) - return -ENOSYS; - - mutex_lock(&inode->i_mutex); - { - struct shmem_falloc shmem_falloc; - struct address_space *mapping = inode->i_mapping; - loff_t unmap_start = round_up(lstart, PAGE_SIZE); - loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1; - - shmem_falloc.start = unmap_start >> PAGE_SHIFT; - shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; - spin_lock(&inode->i_lock); - inode->i_private = &shmem_falloc; - spin_unlock(&inode->i_lock); - - if ((u64)unmap_end > (u64)unmap_start) - unmap_mapping_range(mapping, unmap_start, - 1 + unmap_end - unmap_start, 0); - shmem_truncate_range(inode, lstart, lend); - /* No need to unmap again: hole-punching leaves COWed pages */ - - spin_lock(&inode->i_lock); - inode->i_private = NULL; - spin_unlock(&inode->i_lock); - } - mutex_unlock(&inode->i_mutex); - return 0; -} - #ifdef CONFIG_NUMA static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) { @@ -2737,12 +2652,6 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) } EXPORT_SYMBOL_GPL(shmem_truncate_range); -int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend) -{ - /* Only CONFIG_SHMEM shmem.c ever supported i_op->truncate_range(). */ - return -ENOSYS; -} - #define shmem_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) diff --git a/mm/truncate.c b/mm/truncate.c index 57625f7ed8e1..584e3194e979 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -658,6 +658,31 @@ int vmtruncate(struct inode *inode, loff_t newsize) } EXPORT_SYMBOL(vmtruncate); +int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend) +{ + struct address_space *mapping = inode->i_mapping; + loff_t holebegin = round_up(lstart, PAGE_SIZE); + loff_t holelen = 1 + lend - holebegin; + + /* + * If the underlying filesystem is not going to provide + * a way to truncate a range of blocks (punch a hole) - + * we should return failure right now. + */ + if (!inode->i_op->truncate_range) + return -ENOSYS; + + mutex_lock(&inode->i_mutex); + inode_dio_wait(inode); + unmap_mapping_range(mapping, holebegin, holelen, 1); + inode->i_op->truncate_range(inode, lstart, lend); + /* unmap again to remove racily COWed private pages */ + unmap_mapping_range(mapping, holebegin, holelen, 1); + mutex_unlock(&inode->i_mutex); + + return 0; +} + /** * truncate_pagecache_range - unmap and remove pagecache that is hole-punched * @inode: inode From 75f433a2beb2095730071df0f0bb02f1930e7e68 Mon Sep 17 00:00:00 2001 From: Arne Coucheron Date: Thu, 3 Oct 2019 00:49:42 +0200 Subject: [PATCH 04/54] Include uidgid.h in same header files as in newer kernels Doing this we don't have to add it when picking patches making use of functions in it. Change-Id: Ia9e2717f4dc0a69710fb3bb88b3e19f2af584776 --- include/linux/cred.h | 1 + include/linux/device.h | 1 + include/linux/fs.h | 1 + include/linux/ipc.h | 1 + include/linux/key.h | 1 + include/linux/nfs4.h | 1 + include/linux/nfs_idmap.h | 1 + include/linux/quota.h | 1 + include/linux/sched.h | 1 + include/linux/stat.h | 1 + include/linux/sunrpc/auth.h | 1 + include/net/netns/ipv4.h | 1 + 12 files changed, 12 insertions(+) diff --git a/include/linux/cred.h b/include/linux/cred.h index 7f5b298f7d49..3b3f0b318486 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -17,6 +17,7 @@ #include #include #include +#include struct user_struct; struct cred; diff --git a/include/linux/device.h b/include/linux/device.h index 9e80e90530c9..f5ad3fc57f09 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -23,6 +23,7 @@ #include #include #include +#include #include struct device; diff --git a/include/linux/fs.h b/include/linux/fs.h index d21590b2106e..ac139078c4a6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -428,6 +428,7 @@ struct fscrypt_policy { #include #include #include +#include #include diff --git a/include/linux/ipc.h b/include/linux/ipc.h index 30e816148df4..30562c982a81 100644 --- a/include/linux/ipc.h +++ b/include/linux/ipc.h @@ -79,6 +79,7 @@ struct ipc_kludge { #ifdef __KERNEL__ #include +#include #define IPCMNI 32768 /* <= MAX_INT limit for ipc arrays (including sysctl changes) */ diff --git a/include/linux/key.h b/include/linux/key.h index eb3794de2017..4be9730b4fe8 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -24,6 +24,7 @@ #include #ifdef __KERNEL__ +#include /* key handle serial number */ typedef int32_t key_serial_t; diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 0987146b0637..6b86a236965e 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -167,6 +167,7 @@ enum nfs4_acl_whotype { #ifdef __KERNEL__ #include +#include struct nfs4_ace { uint32_t type; diff --git a/include/linux/nfs_idmap.h b/include/linux/nfs_idmap.h index 7eed2012d288..53b50c5f2543 100644 --- a/include/linux/nfs_idmap.h +++ b/include/linux/nfs_idmap.h @@ -37,6 +37,7 @@ #ifndef NFS_IDMAP_H #define NFS_IDMAP_H +#include #include /* XXX from bits/utmp.h */ diff --git a/include/linux/quota.h b/include/linux/quota.h index ffd8607ca4be..0e18a66a348d 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -35,6 +35,7 @@ #include #include +#include #define __DQUOT_VERSION__ "dquot_6.5.2" diff --git a/include/linux/sched.h b/include/linux/sched.h index e64bd55a161d..c89044df2912 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -90,6 +90,7 @@ struct sched_param { #include #include #include +#include #include diff --git a/include/linux/stat.h b/include/linux/stat.h index 611c398dab72..a918fcecaea2 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -58,6 +58,7 @@ #include #include +#include struct kstat { u64 ino; diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h index 492a36d72829..6a1ce4781710 100644 --- a/include/linux/sunrpc/auth.h +++ b/include/linux/sunrpc/auth.h @@ -17,6 +17,7 @@ #include #include +#include /* size of the nodename buffer */ #define UNX_MAXNODENAME 32 diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index adbe8c6e20d9..b2e6a7b6da9f 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -5,6 +5,7 @@ #ifndef __NETNS_IPV4_H__ #define __NETNS_IPV4_H__ +#include #include struct ctl_table_header; From d0cb59457b2fa252b41659db5c672ca004701cab Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:38 -0700 Subject: [PATCH 05/54] shmem: replace page if mapping excludes its zone The GMA500 GPU driver uses GEM shmem objects, but with a new twist: the backing RAM has to be below 4GB. Not a problem while the boards supported only 4GB: but now Intel's D2700MUD boards support 8GB, and their GMA3600 is managed by the GMA500 driver. shmem/tmpfs has never pretended to support hardware restrictions on the backing memory, but it might have appeared to do so before v3.1, and even now it works fine until a page is swapped out then back in. When read_cache_page_gfp() supplied a freshly allocated page for copy, that compensated for whatever choice might have been made by earlier swapin readahead; but swapoff was likely to destroy the illusion. We'd like to continue to support GMA500, so now add a new shmem_should_replace_page() check on the zone when about to move a page from swapcache to filecache (in swapin and swapoff cases), with shmem_replace_page() to allocate and substitute a suitable page (given gma500/gem.c's mapping_set_gfp_mask GFP_KERNEL | __GFP_DMA32). This does involve a minor extension to mem_cgroup_replace_page_cache() (the page may or may not have already been charged); and I've removed a comment and call to mem_cgroup_uncharge_cache_page(), which in fact is always a no-op while PageSwapCache. Also removed optimization of an unlikely path in shmem_getpage_gfp(), now that we need to check PageSwapCache more carefully (a racing caller might already have made the copy). And at one point shmem_unuse_inode() needs to use the hitherto private page_swapcount(), to guard against racing with inode eviction. It would make sense to extend shmem_should_replace_page(), to cover cpuset and NUMA mempolicy restrictions too, but set that aside for now: needs a cleanup of shmem mempolicy handling, and more testing, and ought to handle swap faults in do_swap_page() as well as shmem. Change-Id: I45bd35a42eab75c8ad1db7a7ae30b924718b2b67 Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Acked-by: KAMEZAWA Hiroyuki Cc: Alan Cox Cc: Stephane Marchesin Cc: Andi Kleen Cc: Dave Airlie Cc: Daniel Vetter Cc: Rob Clark Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 17 ++++-- mm/shmem.c | 141 +++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 135 insertions(+), 23 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index eafff673a652..c5775df2b010 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3372,7 +3372,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, void mem_cgroup_replace_page_cache(struct page *oldpage, struct page *newpage) { - struct mem_cgroup *memcg; + struct mem_cgroup *memcg = NULL; struct page_cgroup *pc; enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; @@ -3382,11 +3382,20 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, pc = lookup_page_cgroup(oldpage); /* fix accounting on old pages */ lock_page_cgroup(pc); - memcg = pc->mem_cgroup; - mem_cgroup_charge_statistics(memcg, false, -1); - ClearPageCgroupUsed(pc); + if (PageCgroupUsed(pc)) { + memcg = pc->mem_cgroup; + mem_cgroup_charge_statistics(memcg, false, -1); + ClearPageCgroupUsed(pc); + } unlock_page_cgroup(pc); + /* + * When called from shmem_replace_page(), in some cases the + * oldpage has already been charged, and in some cases not. + */ + if (!memcg) + return; + if (PageSwapBacked(oldpage)) type = MEM_CGROUP_CHARGE_TYPE_SHMEM; diff --git a/mm/shmem.c b/mm/shmem.c index cb019e9c972f..f87d102fbe13 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -105,6 +105,9 @@ static unsigned long shmem_default_max_inodes(void) } #endif +static bool shmem_should_replace_page(struct page *page, gfp_t gfp); +static int shmem_replace_page(struct page **pagep, gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index); static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); @@ -606,12 +609,13 @@ static void shmem_evict_inode(struct inode *inode) * If swap found in inode, free it and move page from swapcache to filecache. */ static int shmem_unuse_inode(struct shmem_inode_info *info, - swp_entry_t swap, struct page *page) + swp_entry_t swap, struct page **pagep) { struct address_space *mapping = info->vfs_inode.i_mapping; void *radswap; pgoff_t index; - int error; + gfp_t gfp; + int error = 0; radswap = swp_to_radix_entry(swap); index = radix_tree_locate_item(&mapping->page_tree, radswap); @@ -627,22 +631,37 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, if (shmem_swaplist.next != &info->swaplist) list_move_tail(&shmem_swaplist, &info->swaplist); + gfp = mapping_gfp_mask(mapping); + if (shmem_should_replace_page(*pagep, gfp)) { + mutex_unlock(&shmem_swaplist_mutex); + error = shmem_replace_page(pagep, gfp, info, index); + mutex_lock(&shmem_swaplist_mutex); + /* + * We needed to drop mutex to make that restrictive page + * allocation; but the inode might already be freed by now, + * and we cannot refer to inode or mapping or info to check. + * However, we do hold page lock on the PageSwapCache page, + * so can check if that still has our reference remaining. + */ + if (!page_swapcount(*pagep)) + error = -ENOENT; + } + /* * We rely on shmem_swaplist_mutex, not only to protect the swaplist, * but also to hold up shmem_evict_inode(): so inode cannot be freed * beneath us (pagelock doesn't help until the page is in pagecache). */ - error = shmem_add_to_page_cache(page, mapping, index, + if (!error) + error = shmem_add_to_page_cache(*pagep, mapping, index, GFP_NOWAIT, radswap); - /* which does mem_cgroup_uncharge_cache_page on error */ - if (error != -ENOMEM) { /* * Truncation and eviction use free_swap_and_cache(), which * only does trylock page: if we raced, best clean up here. */ - delete_from_swap_cache(page); - set_page_dirty(page); + delete_from_swap_cache(*pagep); + set_page_dirty(*pagep); if (!error) { spin_lock(&info->lock); info->swapped--; @@ -662,7 +681,14 @@ int shmem_unuse(swp_entry_t swap, struct page *page) struct list_head *this, *next; struct shmem_inode_info *info; int found = 0; - int error; + int error = 0; + + /* + * There's a faint possibility that swap page was replaced before + * caller locked it: it will come back later with the right page. + */ + if (unlikely(!PageSwapCache(page))) + goto out; /* * Charge page using GFP_KERNEL while we can wait, before taking @@ -678,7 +704,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page) list_for_each_safe(this, next, &shmem_swaplist) { info = list_entry(this, struct shmem_inode_info, swaplist); if (info->swapped) - found = shmem_unuse_inode(info, swap, page); + found = shmem_unuse_inode(info, swap, &page); else list_del_init(&info->swaplist); cond_resched(); @@ -687,8 +713,6 @@ int shmem_unuse(swp_entry_t swap, struct page *page) } mutex_unlock(&shmem_swaplist_mutex); - if (!found) - mem_cgroup_uncharge_cache_page(page); if (found < 0) error = found; out: @@ -863,6 +887,84 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) } #endif +/* + * When a page is moved from swapcache to shmem filecache (either by the + * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of + * shmem_unuse_inode()), it may have been read in earlier from swap, in + * ignorance of the mapping it belongs to. If that mapping has special + * constraints (like the gma500 GEM driver, which requires RAM below 4GB), + * we may need to copy to a suitable page before moving to filecache. + * + * In a future release, this may well be extended to respect cpuset and + * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); + * but for now it is a simple matter of zone. + */ +static bool shmem_should_replace_page(struct page *page, gfp_t gfp) +{ + return page_zonenum(page) > gfp_zone(gfp); +} + +static int shmem_replace_page(struct page **pagep, gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index) +{ + struct page *oldpage, *newpage; + struct address_space *swap_mapping; + pgoff_t swap_index; + int error; + + oldpage = *pagep; + swap_index = page_private(oldpage); + swap_mapping = page_mapping(oldpage); + + /* + * We have arrived here because our zones are constrained, so don't + * limit chance of success by further cpuset and node constraints. + */ + gfp &= ~GFP_CONSTRAINT_MASK; + newpage = shmem_alloc_page(gfp, info, index); + if (!newpage) + return -ENOMEM; + VM_BUG_ON(shmem_should_replace_page(newpage, gfp)); + + *pagep = newpage; + page_cache_get(newpage); + copy_highpage(newpage, oldpage); + + VM_BUG_ON(!PageLocked(oldpage)); + __set_page_locked(newpage); + VM_BUG_ON(!PageUptodate(oldpage)); + SetPageUptodate(newpage); + VM_BUG_ON(!PageSwapBacked(oldpage)); + SetPageSwapBacked(newpage); + VM_BUG_ON(!swap_index); + set_page_private(newpage, swap_index); + VM_BUG_ON(!PageSwapCache(oldpage)); + SetPageSwapCache(newpage); + + /* + * Our caller will very soon move newpage out of swapcache, but it's + * a nice clean interface for us to replace oldpage by newpage there. + */ + spin_lock_irq(&swap_mapping->tree_lock); + error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, + newpage); + __inc_zone_page_state(newpage, NR_FILE_PAGES); + __dec_zone_page_state(oldpage, NR_FILE_PAGES); + spin_unlock_irq(&swap_mapping->tree_lock); + BUG_ON(error); + + mem_cgroup_replace_page_cache(oldpage, newpage); + lru_cache_add_anon(newpage); + + ClearPageSwapCache(oldpage); + set_page_private(oldpage, 0); + + unlock_page(oldpage); + page_cache_release(oldpage); + page_cache_release(oldpage); + return 0; +} + /* * shmem_getpage_gfp - find page in cache, or get from swap, or allocate * @@ -931,19 +1033,20 @@ repeat: /* We have to do this with page locked to prevent races */ lock_page(page); + if (!PageSwapCache(page) || page->mapping) { + error = -EEXIST; /* try again */ + goto failed; + } if (!PageUptodate(page)) { error = -EIO; goto failed; } wait_on_page_writeback(page); - /* Someone may have already done it for us */ - if (page->mapping) { - if (page->mapping == mapping && - page->index == index) - goto done; - error = -EEXIST; - goto failed; + if (shmem_should_replace_page(page, gfp)) { + error = shmem_replace_page(&page, gfp, info, index); + if (error) + goto failed; } error = mem_cgroup_cache_charge(page, current->mm, @@ -1006,7 +1109,7 @@ repeat: if (sgp == SGP_DIRTY) set_page_dirty(page); } -done: + /* Perhaps the file has been truncated since we checked */ if (sgp != SGP_WRITE && ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { From 2b4bec446dfea3dffba124a36b0e0e170c7c1a4c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:38 -0700 Subject: [PATCH 06/54] tmpfs: enable NOSEC optimization Let tmpfs into the NOSEC optimization (avoiding file_remove_suid() overhead on most common writes): set MS_NOSEC on its superblocks. Change-Id: I72fe4524333306472995ea3d8dc7927ac98b31b5 Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Cc: Andi Kleen Cc: Al Viro Cc: Cong Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/shmem.c b/mm/shmem.c index f87d102fbe13..350af185f858 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2482,6 +2482,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) } } sb->s_export_op = &shmem_export_ops; + sb->s_flags |= MS_NOSEC; #else sb->s_flags |= MS_NOUSER; #endif From ad041dd6a6a1cce3cb63575a53fcf432fee5f351 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:39 -0700 Subject: [PATCH 07/54] tmpfs: optimize clearing when writing Nick proposed years ago that tmpfs should avoid clearing its pages where write will overwrite them with new data, as ramfs has long done. But I messed it up and just got bad data. Tried again recently, it works fine. Here's time output for writing 4GiB 16 times on this Core i5 laptop: before: real 0m21.169s user 0m0.028s sys 0m21.057s real 0m21.382s user 0m0.016s sys 0m21.289s real 0m21.311s user 0m0.020s sys 0m21.217s after: real 0m18.273s user 0m0.032s sys 0m18.165s real 0m18.354s user 0m0.020s sys 0m18.265s real 0m18.440s user 0m0.032s sys 0m18.337s ramfs: real 0m16.860s user 0m0.028s sys 0m16.765s real 0m17.382s user 0m0.040s sys 0m17.273s real 0m17.133s user 0m0.044s sys 0m17.021s Yes, I have done perf reports, but they need more explanation than they deserve: in summary, clear_page vanishes, its cache loading shifts into copy_user_generic_unrolled; shmem_getpage_gfp goes down, and surprisingly mark_page_accessed goes way up - I think because they are respectively where the cache gets to be reloaded after being purged by clear or copy. Change-Id: I3f32de9543820c0c827e7e89b3481e4fe6670dff Suggested-by: Nick Piggin Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 350af185f858..2de40e858352 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1103,9 +1103,14 @@ repeat: shmem_recalc_inode(inode); spin_unlock(&info->lock); - clear_highpage(page); - flush_dcache_page(page); - SetPageUptodate(page); + /* + * Let SGP_WRITE caller clear ends if write does not fill page + */ + if (sgp != SGP_WRITE) { + clear_highpage(page); + flush_dcache_page(page); + SetPageUptodate(page); + } if (sgp == SGP_DIRTY) set_page_dirty(page); } @@ -1315,6 +1320,14 @@ shmem_write_end(struct file *file, struct address_space *mapping, if (pos + copied > inode->i_size) i_size_write(inode, pos + copied); + if (!PageUptodate(page)) { + if (copied < PAGE_CACHE_SIZE) { + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + zero_user_segments(page, 0, from, + from + copied, PAGE_CACHE_SIZE); + } + SetPageUptodate(page); + } set_page_dirty(page); unlock_page(page); page_cache_release(page); @@ -1810,6 +1823,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s kaddr = kmap_atomic(page); memcpy(kaddr, symname, len); kunmap_atomic(kaddr); + SetPageUptodate(page); set_page_dirty(page); unlock_page(page); page_cache_release(page); From 89097b0ee5759d3220f215220d314e9e4c78fca2 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:40 -0700 Subject: [PATCH 08/54] tmpfs: support fallocate FALLOC_FL_PUNCH_HOLE tmpfs has supported hole-punching since 2.6.16, via madvise(,,MADV_REMOVE). But nowadays fallocate(,FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE,,) is the agreed way to punch holes. So add shmem_fallocate() to support that, and tweak shmem_truncate_range() to support partial pages at both the beginning and end of range (never needed for madvise, which demands rounded addr and rounds up length). Change-Id: If3817fc74a6fb6da898be7e0bedce729113b6530 Based-on-patch-by: Cong Wang Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Cc: Cong Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 68 +++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 57 insertions(+), 11 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 2de40e858352..5a43060d2490 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -53,6 +53,7 @@ static struct vfsmount *shm_mnt; #include #include #include +#include #include #include #include @@ -434,21 +435,23 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); - pgoff_t end = (lend >> PAGE_CACHE_SHIFT); + pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT; + unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1); + unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); struct pagevec pvec; pgoff_t indices[PAGEVEC_SIZE]; long nr_swaps_freed = 0; pgoff_t index; int i; - BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); + if (lend == -1) + end = -1; /* unsigned, so actually very big */ pagevec_init(&pvec, 0); index = start; - while (index <= end) { + while (index < end) { pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, + min(end - index, (pgoff_t)PAGEVEC_SIZE), pvec.pages, indices); if (!pvec.nr) break; @@ -457,7 +460,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) struct page *page = pvec.pages[i]; index = indices[i]; - if (index > end) + if (index >= end) break; if (radix_tree_exceptional_entry(page)) { @@ -481,22 +484,39 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) index++; } - if (partial) { + if (partial_start) { struct page *page = NULL; shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); if (page) { - zero_user_segment(page, partial, PAGE_CACHE_SIZE); + unsigned int top = PAGE_CACHE_SIZE; + if (start > end) { + top = partial_end; + partial_end = 0; + } + zero_user_segment(page, partial_start, top); set_page_dirty(page); unlock_page(page); page_cache_release(page); } } + if (partial_end) { + struct page *page = NULL; + shmem_getpage(inode, end, &page, SGP_READ, NULL); + if (page) { + zero_user_segment(page, 0, partial_end); + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + } + } + if (start >= end) + return; index = start; for ( ; ; ) { cond_resched(); pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, + min(end - index, (pgoff_t)PAGEVEC_SIZE), pvec.pages, indices); if (!pvec.nr) { if (index == start) @@ -504,7 +524,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) index = start; continue; } - if (index == start && indices[0] > end) { + if (index == start && indices[0] >= end) { shmem_deswap_pagevec(&pvec); pagevec_release(&pvec); break; @@ -514,7 +534,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) struct page *page = pvec.pages[i]; index = indices[i]; - if (index > end) + if (index >= end) break; if (radix_tree_exceptional_entry(page)) { @@ -1587,6 +1607,31 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, return error; } +static long shmem_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) +{ + struct inode *inode = file->f_path.dentry->d_inode; + int error = -EOPNOTSUPP; + + mutex_lock(&inode->i_mutex); + + if (mode & FALLOC_FL_PUNCH_HOLE) { + struct address_space *mapping = file->f_mapping; + loff_t unmap_start = round_up(offset, PAGE_SIZE); + loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; + + if ((u64)unmap_end > (u64)unmap_start) + unmap_mapping_range(mapping, unmap_start, + 1 + unmap_end - unmap_start, 0); + shmem_truncate_range(inode, offset, offset + len - 1); + /* No need to unmap again: hole-punching leaves COWed pages */ + error = 0; + } + + mutex_unlock(&inode->i_mutex); + return error; +} + static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) { struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); @@ -2599,6 +2644,7 @@ static const struct file_operations shmem_file_operations = { .fsync = noop_fsync, .splice_read = shmem_file_splice_read, .splice_write = generic_file_splice_write, + .fallocate = shmem_fallocate, #endif }; From 1c30f2c46598fb712b0a95b46e380af31ebe8729 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:41 -0700 Subject: [PATCH 09/54] mm/fs: remove truncate_range Remove vmtruncate_range(), and remove the truncate_range method from struct inode_operations: only tmpfs ever supported it, and tmpfs has now converted over to using the fallocate method of file_operations. Update Documentation accordingly, adding (setlease and) fallocate lines. And while we're in mm.h, remove duplicate declarations of shmem_lock() and shmem_file_setup(): everyone is now using the ones in shmem_fs.h. Change-Id: I1452e3356333d19b778ef60d6cf7625c31b77bf8 Based-on-patch-by: Cong Wang Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Cc: Cong Wang Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/Locking | 2 -- Documentation/filesystems/vfs.txt | 13 ++++++++----- fs/bad_inode.c | 1 - include/linux/fs.h | 1 - include/linux/mm.h | 4 ---- mm/shmem.c | 1 - mm/truncate.c | 25 ------------------------- 7 files changed, 8 insertions(+), 39 deletions(-) diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 7248c523a411..9d21decb74ed 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -59,7 +59,6 @@ prototypes: ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*removexattr) (struct dentry *, const char *); - void (*truncate_range)(struct inode *, loff_t, loff_t); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); void (*update_time)(struct inode *, struct timespec *, int); int (*atomic_open)(struct inode *, struct dentry *, @@ -91,7 +90,6 @@ setxattr: yes getxattr: no listxattr: no removexattr: yes -truncate_range: yes fiemap: no update_time: no atomic_open: yes diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 8a0b032697cf..85866762a161 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -363,7 +363,6 @@ struct inode_operations { ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*removexattr) (struct dentry *, const char *); - void (*truncate_range)(struct inode *, loff_t, loff_t); void (*update_time)(struct inode *, struct timespec *, int); int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode, int *opened); @@ -476,9 +475,6 @@ otherwise noted. removexattr: called by the VFS to remove an extended attribute from a file. This method is called by removexattr(2) system call. - truncate_range: a method provided by the underlying filesystem to truncate a - range of blocks , i.e. punch a hole somewhere in a file. - update_time: called by the VFS to update a specific time or the i_version of an inode. If this is not defined the VFS will update the inode itself and call mark_inode_dirty_sync. @@ -780,7 +776,7 @@ struct file_operations ---------------------- This describes how the VFS can manipulate an open file. As of kernel -2.6.22, the following members are defined: +3.5, the following members are defined: struct file_operations { struct module *owner; @@ -810,6 +806,8 @@ struct file_operations { int (*flock) (struct file *, int, struct file_lock *); ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, struct pipe_inode_info *, size_t, unsigned int); + int (*setlease)(struct file *, long arg, struct file_lock **); + long (*fallocate)(struct file *, int mode, loff_t offset, loff_t len); }; Again, all methods are called without any locks being held, unless @@ -878,6 +876,11 @@ otherwise noted. splice_read: called by the VFS to splice data from file to a pipe. This method is used by the splice(2) system call + setlease: called by the VFS to set or release a file lock lease. + setlease has the file_lock_lock held and must not sleep. + + fallocate: called by the VFS to preallocate blocks or punch a hole. + Note that the file operations are implemented by the specific filesystem in which the inode resides. When opening a device node (character or block special) most filesystems will call special diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 83aefc5b58df..b1342ffb3cf6 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -292,7 +292,6 @@ static const struct inode_operations bad_inode_ops = .getxattr = bad_inode_getxattr, .listxattr = bad_inode_listxattr, .removexattr = bad_inode_removexattr, - /* truncate_range returns void */ }; diff --git a/include/linux/fs.h b/include/linux/fs.h index ac139078c4a6..0b65d9a8c3da 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1709,7 +1709,6 @@ struct inode_operations { ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*removexattr) (struct dentry *, const char *); - void (*truncate_range)(struct inode *, loff_t, loff_t); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); int (*update_time)(struct inode *, struct timespec *, int); diff --git a/include/linux/mm.h b/include/linux/mm.h index ba4a1e743297..06a95f01ea37 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -878,8 +878,6 @@ extern void pagefault_out_of_memory(void); extern void show_free_areas(unsigned int flags); extern bool skip_free_areas_node(unsigned int flags, int nid); -int shmem_lock(struct file *file, int lock, struct user_struct *user); -struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags); void shmem_set_file(struct vm_area_struct *vma, struct file *file); int shmem_zero_setup(struct vm_area_struct *); @@ -962,11 +960,9 @@ extern void truncate_pagecache(struct inode *inode, loff_t old, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); extern int vmtruncate(struct inode *inode, loff_t offset); -extern int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end); void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); int truncate_inode_page(struct address_space *mapping, struct page *page); int generic_error_remove_page(struct address_space *mapping, struct page *page); - int invalidate_inode_page(struct page *page); #ifdef CONFIG_MMU diff --git a/mm/shmem.c b/mm/shmem.c index 5a43060d2490..18e24691bbea 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2650,7 +2650,6 @@ static const struct file_operations shmem_file_operations = { static const struct inode_operations shmem_inode_operations = { .setattr = shmem_setattr, - .truncate_range = shmem_truncate_range, #ifdef CONFIG_TMPFS_XATTR .setxattr = shmem_setxattr, .getxattr = shmem_getxattr, diff --git a/mm/truncate.c b/mm/truncate.c index 584e3194e979..57625f7ed8e1 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -658,31 +658,6 @@ int vmtruncate(struct inode *inode, loff_t newsize) } EXPORT_SYMBOL(vmtruncate); -int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend) -{ - struct address_space *mapping = inode->i_mapping; - loff_t holebegin = round_up(lstart, PAGE_SIZE); - loff_t holelen = 1 + lend - holebegin; - - /* - * If the underlying filesystem is not going to provide - * a way to truncate a range of blocks (punch a hole) - - * we should return failure right now. - */ - if (!inode->i_op->truncate_range) - return -ENOSYS; - - mutex_lock(&inode->i_mutex); - inode_dio_wait(inode); - unmap_mapping_range(mapping, holebegin, holelen, 1); - inode->i_op->truncate_range(inode, lstart, lend); - /* unmap again to remove racily COWed private pages */ - unmap_mapping_range(mapping, holebegin, holelen, 1); - mutex_unlock(&inode->i_mutex); - - return 0; -} - /** * truncate_pagecache_range - unmap and remove pagecache that is hole-punched * @inode: inode From da86902e2fe95cbd07edd1f917b46c94d64a59f4 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:40 -0700 Subject: [PATCH 10/54] mm/fs: route MADV_REMOVE to FALLOC_FL_PUNCH_HOLE Now tmpfs supports hole-punching via fallocate(), switch madvise_remove() to use do_fallocate() instead of vmtruncate_range(): which extends madvise(,,MADV_REMOVE) support from tmpfs to ext4, ocfs2 and xfs. There is one more user of vmtruncate_range() in our tree, staging/android's ashmem_shrink(): convert it to use do_fallocate() too (but if its unpinned areas are already unmapped - I don't know - then it would do better to use shmem_truncate_range() directly). Change-Id: Ic645f5fffb37066a2f86d2a4d7367118bb5c0395 Based-on-patch-by: Cong Wang Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Cc: Al Viro Cc: Colin Cross Cc: John Stultz Cc: Greg Kroah-Hartman Cc: "Theodore Ts'o" Cc: Andreas Dilger Cc: Mark Fasheh Cc: Joel Becker Cc: Dave Chinner Cc: Ben Myers Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/staging/android/ashmem.c | 8 +++++--- mm/madvise.c | 15 +++++++-------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c index 2467303603f8..06eb673259a7 100644 --- a/drivers/staging/android/ashmem.c +++ b/drivers/staging/android/ashmem.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -375,11 +376,12 @@ static int ashmem_shrink(struct shrinker *s, struct shrink_control *sc) return -1; list_for_each_entry_safe(range, next, &ashmem_lru_list, lru) { - struct inode *inode = range->asma->file->f_dentry->d_inode; loff_t start = range->pgstart * PAGE_SIZE; - loff_t end = (range->pgend + 1) * PAGE_SIZE - 1; + loff_t end = (range->pgend + 1) * PAGE_SIZE; - vmtruncate_range(inode, start, end); + do_fallocate(range->asma->file, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + start, end - start); range->purged = ASHMEM_WAS_PURGED; lru_del(range); diff --git a/mm/madvise.c b/mm/madvise.c index b075d1d1f6c8..1a58ddfb53a0 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -11,8 +11,10 @@ #include #include #include +#include #include #include +#include #include /* @@ -202,8 +204,7 @@ static long madvise_remove(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { - struct address_space *mapping; - loff_t offset, endoff; + loff_t offset; int error; struct file *f; @@ -221,22 +222,20 @@ static long madvise_remove(struct vm_area_struct *vma, if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) return -EACCES; - mapping = vma->vm_file->f_mapping; - offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - endoff = (loff_t)(end - vma->vm_start - 1) - + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); /* - * vmtruncate_range may need to take i_mutex. We need to + * Filesystem's fallocate may need to take i_mutex. We need to * explicitly grab a reference because the vma (and hence the * vma's reference to the file) can go away as soon as we drop * mmap_sem. */ get_file(f); up_read(¤t->mm->mmap_sem); - error = vmtruncate_range(mapping->host, offset, endoff); + error = do_fallocate(f, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + offset, end - start); fput(f); down_read(¤t->mm->mmap_sem); return error; From 4c700db44cd8b513ee4ad1f988778c509912c10b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:41 -0700 Subject: [PATCH 11/54] tmpfs: support fallocate preallocation The systemd plumbers expressed a wish that tmpfs support preallocation. Cong Wang wrote a patch, but several kernel guys expressed scepticism: https://lkml.org/lkml/2011/11/18/137 Christoph Hellwig: What for exactly? Please explain why preallocating on tmpfs would make any sense. Kay Sievers: To be able to safely use mmap(), regarding SIGBUS, on files on the /dev/shm filesystem. The glibc fallback loop for -ENOSYS [or -EOPNOTSUPP] on fallocate is just ugly. Hugh Dickins: If tmpfs is going to support fallocate(FALLOC_FL_PUNCH_HOLE), it would seem perverse to permit the deallocation but fail the allocation. Christoph Hellwig: Agreed. Now that we do have shmem_fallocate() for hole-punching, plumb in basic support for preallocation mode too. It's fairly straightforward (though quite a few details needed attention), except for when it fails part way through. What a pity that fallocate(2) was not specified to return the length allocated, permitting short fallocations! As it is, when it fails part way through, we ought to free what has just been allocated by this system call; but must be very sure not to free any allocated earlier, or any allocated by racing accesses (not all excluded by i_mutex). But we cannot distinguish them: so in this patch simply leak allocations on partial failure (they will be freed later if the file is removed). An attractive alternative approach would have been for fallocate() not to allocate pages at all, but note reservations by entries in the radix-tree. But that would give less assurance, and, critically, would be hard to fit with mem cgroups (who owns the reservations?): allocating pages lets fallocate() behave in just the same way as write(). Change-Id: I48e9a02d087213ffd74ffd65f2f29d71bcf07eab Based-on-patch-by: Cong Wang Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Cc: Cong Wang Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 18e24691bbea..699655097438 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1611,7 +1611,9 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file->f_path.dentry->d_inode; - int error = -EOPNOTSUPP; + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + pgoff_t start, index, end; + int error; mutex_lock(&inode->i_mutex); @@ -1626,8 +1628,65 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, shmem_truncate_range(inode, offset, offset + len - 1); /* No need to unmap again: hole-punching leaves COWed pages */ error = 0; + goto out; } + /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ + error = inode_newsize_ok(inode, offset + len); + if (error) + goto out; + + start = offset >> PAGE_CACHE_SHIFT; + end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + /* Try to avoid a swapstorm if len is impossible to satisfy */ + if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { + error = -ENOSPC; + goto out; + } + + for (index = start; index < end; index++) { + struct page *page; + + /* + * Good, the fallocate(2) manpage permits EINTR: we may have + * been interrupted because we are using up too much memory. + */ + if (signal_pending(current)) + error = -EINTR; + else + error = shmem_getpage(inode, index, &page, SGP_WRITE, + NULL); + if (error) { + /* + * We really ought to free what we allocated so far, + * but it would be wrong to free pages allocated + * earlier, or already now in use: i_mutex does not + * exclude all cases. We do not know what to free. + */ + goto ctime; + } + + if (!PageUptodate(page)) { + clear_highpage(page); + flush_dcache_page(page); + SetPageUptodate(page); + } + /* + * set_page_dirty so that memory pressure will swap rather + * than free the pages we are allocating (and SGP_CACHE pages + * might still be clean: we now need to mark those dirty too). + */ + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + cond_resched(); + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) + i_size_write(inode, offset + len); +ctime: + inode->i_ctime = CURRENT_TIME; +out: mutex_unlock(&inode->i_mutex); return error; } From 6d687e0f9b055ac1346596ea9fa6de65c05598c7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:42 -0700 Subject: [PATCH 12/54] tmpfs: undo fallocation on failure In the previous episode, we left the already-fallocated pages attached to the file when shmem_fallocate() fails part way through. Now try to do better, by extending the earlier optimization of !Uptodate pages (then always under page lock) to !Uptodate pages (outside of page lock), representing fallocated pages. And don't waste time clearing them at the time of fallocate(), leave that until later if necessary. Adapt shmem_truncate_range() to shmem_undo_range(), so that a failing fallocate can recognize and remove precisely those !Uptodate allocations which it added (and were not independently allocated by racing tasks). But unless we start playing with swapfile.c and memcontrol.c too, once one of our fallocated pages reaches shmem_writepage(), we do then have to instantiate it as an ordinarily allocated page, before swapping out. This is unsatisfactory, but improved in the next episode. Change-Id: I6220082ed23caded4c1d022ad53f387270ee4ec8 Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Cc: Cong Wang Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 105 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 72 insertions(+), 33 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 699655097438..4c4537bcbfcd 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -91,7 +91,8 @@ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ - SGP_WRITE, /* may exceed i_size, may allocate page */ + SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ + SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ }; #ifdef CONFIG_TMPFS @@ -429,8 +430,10 @@ void shmem_unlock_mapping(struct address_space *mapping) /* * Remove range of pages and swap entries from radix tree, and free them. + * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. */ -void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) +static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, + bool unfalloc) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); @@ -464,6 +467,8 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) break; if (radix_tree_exceptional_entry(page)) { + if (unfalloc) + continue; nr_swaps_freed += !shmem_free_swap(mapping, index, page); continue; @@ -471,9 +476,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) if (!trylock_page(page)) continue; - if (page->mapping == mapping) { - VM_BUG_ON(PageWriteback(page)); - truncate_inode_page(mapping, page); + if (!unfalloc || !PageUptodate(page)) { + if (page->mapping == mapping) { + VM_BUG_ON(PageWriteback(page)); + truncate_inode_page(mapping, page); + } } unlock_page(page); } @@ -519,12 +526,12 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) min(end - index, (pgoff_t)PAGEVEC_SIZE), pvec.pages, indices); if (!pvec.nr) { - if (index == start) + if (index == start || unfalloc) break; index = start; continue; } - if (index == start && indices[0] >= end) { + if ((index == start || unfalloc) && indices[0] >= end) { shmem_deswap_pagevec(&pvec); pagevec_release(&pvec); break; @@ -538,15 +545,19 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) break; if (radix_tree_exceptional_entry(page)) { + if (unfalloc) + continue; nr_swaps_freed += !shmem_free_swap(mapping, index, page); continue; } lock_page(page); - if (page->mapping == mapping) { - VM_BUG_ON(PageWriteback(page)); - truncate_inode_page(mapping, page); + if (!unfalloc || !PageUptodate(page)) { + if (page->mapping == mapping) { + VM_BUG_ON(PageWriteback(page)); + truncate_inode_page(mapping, page); + } } unlock_page(page); } @@ -560,7 +571,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) info->swapped -= nr_swaps_freed; shmem_recalc_inode(inode); spin_unlock(&info->lock); +} +void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) +{ + shmem_undo_range(inode, lstart, lend, false); inode->i_ctime = inode->i_mtime = CURRENT_TIME; } EXPORT_SYMBOL_GPL(shmem_truncate_range); @@ -773,6 +788,18 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ goto redirty; } + + /* + * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC + * value into swapfile.c, the only way we can correctly account for a + * fallocated page arriving here is now to initialize it and write it. + */ + if (!PageUptodate(page)) { + clear_highpage(page); + flush_dcache_page(page); + SetPageUptodate(page); + } + swap = get_swap_page(); if (!swap.val) goto redirty; @@ -1002,6 +1029,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, swp_entry_t swap; int error; int once = 0; + int alloced = 0; if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) return -EFBIG; @@ -1013,19 +1041,21 @@ repeat: page = NULL; } - if (sgp != SGP_WRITE && + if (sgp != SGP_WRITE && sgp != SGP_FALLOC && ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { error = -EINVAL; goto failed; } + /* fallocated page? */ + if (page && !PageUptodate(page)) { + if (sgp != SGP_READ) + goto clear; + unlock_page(page); + page_cache_release(page); + page = NULL; + } if (page || (sgp == SGP_READ && !swap.val)) { - /* - * Once we can get the page lock, it must be uptodate: - * if there were an error in reading back from swap, - * the page would not be inserted into the filecache. - */ - BUG_ON(page && !PageUptodate(page)); *pagep = page; return 0; } @@ -1122,9 +1152,18 @@ repeat: inode->i_blocks += BLOCKS_PER_PAGE; shmem_recalc_inode(inode); spin_unlock(&info->lock); + alloced = true; /* - * Let SGP_WRITE caller clear ends if write does not fill page + * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. + */ + if (sgp == SGP_FALLOC) + sgp = SGP_WRITE; +clear: + /* + * Let SGP_WRITE caller clear ends if write does not fill page; + * but SGP_FALLOC on a page fallocated earlier must initialize + * it now, lest undo on failure cancel our earlier guarantee. */ if (sgp != SGP_WRITE) { clear_highpage(page); @@ -1136,10 +1175,13 @@ repeat: } /* Perhaps the file has been truncated since we checked */ - if (sgp != SGP_WRITE && + if (sgp != SGP_WRITE && sgp != SGP_FALLOC && ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { error = -EINVAL; - goto trunc; + if (alloced) + goto trunc; + else + goto failed; } *pagep = page; return 0; @@ -1148,6 +1190,7 @@ repeat: * Error recovery. */ trunc: + info = SHMEM_I(inode); ClearPageDirty(page); delete_from_page_cache(page); spin_lock(&info->lock); @@ -1155,6 +1198,7 @@ trunc: inode->i_blocks -= BLOCKS_PER_PAGE; spin_unlock(&info->lock); decused: + sbinfo = SHMEM_SB(inode->i_sb); if (sbinfo->max_blocks) percpu_counter_add(&sbinfo->used_blocks, -1); unacct: @@ -1654,25 +1698,20 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, if (signal_pending(current)) error = -EINTR; else - error = shmem_getpage(inode, index, &page, SGP_WRITE, + error = shmem_getpage(inode, index, &page, SGP_FALLOC, NULL); if (error) { - /* - * We really ought to free what we allocated so far, - * but it would be wrong to free pages allocated - * earlier, or already now in use: i_mutex does not - * exclude all cases. We do not know what to free. - */ + /* Remove the !PageUptodate pages we added */ + shmem_undo_range(inode, + (loff_t)start << PAGE_CACHE_SHIFT, + (loff_t)index << PAGE_CACHE_SHIFT, true); goto ctime; } - if (!PageUptodate(page)) { - clear_highpage(page); - flush_dcache_page(page); - SetPageUptodate(page); - } /* - * set_page_dirty so that memory pressure will swap rather + * If !PageUptodate, leave it that way so that freeable pages + * can be recognized if we need to rollback on error later. + * But set_page_dirty so that memory pressure will swap rather * than free the pages we are allocating (and SGP_CACHE pages * might still be clean: we now need to mark those dirty too). */ From 8cb17b36c9f2e464bc632b0710c78d754af7c0cc Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:42 -0700 Subject: [PATCH 13/54] tmpfs: quit when fallocate fills memory As it stands, a large fallocate() on tmpfs is liable to fill memory with pages, freed on failure except when they run into swap, at which point they become fixed into the file despite the failure. That feels quite wrong, to be consuming resources precisely when they're in short supply. Go the other way instead: shmem_fallocate() indicate the range it has fallocated to shmem_writepage(), keeping count of pages it's allocating; shmem_writepage() reactivate instead of swapping out pages fallocated by this syscall (but happily swap out those from earlier occasions), keeping count; shmem_fallocate() compare counts and give up once the reactivated pages have started to coming back to writepage (approximately: some zones would in fact recycle faster than others). This is a little unusual, but works well: although we could consider the failure to swap as a bug, and fix it later with SWAP_MAP_FALLOC handling added in swapfile.c and memcontrol.c, I doubt that we shall ever want to. (If there's no swap, an over-large fallocate() on tmpfs is limited in the same way as writing: stopped by rlimit, or by tmpfs mount size if that was set sensibly, or by __vm_enough_memory() heuristics if OVERCOMMIT_GUESS or OVERCOMMIT_NEVER. If OVERCOMMIT_ALWAYS, then it is liable to OOM-kill others as writing would, but stops and frees if interrupted.) Now that everything is freed on failure, we can then skip updating ctime. Change-Id: I3c26e625046b0a2a243982aebe147b58ce32b3d3 Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Cc: Cong Wang Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 4c4537bcbfcd..681a3f8bd4ad 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -86,6 +86,18 @@ struct shmem_xattr { char value[0]; }; +/* + * shmem_fallocate and shmem_writepage communicate via inode->i_private + * (with i_mutex making sure that it has only one user at a time): + * we would prefer not to enlarge the shmem inode just for that. + */ +struct shmem_falloc { + pgoff_t start; /* start of range currently being fallocated */ + pgoff_t next; /* the next page offset to be fallocated */ + pgoff_t nr_falloced; /* how many new pages have been fallocated */ + pgoff_t nr_unswapped; /* how often writepage refused to swap out */ +}; + /* Flag allocation requirements to shmem_getpage */ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ @@ -793,8 +805,28 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC * value into swapfile.c, the only way we can correctly account for a * fallocated page arriving here is now to initialize it and write it. + * + * That's okay for a page already fallocated earlier, but if we have + * not yet completed the fallocation, then (a) we want to keep track + * of this page in case we have to undo it, and (b) it may not be a + * good idea to continue anyway, once we're pushing into swap. So + * reactivate the page, and let shmem_fallocate() quit when too many. */ if (!PageUptodate(page)) { + if (inode->i_private) { + struct shmem_falloc *shmem_falloc; + spin_lock(&inode->i_lock); + shmem_falloc = inode->i_private; + if (shmem_falloc && + index >= shmem_falloc->start && + index < shmem_falloc->next) + shmem_falloc->nr_unswapped++; + else + shmem_falloc = NULL; + spin_unlock(&inode->i_lock); + if (shmem_falloc) + goto redirty; + } clear_highpage(page); flush_dcache_page(page); SetPageUptodate(page); @@ -1656,6 +1688,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, { struct inode *inode = file->f_path.dentry->d_inode; struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct shmem_falloc shmem_falloc; pgoff_t start, index, end; int error; @@ -1688,6 +1721,14 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, goto out; } + shmem_falloc.start = start; + shmem_falloc.next = start; + shmem_falloc.nr_falloced = 0; + shmem_falloc.nr_unswapped = 0; + spin_lock(&inode->i_lock); + inode->i_private = &shmem_falloc; + spin_unlock(&inode->i_lock); + for (index = start; index < end; index++) { struct page *page; @@ -1697,6 +1738,8 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, */ if (signal_pending(current)) error = -EINTR; + else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) + error = -ENOMEM; else error = shmem_getpage(inode, index, &page, SGP_FALLOC, NULL); @@ -1705,9 +1748,17 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, shmem_undo_range(inode, (loff_t)start << PAGE_CACHE_SHIFT, (loff_t)index << PAGE_CACHE_SHIFT, true); - goto ctime; + goto undone; } + /* + * Inform shmem_writepage() how far we have reached. + * No need for lock or barrier: we have the page lock. + */ + shmem_falloc.next++; + if (!PageUptodate(page)) + shmem_falloc.nr_falloced++; + /* * If !PageUptodate, leave it that way so that freeable pages * can be recognized if we need to rollback on error later. @@ -1723,8 +1774,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) i_size_write(inode, offset + len); -ctime: inode->i_ctime = CURRENT_TIME; +undone: + spin_lock(&inode->i_lock); + inode->i_private = NULL; + spin_unlock(&inode->i_lock); out: mutex_unlock(&inode->i_mutex); return error; From f398c1f737bedf2cd28386a28a9ab67681d91a24 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 7 Jun 2012 14:21:09 -0700 Subject: [PATCH 14/54] shmem: replace_page must flush_dcache and others Commit bde05d1ccd51 ("shmem: replace page if mapping excludes its zone") is not at all likely to break for anyone, but it was an earlier version from before review feedback was incorporated. Fix that up now. * shmem_replace_page must flush_dcache_page after copy_highpage [akpm] * Expand comment on why shmem_unuse_inode needs page_swapcount [akpm] * Remove excess of VM_BUG_ONs from shmem_replace_page [wangcong] * Check page_private matches swap before calling shmem_replace_page [hughd] * shmem_replace_page allow for unexpected race in radix_tree lookup [hughd] Change-Id: Ifd86f03fdff4cb43b18d5f79ce75120b3b84d3ea Signed-off-by: Hugh Dickins Cc: Cong Wang Cc: Christoph Hellwig Cc: KAMEZAWA Hiroyuki Cc: Alan Cox Cc: Stephane Marchesin Cc: Andi Kleen Cc: Dave Airlie Cc: Daniel Vetter Cc: Rob Clark Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 57 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 681a3f8bd4ad..1ce01cf384f8 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -685,10 +685,21 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, mutex_lock(&shmem_swaplist_mutex); /* * We needed to drop mutex to make that restrictive page - * allocation; but the inode might already be freed by now, - * and we cannot refer to inode or mapping or info to check. - * However, we do hold page lock on the PageSwapCache page, - * so can check if that still has our reference remaining. + * allocation, but the inode might have been freed while we + * dropped it: although a racing shmem_evict_inode() cannot + * complete without emptying the radix_tree, our page lock + * on this swapcache page is not enough to prevent that - + * free_swap_and_cache() of our swap entry will only + * trylock_page(), removing swap from radix_tree whatever. + * + * We must not proceed to shmem_add_to_page_cache() if the + * inode has been freed, but of course we cannot rely on + * inode or mapping or info to check that. However, we can + * safely check if our swap entry is still in use (and here + * it can't have got reused for another page): if it's still + * in use, then the inode cannot have been freed yet, and we + * can safely proceed (if it's no longer in use, that tells + * nothing about the inode, but we don't need to unuse swap). */ if (!page_swapcount(*pagep)) error = -ENOENT; @@ -732,9 +743,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page) /* * There's a faint possibility that swap page was replaced before - * caller locked it: it will come back later with the right page. + * caller locked it: caller will come back later with the right page. */ - if (unlikely(!PageSwapCache(page))) + if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val)) goto out; /* @@ -1003,21 +1014,15 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, newpage = shmem_alloc_page(gfp, info, index); if (!newpage) return -ENOMEM; - VM_BUG_ON(shmem_should_replace_page(newpage, gfp)); - *pagep = newpage; page_cache_get(newpage); copy_highpage(newpage, oldpage); + flush_dcache_page(newpage); - VM_BUG_ON(!PageLocked(oldpage)); __set_page_locked(newpage); - VM_BUG_ON(!PageUptodate(oldpage)); SetPageUptodate(newpage); - VM_BUG_ON(!PageSwapBacked(oldpage)); SetPageSwapBacked(newpage); - VM_BUG_ON(!swap_index); set_page_private(newpage, swap_index); - VM_BUG_ON(!PageSwapCache(oldpage)); SetPageSwapCache(newpage); /* @@ -1027,13 +1032,24 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, spin_lock_irq(&swap_mapping->tree_lock); error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, newpage); - __inc_zone_page_state(newpage, NR_FILE_PAGES); - __dec_zone_page_state(oldpage, NR_FILE_PAGES); + if (!error) { + __inc_zone_page_state(newpage, NR_FILE_PAGES); + __dec_zone_page_state(oldpage, NR_FILE_PAGES); + } spin_unlock_irq(&swap_mapping->tree_lock); - BUG_ON(error); - mem_cgroup_replace_page_cache(oldpage, newpage); - lru_cache_add_anon(newpage); + if (unlikely(error)) { + /* + * Is this possible? I think not, now that our callers check + * both PageSwapCache and page_private after getting page lock; + * but be defensive. Reverse old to newpage for clear and free. + */ + oldpage = newpage; + } else { + mem_cgroup_replace_page_cache(oldpage, newpage); + lru_cache_add_anon(newpage); + *pagep = newpage; + } ClearPageSwapCache(oldpage); set_page_private(oldpage, 0); @@ -1041,7 +1057,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, unlock_page(oldpage); page_cache_release(oldpage); page_cache_release(oldpage); - return 0; + return error; } /* @@ -1115,7 +1131,8 @@ repeat: /* We have to do this with page locked to prevent races */ lock_page(page); - if (!PageSwapCache(page) || page->mapping) { + if (!PageSwapCache(page) || page_private(page) != swap.val || + page->mapping) { error = -EEXIST; /* try again */ goto failed; } From 32ce7758c5ca83581533fd456053c28a30e3efa5 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 11 Jul 2012 14:02:47 -0700 Subject: [PATCH 15/54] shmem: fix negative rss in memcg memory.stat When adding the page_private checks before calling shmem_replace_page(), I did realize that there is a further race, but thought it too unlikely to need a hurried fix. But independently I've been chasing why a mem cgroup's memory.stat sometimes shows negative rss after all tasks have gone: I expected it to be a stats gathering bug, but actually it's shmem swapping's fault. It's an old surprise, that when you lock_page(lookup_swap_cache(swap)), the page may have been removed from swapcache before getting the lock; or it may have been freed and reused and be back in swapcache; and it can even be using the same swap location as before (page_private same). The swapoff case is already secure against this (swap cannot be reused until the whole area has been swapped off, and a new swapped on); and shmem_getpage_gfp() is protected by shmem_add_to_page_cache()'s check for the expected radix_tree entry - but a little too late. By that time, we might have already decided to shmem_replace_page(): I don't know of a problem from that, but I'd feel more at ease not to do so spuriously. And we have already done mem_cgroup_cache_charge(), on perhaps the wrong mem cgroup: and this charge is not then undone on the error path, because PageSwapCache ends up preventing that. It's this last case which causes the occasional negative rss in memory.stat: the page is charged here as cache, but (sometimes) found to be anon when eventually it's uncharged - and in between, it's an undeserved charge on the wrong memcg. Fix this by adding an earlier check on the radix_tree entry: it's inelegant to descend the tree twice, but swapping is not the fast path, and a better solution would need a pair (try+commit) of memcg calls, and a rework of shmem_replace_page() to keep out of the swapcache. We can use the added shmem_confirm_swap() function to replace the find_get_page+page_cache_release we were already doing on the error path. And add a comment on that -EEXIST: it seems a peculiar errno to be using, but originates from its use in radix_tree_insert(). [It can be surprising to see positive rss left in a memcg's memory.stat after all tasks have gone, since it is supposed to count anonymous but not shmem. Aside from sharing anon pages via fork with a task in some other memcg, it often happens after swapping: because a swap page can't be freed while under writeback, nor while locked. So it's not an error, and these residual pages are easily freed once pressure demands.] Change-Id: I8bba4e6d7e901058b31438371ed26384aa3f8ca4 Signed-off-by: Hugh Dickins Acked-by: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 1ce01cf384f8..6998de1fdaa9 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -265,6 +265,24 @@ static int shmem_radix_tree_replace(struct address_space *mapping, return 0; } +/* + * Sometimes, before we decide whether to proceed or to fail, we must check + * that an entry was not already brought back from swap by a racing thread. + * + * Checking page is not enough: by the time a SwapCache page is locked, it + * might be reused, and again be SwapCache, using the same swap as before. + */ +static bool shmem_confirm_swap(struct address_space *mapping, + pgoff_t index, swp_entry_t swap) +{ + void *item; + + rcu_read_lock(); + item = radix_tree_lookup(&mapping->page_tree, index); + rcu_read_unlock(); + return item == swp_to_radix_entry(swap); +} + /* * Like add_to_page_cache_locked, but error if expected item has gone. */ @@ -1132,9 +1150,9 @@ repeat: /* We have to do this with page locked to prevent races */ lock_page(page); if (!PageSwapCache(page) || page_private(page) != swap.val || - page->mapping) { + !shmem_confirm_swap(mapping, index, swap)) { error = -EEXIST; /* try again */ - goto failed; + goto unlock; } if (!PageUptodate(page)) { error = -EIO; @@ -1150,9 +1168,12 @@ repeat: error = mem_cgroup_cache_charge(page, current->mm, gfp & GFP_RECLAIM_MASK); - if (!error) + if (!error) { error = shmem_add_to_page_cache(page, mapping, index, gfp, swp_to_radix_entry(swap)); + /* We already confirmed swap, and make no allocation */ + VM_BUG_ON(error); + } if (error) goto failed; @@ -1253,14 +1274,10 @@ decused: unacct: shmem_unacct_blocks(info->flags, 1); failed: - if (swap.val && error != -EINVAL) { - struct page *test = find_get_page(mapping, index); - if (test && !radix_tree_exceptional_entry(test)) - page_cache_release(test); - /* Have another try if the entry has changed */ - if (test != swp_to_radix_entry(swap)) - error = -EEXIST; - } + if (swap.val && error != -EINVAL && + !shmem_confirm_swap(mapping, index, swap)) + error = -EEXIST; +unlock: if (page) { unlock_page(page); page_cache_release(page); @@ -1272,7 +1289,7 @@ failed: spin_unlock(&info->lock); goto repeat; } - if (error == -EEXIST) + if (error == -EEXIST) /* from above or from radix_tree_insert */ goto repeat; return error; } From 5cb6f6f9ef219abd49436876a6cac754af9207d5 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 11 Jul 2012 14:02:48 -0700 Subject: [PATCH 16/54] shmem: cleanup shmem_add_to_page_cache shmem_add_to_page_cache() has three callsites, but only one of them wants the radix_tree_preload() (an exceptional entry guarantees that the radix tree node is present in the other cases), and only that site can achieve mem_cgroup_uncharge_cache_page() (PageSwapCache makes it a no-op in the other cases). We did it this way originally to reflect add_to_page_cache_locked(); but it's confusing now, so move the radix_tree preloading and mem_cgroup uncharging to that one caller. Change-Id: Idd0fb8295f2930a01d5bd850fc5c4c097b7bb89e Signed-off-by: Hugh Dickins Acked-by: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 60 ++++++++++++++++++++++++++---------------------------- 1 file changed, 29 insertions(+), 31 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 6998de1fdaa9..9af7a534b3fd 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -290,40 +290,31 @@ static int shmem_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp, void *expected) { - int error = 0; + int error; VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageSwapBacked(page)); - if (!expected) - error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); - if (!error) { - page_cache_get(page); - page->mapping = mapping; - page->index = index; + page_cache_get(page); + page->mapping = mapping; + page->index = index; - spin_lock_irq(&mapping->tree_lock); - if (!expected) - error = radix_tree_insert(&mapping->page_tree, - index, page); - else - error = shmem_radix_tree_replace(mapping, index, - expected, page); - if (!error) { - mapping->nrpages++; - __inc_zone_page_state(page, NR_FILE_PAGES); - __inc_zone_page_state(page, NR_SHMEM); - spin_unlock_irq(&mapping->tree_lock); - } else { - page->mapping = NULL; - spin_unlock_irq(&mapping->tree_lock); - page_cache_release(page); - } - if (!expected) - radix_tree_preload_end(); + spin_lock_irq(&mapping->tree_lock); + if (!expected) + error = radix_tree_insert(&mapping->page_tree, index, page); + else + error = shmem_radix_tree_replace(mapping, index, expected, + page); + if (!error) { + mapping->nrpages++; + __inc_zone_page_state(page, NR_FILE_PAGES); + __inc_zone_page_state(page, NR_SHMEM); + spin_unlock_irq(&mapping->tree_lock); + } else { + page->mapping = NULL; + spin_unlock_irq(&mapping->tree_lock); + page_cache_release(page); } - if (error) - mem_cgroup_uncharge_cache_page(page); return error; } @@ -1210,11 +1201,18 @@ repeat: __set_page_locked(page); error = mem_cgroup_cache_charge(page, current->mm, gfp & GFP_RECLAIM_MASK); - if (!error) - error = shmem_add_to_page_cache(page, mapping, index, - gfp, NULL); if (error) goto decused; + error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); + if (!error) { + error = shmem_add_to_page_cache(page, mapping, index, + gfp, NULL); + radix_tree_preload_end(); + } + if (error) { + mem_cgroup_uncharge_cache_page(page); + goto decused; + } lru_cache_add_anon(page); spin_lock(&info->lock); From 03b9f6fd8983371d517eb5bd8cbc0e02ddb26e12 Mon Sep 17 00:00:00 2001 From: Nathan Zimmer Date: Tue, 31 Jul 2012 16:46:17 -0700 Subject: [PATCH 17/54] tmpfs: distribute interleave better across nodes When tmpfs has the interleave memory policy, it always starts allocating for each file from node 0 at offset 0. When there are many small files, the lower nodes fill up disproportionately. This patch spreads out node usage by starting files at nodes other than 0, by using the inode number to bias the starting node for interleave. Change-Id: I7286adb30f7de0cfd6fafa628d3edae1756752dc Signed-off-by: Nathan Zimmer Signed-off-by: Hugh Dickins Cc: Christoph Lameter Cc: Nick Piggin Cc: Lee Schermerhorn Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 9af7a534b3fd..8c95f589814d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -928,7 +928,8 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, /* Create a pseudo vma that just contains the policy */ pvma.vm_start = 0; - pvma.vm_pgoff = index; + /* Bias interleave by inode number to distribute better across nodes */ + pvma.vm_pgoff = index + info->vfs_inode.i_ino; pvma.vm_ops = NULL; pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); @@ -948,7 +949,8 @@ static struct page *shmem_alloc_page(gfp_t gfp, /* Create a pseudo vma that just contains the policy */ pvma.vm_start = 0; - pvma.vm_pgoff = index; + /* Bias interleave by inode number to distribute better across nodes */ + pvma.vm_pgoff = index + info->vfs_inode.i_ino; pvma.vm_ops = NULL; pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); From 4f3301a0664cdd25d96f16cd6befd7b5e00599d8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 21 Apr 2012 18:41:25 -0400 Subject: [PATCH 18/54] switch xattr syscalls to fget_light/fput_light Change-Id: Iaccab81ce6951d3250c8f2d1d21669ae6ca08848 Signed-off-by: Al Viro --- fs/xattr.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/fs/xattr.c b/fs/xattr.c index 3c8c1cc333c7..1d7ac3790458 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -399,11 +399,12 @@ SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname, SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, const void __user *,value, size_t, size, int, flags) { + int fput_needed; struct file *f; struct dentry *dentry; int error = -EBADF; - f = fget(fd); + f = fget_light(fd, &fput_needed); if (!f) return error; dentry = f->f_path.dentry; @@ -413,7 +414,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, error = setxattr(dentry, name, value, size, flags); mnt_drop_write_file(f); } - fput(f); + fput_light(f, fput_needed); return error; } @@ -486,15 +487,16 @@ SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname, SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name, void __user *, value, size_t, size) { + int fput_needed; struct file *f; ssize_t error = -EBADF; - f = fget(fd); + f = fget_light(fd, &fput_needed); if (!f) return error; audit_inode(NULL, f->f_path.dentry); error = getxattr(f->f_path.dentry, name, value, size); - fput(f); + fput_light(f, fput_needed); return error; } @@ -566,15 +568,16 @@ SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list, SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size) { + int fput_needed; struct file *f; ssize_t error = -EBADF; - f = fget(fd); + f = fget_light(fd, &fput_needed); if (!f) return error; audit_inode(NULL, f->f_path.dentry); error = listxattr(f->f_path.dentry, list, size); - fput(f); + fput_light(f, fput_needed); return error; } @@ -634,11 +637,12 @@ SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname, SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) { + int fput_needed; struct file *f; struct dentry *dentry; int error = -EBADF; - f = fget(fd); + f = fget_light(fd, &fput_needed); if (!f) return error; dentry = f->f_path.dentry; @@ -648,7 +652,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) error = removexattr(dentry, name); mnt_drop_write_file(f); } - fput(f); + fput_light(f, fput_needed); return error; } From 8e3394c402a0b7260cc6c3dac6cfd6ecb40801ef Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Thu, 23 Aug 2012 16:53:28 -0400 Subject: [PATCH 19/54] xattr: extract simple_xattr code from tmpfs Extract in-memory xattr APIs from tmpfs. Will be used by cgroup. $ size vmlinux.o text data bss dec hex filename 4658782 880729 5195032 10734543 a3cbcf vmlinux.o $ size vmlinux.o text data bss dec hex filename 4658957 880729 5195032 10734718 a3cc7e vmlinux.o v7: - checkpatch warnings fixed - Implement the changes requested by Hugh Dickins: - make simple_xattrs_init and simple_xattrs_free inline - get rid of locking and list reinitialization in simple_xattrs_free, they're not needed v6: - no changes v5: - no changes v4: - move simple_xattrs_free() to fs/xattr.c v3: - in kmem_xattrs_free(), reinitialize the list - use simple_xattr_* prefix - introduce simple_xattr_add() to prevent direct list usage Change-Id: Id683f37a6190127866f75c5b4bb8fafc6491dcd8 Original-patch-by: Li Zefan Cc: Li Zefan Cc: Hillf Danton Cc: Lennart Poettering Acked-by: Hugh Dickins Signed-off-by: Li Zefan Signed-off-by: Aristeu Rozanski Signed-off-by: Tejun Heo --- fs/xattr.c | 166 +++++++++++++++++++++++++++++++++++++ include/linux/shmem_fs.h | 3 +- include/linux/xattr.h | 48 +++++++++++ mm/shmem.c | 171 +++------------------------------------ 4 files changed, 229 insertions(+), 159 deletions(-) diff --git a/fs/xattr.c b/fs/xattr.c index 1d7ac3790458..78ed976ab548 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -783,3 +783,169 @@ EXPORT_SYMBOL(generic_getxattr); EXPORT_SYMBOL(generic_listxattr); EXPORT_SYMBOL(generic_setxattr); EXPORT_SYMBOL(generic_removexattr); + +/* + * Allocate new xattr and copy in the value; but leave the name to callers. + */ +struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) +{ + struct simple_xattr *new_xattr; + size_t len; + + /* wrap around? */ + len = sizeof(*new_xattr) + size; + if (len <= sizeof(*new_xattr)) + return NULL; + + new_xattr = kmalloc(len, GFP_KERNEL); + if (!new_xattr) + return NULL; + + new_xattr->size = size; + memcpy(new_xattr->value, value, size); + return new_xattr; +} + +/* + * xattr GET operation for in-memory/pseudo filesystems + */ +int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, + void *buffer, size_t size) +{ + struct simple_xattr *xattr; + int ret = -ENODATA; + + spin_lock(&xattrs->lock); + list_for_each_entry(xattr, &xattrs->head, list) { + if (strcmp(name, xattr->name)) + continue; + + ret = xattr->size; + if (buffer) { + if (size < xattr->size) + ret = -ERANGE; + else + memcpy(buffer, xattr->value, xattr->size); + } + break; + } + spin_unlock(&xattrs->lock); + return ret; +} + +static int __simple_xattr_set(struct simple_xattrs *xattrs, const char *name, + const void *value, size_t size, int flags) +{ + struct simple_xattr *xattr; + struct simple_xattr *new_xattr = NULL; + int err = 0; + + /* value == NULL means remove */ + if (value) { + new_xattr = simple_xattr_alloc(value, size); + if (!new_xattr) + return -ENOMEM; + + new_xattr->name = kstrdup(name, GFP_KERNEL); + if (!new_xattr->name) { + kfree(new_xattr); + return -ENOMEM; + } + } + + spin_lock(&xattrs->lock); + list_for_each_entry(xattr, &xattrs->head, list) { + if (!strcmp(name, xattr->name)) { + if (flags & XATTR_CREATE) { + xattr = new_xattr; + err = -EEXIST; + } else if (new_xattr) { + list_replace(&xattr->list, &new_xattr->list); + } else { + list_del(&xattr->list); + } + goto out; + } + } + if (flags & XATTR_REPLACE) { + xattr = new_xattr; + err = -ENODATA; + } else { + list_add(&new_xattr->list, &xattrs->head); + xattr = NULL; + } +out: + spin_unlock(&xattrs->lock); + if (xattr) { + kfree(xattr->name); + kfree(xattr); + } + return err; + +} + +/* + * xattr SET operation for in-memory/pseudo filesystems + */ +int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, + const void *value, size_t size, int flags) +{ + if (size == 0) + value = ""; /* empty EA, do not remove */ + return __simple_xattr_set(xattrs, name, value, size, flags); +} + +/* + * xattr REMOVE operation for in-memory/pseudo filesystems + */ +int simple_xattr_remove(struct simple_xattrs *xattrs, const char *name) +{ + return __simple_xattr_set(xattrs, name, NULL, 0, XATTR_REPLACE); +} + +static bool xattr_is_trusted(const char *name) +{ + return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); +} + +/* + * xattr LIST operation for in-memory/pseudo filesystems + */ +ssize_t simple_xattr_list(struct simple_xattrs *xattrs, char *buffer, + size_t size) +{ + bool trusted = capable(CAP_SYS_ADMIN); + struct simple_xattr *xattr; + size_t used = 0; + + spin_lock(&xattrs->lock); + list_for_each_entry(xattr, &xattrs->head, list) { + size_t len; + + /* skip "trusted." attributes for unprivileged callers */ + if (!trusted && xattr_is_trusted(xattr->name)) + continue; + + len = strlen(xattr->name) + 1; + used += len; + if (buffer) { + if (size < used) { + used = -ERANGE; + break; + } + memcpy(buffer, xattr->name, len); + buffer += len; + } + } + spin_unlock(&xattrs->lock); + + return used; +} + +void simple_xattr_list_add(struct simple_xattrs *xattrs, + struct simple_xattr *new_xattr) +{ + spin_lock(&xattrs->lock); + list_add(&new_xattr->list, &xattrs->head); + spin_unlock(&xattrs->lock); +} diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 79ab2555b3b0..72263bdafd4a 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -5,6 +5,7 @@ #include #include #include +#include /* inode in-kernel data */ @@ -18,7 +19,7 @@ struct shmem_inode_info { }; struct shared_policy policy; /* NUMA memory alloc policy */ struct list_head swaplist; /* chain of maybes on swap */ - struct list_head xattr_list; /* list of shmem_xattr */ + struct simple_xattrs xattrs; /* list of xattrs */ struct inode vfs_inode; }; diff --git a/include/linux/xattr.h b/include/linux/xattr.h index e5d122031542..2ace7a60316d 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -59,7 +59,9 @@ #ifdef __KERNEL__ +#include #include +#include struct inode; struct dentry; @@ -96,6 +98,52 @@ ssize_t vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value, size_t size, gfp_t flags); int vfs_xattr_cmp(struct dentry *dentry, const char *xattr_name, const char *value, size_t size, gfp_t flags); + +struct simple_xattrs { + struct list_head head; + spinlock_t lock; +}; + +struct simple_xattr { + struct list_head list; + char *name; + size_t size; + char value[0]; +}; + +/* + * initialize the simple_xattrs structure + */ +static inline void simple_xattrs_init(struct simple_xattrs *xattrs) +{ + INIT_LIST_HEAD(&xattrs->head); + spin_lock_init(&xattrs->lock); +} + +/* + * free all the xattrs + */ +static inline void simple_xattrs_free(struct simple_xattrs *xattrs) +{ + struct simple_xattr *xattr, *node; + + list_for_each_entry_safe(xattr, node, &xattrs->head, list) { + kfree(xattr->name); + kfree(xattr); + } +} + +struct simple_xattr *simple_xattr_alloc(const void *value, size_t size); +int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, + void *buffer, size_t size); +int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, + const void *value, size_t size, int flags); +int simple_xattr_remove(struct simple_xattrs *xattrs, const char *name); +ssize_t simple_xattr_list(struct simple_xattrs *xattrs, char *buffer, + size_t size); +void simple_xattr_list_add(struct simple_xattrs *xattrs, + struct simple_xattr *new_xattr); + #endif /* __KERNEL__ */ #endif /* _LINUX_XATTR_H */ diff --git a/mm/shmem.c b/mm/shmem.c index 8c95f589814d..092bb384ea5c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -79,13 +79,6 @@ static struct vfsmount *shm_mnt; /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ #define SHORT_SYMLINK_LEN 128 -struct shmem_xattr { - struct list_head list; /* anchored by shmem_inode_info->xattr_list */ - char *name; /* xattr name */ - size_t size; - char value[0]; -}; - /* * shmem_fallocate and shmem_writepage communicate via inode->i_private * (with i_mutex making sure that it has only one user at a time): @@ -638,7 +631,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) static void shmem_evict_inode(struct inode *inode) { struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_xattr *xattr, *nxattr; if (inode->i_mapping->a_ops == &shmem_aops) { shmem_unacct_size(info->flags, inode->i_size); @@ -652,10 +644,7 @@ static void shmem_evict_inode(struct inode *inode) } else kfree(info->symlink); - list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) { - kfree(xattr->name); - kfree(xattr); - } + simple_xattrs_free(&info->xattrs); WARN_ON(inode->i_blocks); shmem_free_inode(inode->i_sb); end_writeback(inode); @@ -1385,7 +1374,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode spin_lock_init(&info->lock); info->flags = flags & VM_NORESERVE; INIT_LIST_HEAD(&info->swaplist); - INIT_LIST_HEAD(&info->xattr_list); + simple_xattrs_init(&info->xattrs); cache_no_acl(inode); switch (mode & S_IFMT) { @@ -2100,28 +2089,6 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co * filesystem level, though. */ -/* - * Allocate new xattr and copy in the value; but leave the name to callers. - */ -static struct shmem_xattr *shmem_xattr_alloc(const void *value, size_t size) -{ - struct shmem_xattr *new_xattr; - size_t len; - - /* wrap around? */ - len = sizeof(*new_xattr) + size; - if (len <= sizeof(*new_xattr)) - return NULL; - - new_xattr = kmalloc(len, GFP_KERNEL); - if (!new_xattr) - return NULL; - - new_xattr->size = size; - memcpy(new_xattr->value, value, size); - return new_xattr; -} - /* * Callback for security_inode_init_security() for acquiring xattrs. */ @@ -2131,11 +2098,11 @@ static int shmem_initxattrs(struct inode *inode, { struct shmem_inode_info *info = SHMEM_I(inode); const struct xattr *xattr; - struct shmem_xattr *new_xattr; + struct simple_xattr *new_xattr; size_t len; for (xattr = xattr_array; xattr->name != NULL; xattr++) { - new_xattr = shmem_xattr_alloc(xattr->value, xattr->value_len); + new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); if (!new_xattr) return -ENOMEM; @@ -2152,91 +2119,12 @@ static int shmem_initxattrs(struct inode *inode, memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, xattr->name, len); - spin_lock(&info->lock); - list_add(&new_xattr->list, &info->xattr_list); - spin_unlock(&info->lock); + simple_xattr_list_add(&info->xattrs, new_xattr); } return 0; } -static int shmem_xattr_get(struct dentry *dentry, const char *name, - void *buffer, size_t size) -{ - struct shmem_inode_info *info; - struct shmem_xattr *xattr; - int ret = -ENODATA; - - info = SHMEM_I(dentry->d_inode); - - spin_lock(&info->lock); - list_for_each_entry(xattr, &info->xattr_list, list) { - if (strcmp(name, xattr->name)) - continue; - - ret = xattr->size; - if (buffer) { - if (size < xattr->size) - ret = -ERANGE; - else - memcpy(buffer, xattr->value, xattr->size); - } - break; - } - spin_unlock(&info->lock); - return ret; -} - -static int shmem_xattr_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_xattr *xattr; - struct shmem_xattr *new_xattr = NULL; - int err = 0; - - /* value == NULL means remove */ - if (value) { - new_xattr = shmem_xattr_alloc(value, size); - if (!new_xattr) - return -ENOMEM; - - new_xattr->name = kstrdup(name, GFP_KERNEL); - if (!new_xattr->name) { - kfree(new_xattr); - return -ENOMEM; - } - } - - spin_lock(&info->lock); - list_for_each_entry(xattr, &info->xattr_list, list) { - if (!strcmp(name, xattr->name)) { - if (flags & XATTR_CREATE) { - xattr = new_xattr; - err = -EEXIST; - } else if (new_xattr) { - list_replace(&xattr->list, &new_xattr->list); - } else { - list_del(&xattr->list); - } - goto out; - } - } - if (flags & XATTR_REPLACE) { - xattr = new_xattr; - err = -ENODATA; - } else { - list_add(&new_xattr->list, &info->xattr_list); - xattr = NULL; - } -out: - spin_unlock(&info->lock); - if (xattr) - kfree(xattr->name); - kfree(xattr); - return err; -} - static const struct xattr_handler *shmem_xattr_handlers[] = { #ifdef CONFIG_TMPFS_POSIX_ACL &generic_acl_access_handler, @@ -2267,6 +2155,7 @@ static int shmem_xattr_validate(const char *name) static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size) { + struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); int err; /* @@ -2281,12 +2170,13 @@ static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, if (err) return err; - return shmem_xattr_get(dentry, name, buffer, size); + return simple_xattr_get(&info->xattrs, name, buffer, size); } static int shmem_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { + struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); int err; /* @@ -2301,15 +2191,12 @@ static int shmem_setxattr(struct dentry *dentry, const char *name, if (err) return err; - if (size == 0) - value = ""; /* empty EA, do not remove */ - - return shmem_xattr_set(dentry->d_inode, name, value, size, flags); - + return simple_xattr_set(&info->xattrs, name, value, size, flags); } static int shmem_removexattr(struct dentry *dentry, const char *name) { + struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); int err; /* @@ -2324,45 +2211,13 @@ static int shmem_removexattr(struct dentry *dentry, const char *name) if (err) return err; - return shmem_xattr_set(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); -} - -static bool xattr_is_trusted(const char *name) -{ - return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); + return simple_xattr_remove(&info->xattrs, name); } static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) { - bool trusted = capable(CAP_SYS_ADMIN); - struct shmem_xattr *xattr; - struct shmem_inode_info *info; - size_t used = 0; - - info = SHMEM_I(dentry->d_inode); - - spin_lock(&info->lock); - list_for_each_entry(xattr, &info->xattr_list, list) { - size_t len; - - /* skip "trusted." attributes for unprivileged callers */ - if (!trusted && xattr_is_trusted(xattr->name)) - continue; - - len = strlen(xattr->name) + 1; - used += len; - if (buffer) { - if (size < used) { - used = -ERANGE; - break; - } - memcpy(buffer, xattr->name, len); - buffer += len; - } - } - spin_unlock(&info->lock); - - return used; + struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); + return simple_xattr_list(&info->xattrs, buffer, size); } #endif /* CONFIG_TMPFS_XATTR */ From 2b361a6910aad38e4798647446918a02911dfa66 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 16 Nov 2012 14:15:03 -0800 Subject: [PATCH 20/54] tmpfs: fix shmem_getpage_gfp() VM_BUG_ON Fuzzing with trinity hit the "impossible" VM_BUG_ON(error) (which Fedora has converted to WARNING) in shmem_getpage_gfp(): WARNING: at mm/shmem.c:1151 shmem_getpage_gfp+0xa5c/0xa70() Pid: 29795, comm: trinity-child4 Not tainted 3.7.0-rc2+ #49 Call Trace: warn_slowpath_common+0x7f/0xc0 warn_slowpath_null+0x1a/0x20 shmem_getpage_gfp+0xa5c/0xa70 shmem_fault+0x4f/0xa0 __do_fault+0x71/0x5c0 handle_pte_fault+0x97/0xae0 handle_mm_fault+0x289/0x350 __do_page_fault+0x18e/0x530 do_page_fault+0x2b/0x50 page_fault+0x28/0x30 tracesys+0xe1/0xe6 Thanks to Johannes for pointing to truncation: free_swap_and_cache() only does a trylock on the page, so the page lock we've held since before confirming swap is not enough to protect against truncation. What cleanup is needed in this case? Just delete_from_swap_cache(), which takes care of the memcg uncharge. Change-Id: I6180f00af8917f2b98692a4cb2528e9a2399462b Signed-off-by: Hugh Dickins Reported-by: Dave Jones Cc: Johannes Weiner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 092bb384ea5c..8fb86c7f9c49 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1153,8 +1153,20 @@ repeat: if (!error) { error = shmem_add_to_page_cache(page, mapping, index, gfp, swp_to_radix_entry(swap)); - /* We already confirmed swap, and make no allocation */ - VM_BUG_ON(error); + /* + * We already confirmed swap under page lock, and make + * no memory allocation here, so usually no possibility + * of error; but free_swap_and_cache() only trylocks a + * page, so it is just possible that the entry has been + * truncated or holepunched since swap was confirmed. + * shmem_undo_range() will have done some of the + * unaccounting, now delete_from_swap_cache() will do + * the rest (including mem_cgroup_uncharge_swapcache). + * Reset swap.val? No, leave it so "failed" goes back to + * "repeat": reading a hole and writing should succeed. + */ + if (error) + delete_from_swap_cache(page); } if (error) goto failed; From d9b3c9f46e287f358c1e22ac0901f348935e7dd7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 12 Dec 2012 13:52:21 -0800 Subject: [PATCH 21/54] tmpfs: support SEEK_DATA and SEEK_HOLE (reprise) Revert 3.5's commit f21f8062201f ("tmpfs: revert SEEK_DATA and SEEK_HOLE") to reinstate 4fb5ef089b28 ("tmpfs: support SEEK_DATA and SEEK_HOLE"), with the intervening additional arg to generic_file_llseek_size(). In 3.8, ext4 is expected to join btrfs, ocfs2 and xfs with proper SEEK_DATA and SEEK_HOLE support; and a good case has now been made for it on tmpfs, so let's join the party. It's quite easy for tmpfs to scan the radix_tree to support llseek's new SEEK_DATA and SEEK_HOLE options: so add them while the minutiae are still on my mind (in particular, the !PageUptodate-ness of pages fallocated but still unwritten). [akpm@linux-foundation.org: fix warning with CONFIG_TMPFS=n] Signed-off-by: Hugh Dickins Cc: Dave Chinner Cc: Jaegeuk Hanse Cc: "Theodore Ts'o" Cc: Zheng Liu Cc: Jeff liu Cc: Paul Eggert Cc: Christoph Hellwig Cc: Josef Bacik Cc: Andi Kleen Cc: Andreas Dilger Cc: Marco Stornelli Cc: Chris Mason Cc: Sunil Mushran Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Change-Id: I909ce57329dbad9a27067b62f7462a155c815143 --- mm/shmem.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 91 insertions(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 8fb86c7f9c49..4341ae50d19f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1718,6 +1718,96 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, return error; } +/* + * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. + */ +static pgoff_t shmem_seek_hole_data(struct address_space *mapping, + pgoff_t index, pgoff_t end, int origin) +{ + struct page *page; + struct pagevec pvec; + pgoff_t indices[PAGEVEC_SIZE]; + bool done = false; + int i; + + pagevec_init(&pvec, 0); + pvec.nr = 1; /* start small: we may be there already */ + while (!done) { + pvec.nr = shmem_find_get_pages_and_swap(mapping, index, + pvec.nr, pvec.pages, indices); + if (!pvec.nr) { + if (origin == SEEK_DATA) + index = end; + break; + } + for (i = 0; i < pvec.nr; i++, index++) { + if (index < indices[i]) { + if (origin == SEEK_HOLE) { + done = true; + break; + } + index = indices[i]; + } + page = pvec.pages[i]; + if (page && !radix_tree_exceptional_entry(page)) { + if (!PageUptodate(page)) + page = NULL; + } + if (index >= end || + (page && origin == SEEK_DATA) || + (!page && origin == SEEK_HOLE)) { + done = true; + break; + } + } + shmem_deswap_pagevec(&pvec); + pagevec_release(&pvec); + pvec.nr = PAGEVEC_SIZE; + cond_resched(); + } + return index; +} + +static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin) +{ + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + pgoff_t start, end; + loff_t new_offset; + + if (origin != SEEK_DATA && origin != SEEK_HOLE) + return generic_file_llseek_size(file, offset, origin, + MAX_LFS_FILESIZE); + mutex_lock(&inode->i_mutex); + /* We're holding i_mutex so we can access i_size directly */ + + if (offset < 0) + offset = -EINVAL; + else if (offset >= inode->i_size) + offset = -ENXIO; + else { + start = offset >> PAGE_CACHE_SHIFT; + end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + new_offset = shmem_seek_hole_data(mapping, start, end, origin); + new_offset <<= PAGE_CACHE_SHIFT; + if (new_offset > offset) { + if (new_offset < inode->i_size) + offset = new_offset; + else if (origin == SEEK_DATA) + offset = -ENXIO; + else + offset = inode->i_size; + } + } + + if (offset >= 0 && offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + } + mutex_unlock(&inode->i_mutex); + return offset; +} + static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { @@ -2689,7 +2779,7 @@ static const struct address_space_operations shmem_aops = { static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, #ifdef CONFIG_TMPFS - .llseek = generic_file_llseek, + .llseek = shmem_file_llseek, .read = do_sync_read, .write = do_sync_write, .aio_read = shmem_file_aio_read, From 841aca1716cda2bcf9db21cb9669c9cf56d7fb33 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 22 Feb 2013 16:35:17 -0800 Subject: [PATCH 22/54] mm: shmem: use new radix tree iterator In shmem_find_get_pages_and_swap(), use the faster radix tree iterator construct from commit 78c1d78488a3 ("radix-tree: introduce bit-optimized iterator"). Change-Id: Ia82af5c6cf71b6cca13397db9ab7d94ec8dbdf32 Signed-off-by: Johannes Weiner Acked-by: Hugh Dickins Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 4341ae50d19f..400b07e5ab6b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -337,19 +337,19 @@ static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages, pgoff_t *indices) { - unsigned int i; - unsigned int ret; - unsigned int nr_found; + void **slot; + unsigned int ret = 0; + struct radix_tree_iter iter; + + if (!nr_pages) + return 0; rcu_read_lock(); restart: - nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, - (void ***)pages, indices, start, nr_pages); - ret = 0; - for (i = 0; i < nr_found; i++) { + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { struct page *page; repeat: - page = radix_tree_deref_slot((void **)pages[i]); + page = radix_tree_deref_slot(slot); if (unlikely(!page)) continue; if (radix_tree_exception(page)) { @@ -366,17 +366,16 @@ repeat: goto repeat; /* Has the page moved? */ - if (unlikely(page != *((void **)pages[i]))) { + if (unlikely(page != *slot)) { page_cache_release(page); goto repeat; } export: - indices[ret] = indices[i]; + indices[ret] = iter.index; pages[ret] = page; - ret++; + if (++ret == nr_pages) + break; } - if (unlikely(!ret && nr_found)) - goto restart; rcu_read_unlock(); return ret; } From 5a6aa8abb8698bca7c1ce4c3055cbaa69791fe40 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Fri, 22 Feb 2013 16:36:02 -0800 Subject: [PATCH 23/54] tmpfs: fix mempolicy object leaks Fix several mempolicy leaks in the tmpfs mount logic. These leaks are slow - on the order of one object leaked per mount attempt. Leak 1 (umount doesn't free mpol allocated in mount): while true; do mount -t tmpfs -o mpol=interleave,size=100M nodev /mnt umount /mnt done Leak 2 (errors parsing remount options will leak mpol): mount -t tmpfs -o size=100M nodev /mnt while true; do mount -o remount,mpol=interleave,size=x /mnt 2> /dev/null done umount /mnt Leak 3 (multiple mpol per mount leak mpol): while true; do mount -t tmpfs -o mpol=interleave,mpol=interleave,size=100M nodev /mnt umount /mnt done This patch fixes all of the above. I could have broken the patch into three pieces but is seemed easier to review as one. [akpm@linux-foundation.org: fix handling of mpol_parse_str() errors, per Hugh] Signed-off-by: Greg Thelen Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Change-Id: I96c06e9b7322b3ef146a0d04a57dc61dc801c177 --- mm/shmem.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 400b07e5ab6b..16b144a12bde 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2423,6 +2423,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, bool remount) { char *this_char, *value, *rest; + struct mempolicy *mpol = NULL; while (options != NULL) { this_char = options; @@ -2449,7 +2450,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, printk(KERN_ERR "tmpfs: No value for mount option '%s'\n", this_char); - return 1; + goto error; } if (!strcmp(this_char,"size")) { @@ -2492,19 +2493,24 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, if (*rest) goto bad_val; } else if (!strcmp(this_char,"mpol")) { - if (mpol_parse_str(value, &sbinfo->mpol, 1)) + mpol_put(mpol); + mpol = NULL; + if (mpol_parse_str(value, &mpol, 1)) goto bad_val; } else { printk(KERN_ERR "tmpfs: Bad mount option %s\n", this_char); - return 1; + goto error; } } + sbinfo->mpol = mpol; return 0; bad_val: printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", value, this_char); +error: + mpol_put(mpol); return 1; } @@ -2647,6 +2653,7 @@ static void shmem_put_super(struct super_block *sb) struct shmem_sb_info *sbinfo = SHMEM_SB(sb); percpu_counter_destroy(&sbinfo->used_blocks); + mpol_put(sbinfo->mpol); kfree(sbinfo); sb->s_fs_info = NULL; } From 45abc720226dd359a183fc385c9fe0aa5b27248b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 25 Jan 2013 16:32:10 -0800 Subject: [PATCH 24/54] userns: Allow the userns root to mount tmpfs. There is no backing store to tmpfs and file creation rules are the same as for any other filesystem so it is semantically safe to allow unprivileged users to mount it. ramfs is safe for the same reasons so allow either flavor of tmpfs to be mounted by a user namespace root user. The memory control group successfully limits how much memory tmpfs can consume on any system that cares about a user namespace root using tmpfs to exhaust memory the memory control group can be deployed. Change-Id: Ie237fefe71aaa39172d661945197a20e6d0280eb Signed-off-by: "Eric W. Biederman" --- mm/shmem.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/shmem.c b/mm/shmem.c index 16b144a12bde..d398429c6914 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2875,6 +2875,7 @@ static struct file_system_type shmem_fs_type = { .name = "tmpfs", .mount = shmem_mount, .kill_sb = kill_litter_super, + .fs_flags = FS_USERNS_MOUNT, }; int __init shmem_init(void) @@ -2932,6 +2933,7 @@ static struct file_system_type shmem_fs_type = { .name = "tmpfs", .mount = ramfs_mount, .kill_sb = kill_litter_super, + .fs_flags = FS_USERNS_MOUNT, }; int __init shmem_init(void) From 079ad001114da560c22c1f3cc9ab64591b4ac90f Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Sun, 14 Apr 2013 09:21:47 -0400 Subject: [PATCH 25/54] evm: calculate HMAC after initializing posix acl on tmpfs Included in the EVM hmac calculation is the i_mode. Any changes to the i_mode need to be reflected in the hmac. shmem_mknod() currently calls generic_acl_init(), which modifies the i_mode, after calling security_inode_init_security(). This patch reverses the order in which they are called. Change-Id: I1b05e94748a5e59f49afe566b34e2cb79bd3cc5f Reported-by: Sven Vermeulen Signed-off-by: Mimi Zohar Acked-by: Hugh Dickins --- mm/shmem.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index d398429c6914..13479d8c5ff7 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1940,6 +1940,13 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); if (inode) { +#ifdef CONFIG_TMPFS_POSIX_ACL + error = generic_acl_init(inode, dir); + if (error) { + iput(inode); + return error; + } +#endif error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); @@ -1949,15 +1956,8 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) return error; } } -#ifdef CONFIG_TMPFS_POSIX_ACL - error = generic_acl_init(inode, dir); - if (error) { - iput(inode); - return error; - } -#else + error = 0; -#endif dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = CURRENT_TIME; d_instantiate(dentry, inode); From 3c1ff9b30226852b6efffb694291a9dee1a0bf1b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 11 Sep 2013 14:26:05 -0700 Subject: [PATCH 26/54] lib/radix-tree.c: make radix_tree_node_alloc() work correctly within interrupt With users of radix_tree_preload() run from interrupt (block/blk-ioc.c is one such possible user), the following race can happen: radix_tree_preload() ... radix_tree_insert() radix_tree_node_alloc() if (rtp->nr) { ret = rtp->nodes[rtp->nr - 1]; ... radix_tree_preload() ... radix_tree_insert() radix_tree_node_alloc() if (rtp->nr) { ret = rtp->nodes[rtp->nr - 1]; And we give out one radix tree node twice. That clearly results in radix tree corruption with different results (usually OOPS) depending on which two users of radix tree race. We fix the problem by making radix_tree_node_alloc() always allocate fresh radix tree nodes when in interrupt. Using preloading when in interrupt doesn't make sense since all the allocations have to be atomic anyway and we cannot steal nodes from process-context users because some users rely on radix_tree_insert() succeeding after radix_tree_preload(). in_interrupt() check is somewhat ugly but we cannot simply key off passed gfp_mask as that is acquired from root_gfp_mask() and thus the same for all preload users. Another part of the fix is to avoid node preallocation in radix_tree_preload() when passed gfp_mask doesn't allow waiting. Again, preallocation in such case doesn't make sense and when preallocation would happen in interrupt we could possibly leak some allocated nodes. However, some users of radix_tree_preload() require following radix_tree_insert() to succeed. To avoid unexpected effects for these users, radix_tree_preload() only warns if passed gfp mask doesn't allow waiting and we provide a new function radix_tree_maybe_preload() for those users which get different gfp mask from different call sites and which are prepared to handle radix_tree_insert() failure. Change-Id: Iab513ed95e8a98cd890e68a820181a8a915da9aa Signed-off-by: Jan Kara Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/blk-ioc.c | 2 +- fs/fscache/page.c | 2 +- include/linux/radix-tree.h | 1 + lib/radix-tree.c | 41 ++++++++++++++++++++++++++++++++++++-- mm/filemap.c | 2 +- mm/shmem.c | 2 +- mm/swap_state.c | 4 ++-- 7 files changed, 46 insertions(+), 8 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index fb95dd2f889a..57575aa81064 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -350,7 +350,7 @@ struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask) if (!icq) return NULL; - if (radix_tree_preload(gfp_mask) < 0) { + if (radix_tree_maybe_preload(gfp_mask) < 0) { kmem_cache_free(et->icq_cache, icq); return NULL; } diff --git a/fs/fscache/page.c b/fs/fscache/page.c index c9d52e1a8fa9..5c7f4538e9e9 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -767,7 +767,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, fscache_release_write_op); op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING); - ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM); + ret = radix_tree_maybe_preload(gfp & ~__GFP_HIGHMEM); if (ret < 0) goto nomem_free; diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index ffc444c38b0a..403940787be1 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -231,6 +231,7 @@ unsigned long radix_tree_next_hole(struct radix_tree_root *root, unsigned long radix_tree_prev_hole(struct radix_tree_root *root, unsigned long index, unsigned long max_scan); int radix_tree_preload(gfp_t gfp_mask); +int radix_tree_maybe_preload(gfp_t gfp_mask); void radix_tree_init(void); void *radix_tree_tag_set(struct radix_tree_root *root, unsigned long index, unsigned int tag); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 3ac50dc55638..40d9ad96cf60 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -32,6 +32,7 @@ #include #include #include +#include /* in_interrupt() */ #ifdef __KERNEL__ @@ -194,7 +195,12 @@ radix_tree_node_alloc(struct radix_tree_root *root) struct radix_tree_node *ret = NULL; gfp_t gfp_mask = root_gfp_mask(root); - if (!(gfp_mask & __GFP_WAIT)) { + /* + * Preload code isn't irq safe and it doesn't make sence to use + * preloading in the interrupt anyway as all the allocations have to + * be atomic. So just do normal allocation when in interrupt. + */ + if (!(gfp_mask & __GFP_WAIT) && !in_interrupt()) { struct radix_tree_preload *rtp; /* @@ -251,7 +257,7 @@ radix_tree_node_free(struct radix_tree_node *node) * To make use of this facility, the radix tree must be initialised without * __GFP_WAIT being passed to INIT_RADIX_TREE(). */ -int radix_tree_preload(gfp_t gfp_mask) +static int __radix_tree_preload(gfp_t gfp_mask) { struct radix_tree_preload *rtp; struct radix_tree_node *node; @@ -275,8 +281,39 @@ int radix_tree_preload(gfp_t gfp_mask) out: return ret; } + +/* + * Load up this CPU's radix_tree_node buffer with sufficient objects to + * ensure that the addition of a single element in the tree cannot fail. On + * success, return zero, with preemption disabled. On error, return -ENOMEM + * with preemption not disabled. + * + * To make use of this facility, the radix tree must be initialised without + * __GFP_WAIT being passed to INIT_RADIX_TREE(). + */ +int radix_tree_preload(gfp_t gfp_mask) +{ + /* Warn on non-sensical use... */ + WARN_ON_ONCE(!(gfp_mask & __GFP_WAIT)); + return __radix_tree_preload(gfp_mask); +} EXPORT_SYMBOL(radix_tree_preload); +/* + * The same as above function, except we don't guarantee preloading happens. + * We do it, if we decide it helps. On success, return zero with preemption + * disabled. On error, return -ENOMEM with preemption not disabled. + */ +int radix_tree_maybe_preload(gfp_t gfp_mask) +{ + if (gfp_mask & __GFP_WAIT) + return __radix_tree_preload(gfp_mask); + /* Preloading doesn't help anything with this gfp mask, skip it */ + preempt_disable(); + return 0; +} +EXPORT_SYMBOL(radix_tree_maybe_preload); + /* * Return the maximum key which can be store into a * radix tree with height HEIGHT. diff --git a/mm/filemap.c b/mm/filemap.c index 96efd16a29e7..f27d1dfc3ca1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -453,7 +453,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, if (error) goto out; - error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); + error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); if (error == 0) { page_cache_get(page); page->mapping = mapping; diff --git a/mm/shmem.c b/mm/shmem.c index 13479d8c5ff7..8bb9d3d7a885 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1205,7 +1205,7 @@ repeat: gfp & GFP_RECLAIM_MASK); if (error) goto decused; - error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); + error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, gfp, NULL); diff --git a/mm/swap_state.c b/mm/swap_state.c index 79b1c893005f..86065b4d02a1 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -120,7 +120,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) { int error; - error = radix_tree_preload(gfp_mask); + error = radix_tree_maybe_preload(gfp_mask); if (!error) { error = __add_to_swap_cache(page, entry); radix_tree_preload_end(); @@ -326,7 +326,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * call radix_tree_preload() while we can wait. */ - err = radix_tree_preload(gfp_mask & GFP_KERNEL); + err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL); if (err) break; From d68e74aa21ef4a59b39bcf6e3ab55f9b65cf1adb Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 2 Dec 2013 11:24:19 +0000 Subject: [PATCH 27/54] security: shmem: implement kernel private shmem inodes We have a problem where the big_key key storage implementation uses a shmem backed inode to hold the key contents. Because of this detail of implementation LSM checks are being done between processes trying to read the keys and the tmpfs backed inode. The LSM checks are already being handled on the key interface level and should not be enforced at the inode level (since the inode is an implementation detail, not a part of the security model) This patch implements a new function shmem_kernel_file_setup() which returns the equivalent to shmem_file_setup() only the underlying inode has S_PRIVATE set. This means that all LSM checks for the inode in question are skipped. It should only be used for kernel internal operations where the inode is not exposed to userspace without proper LSM checking. It is possible that some other users of shmem_file_setup() should use the new interface, but this has not been explored. Reproducing this bug is a little bit difficult. The steps I used on Fedora are: (1) Turn off selinux enforcing: setenforce 0 (2) Create a huge key k=`dd if=/dev/zero bs=8192 count=1 | keyctl padd big_key test-key @s` (3) Access the key in another context: runcon system_u:system_r:httpd_t:s0-s0:c0.c1023 keyctl print $k >/dev/null (4) Examine the audit logs: ausearch -m AVC -i --subject httpd_t | audit2allow If the last command's output includes a line that looks like: allow httpd_t user_tmpfs_t:file { open read }; There was an inode check between httpd and the tmpfs filesystem. With this patch no such denial will be seen. (NOTE! you should clear your audit log if you have tested for this previously) (Please return you box to enforcing) Change-Id: I4ff037291f99acd965ac759c2147b52e75078ceb Signed-off-by: Eric Paris Signed-off-by: David Howells cc: Hugh Dickins cc: linux-mm@kvack.org --- include/linux/shmem_fs.h | 2 ++ mm/shmem.c | 36 +++++++++++++++++++++++++++++------- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 72263bdafd4a..f2410d3354d4 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -47,6 +47,8 @@ extern int shmem_init(void); extern int shmem_fill_super(struct super_block *sb, void *data, int silent); extern struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags); +extern struct file *shmem_kernel_file_setup(const char *name, loff_t size, + unsigned long flags); extern int shmem_zero_setup(struct vm_area_struct *); extern int shmem_lock(struct file *file, int lock, struct user_struct *user); extern void shmem_unlock_mapping(struct address_space *mapping); diff --git a/mm/shmem.c b/mm/shmem.c index 8bb9d3d7a885..d888564825a3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2976,13 +2976,8 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); /* common code */ -/** - * shmem_file_setup - get an unlinked file living in tmpfs - * @name: name for dentry (to be seen in /proc//maps - * @size: size to be set for the file - * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size - */ -struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) +static struct file *__shmem_file_setup(const char *name, loff_t size, + unsigned long flags, unsigned int i_flags) { int error; struct file *file; @@ -3015,6 +3010,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags if (!inode) goto put_dentry; + inode->i_flags |= i_flags; d_instantiate(path.dentry, inode); inode->i_size = size; clear_nlink(inode); /* It is unlinked */ @@ -3038,6 +3034,32 @@ put_memory: shmem_unacct_size(flags, size); return ERR_PTR(error); } + +/** + * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be + * kernel internal. There will be NO LSM permission checks against the + * underlying inode. So users of this interface must do LSM checks at a + * higher layer. The one user is the big_key implementation. LSM checks + * are provided at the key level rather than the inode level. + * @name: name for dentry (to be seen in /proc//maps + * @size: size to be set for the file + * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size + */ +struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags) +{ + return __shmem_file_setup(name, size, flags, S_PRIVATE); +} + +/** + * shmem_file_setup - get an unlinked file living in tmpfs + * @name: name for dentry (to be seen in /proc//maps + * @size: size to be set for the file + * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size + */ +struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) +{ + return __shmem_file_setup(name, size, flags, 0); +} EXPORT_SYMBOL_GPL(shmem_file_setup); void shmem_set_file(struct vm_area_struct *vma, struct file *file) From 87b163a96cae13e720100abbea0e61f10980c717 Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Tue, 11 Sep 2012 16:28:11 -0400 Subject: [PATCH 28/54] fs: add missing documentation to simple_xattr functions v2: add function documentation instead of adding a separate file under Documentation/ tj: Updated comment a bit and rolled in Randy's suggestions. Change-Id: I2be6be5f4987127c354e2cca490876085ceaea45 Cc: Li Zefan Cc: Tejun Heo Cc: Hugh Dickins Cc: Hillf Danton Cc: Lennart Poettering Cc: Randy Dunlap Signed-off-by: Aristeu Rozanski Signed-off-by: Tejun Heo --- fs/xattr.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/fs/xattr.c b/fs/xattr.c index 78ed976ab548..75c784d4ee81 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -884,8 +884,19 @@ out: } -/* - * xattr SET operation for in-memory/pseudo filesystems +/** + * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems + * @xattrs: target simple_xattr list + * @name: name of the new extended attribute + * @value: value of the new xattr. If %NULL, will remove the attribute + * @size: size of the new xattr + * @flags: %XATTR_{CREATE|REPLACE} + * + * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails + * with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist; + * otherwise, fails with -ENODATA. + * + * Returns 0 on success, -errno on failure. */ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, const void *value, size_t size, int flags) @@ -942,6 +953,9 @@ ssize_t simple_xattr_list(struct simple_xattrs *xattrs, char *buffer, return used; } +/* + * Adds an extended attribute to the list + */ void simple_xattr_list_add(struct simple_xattrs *xattrs, struct simple_xattr *new_xattr) { From 4a715aabf5288d9b0e11e913ef72821c29094161 Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Wed, 12 Sep 2012 10:31:13 -0400 Subject: [PATCH 29/54] xattr: mark variable as uninitialized to make both gcc and smatch happy new_xattr in __simple_xattr_set() is only initialized with a valid pointer if value is not NULL, which only happens if this function is called directly with the intention to remove an existing extended attribute. Even being safe to be this way, smatch warns about possible NULL dereference. Dan Carpenter suggested using uninitialized_var() which will make both gcc and smatch happy. Change-Id: I0a7ff73cd9bda419961e8421b1472ed14a64f71c Cc: Fengguang Wu Cc: Dan Carpenter Signed-off-by: Aristeu Rozanski Signed-off-by: Tejun Heo --- fs/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xattr.c b/fs/xattr.c index 75c784d4ee81..87a0e9449a97 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -837,7 +837,7 @@ static int __simple_xattr_set(struct simple_xattrs *xattrs, const char *name, const void *value, size_t size, int flags) { struct simple_xattr *xattr; - struct simple_xattr *new_xattr = NULL; + struct simple_xattr *uninitialized_var(new_xattr); int err = 0; /* value == NULL means remove */ From 9e2cb26fd88048d047357967279e73ad92f8735c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 7 Feb 2012 18:52:57 -0800 Subject: [PATCH 30/54] userns: Convert vfs posix_acl support to use kuids and kgids - In setxattr if we are setting a posix acl convert uids and gids from the current user namespace into the initial user namespace, before the xattrs are passed to the underlying filesystem. Untranslatable uids and gids are represented as -1 which posix_acl_from_xattr will represent as INVALID_UID or INVALID_GID. posix_acl_valid will fail if an acl from userspace has any INVALID_UID or INVALID_GID values. In net this guarantees that untranslatable posix acls will not be stored by filesystems. - In getxattr if we are reading a posix acl convert uids and gids from the initial user namespace into the current user namespace. Uids and gids that can not be tranlsated into the current user namespace will be represented as -1. - Replace e_id in struct posix_acl_entry with an anymouns union of e_uid and e_gid. For the short term retain the e_id field until all of the users are converted. - Don't set struct posix_acl.e_id in the cases where the acl type does not use e_id. Greatly reducing the use of ACL_UNDEFINED_ID. - Rework the ordering checks in posix_acl_valid so that I use kuid_t and kgid_t types throughout the code, and so that I don't need arithmetic on uid and gid types. Change-Id: If5ca5f58195e29b3c8cb51bd7769c980dc7da3a4 Cc: Theodore Tso Cc: Andrew Morton Cc: Andreas Dilger Cc: Jan Kara Cc: Al Viro Signed-off-by: Eric W. Biederman --- fs/posix_acl.c | 30 +++++------ fs/xattr.c | 7 +++ fs/xattr_acl.c | 90 +++++++++++++++++++++++++++++++-- include/linux/posix_acl.h | 8 ++- include/linux/posix_acl_xattr.h | 12 +++++ 5 files changed, 126 insertions(+), 21 deletions(-) diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 1c61b8d58f9f..082c3cc09b63 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -78,7 +78,8 @@ posix_acl_valid(const struct posix_acl *acl) { const struct posix_acl_entry *pa, *pe; int state = ACL_USER_OBJ; - unsigned int id = 0; /* keep gcc happy */ + kuid_t prev_uid = INVALID_UID; + kgid_t prev_gid = INVALID_GID; int needs_mask = 0; FOREACH_ACL_ENTRY(pa, acl, pe) { @@ -87,7 +88,6 @@ posix_acl_valid(const struct posix_acl *acl) switch (pa->e_tag) { case ACL_USER_OBJ: if (state == ACL_USER_OBJ) { - id = 0; state = ACL_USER; break; } @@ -96,16 +96,17 @@ posix_acl_valid(const struct posix_acl *acl) case ACL_USER: if (state != ACL_USER) return -EINVAL; - if (pa->e_id == ACL_UNDEFINED_ID || - pa->e_id < id) + if (!uid_valid(pa->e_uid)) return -EINVAL; - id = pa->e_id + 1; + if (uid_valid(prev_uid) && + uid_lte(pa->e_uid, prev_uid)) + return -EINVAL; + prev_uid = pa->e_uid; needs_mask = 1; break; case ACL_GROUP_OBJ: if (state == ACL_USER) { - id = 0; state = ACL_GROUP; break; } @@ -114,10 +115,12 @@ posix_acl_valid(const struct posix_acl *acl) case ACL_GROUP: if (state != ACL_GROUP) return -EINVAL; - if (pa->e_id == ACL_UNDEFINED_ID || - pa->e_id < id) + if (!gid_valid(pa->e_gid)) return -EINVAL; - id = pa->e_id + 1; + if (gid_valid(prev_gid) && + gid_lte(pa->e_gid, prev_gid)) + return -EINVAL; + prev_gid = pa->e_gid; needs_mask = 1; break; @@ -201,15 +204,12 @@ posix_acl_from_mode(umode_t mode, gfp_t flags) return ERR_PTR(-ENOMEM); acl->a_entries[0].e_tag = ACL_USER_OBJ; - acl->a_entries[0].e_id = ACL_UNDEFINED_ID; acl->a_entries[0].e_perm = (mode & S_IRWXU) >> 6; acl->a_entries[1].e_tag = ACL_GROUP_OBJ; - acl->a_entries[1].e_id = ACL_UNDEFINED_ID; acl->a_entries[1].e_perm = (mode & S_IRWXG) >> 3; acl->a_entries[2].e_tag = ACL_OTHER; - acl->a_entries[2].e_id = ACL_UNDEFINED_ID; acl->a_entries[2].e_perm = (mode & S_IRWXO); return acl; } @@ -230,11 +230,11 @@ posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want) switch(pa->e_tag) { case ACL_USER_OBJ: /* (May have been checked already) */ - if (inode->i_uid == current_fsuid()) + if (uid_eq(inode->i_uid, current_fsuid())) goto check_perm; break; case ACL_USER: - if (pa->e_id == current_fsuid()) + if (uid_eq(pa->e_uid, current_fsuid())) goto mask; break; case ACL_GROUP_OBJ: @@ -245,7 +245,7 @@ posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want) } break; case ACL_GROUP: - if (in_group_p(pa->e_id)) { + if (in_group_p(pa->e_gid)) { found = 1; if ((pa->e_perm & want) == want) goto mask; diff --git a/fs/xattr.c b/fs/xattr.c index 87a0e9449a97..be25ff4f7399 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -347,6 +348,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value, error = -EFAULT; goto out; } + if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || + (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) + posix_acl_fix_xattr_from_user(kvalue, size); } error = vfs_setxattr(d, kname, kvalue, size, flags); @@ -445,6 +449,9 @@ getxattr(struct dentry *d, const char __user *name, void __user *value, error = vfs_getxattr(d, kname, kvalue, size); if (error > 0) { + if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || + (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) + posix_acl_fix_xattr_to_user(kvalue, size); if (size && copy_to_user(value, kvalue, error)) error = -EFAULT; } else if (error == -ERANGE && size >= XATTR_SIZE_MAX) { diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c index 69d06b07b169..bf472ca1b348 100644 --- a/fs/xattr_acl.c +++ b/fs/xattr_acl.c @@ -9,7 +9,65 @@ #include #include #include +#include +/* + * Fix up the uids and gids in posix acl extended attributes in place. + */ +static void posix_acl_fix_xattr_userns( + struct user_namespace *to, struct user_namespace *from, + void *value, size_t size) +{ + posix_acl_xattr_header *header = (posix_acl_xattr_header *)value; + posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end; + int count; + kuid_t uid; + kgid_t gid; + + if (!value) + return; + if (size < sizeof(posix_acl_xattr_header)) + return; + if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) + return; + + count = posix_acl_xattr_count(size); + if (count < 0) + return; + if (count == 0) + return; + + for (end = entry + count; entry != end; entry++) { + switch(le16_to_cpu(entry->e_tag)) { + case ACL_USER: + uid = make_kuid(from, le32_to_cpu(entry->e_id)); + entry->e_id = cpu_to_le32(from_kuid(to, uid)); + break; + case ACL_GROUP: + gid = make_kgid(from, le32_to_cpu(entry->e_id)); + entry->e_id = cpu_to_le32(from_kuid(to, uid)); + break; + default: + break; + } + } +} + +void posix_acl_fix_xattr_from_user(void *value, size_t size) +{ + struct user_namespace *user_ns = current_user_ns(); + if (user_ns == &init_user_ns) + return; + posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size); +} + +void posix_acl_fix_xattr_to_user(void *value, size_t size) +{ + struct user_namespace *user_ns = current_user_ns(); + if (user_ns == &init_user_ns) + return; + posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size); +} /* * Convert from extended attribute to in-memory representation. @@ -50,12 +108,21 @@ posix_acl_from_xattr(const void *value, size_t size) case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: - acl_e->e_id = ACL_UNDEFINED_ID; break; case ACL_USER: + acl_e->e_uid = + make_kuid(&init_user_ns, + le32_to_cpu(entry->e_id)); + if (!uid_valid(acl_e->e_uid)) + goto fail; + break; case ACL_GROUP: - acl_e->e_id = le32_to_cpu(entry->e_id); + acl_e->e_gid = + make_kgid(&init_user_ns, + le32_to_cpu(entry->e_id)); + if (!gid_valid(acl_e->e_gid)) + goto fail; break; default: @@ -89,9 +156,22 @@ posix_acl_to_xattr(const struct posix_acl *acl, void *buffer, size_t size) ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); for (n=0; n < acl->a_count; n++, ext_entry++) { - ext_entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); - ext_entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); - ext_entry->e_id = cpu_to_le32(acl->a_entries[n].e_id); + const struct posix_acl_entry *acl_e = &acl->a_entries[n]; + ext_entry->e_tag = cpu_to_le16(acl_e->e_tag); + ext_entry->e_perm = cpu_to_le16(acl_e->e_perm); + switch(acl_e->e_tag) { + case ACL_USER: + ext_entry->e_id = + cpu_to_le32(from_kuid(&init_user_ns, acl_e->e_uid)); + break; + case ACL_GROUP: + ext_entry->e_id = + cpu_to_le32(from_kgid(&init_user_ns, acl_e->e_gid)); + break; + default: + ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID); + break; + } } return real_size; } diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index 9c4bbc29e30a..2ae0bba45f12 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -36,7 +36,13 @@ struct posix_acl_entry { short e_tag; unsigned short e_perm; - unsigned int e_id; + union { + kuid_t e_uid; + kgid_t e_gid; +#ifndef CONFIG_UIDGID_STRICT_TYPE_CHECKS + unsigned int e_id; +#endif + }; }; struct posix_acl { diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h index 6e53c34035cd..8bd5fcf06916 100644 --- a/include/linux/posix_acl_xattr.h +++ b/include/linux/posix_acl_xattr.h @@ -52,6 +52,18 @@ posix_acl_xattr_count(size_t size) return size / sizeof(posix_acl_xattr_entry); } +#ifdef CONFIG_FS_POSIX_ACL +void posix_acl_fix_xattr_from_user(void *value, size_t size); +void posix_acl_fix_xattr_to_user(void *value, size_t size); +#else +static inline void posix_acl_fix_xattr_from_user(void *value, size_t size) +{ +} +static inline void posix_acl_fix_xattr_to_user(void *value, size_t size) +{ +} +#endif + struct posix_acl *posix_acl_from_xattr(const void *value, size_t size); int posix_acl_to_xattr(const struct posix_acl *acl, void *buffer, size_t size); From c859ce625157db6dd636e1be054b4d3c0c339dce Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 10 Sep 2012 20:17:44 -0700 Subject: [PATCH 31/54] userns: Pass a userns parameter into posix_acl_to_xattr and posix_acl_from_xattr - Pass the user namespace the uid and gid values in the xattr are stored in into posix_acl_from_xattr. - Pass the user namespace kuid and kgid values should be converted into when storing uid and gid values in an xattr in posix_acl_to_xattr. - Modify all callers of posix_acl_from_xattr and posix_acl_to_xattr to pass in &init_user_ns. In the short term this change is not strictly needed but it makes the code clearer. In the longer term this change is necessary to be able to mount filesystems outside of the initial user namespace that natively store posix acls in the linux xattr format. Change-Id: I7c2b18f16ec9d7ded49135cedc2e91a71e078087 Cc: Theodore Tso Cc: Andrew Morton Cc: Andreas Dilger Cc: Jan Kara Cc: Al Viro Signed-off-by: "Eric W. Biederman" --- fs/9p/acl.c | 8 ++++---- fs/btrfs/acl.c | 8 ++++---- fs/ext2/acl.c | 4 ++-- fs/ext3/acl.c | 4 ++-- fs/ext4/acl.c | 4 ++-- fs/f2fs/acl.c | 4 ++-- fs/generic_acl.c | 4 ++-- fs/gfs2/acl.c | 14 +++++++------- fs/jffs2/acl.c | 4 ++-- fs/jfs/acl.c | 4 ++-- fs/jfs/xattr.c | 4 ++-- fs/nfs/nfs3acl.c | 4 ++-- fs/nfsd/vfs.c | 8 ++++---- fs/ocfs2/acl.c | 4 ++-- fs/reiserfs/xattr_acl.c | 4 ++-- fs/xattr_acl.c | 14 ++++++++------ fs/xfs/xfs_acl.c | 4 ++-- include/linux/posix_acl_xattr.h | 6 ++++-- 18 files changed, 55 insertions(+), 51 deletions(-) diff --git a/fs/9p/acl.c b/fs/9p/acl.c index a4188cfcc9f9..27886efe87f3 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -37,7 +37,7 @@ static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name) return ERR_PTR(-ENOMEM); size = v9fs_fid_xattr_get(fid, name, value, size); if (size > 0) { - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) goto err_out; } @@ -131,7 +131,7 @@ static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl) buffer = kmalloc(size, GFP_KERNEL); if (!buffer) return -ENOMEM; - retval = posix_acl_to_xattr(acl, buffer, size); + retval = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); if (retval < 0) goto err_free_out; switch (type) { @@ -251,7 +251,7 @@ static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name, return PTR_ERR(acl); if (acl == NULL) return -ENODATA; - error = posix_acl_to_xattr(acl, buffer, size); + error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); posix_acl_release(acl); return error; @@ -304,7 +304,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name, return -EPERM; if (value) { /* update the cached acl value */ - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); else if (acl) { diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 9f55b545ea44..0b537ad2f704 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -61,7 +61,7 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) size = __btrfs_getxattr(inode, name, value, size); } if (size > 0) { - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); } else if (size == -ENOENT || size == -ENODATA || size == 0) { /* FIXME, who returns -ENOENT? I think nobody */ acl = NULL; @@ -91,7 +91,7 @@ static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name, return PTR_ERR(acl); if (acl == NULL) return -ENODATA; - ret = posix_acl_to_xattr(acl, value, size); + ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); posix_acl_release(acl); return ret; @@ -141,7 +141,7 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans, goto out; } - ret = posix_acl_to_xattr(acl, value, size); + ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); if (ret < 0) goto out; } @@ -169,7 +169,7 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name, return -EOPNOTSUPP; if (value) { - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index e38a9b61af3f..3b067882ae9d 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -346,7 +346,7 @@ ext2_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, return PTR_ERR(acl); if (acl == NULL) return -ENODATA; - error = posix_acl_to_xattr(acl, buffer, size); + error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); posix_acl_release(acl); return error; @@ -367,7 +367,7 @@ ext2_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, return -EPERM; if (value) { - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); else if (acl) { diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index f3326ca34159..e8190c21dab5 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -365,7 +365,7 @@ ext3_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, return PTR_ERR(acl); if (acl == NULL) return -ENODATA; - error = posix_acl_to_xattr(acl, buffer, size); + error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); posix_acl_release(acl); return error; @@ -388,7 +388,7 @@ ext3_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, return -EPERM; if (value) { - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); else if (acl) { diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 5d419a496d96..1b24291b6764 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -370,7 +370,7 @@ ext4_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, return PTR_ERR(acl); if (acl == NULL) return -ENODATA; - error = posix_acl_to_xattr(acl, buffer, size); + error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); posix_acl_release(acl); return error; @@ -393,7 +393,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, return -EPERM; if (value) { - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); else if (acl) { diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 1ad03916074a..f295fbe3bf81 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -343,7 +343,7 @@ static int f2fs_xattr_get_acl(struct dentry *dentry, const char *name, return PTR_ERR(acl); if (!acl) return -ENODATA; - error = posix_acl_to_xattr(acl, buffer, size); + error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); posix_acl_release(acl); return error; @@ -365,7 +365,7 @@ static int f2fs_xattr_set_acl(struct dentry *dentry, const char *name, return -EPERM; if (value) { - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { diff --git a/fs/generic_acl.c b/fs/generic_acl.c index e9c0746e8205..21408084c3b3 100644 --- a/fs/generic_acl.c +++ b/fs/generic_acl.c @@ -56,7 +56,7 @@ generic_acl_get(struct dentry *dentry, const char *name, void *buffer, acl = get_cached_acl(dentry->d_inode, type); if (!acl) return -ENODATA; - error = posix_acl_to_xattr(acl, buffer, size); + error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); posix_acl_release(acl); return error; @@ -77,7 +77,7 @@ generic_acl_set(struct dentry *dentry, const char *name, const void *value, if (!inode_owner_or_capable(inode)) return -EPERM; if (value) { - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); } diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 846611f2e415..ea4b4585ae11 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -63,7 +63,7 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type) if (len == 0) return NULL; - acl = posix_acl_from_xattr(data, len); + acl = posix_acl_from_xattr(&init_user_ns, data, len); kfree(data); return acl; } @@ -92,13 +92,13 @@ static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl) const char *name = gfs2_acl_name(type); BUG_ON(name == NULL); - len = posix_acl_to_xattr(acl, NULL, 0); + len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0); if (len == 0) return 0; data = kmalloc(len, GFP_NOFS); if (data == NULL) return -ENOMEM; - error = posix_acl_to_xattr(acl, data, len); + error = posix_acl_to_xattr(&init_user_ns, acl, data, len); if (error < 0) goto out; error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS); @@ -172,12 +172,12 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) if (error) return error; - len = posix_acl_to_xattr(acl, NULL, 0); + len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0); data = kmalloc(len, GFP_NOFS); error = -ENOMEM; if (data == NULL) goto out; - posix_acl_to_xattr(acl, data, len); + posix_acl_to_xattr(&init_user_ns, acl, data, len); error = gfs2_xattr_acl_chmod(ip, attr, data); kfree(data); set_cached_acl(&ip->i_inode, ACL_TYPE_ACCESS, acl); @@ -218,7 +218,7 @@ static int gfs2_xattr_system_get(struct dentry *dentry, const char *name, if (acl == NULL) return -ENODATA; - error = posix_acl_to_xattr(acl, buffer, size); + error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); posix_acl_release(acl); return error; @@ -251,7 +251,7 @@ static int gfs2_xattr_system_set(struct dentry *dentry, const char *name, if (!value) goto set_acl; - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (!acl) { /* * acl_set_file(3) may request that we set default ACLs with diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 398e5c71c502..85b3c4c01963 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -362,7 +362,7 @@ static int jffs2_acl_getxattr(struct dentry *dentry, const char *name, return PTR_ERR(acl); if (!acl) return -ENODATA; - rc = posix_acl_to_xattr(acl, buffer, size); + rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); posix_acl_release(acl); return rc; @@ -380,7 +380,7 @@ static int jffs2_acl_setxattr(struct dentry *dentry, const char *name, return -EPERM; if (value) { - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 45559dc3ea2f..d254d6d35995 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -64,7 +64,7 @@ struct posix_acl *jfs_get_acl(struct inode *inode, int type) else acl = ERR_PTR(size); } else { - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); } kfree(value); if (!IS_ERR(acl)) @@ -100,7 +100,7 @@ static int jfs_set_acl(tid_t tid, struct inode *inode, int type, value = kmalloc(size, GFP_KERNEL); if (!value) return -ENOMEM; - rc = posix_acl_to_xattr(acl, value, size); + rc = posix_acl_to_xattr(&init_user_ns, acl, value, size); if (rc < 0) goto out; } diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 41a0735314fe..c79b1d7a53e2 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -685,7 +685,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name, * POSIX_ACL_XATTR_ACCESS is tied to i_mode */ if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) { - acl = posix_acl_from_xattr(value, value_len); + acl = posix_acl_from_xattr(&init_user_ns, value, value_len); if (IS_ERR(acl)) { rc = PTR_ERR(acl); printk(KERN_ERR "posix_acl_from_xattr returned %d\n", @@ -710,7 +710,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name, return 0; } else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) { - acl = posix_acl_from_xattr(value, value_len); + acl = posix_acl_from_xattr(&init_user_ns, value, value_len); if (IS_ERR(acl)) { rc = PTR_ERR(acl); printk(KERN_ERR "posix_acl_from_xattr returned %d\n", diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index e4498dc351a8..4a1aafba6a20 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -70,7 +70,7 @@ ssize_t nfs3_getxattr(struct dentry *dentry, const char *name, if (type == ACL_TYPE_ACCESS && acl->a_count == 0) error = -ENODATA; else - error = posix_acl_to_xattr(acl, buffer, size); + error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); posix_acl_release(acl); } else error = -ENODATA; @@ -92,7 +92,7 @@ int nfs3_setxattr(struct dentry *dentry, const char *name, else return -EOPNOTSUPP; - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); error = nfs3_proc_setacl(inode, type, acl); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 8ace1bf90a99..ed90f834ef38 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -517,7 +517,7 @@ set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key) if (buf == NULL) goto out; - len = posix_acl_to_xattr(pacl, buf, buflen); + len = posix_acl_to_xattr(&init_user_ns, pacl, buf, buflen); if (len < 0) { error = len; goto out; @@ -586,7 +586,7 @@ _get_posix_acl(struct dentry *dentry, char *key) if (buflen <= 0) return ERR_PTR(buflen); - pacl = posix_acl_from_xattr(buf, buflen); + pacl = posix_acl_from_xattr(&init_user_ns, buf, buflen); kfree(buf); return pacl; } @@ -2299,7 +2299,7 @@ nfsd_get_posix_acl(struct svc_fh *fhp, int type) if (size < 0) return ERR_PTR(size); - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); kfree(value); return acl; } @@ -2332,7 +2332,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl) value = kmalloc(size, GFP_KERNEL); if (!value) return -ENOMEM; - error = posix_acl_to_xattr(acl, value, size); + error = posix_acl_to_xattr(&init_user_ns, acl, value, size); if (error < 0) goto getout; size = error; diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index eed868e9a89d..3283555e4701 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -446,7 +446,7 @@ static int ocfs2_xattr_get_acl(struct dentry *dentry, const char *name, return PTR_ERR(acl); if (acl == NULL) return -ENODATA; - ret = posix_acl_to_xattr(acl, buffer, size); + ret = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); posix_acl_release(acl); return ret; @@ -469,7 +469,7 @@ static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name, return -EPERM; if (value) { - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); else if (acl) { diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 502254b64324..690d5457af2d 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -30,7 +30,7 @@ posix_acl_set(struct dentry *dentry, const char *name, const void *value, return -EPERM; if (value) { - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) { return PTR_ERR(acl); } else if (acl) { @@ -77,7 +77,7 @@ posix_acl_get(struct dentry *dentry, const char *name, void *buffer, return PTR_ERR(acl); if (acl == NULL) return -ENODATA; - error = posix_acl_to_xattr(acl, buffer, size); + error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); posix_acl_release(acl); return error; diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c index bf472ca1b348..11efd830b5f5 100644 --- a/fs/xattr_acl.c +++ b/fs/xattr_acl.c @@ -73,7 +73,8 @@ void posix_acl_fix_xattr_to_user(void *value, size_t size) * Convert from extended attribute to in-memory representation. */ struct posix_acl * -posix_acl_from_xattr(const void *value, size_t size) +posix_acl_from_xattr(struct user_namespace *user_ns, + const void *value, size_t size) { posix_acl_xattr_header *header = (posix_acl_xattr_header *)value; posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end; @@ -112,14 +113,14 @@ posix_acl_from_xattr(const void *value, size_t size) case ACL_USER: acl_e->e_uid = - make_kuid(&init_user_ns, + make_kuid(user_ns, le32_to_cpu(entry->e_id)); if (!uid_valid(acl_e->e_uid)) goto fail; break; case ACL_GROUP: acl_e->e_gid = - make_kgid(&init_user_ns, + make_kgid(user_ns, le32_to_cpu(entry->e_id)); if (!gid_valid(acl_e->e_gid)) goto fail; @@ -141,7 +142,8 @@ EXPORT_SYMBOL (posix_acl_from_xattr); * Convert from in-memory to extended attribute representation. */ int -posix_acl_to_xattr(const struct posix_acl *acl, void *buffer, size_t size) +posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, + void *buffer, size_t size) { posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer; posix_acl_xattr_entry *ext_entry = ext_acl->a_entries; @@ -162,11 +164,11 @@ posix_acl_to_xattr(const struct posix_acl *acl, void *buffer, size_t size) switch(acl_e->e_tag) { case ACL_USER: ext_entry->e_id = - cpu_to_le32(from_kuid(&init_user_ns, acl_e->e_uid)); + cpu_to_le32(from_kuid(user_ns, acl_e->e_uid)); break; case ACL_GROUP: ext_entry->e_id = - cpu_to_le32(from_kgid(&init_user_ns, acl_e->e_gid)); + cpu_to_le32(from_kgid(user_ns, acl_e->e_gid)); break; default: ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID); diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index c0f97c319b75..fb853c7b2b75 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -337,7 +337,7 @@ xfs_xattr_acl_get(struct dentry *dentry, const char *name, if (acl == NULL) return -ENODATA; - error = posix_acl_to_xattr(acl, value, size); + error = posix_acl_to_xattr(&init_user_ns, acl, value, size); posix_acl_release(acl); return error; @@ -361,7 +361,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name, if (!value) goto set_acl; - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (!acl) { /* * acl_set_file(3) may request that we set default ACLs with diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h index 8bd5fcf06916..ad93ad0f1db0 100644 --- a/include/linux/posix_acl_xattr.h +++ b/include/linux/posix_acl_xattr.h @@ -64,7 +64,9 @@ static inline void posix_acl_fix_xattr_to_user(void *value, size_t size) } #endif -struct posix_acl *posix_acl_from_xattr(const void *value, size_t size); -int posix_acl_to_xattr(const struct posix_acl *acl, void *buffer, size_t size); +struct posix_acl *posix_acl_from_xattr(struct user_namespace *user_ns, + const void *value, size_t size); +int posix_acl_to_xattr(struct user_namespace *user_ns, + const struct posix_acl *acl, void *buffer, size_t size); #endif /* _POSIX_ACL_XATTR_H */ From 33fd466d8f8fa6ecaefb2cbca2467d7441d1570b Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Wed, 23 Mar 2011 17:23:06 -0400 Subject: [PATCH 32/54] vfs: extend vfs_removexattr locking This patch takes the i_mutex lock before security_inode_removexattr(), instead of after, in preparation of calling ima_inode_removexattr(). Change-Id: I969fa5536599242a3403930379c49d733b1e1295 Signed-off-by: Mimi Zohar Signed-off-by: Dmitry Kasatkin --- fs/xattr.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/xattr.c b/fs/xattr.c index be25ff4f7399..b323ab9062ca 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -296,11 +296,13 @@ vfs_removexattr(struct dentry *dentry, const char *name) if (error) return error; - error = security_inode_removexattr(dentry, name); - if (error) - return error; - mutex_lock(&inode->i_mutex); + error = security_inode_removexattr(dentry, name); + if (error) { + mutex_unlock(&inode->i_mutex); + return error; + } + error = inode->i_op->removexattr(dentry, name); mutex_unlock(&inode->i_mutex); From 489f0a30ac60b7f5c7e8d7665a9117a6abf5b3fe Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 9 Oct 2012 15:11:55 -0700 Subject: [PATCH 33/54] userns: Fix posix_acl_file_xattr_userns gid conversion The code needs to be from_kgid(make_kgid(...)...) not from_kuid(make_kgid(...)...). Doh! Change-Id: Ib3e4a257e76ba85e92bb4fe272c6af7c4dc60b5b Reported-by: Jan Kara Signed-off-by: "Eric W. Biederman" --- fs/xattr_acl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c index 11efd830b5f5..9fbea87fdb6e 100644 --- a/fs/xattr_acl.c +++ b/fs/xattr_acl.c @@ -45,7 +45,7 @@ static void posix_acl_fix_xattr_userns( break; case ACL_GROUP: gid = make_kgid(from, le32_to_cpu(entry->e_id)); - entry->e_id = cpu_to_le32(from_kuid(to, uid)); + entry->e_id = cpu_to_le32(from_kgid(to, gid)); break; default: break; From 2fa9d2fc3227dff5c600d72642cf62b28e566836 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 17 Oct 2012 20:41:15 -0700 Subject: [PATCH 34/54] fs, xattr: fix bug when removing a name not in xattr list Commit 38f38657444d ("xattr: extract simple_xattr code from tmpfs") moved some code from tmpfs but introduced a subtle bug along the way. If the name passed to simple_xattr_remove() does not exist in the list of xattrs, then it is possible to call kfree(new_xattr) when new_xattr is actually initialized to itself on the stack via uninitialized_var(). This causes a BUG() since the memory was not allocated via the slab allocator and was not bypassed through to the page allocator because it was too large. Initialize the local variable to NULL so the kfree() never takes place. Change-Id: I0f090df631e871657fb31914dce57c13e81e25c2 Reported-by: Fengguang Wu Signed-off-by: David Rientjes Acked-by: Hugh Dickins Acked-by: Aristeu Rozanski Signed-off-by: Linus Torvalds --- fs/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xattr.c b/fs/xattr.c index b323ab9062ca..709dc820dbc0 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -846,7 +846,7 @@ static int __simple_xattr_set(struct simple_xattrs *xattrs, const char *name, const void *value, size_t size, int flags) { struct simple_xattr *xattr; - struct simple_xattr *uninitialized_var(new_xattr); + struct simple_xattr *new_xattr = NULL; int err = 0; /* value == NULL means remove */ From 92a2cf6e765c4d4107da70d42e4a50194fe6f5c0 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 23 Jan 2014 15:56:15 -0800 Subject: [PATCH 35/54] userns: relax the posix_acl_valid() checks So far, POSIX ACLs are using a canonical representation that keeps all ACL entries in a strict order; the ACL_USER and ACL_GROUP entries for specific users and groups are ordered by user and group identifier, respectively. The user-space code provides ACL entries in this order; the kernel verifies that the ACL entry order is correct in posix_acl_valid(). User namespaces allow to arbitrary map user and group identifiers which can cause the ACL_USER and ACL_GROUP entry order to differ between user space and the kernel; posix_acl_valid() would then fail. Work around this by allowing ACL_USER and ACL_GROUP entries to be in any order in the kernel. The effect is only minor: file permission checks will pick the first matching ACL_USER entry, and check all matching ACL_GROUP entries. (The libacl user-space library and getfacl / setfacl tools will not create ACLs with duplicate user or group idenfifiers; they will handle ACLs with entries in an arbitrary order correctly.) Change-Id: Ib73a93c56fb8029102ba2aec8ea3b56a7467fb86 Signed-off-by: Andreas Gruenbacher Cc: Eric W. Biederman Cc: Theodore Tso Cc: Christoph Hellwig Cc: Andreas Dilger Cc: Jan Kara Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/posix_acl.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 082c3cc09b63..7fe05f1f0c33 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -78,8 +78,6 @@ posix_acl_valid(const struct posix_acl *acl) { const struct posix_acl_entry *pa, *pe; int state = ACL_USER_OBJ; - kuid_t prev_uid = INVALID_UID; - kgid_t prev_gid = INVALID_GID; int needs_mask = 0; FOREACH_ACL_ENTRY(pa, acl, pe) { @@ -98,10 +96,6 @@ posix_acl_valid(const struct posix_acl *acl) return -EINVAL; if (!uid_valid(pa->e_uid)) return -EINVAL; - if (uid_valid(prev_uid) && - uid_lte(pa->e_uid, prev_uid)) - return -EINVAL; - prev_uid = pa->e_uid; needs_mask = 1; break; @@ -117,10 +111,6 @@ posix_acl_valid(const struct posix_acl *acl) return -EINVAL; if (!gid_valid(pa->e_gid)) return -EINVAL; - if (gid_valid(prev_gid) && - gid_lte(pa->e_gid, prev_gid)) - return -EINVAL; - prev_gid = pa->e_gid; needs_mask = 1; break; From 467c6423de29cbae620454f3b20ad358cc7c2763 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 20 Dec 2013 05:16:37 -0800 Subject: [PATCH 36/54] fs: merge xattr_acl.c into posix_acl.c Change-Id: I13a2b882e8ccd34cf858155d3a40beb3643ef216 Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/Makefile | 2 +- fs/posix_acl.c | 180 +++++++++++++++++++++++++++++++++++++++++++++++-- fs/xattr_acl.c | 180 ------------------------------------------------- 3 files changed, 174 insertions(+), 188 deletions(-) delete mode 100644 fs/xattr_acl.c diff --git a/fs/Makefile b/fs/Makefile index c59aeff632a8..a8bb29c1f491 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -46,7 +46,7 @@ obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o obj-$(CONFIG_FS_MBCACHE) += mbcache.o -obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o +obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ obj-$(CONFIG_GENERIC_ACL) += generic_acl.o diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 7fe05f1f0c33..916036a4a1ff 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -1,10 +1,8 @@ /* - * linux/fs/posix_acl.c + * Copyright (C) 2002,2003 by Andreas Gruenbacher * - * Copyright (C) 2002 by Andreas Gruenbacher - * - * Fixes from William Schumacher incorporated on 15 March 2001. - * (Reported by Charles Bertsch, ). + * Fixes from William Schumacher incorporated on 15 March 2001. + * (Reported by Charles Bertsch, ). */ /* @@ -18,9 +16,9 @@ #include #include #include +#include #include - -#include +#include EXPORT_SYMBOL(posix_acl_init); EXPORT_SYMBOL(posix_acl_alloc); @@ -445,3 +443,171 @@ posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode) return err; } EXPORT_SYMBOL(posix_acl_chmod); + +/* + * Fix up the uids and gids in posix acl extended attributes in place. + */ +static void posix_acl_fix_xattr_userns( + struct user_namespace *to, struct user_namespace *from, + void *value, size_t size) +{ + posix_acl_xattr_header *header = (posix_acl_xattr_header *)value; + posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end; + int count; + kuid_t uid; + kgid_t gid; + + if (!value) + return; + if (size < sizeof(posix_acl_xattr_header)) + return; + if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) + return; + + count = posix_acl_xattr_count(size); + if (count < 0) + return; + if (count == 0) + return; + + for (end = entry + count; entry != end; entry++) { + switch(le16_to_cpu(entry->e_tag)) { + case ACL_USER: + uid = make_kuid(from, le32_to_cpu(entry->e_id)); + entry->e_id = cpu_to_le32(from_kuid(to, uid)); + break; + case ACL_GROUP: + gid = make_kgid(from, le32_to_cpu(entry->e_id)); + entry->e_id = cpu_to_le32(from_kgid(to, gid)); + break; + default: + break; + } + } +} + +void posix_acl_fix_xattr_from_user(void *value, size_t size) +{ + struct user_namespace *user_ns = current_user_ns(); + if (user_ns == &init_user_ns) + return; + posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size); +} + +void posix_acl_fix_xattr_to_user(void *value, size_t size) +{ + struct user_namespace *user_ns = current_user_ns(); + if (user_ns == &init_user_ns) + return; + posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size); +} + +/* + * Convert from extended attribute to in-memory representation. + */ +struct posix_acl * +posix_acl_from_xattr(struct user_namespace *user_ns, + const void *value, size_t size) +{ + posix_acl_xattr_header *header = (posix_acl_xattr_header *)value; + posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end; + int count; + struct posix_acl *acl; + struct posix_acl_entry *acl_e; + + if (!value) + return NULL; + if (size < sizeof(posix_acl_xattr_header)) + return ERR_PTR(-EINVAL); + if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) + return ERR_PTR(-EOPNOTSUPP); + + count = posix_acl_xattr_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + + acl = posix_acl_alloc(count, GFP_NOFS); + if (!acl) + return ERR_PTR(-ENOMEM); + acl_e = acl->a_entries; + + for (end = entry + count; entry != end; acl_e++, entry++) { + acl_e->e_tag = le16_to_cpu(entry->e_tag); + acl_e->e_perm = le16_to_cpu(entry->e_perm); + + switch(acl_e->e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + break; + + case ACL_USER: + acl_e->e_uid = + make_kuid(user_ns, + le32_to_cpu(entry->e_id)); + if (!uid_valid(acl_e->e_uid)) + goto fail; + break; + case ACL_GROUP: + acl_e->e_gid = + make_kgid(user_ns, + le32_to_cpu(entry->e_id)); + if (!gid_valid(acl_e->e_gid)) + goto fail; + break; + + default: + goto fail; + } + } + return acl; + +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} +EXPORT_SYMBOL (posix_acl_from_xattr); + +/* + * Convert from in-memory to extended attribute representation. + */ +int +posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, + void *buffer, size_t size) +{ + posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer; + posix_acl_xattr_entry *ext_entry = ext_acl->a_entries; + int real_size, n; + + real_size = posix_acl_xattr_size(acl->a_count); + if (!buffer) + return real_size; + if (real_size > size) + return -ERANGE; + + ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); + + for (n=0; n < acl->a_count; n++, ext_entry++) { + const struct posix_acl_entry *acl_e = &acl->a_entries[n]; + ext_entry->e_tag = cpu_to_le16(acl_e->e_tag); + ext_entry->e_perm = cpu_to_le16(acl_e->e_perm); + switch(acl_e->e_tag) { + case ACL_USER: + ext_entry->e_id = + cpu_to_le32(from_kuid(user_ns, acl_e->e_uid)); + break; + case ACL_GROUP: + ext_entry->e_id = + cpu_to_le32(from_kgid(user_ns, acl_e->e_gid)); + break; + default: + ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID); + break; + } + } + return real_size; +} +EXPORT_SYMBOL (posix_acl_to_xattr); diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c deleted file mode 100644 index 9fbea87fdb6e..000000000000 --- a/fs/xattr_acl.c +++ /dev/null @@ -1,180 +0,0 @@ -/* - * linux/fs/xattr_acl.c - * - * Almost all from linux/fs/ext2/acl.c: - * Copyright (C) 2001 by Andreas Gruenbacher, - */ - -#include -#include -#include -#include -#include - -/* - * Fix up the uids and gids in posix acl extended attributes in place. - */ -static void posix_acl_fix_xattr_userns( - struct user_namespace *to, struct user_namespace *from, - void *value, size_t size) -{ - posix_acl_xattr_header *header = (posix_acl_xattr_header *)value; - posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end; - int count; - kuid_t uid; - kgid_t gid; - - if (!value) - return; - if (size < sizeof(posix_acl_xattr_header)) - return; - if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) - return; - - count = posix_acl_xattr_count(size); - if (count < 0) - return; - if (count == 0) - return; - - for (end = entry + count; entry != end; entry++) { - switch(le16_to_cpu(entry->e_tag)) { - case ACL_USER: - uid = make_kuid(from, le32_to_cpu(entry->e_id)); - entry->e_id = cpu_to_le32(from_kuid(to, uid)); - break; - case ACL_GROUP: - gid = make_kgid(from, le32_to_cpu(entry->e_id)); - entry->e_id = cpu_to_le32(from_kgid(to, gid)); - break; - default: - break; - } - } -} - -void posix_acl_fix_xattr_from_user(void *value, size_t size) -{ - struct user_namespace *user_ns = current_user_ns(); - if (user_ns == &init_user_ns) - return; - posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size); -} - -void posix_acl_fix_xattr_to_user(void *value, size_t size) -{ - struct user_namespace *user_ns = current_user_ns(); - if (user_ns == &init_user_ns) - return; - posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size); -} - -/* - * Convert from extended attribute to in-memory representation. - */ -struct posix_acl * -posix_acl_from_xattr(struct user_namespace *user_ns, - const void *value, size_t size) -{ - posix_acl_xattr_header *header = (posix_acl_xattr_header *)value; - posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end; - int count; - struct posix_acl *acl; - struct posix_acl_entry *acl_e; - - if (!value) - return NULL; - if (size < sizeof(posix_acl_xattr_header)) - return ERR_PTR(-EINVAL); - if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) - return ERR_PTR(-EOPNOTSUPP); - - count = posix_acl_xattr_count(size); - if (count < 0) - return ERR_PTR(-EINVAL); - if (count == 0) - return NULL; - - acl = posix_acl_alloc(count, GFP_NOFS); - if (!acl) - return ERR_PTR(-ENOMEM); - acl_e = acl->a_entries; - - for (end = entry + count; entry != end; acl_e++, entry++) { - acl_e->e_tag = le16_to_cpu(entry->e_tag); - acl_e->e_perm = le16_to_cpu(entry->e_perm); - - switch(acl_e->e_tag) { - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - break; - - case ACL_USER: - acl_e->e_uid = - make_kuid(user_ns, - le32_to_cpu(entry->e_id)); - if (!uid_valid(acl_e->e_uid)) - goto fail; - break; - case ACL_GROUP: - acl_e->e_gid = - make_kgid(user_ns, - le32_to_cpu(entry->e_id)); - if (!gid_valid(acl_e->e_gid)) - goto fail; - break; - - default: - goto fail; - } - } - return acl; - -fail: - posix_acl_release(acl); - return ERR_PTR(-EINVAL); -} -EXPORT_SYMBOL (posix_acl_from_xattr); - -/* - * Convert from in-memory to extended attribute representation. - */ -int -posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, - void *buffer, size_t size) -{ - posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer; - posix_acl_xattr_entry *ext_entry = ext_acl->a_entries; - int real_size, n; - - real_size = posix_acl_xattr_size(acl->a_count); - if (!buffer) - return real_size; - if (real_size > size) - return -ERANGE; - - ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); - - for (n=0; n < acl->a_count; n++, ext_entry++) { - const struct posix_acl_entry *acl_e = &acl->a_entries[n]; - ext_entry->e_tag = cpu_to_le16(acl_e->e_tag); - ext_entry->e_perm = cpu_to_le16(acl_e->e_perm); - switch(acl_e->e_tag) { - case ACL_USER: - ext_entry->e_id = - cpu_to_le32(from_kuid(user_ns, acl_e->e_uid)); - break; - case ACL_GROUP: - ext_entry->e_id = - cpu_to_le32(from_kgid(user_ns, acl_e->e_gid)); - break; - default: - ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID); - break; - } - } - return real_size; -} -EXPORT_SYMBOL (posix_acl_to_xattr); From d23a995d471c09ecb7b9813a7de5c2555540b1b9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 20 Dec 2013 05:16:38 -0800 Subject: [PATCH 37/54] fs: add get_acl helper Factor out the code to get an ACL either from the inode or disk from check_acl, so that it can be used elsewhere later on. Change-Id: I81fab0da8228eaee0ba14b9b9942071caa12aeae Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Al Viro --- fs/namei.c | 24 +++--------------------- fs/posix_acl.c | 27 +++++++++++++++++++++++++++ include/linux/posix_acl.h | 2 ++ 3 files changed, 32 insertions(+), 21 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 4acf5091a822..12378d1aef23 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -230,27 +230,9 @@ static int check_acl(struct inode *inode, int mask) return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK); } - acl = get_cached_acl(inode, ACL_TYPE_ACCESS); - - /* - * A filesystem can force a ACL callback by just never filling the - * ACL cache. But normally you'd fill the cache either at inode - * instantiation time, or on the first ->get_acl call. - * - * If the filesystem doesn't have a get_acl() function at all, we'll - * just create the negative cache entry. - */ - if (acl == ACL_NOT_CACHED) { - if (inode->i_op->get_acl) { - acl = inode->i_op->get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } else { - set_cached_acl(inode, ACL_TYPE_ACCESS, NULL); - return -EAGAIN; - } - } - + acl = get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) + return PTR_ERR(acl); if (acl) { int error = posix_acl_permission(inode, acl, mask); posix_acl_release(acl); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 916036a4a1ff..d63f7c851bdc 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -26,6 +26,33 @@ EXPORT_SYMBOL(posix_acl_valid); EXPORT_SYMBOL(posix_acl_equiv_mode); EXPORT_SYMBOL(posix_acl_from_mode); +struct posix_acl *get_acl(struct inode *inode, int type) +{ + struct posix_acl *acl; + + acl = get_cached_acl(inode, type); + if (acl != ACL_NOT_CACHED) + return acl; + + if (!IS_POSIXACL(inode)) + return NULL; + + /* + * A filesystem can force a ACL callback by just never filling the + * ACL cache. But normally you'd fill the cache either at inode + * instantiation time, or on the first ->get_acl call. + * + * If the filesystem doesn't have a get_acl() function at all, we'll + * just create the negative cache entry. + */ + if (!inode->i_op->get_acl) { + set_cached_acl(inode, type, NULL); + return NULL; + } + return inode->i_op->get_acl(inode, type); +} +EXPORT_SYMBOL(get_acl); + /* * Init a fresh posix_acl */ diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index 2ae0bba45f12..f37b94074bc3 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -176,4 +176,6 @@ static inline void cache_no_acl(struct inode *inode) #endif } +struct posix_acl *get_acl(struct inode *inode, int type); + #endif /* __LINUX_POSIX_ACL_H */ From 83325ea5e4ff61bba46562d33a8cefdb08bb7f49 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 20 Dec 2013 05:16:40 -0800 Subject: [PATCH 38/54] fs: add generic xattr_acl handlers With the ->set_acl inode operation we can implement the Posix ACL xattr handlers in generic code instead of duplicating them all over the tree. Change-Id: I473c270b801d7faf3338d68a29c749ff929fc575 Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Al Viro --- fs/posix_acl.c | 102 ++++++++++++++++++++++++++++++++ include/linux/posix_acl_xattr.h | 3 + 2 files changed, 105 insertions(+) diff --git a/fs/posix_acl.c b/fs/posix_acl.c index d63f7c851bdc..1dc6d8b5cfee 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -638,3 +639,104 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, return real_size; } EXPORT_SYMBOL (posix_acl_to_xattr); + +static int +posix_acl_xattr_get(struct dentry *dentry, const char *name, + void *value, size_t size, int type) +{ + struct posix_acl *acl; + int error; + + if (!IS_POSIXACL(dentry->d_inode)) + return -EOPNOTSUPP; + if (S_ISLNK(dentry->d_inode->i_mode)) + return -EOPNOTSUPP; + + acl = get_acl(dentry->d_inode, type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + + error = posix_acl_to_xattr(&init_user_ns, acl, value, size); + posix_acl_release(acl); + + return error; +} + +static int +posix_acl_xattr_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + struct inode *inode = dentry->d_inode; + struct posix_acl *acl = NULL; + int ret; + + if (!IS_POSIXACL(inode)) + return -EOPNOTSUPP; + if (!inode->i_op->set_acl) + return -EOPNOTSUPP; + + if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) + return value ? -EACCES : 0; + if (!inode_owner_or_capable(inode)) + return -EPERM; + + if (value) { + acl = posix_acl_from_xattr(&init_user_ns, value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + + if (acl) { + ret = posix_acl_valid(acl); + if (ret) + goto out; + } + } + + ret = inode->i_op->set_acl(inode, acl, type); +out: + posix_acl_release(acl); + return ret; +} + +static size_t +posix_acl_xattr_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + const char *xname; + size_t size; + + if (!IS_POSIXACL(dentry->d_inode)) + return -EOPNOTSUPP; + if (S_ISLNK(dentry->d_inode->i_mode)) + return -EOPNOTSUPP; + + if (type == ACL_TYPE_ACCESS) + xname = POSIX_ACL_XATTR_ACCESS; + else + xname = POSIX_ACL_XATTR_DEFAULT; + + size = strlen(xname) + 1; + if (list && size <= list_size) + memcpy(list, xname, size); + return size; +} + +const struct xattr_handler posix_acl_access_xattr_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, + .list = posix_acl_xattr_list, + .get = posix_acl_xattr_get, + .set = posix_acl_xattr_set, +}; +EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler); + +const struct xattr_handler posix_acl_default_xattr_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .list = posix_acl_xattr_list, + .get = posix_acl_xattr_get, + .set = posix_acl_xattr_set, +}; +EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler); diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h index ad93ad0f1db0..6f14ee295822 100644 --- a/include/linux/posix_acl_xattr.h +++ b/include/linux/posix_acl_xattr.h @@ -69,4 +69,7 @@ struct posix_acl *posix_acl_from_xattr(struct user_namespace *user_ns, int posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, void *buffer, size_t size); +extern const struct xattr_handler posix_acl_access_xattr_handler; +extern const struct xattr_handler posix_acl_default_xattr_handler; + #endif /* _POSIX_ACL_XATTR_H */ From 1c88c9342a80ae69365031cd6f71ceadf7ef41b4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 20 Dec 2013 05:16:41 -0800 Subject: [PATCH 39/54] fs: make posix_acl_chmod more useful Rename the current posix_acl_chmod to __posix_acl_chmod and add a fully featured ACL chmod helper that uses the ->set_acl inode operation. Change-Id: I503ed1049a28ad01d32fe3fa85d8fc9b7e12610f Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Al Viro --- fs/9p/acl.c | 2 +- fs/btrfs/acl.c | 2 +- fs/ext2/acl.c | 2 +- fs/ext3/acl.c | 2 +- fs/ext4/acl.c | 2 +- fs/f2fs/acl.c | 2 +- fs/generic_acl.c | 2 +- fs/gfs2/acl.c | 2 +- fs/jffs2/acl.c | 2 +- fs/jfs/acl.c | 2 +- fs/ocfs2/acl.c | 2 +- fs/posix_acl.c | 30 +++++++++++++++++++++++++++--- fs/reiserfs/xattr_acl.c | 2 +- fs/xfs/xfs_acl.c | 2 +- include/linux/posix_acl.h | 17 +++++++++++++---- 15 files changed, 53 insertions(+), 20 deletions(-) diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 27886efe87f3..57469bf7927c 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -160,7 +160,7 @@ int v9fs_acl_chmod(struct dentry *dentry) return -EOPNOTSUPP; acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS); if (acl) { - retval = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + retval = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); if (retval) return retval; retval = v9fs_set_acl(dentry, ACL_TYPE_ACCESS, acl); diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 0b537ad2f704..2263ada10f24 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -250,7 +250,7 @@ int btrfs_acl_chmod(struct inode *inode) if (IS_ERR_OR_NULL(acl)) return PTR_ERR(acl); - ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); if (ret) return ret; ret = btrfs_set_acl(NULL, inode, acl, ACL_TYPE_ACCESS); diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 3b067882ae9d..ad86d761696e 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -292,7 +292,7 @@ ext2_acl_chmod(struct inode *inode) acl = ext2_get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl) || !acl) return PTR_ERR(acl); - error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + error = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); if (error) return error; error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl); diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index e8190c21dab5..523522e40c3d 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -298,7 +298,7 @@ ext3_acl_chmod(struct inode *inode) acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl) || !acl) return PTR_ERR(acl); - error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + error = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); if (error) return error; retry: diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 1b24291b6764..b8a48cc9af91 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -303,7 +303,7 @@ ext4_acl_chmod(struct inode *inode) acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl) || !acl) return PTR_ERR(acl); - error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + error = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); if (error) return error; retry: diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index f295fbe3bf81..40d9b743791b 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -298,7 +298,7 @@ int f2fs_acl_chmod(struct inode *inode) if (IS_ERR(acl) || !acl) return PTR_ERR(acl); - error = posix_acl_chmod(&acl, GFP_KERNEL, mode); + error = __posix_acl_chmod(&acl, GFP_KERNEL, mode); if (error) return error; diff --git a/fs/generic_acl.c b/fs/generic_acl.c index 21408084c3b3..f07ec18cd6c0 100644 --- a/fs/generic_acl.c +++ b/fs/generic_acl.c @@ -154,7 +154,7 @@ generic_acl_chmod(struct inode *inode) return -EOPNOTSUPP; acl = get_cached_acl(inode, ACL_TYPE_ACCESS); if (acl) { - error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + error = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); if (error) return error; set_cached_acl(inode, ACL_TYPE_ACCESS, acl); diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index ea4b4585ae11..65e70e1644bf 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -168,7 +168,7 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) if (!acl) return gfs2_setattr_simple(inode, attr); - error = posix_acl_chmod(&acl, GFP_NOFS, attr->ia_mode); + error = __posix_acl_chmod(&acl, GFP_NOFS, attr->ia_mode); if (error) return error; diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 85b3c4c01963..029dcc2d2432 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -320,7 +320,7 @@ int jffs2_acl_chmod(struct inode *inode) acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl) || !acl) return PTR_ERR(acl); - rc = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + rc = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); if (rc) return rc; rc = jffs2_set_acl(inode, ACL_TYPE_ACCESS, acl); diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index d254d6d35995..9c0fca8073da 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -161,7 +161,7 @@ int jfs_acl_chmod(struct inode *inode) if (IS_ERR(acl) || !acl) return PTR_ERR(acl); - rc = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + rc = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); if (rc) return rc; diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 3283555e4701..7ad14af22cab 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -321,7 +321,7 @@ int ocfs2_acl_chmod(struct inode *inode) acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl) || !acl) return PTR_ERR(acl); - ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); if (ret) return ret; ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS, diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 1dc6d8b5cfee..254a760e7a20 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -391,7 +391,7 @@ EXPORT_SYMBOL(posix_acl_update_mode); /* * Modify the ACL for the chmod syscall. */ -static int posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode) +static int __posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode) { struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; struct posix_acl_entry *pa, *pe; @@ -455,12 +455,12 @@ posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p) EXPORT_SYMBOL(posix_acl_create); int -posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode) +__posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode) { struct posix_acl *clone = posix_acl_clone(*acl, gfp); int err = -ENOMEM; if (clone) { - err = posix_acl_chmod_masq(clone, mode); + err = __posix_acl_chmod_masq(clone, mode); if (err) { posix_acl_release(clone); clone = NULL; @@ -470,6 +470,30 @@ posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode) *acl = clone; return err; } +EXPORT_SYMBOL(__posix_acl_chmod); + +int +posix_acl_chmod(struct inode *inode) +{ + struct posix_acl *acl; + int ret = 0; + + if (!IS_POSIXACL(inode)) + return 0; + if (!inode->i_op->set_acl) + return -EOPNOTSUPP; + + acl = get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR_OR_NULL(acl)) + return PTR_ERR(acl); + + ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (ret) + return ret; + ret = inode->i_op->set_acl(inode, acl, ACL_TYPE_ACCESS); + posix_acl_release(acl); + return ret; +} EXPORT_SYMBOL(posix_acl_chmod); /* diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 690d5457af2d..8882b8dcbaac 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -440,7 +440,7 @@ int reiserfs_acl_chmod(struct inode *inode) return 0; if (IS_ERR(acl)) return PTR_ERR(acl); - error = posix_acl_chmod(&acl, GFP_NOFS, inode->i_mode); + error = __posix_acl_chmod(&acl, GFP_NOFS, inode->i_mode); if (error) return error; diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index fb853c7b2b75..de4f76f02aed 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -315,7 +315,7 @@ xfs_acl_chmod(struct inode *inode) if (IS_ERR(acl) || !acl) return PTR_ERR(acl); - error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + error = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); if (error) return error; diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index f37b94074bc3..eb18661174ff 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -89,13 +89,15 @@ extern int posix_acl_permission(struct inode *, const struct posix_acl *, int); extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t); extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *); extern int posix_acl_create(struct posix_acl **, gfp_t, umode_t *); -extern int posix_acl_chmod(struct posix_acl **, gfp_t, umode_t); +extern int __posix_acl_chmod(struct posix_acl **, gfp_t, umode_t); extern int posix_acl_update_mode(struct inode *, umode_t *, struct posix_acl **); extern struct posix_acl *get_posix_acl(struct inode *, int); extern int set_posix_acl(struct inode *, int, struct posix_acl *); #ifdef CONFIG_FS_POSIX_ACL +extern int posix_acl_chmod(struct inode *); + static inline struct posix_acl **acl_by_type(struct inode *inode, int type) { switch (type) { @@ -166,15 +168,22 @@ static inline void forget_all_cached_acls(struct inode *inode) if (old_default != ACL_NOT_CACHED) posix_acl_release(old_default); } -#endif static inline void cache_no_acl(struct inode *inode) { -#ifdef CONFIG_FS_POSIX_ACL inode->i_acl = NULL; inode->i_default_acl = NULL; -#endif } +#else +static inline int posix_acl_chmod(struct inode *inode) +{ + return 0; +} + +static inline void cache_no_acl(struct inode *inode) +{ +} +#endif /* CONFIG_FS_POSIX_ACL */ struct posix_acl *get_acl(struct inode *inode, int type); From c2d22a63dab0391971643c1037cb6f4e8b159fad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 20 Dec 2013 05:16:42 -0800 Subject: [PATCH 40/54] fs: make posix_acl_create more useful Rename the current posix_acl_created to __posix_acl_create and add a fully featured helper to set up the ACLs on file creation that uses get_acl(). Change-Id: I7d8de350fe89ef3d2f9ff6eaa2c198b5403d33fc Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Al Viro --- fs/9p/acl.c | 2 +- fs/btrfs/acl.c | 2 +- fs/ext2/acl.c | 2 +- fs/ext3/acl.c | 2 +- fs/ext4/acl.c | 2 +- fs/f2fs/acl.c | 2 +- fs/generic_acl.c | 2 +- fs/gfs2/acl.c | 2 +- fs/jffs2/acl.c | 2 +- fs/jfs/acl.c | 2 +- fs/nfs/nfs3acl.c | 2 +- fs/ocfs2/acl.c | 2 +- fs/posix_acl.c | 57 ++++++++++++++++++++++++++++++++++++--- fs/reiserfs/xattr_acl.c | 2 +- fs/xfs/xfs_acl.c | 4 +-- include/linux/posix_acl.h | 15 ++++++++--- 16 files changed, 80 insertions(+), 22 deletions(-) diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 57469bf7927c..5893c2bd0f96 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -199,7 +199,7 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep, if (acl) { if (S_ISDIR(mode)) *dpacl = posix_acl_dup(acl); - retval = posix_acl_create(&acl, GFP_NOFS, &mode); + retval = __posix_acl_create(&acl, GFP_NOFS, &mode); if (retval < 0) return retval; if (retval > 0) diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 2263ada10f24..4916676fa9af 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -220,7 +220,7 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans, if (ret) goto failed; } - ret = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); + ret = __posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); if (ret < 0) return ret; diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index ad86d761696e..c73d8094c00f 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -252,7 +252,7 @@ ext2_init_acl(struct inode *inode, struct inode *dir) if (error) goto cleanup; } - error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); + error = __posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); if (error < 0) return error; if (error > 0) { diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index 523522e40c3d..6a4ef3add85c 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -255,7 +255,7 @@ ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) if (error) goto cleanup; } - error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); + error = __posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); if (error < 0) return error; diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index b8a48cc9af91..1ac74f355bb9 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -259,7 +259,7 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) if (error) goto cleanup; } - error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); + error = __posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); if (error < 0) return error; diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 40d9b743791b..ee440ce6994d 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -272,7 +272,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, if (error) goto cleanup; } - error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); + error = __posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); if (error < 0) return error; if (error > 0) diff --git a/fs/generic_acl.c b/fs/generic_acl.c index f07ec18cd6c0..83921c155c90 100644 --- a/fs/generic_acl.c +++ b/fs/generic_acl.c @@ -124,7 +124,7 @@ generic_acl_init(struct inode *inode, struct inode *dir) if (acl) { if (S_ISDIR(inode->i_mode)) set_cached_acl(inode, ACL_TYPE_DEFAULT, acl); - error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); + error = __posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); if (error < 0) return error; if (error > 0) diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 65e70e1644bf..5dd1a4e22b8e 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -137,7 +137,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode) goto out; } - error = posix_acl_create(&acl, GFP_NOFS, &mode); + error = __posix_acl_create(&acl, GFP_NOFS, &mode); if (error < 0) return error; diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 029dcc2d2432..3530a853a100 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -280,7 +280,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, umode_t *i_mode if (S_ISDIR(*i_mode)) set_cached_acl(inode, ACL_TYPE_DEFAULT, acl); - rc = posix_acl_create(&acl, GFP_KERNEL, i_mode); + rc = __posix_acl_create(&acl, GFP_KERNEL, i_mode); if (rc < 0) return rc; if (rc > 0) diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 9c0fca8073da..28d529ae9a4a 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -132,7 +132,7 @@ int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) if (rc) goto cleanup; } - rc = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); + rc = __posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); if (rc < 0) goto cleanup; /* posix_acl_release(NULL) is no-op */ if (rc > 0) diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 4a1aafba6a20..e85967587d74 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -428,7 +428,7 @@ int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode, if (!dfacl) return 0; acl = posix_acl_dup(dfacl); - error = posix_acl_create(&acl, GFP_KERNEL, &mode); + error = __posix_acl_create(&acl, GFP_KERNEL, &mode); if (error < 0) goto out_release_dfacl; error = nfs3_proc_setacls(inode, acl, S_ISDIR(inode->i_mode) ? diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 7ad14af22cab..79eea54cf36f 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -372,7 +372,7 @@ int ocfs2_init_acl(handle_t *handle, goto cleanup; } mode = inode->i_mode; - ret = posix_acl_create(&acl, GFP_NOFS, &mode); + ret = __posix_acl_create(&acl, GFP_NOFS, &mode); if (ret < 0) return ret; diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 254a760e7a20..b2d5e4b7fdb6 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -437,7 +437,7 @@ static int __posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode) } int -posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p) +__posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p) { struct posix_acl *clone = posix_acl_clone(*acl, gfp); int err = -ENOMEM; @@ -452,7 +452,7 @@ posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p) *acl = clone; return err; } -EXPORT_SYMBOL(posix_acl_create); +EXPORT_SYMBOL(__posix_acl_create); int __posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode) @@ -473,7 +473,7 @@ __posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode) EXPORT_SYMBOL(__posix_acl_chmod); int -posix_acl_chmod(struct inode *inode) +posix_acl_chmod(struct inode *inode, umode_t mode) { struct posix_acl *acl; int ret = 0; @@ -487,7 +487,7 @@ posix_acl_chmod(struct inode *inode) if (IS_ERR_OR_NULL(acl)) return PTR_ERR(acl); - ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); if (ret) return ret; ret = inode->i_op->set_acl(inode, acl, ACL_TYPE_ACCESS); @@ -496,6 +496,55 @@ posix_acl_chmod(struct inode *inode) } EXPORT_SYMBOL(posix_acl_chmod); +int +posix_acl_create(struct inode *dir, umode_t *mode, + struct posix_acl **default_acl, struct posix_acl **acl) +{ + struct posix_acl *p; + int ret; + + if (S_ISLNK(*mode) || !IS_POSIXACL(dir)) + goto no_acl; + + p = get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(p)) + return PTR_ERR(p); + + if (!p) { + *mode &= ~current_umask(); + goto no_acl; + } + + *acl = posix_acl_clone(p, GFP_NOFS); + if (!*acl) + return -ENOMEM; + + ret = posix_acl_create_masq(*acl, mode); + if (ret < 0) { + posix_acl_release(*acl); + return -ENOMEM; + } + + if (ret == 0) { + posix_acl_release(*acl); + *acl = NULL; + } + + if (!S_ISDIR(*mode)) { + posix_acl_release(p); + *default_acl = NULL; + } else { + *default_acl = p; + } + return 0; + +no_acl: + *default_acl = NULL; + *acl = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(posix_acl_create); + /* * Fix up the uids and gids in posix acl extended attributes in place. */ diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 8882b8dcbaac..c087e633b3b2 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -358,7 +358,7 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, /* Now we reconcile the new ACL and the mode, potentially modifying both */ - err = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); + err = __posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); if (err < 0) return err; diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index de4f76f02aed..6c1c66141022 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -278,12 +278,12 @@ xfs_inherit_acl(struct inode *inode, struct posix_acl *acl) goto out; } - error = posix_acl_create(&acl, GFP_KERNEL, &mode); + error = __posix_acl_create(&acl, GFP_KERNEL, &mode); if (error < 0) return error; /* - * If posix_acl_create returns a positive value we need to + * If __posix_acl_create returns a positive value we need to * inherit a permission that can't be represented using the Unix * mode bits and we actually need to set an ACL. */ diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index eb18661174ff..ab859d5387ff 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -88,7 +88,7 @@ extern int posix_acl_valid(const struct posix_acl *); extern int posix_acl_permission(struct inode *, const struct posix_acl *, int); extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t); extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *); -extern int posix_acl_create(struct posix_acl **, gfp_t, umode_t *); +extern int __posix_acl_create(struct posix_acl **, gfp_t, umode_t *); extern int __posix_acl_chmod(struct posix_acl **, gfp_t, umode_t); extern int posix_acl_update_mode(struct inode *, umode_t *, struct posix_acl **); @@ -96,7 +96,9 @@ extern struct posix_acl *get_posix_acl(struct inode *, int); extern int set_posix_acl(struct inode *, int, struct posix_acl *); #ifdef CONFIG_FS_POSIX_ACL -extern int posix_acl_chmod(struct inode *); +extern int posix_acl_chmod(struct inode *, umode_t); +extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **, + struct posix_acl **); static inline struct posix_acl **acl_by_type(struct inode *inode, int type) { @@ -175,7 +177,7 @@ static inline void cache_no_acl(struct inode *inode) inode->i_default_acl = NULL; } #else -static inline int posix_acl_chmod(struct inode *inode) +static inline int posix_acl_chmod(struct inode *inode, umode_t mode) { return 0; } @@ -183,6 +185,13 @@ static inline int posix_acl_chmod(struct inode *inode) static inline void cache_no_acl(struct inode *inode) { } + +static inline int posix_acl_create(struct inode *inode, umode_t *mode, + struct posix_acl **default_acl, struct posix_acl **acl) +{ + *default_acl = *acl = NULL; + return 0; +} #endif /* CONFIG_FS_POSIX_ACL */ struct posix_acl *get_acl(struct inode *inode, int type); From bbc68ff6796367d8ed5a8b419855867dbaa7a473 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 20 Dec 2013 05:16:54 -0800 Subject: [PATCH 41/54] fs: remove generic_acl And instead convert tmpfs to use the new generic ACL code, with two stub methods provided for in-memory filesystems. Change-Id: Ide27840378dbbc062c32ded2a6420c1b6c28f57e Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/Kconfig | 6 +- fs/Makefile | 1 - fs/generic_acl.c | 180 ------------------------------------ fs/posix_acl.c | 36 ++++++++ include/linux/generic_acl.h | 14 --- include/linux/posix_acl.h | 9 ++ mm/shmem.c | 34 +++---- 7 files changed, 61 insertions(+), 219 deletions(-) delete mode 100644 fs/generic_acl.c delete mode 100644 include/linux/generic_acl.h diff --git a/fs/Kconfig b/fs/Kconfig index fac6e8043a62..99d2a37904d0 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -81,10 +81,6 @@ config CUSE If you want to develop or use userspace character device based on CUSE, answer Y or M. -config GENERIC_ACL - bool - select FS_POSIX_ACL - menu "Caches" source "fs/fscache/Kconfig" @@ -132,7 +128,7 @@ config TMPFS_POSIX_ACL bool "Tmpfs POSIX Access Control Lists" depends on TMPFS select TMPFS_XATTR - select GENERIC_ACL + select FS_POSIX_ACL help POSIX Access Control Lists (ACLs) support additional access rights for users and groups beyond the standard owner/group/world scheme, diff --git a/fs/Makefile b/fs/Makefile index a8bb29c1f491..8e8dc4f2a6d0 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -48,7 +48,6 @@ obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o obj-$(CONFIG_FS_MBCACHE) += mbcache.o obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ -obj-$(CONFIG_GENERIC_ACL) += generic_acl.o obj-$(CONFIG_FHANDLE) += fhandle.o diff --git a/fs/generic_acl.c b/fs/generic_acl.c deleted file mode 100644 index 83921c155c90..000000000000 --- a/fs/generic_acl.c +++ /dev/null @@ -1,180 +0,0 @@ -/* - * (C) 2005 Andreas Gruenbacher - * - * This file is released under the GPL. - * - * Generic ACL support for in-memory filesystems. - */ - -#include -#include -#include -#include -#include -#include - - -static size_t -generic_acl_list(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) -{ - struct posix_acl *acl; - const char *xname; - size_t size; - - acl = get_cached_acl(dentry->d_inode, type); - if (!acl) - return 0; - posix_acl_release(acl); - - switch (type) { - case ACL_TYPE_ACCESS: - xname = POSIX_ACL_XATTR_ACCESS; - break; - case ACL_TYPE_DEFAULT: - xname = POSIX_ACL_XATTR_DEFAULT; - break; - default: - return 0; - } - size = strlen(xname) + 1; - if (list && size <= list_size) - memcpy(list, xname, size); - return size; -} - -static int -generic_acl_get(struct dentry *dentry, const char *name, void *buffer, - size_t size, int type) -{ - struct posix_acl *acl; - int error; - - if (strcmp(name, "") != 0) - return -EINVAL; - - acl = get_cached_acl(dentry->d_inode, type); - if (!acl) - return -ENODATA; - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return error; -} - -static int -generic_acl_set(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags, int type) -{ - struct inode *inode = dentry->d_inode; - struct posix_acl *acl = NULL; - int error; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - if (!inode_owner_or_capable(inode)) - return -EPERM; - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - if (acl) { - error = posix_acl_valid(acl); - if (error) - goto failed; - switch (type) { - case ACL_TYPE_ACCESS: - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (error) - goto failed; - inode->i_ctime = CURRENT_TIME; - break; - case ACL_TYPE_DEFAULT: - if (!S_ISDIR(inode->i_mode)) { - error = -EINVAL; - goto failed; - } - break; - } - } - set_cached_acl(inode, type, acl); - error = 0; -failed: - posix_acl_release(acl); - return error; -} - -/** - * generic_acl_init - Take care of acl inheritance at @inode create time - * - * Files created inside a directory with a default ACL inherit the - * directory's default ACL. - */ -int -generic_acl_init(struct inode *inode, struct inode *dir) -{ - struct posix_acl *acl = NULL; - int error; - - if (!S_ISLNK(inode->i_mode)) - acl = get_cached_acl(dir, ACL_TYPE_DEFAULT); - if (acl) { - if (S_ISDIR(inode->i_mode)) - set_cached_acl(inode, ACL_TYPE_DEFAULT, acl); - error = __posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); - if (error < 0) - return error; - if (error > 0) - set_cached_acl(inode, ACL_TYPE_ACCESS, acl); - } else { - inode->i_mode &= ~current_umask(); - } - error = 0; - - posix_acl_release(acl); - return error; -} - -/** - * generic_acl_chmod - change the access acl of @inode upon chmod() - * - * A chmod also changes the permissions of the owner, group/mask, and - * other ACL entries. - */ -int -generic_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl; - int error = 0; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - acl = get_cached_acl(inode, ACL_TYPE_ACCESS); - if (acl) { - error = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (error) - return error; - set_cached_acl(inode, ACL_TYPE_ACCESS, acl); - posix_acl_release(acl); - } - return error; -} - -const struct xattr_handler generic_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = generic_acl_list, - .get = generic_acl_get, - .set = generic_acl_set, -}; - -const struct xattr_handler generic_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = generic_acl_list, - .get = generic_acl_get, - .set = generic_acl_set, -}; diff --git a/fs/posix_acl.c b/fs/posix_acl.c index b2d5e4b7fdb6..a5012e8551e3 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -813,3 +813,39 @@ const struct xattr_handler posix_acl_default_xattr_handler = { .set = posix_acl_xattr_set, }; EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler); + +int simple_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int error; + + if (type == ACL_TYPE_ACCESS) { + error = posix_acl_equiv_mode(acl, &inode->i_mode); + if (error < 0) + return 0; + if (error == 0) + acl = NULL; + } + + inode->i_ctime = CURRENT_TIME; + set_cached_acl(inode, type, acl); + return 0; +} + +int simple_acl_create(struct inode *dir, struct inode *inode) +{ + struct posix_acl *default_acl, *acl; + int error; + + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (error) + return error; + + set_cached_acl(inode, ACL_TYPE_DEFAULT, default_acl); + set_cached_acl(inode, ACL_TYPE_ACCESS, acl); + + if (default_acl) + posix_acl_release(default_acl); + if (acl) + posix_acl_release(acl); + return 0; +} diff --git a/include/linux/generic_acl.h b/include/linux/generic_acl.h deleted file mode 100644 index b6d657544ef1..000000000000 --- a/include/linux/generic_acl.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef LINUX_GENERIC_ACL_H -#define LINUX_GENERIC_ACL_H - -#include - -struct inode; - -extern const struct xattr_handler generic_acl_access_handler; -extern const struct xattr_handler generic_acl_default_handler; - -int generic_acl_init(struct inode *, struct inode *); -int generic_acl_chmod(struct inode *); - -#endif /* LINUX_GENERIC_ACL_H */ diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index ab859d5387ff..a956831b9d6d 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -100,6 +100,9 @@ extern int posix_acl_chmod(struct inode *, umode_t); extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **, struct posix_acl **); +extern int simple_set_acl(struct inode *, struct posix_acl *, int); +extern int simple_acl_create(struct inode *, struct inode *); + static inline struct posix_acl **acl_by_type(struct inode *inode, int type) { switch (type) { @@ -182,6 +185,12 @@ static inline int posix_acl_chmod(struct inode *inode, umode_t mode) return 0; } +#define simple_set_acl NULL + +static inline int simple_acl_create(struct inode *dir, struct inode *inode) +{ + return 0; +} static inline void cache_no_acl(struct inode *inode) { } diff --git a/mm/shmem.c b/mm/shmem.c index d888564825a3..1ee2a4747b6e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -43,7 +43,7 @@ static struct vfsmount *shm_mnt; #include #include #include -#include +#include #include #include #include @@ -620,10 +620,8 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) } setattr_copy(inode, attr); -#ifdef CONFIG_TMPFS_POSIX_ACL if (attr->ia_valid & ATTR_MODE) - error = generic_acl_chmod(inode); -#endif + error = posix_acl_chmod(inode, inode->i_mode); return error; } @@ -1940,22 +1938,14 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); if (inode) { -#ifdef CONFIG_TMPFS_POSIX_ACL - error = generic_acl_init(inode, dir); - if (error) { - iput(inode); - return error; - } -#endif + error = simple_acl_create(dir, inode); + if (error) + goto out_iput; error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); - if (error) { - if (error != -EOPNOTSUPP) { - iput(inode); - return error; - } - } + if (error && error != -EOPNOTSUPP) + goto out_iput; error = 0; dir->i_size += BOGO_DIRENT_SIZE; @@ -1964,6 +1954,9 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) dget(dentry); /* Extra count - pin the dentry in core */ } return error; +out_iput: + iput(inode); + return error; } static int @@ -2228,8 +2221,8 @@ static int shmem_initxattrs(struct inode *inode, static const struct xattr_handler *shmem_xattr_handlers[] = { #ifdef CONFIG_TMPFS_POSIX_ACL - &generic_acl_access_handler, - &generic_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif NULL }; @@ -2804,6 +2797,7 @@ static const struct inode_operations shmem_inode_operations = { .getxattr = shmem_getxattr, .listxattr = shmem_listxattr, .removexattr = shmem_removexattr, + .set_acl = simple_set_acl, #endif }; @@ -2828,6 +2822,7 @@ static const struct inode_operations shmem_dir_inode_operations = { #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, + .set_acl = simple_set_acl, #endif }; @@ -2840,6 +2835,7 @@ static const struct inode_operations shmem_special_inode_operations = { #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, + .set_acl = simple_set_acl, #endif }; From 9e84eb08f645e65e1acb47336c33f2a514c41333 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 20 Dec 2013 05:16:39 -0800 Subject: [PATCH 42/54] fs: add a set_acl inode operation This will allow moving all the Posix ACL handling into the VFS and clean up tons of cruft in the filesystems. Change-Id: I99d1ac617acb0da73722f0f977357f6a1ed4efab Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Al Viro --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/fs.h b/include/linux/fs.h index 0b65d9a8c3da..30accce1d895 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1712,6 +1712,7 @@ struct inode_operations { int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); int (*update_time)(struct inode *, struct timespec *, int); + int (*set_acl)(struct inode *, struct posix_acl *, int); int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode, int *opened); From 0153fc02cfaf54e4f8bcd751714a17d3e10b30c5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 31 Jan 2014 14:25:19 -0500 Subject: [PATCH 43/54] fs: get_acl() must be allowed to return EOPNOTSUPP posix_acl_xattr_get requires get_acl() to return EOPNOTSUPP if the filesystem cannot support acls. This is needed for NFS, which can't know whether or not the server supports acls until it tries to get/set one. This patch converts posix_acl_chmod and posix_acl_create to deal with EOPNOTSUPP return values from get_acl(). Change-Id: I931423ae763a4950056c7b20938be6f1f4536e24 Reported-by: Russell King Link: http://lkml.kernel.org/r/20140130140834.GW15937@n2100.arm.linux.org.uk Cc: Al Viro viro@zeniv.linux.org.uk> Reviewed-by: Christoph Hellwig Tested-by: Takashi Iwai Signed-off-by: Trond Myklebust --- fs/posix_acl.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/posix_acl.c b/fs/posix_acl.c index a5012e8551e3..37865a0fe016 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -484,8 +484,11 @@ posix_acl_chmod(struct inode *inode, umode_t mode) return -EOPNOTSUPP; acl = get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR_OR_NULL(acl)) + if (IS_ERR_OR_NULL(acl)) { + if (acl == ERR_PTR(-EOPNOTSUPP)) + return 0; return PTR_ERR(acl); + } ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); if (ret) @@ -507,14 +510,15 @@ posix_acl_create(struct inode *dir, umode_t *mode, goto no_acl; p = get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(p)) + if (IS_ERR(p)) { + if (p == ERR_PTR(-EOPNOTSUPP)) + goto apply_umask; return PTR_ERR(p); - - if (!p) { - *mode &= ~current_umask(); - goto no_acl; } + if (!p) + goto apply_umask; + *acl = posix_acl_clone(p, GFP_NOFS); if (!*acl) return -ENOMEM; @@ -538,6 +542,8 @@ posix_acl_create(struct inode *dir, umode_t *mode, } return 0; +apply_umask: + *mode &= ~current_umask(); no_acl: *default_acl = NULL; *acl = NULL; From 37d48b7539c97a2d3831e7110c7775d1eac770bd Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 14 Feb 2014 12:05:49 +0300 Subject: [PATCH 44/54] fs: NULL dereference in posix_acl_to_xattr() commit 47ba9734403770a4c5e685b01f0a72b835dd4fff upstream. This patch moves the dereference of "buffer" after the check for NULL. The only place which passes a NULL parameter is gfs2_set_acl(). Change-Id: I7ede500c05e646e4c07238d159b8f182a1fbf80d Signed-off-by: Dan Carpenter Signed-off-by: Steven Whitehouse Signed-off-by: Greg Kroah-Hartman --- fs/posix_acl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 37865a0fe016..29947dbf69ae 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -686,7 +686,7 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, void *buffer, size_t size) { posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer; - posix_acl_xattr_entry *ext_entry = ext_acl->a_entries; + posix_acl_xattr_entry *ext_entry; int real_size, n; real_size = posix_acl_xattr_size(acl->a_count); @@ -694,7 +694,8 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, return real_size; if (real_size > size) return -ERANGE; - + + ext_entry = ext_acl->a_entries; ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); for (n=0; n < acl->a_count; n++, ext_entry++) { From df3b3ae066c560a5b5fe0086203eb3adcafa0efe Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 23 Jun 2014 13:22:06 -0700 Subject: [PATCH 45/54] shmem: fix faulting into a hole while it's punched commit f00cdc6df7d7cfcabb5b740911e6788cb0802bdb upstream. Trinity finds that mmap access to a hole while it's punched from shmem can prevent the madvise(MADV_REMOVE) or fallocate(FALLOC_FL_PUNCH_HOLE) from completing, until the reader chooses to stop; with the puncher's hold on i_mutex locking out all other writers until it can complete. It appears that the tmpfs fault path is too light in comparison with its hole-punching path, lacking an i_data_sem to obstruct it; but we don't want to slow down the common case. Extend shmem_fallocate()'s existing range notification mechanism, so shmem_fault() can refrain from faulting pages into the hole while it's punched, waiting instead on i_mutex (when safe to sleep; or repeatedly faulting when not). [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Hugh Dickins Reported-by: Sasha Levin Tested-by: Sasha Levin Cc: Dave Jones Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Change-Id: Icb632d07fb5c6c315ea3914aecdfae52c6536dc1 --- mm/shmem.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 1ee2a4747b6e..92f06fa16388 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -80,11 +80,12 @@ static struct vfsmount *shm_mnt; #define SHORT_SYMLINK_LEN 128 /* - * shmem_fallocate and shmem_writepage communicate via inode->i_private - * (with i_mutex making sure that it has only one user at a time): - * we would prefer not to enlarge the shmem inode just for that. + * shmem_fallocate communicates with shmem_fault or shmem_writepage via + * inode->i_private (with i_mutex making sure that it has only one user at + * a time): we would prefer not to enlarge the shmem inode just for that. */ struct shmem_falloc { + int mode; /* FALLOC_FL mode currently operating */ pgoff_t start; /* start of range currently being fallocated */ pgoff_t next; /* the next page offset to be fallocated */ pgoff_t nr_falloced; /* how many new pages have been fallocated */ @@ -824,6 +825,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) spin_lock(&inode->i_lock); shmem_falloc = inode->i_private; if (shmem_falloc && + !shmem_falloc->mode && index >= shmem_falloc->start && index < shmem_falloc->next) shmem_falloc->nr_unswapped++; @@ -1298,6 +1300,44 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) int error; int ret = VM_FAULT_LOCKED; + /* + * Trinity finds that probing a hole which tmpfs is punching can + * prevent the hole-punch from ever completing: which in turn + * locks writers out with its hold on i_mutex. So refrain from + * faulting pages into the hole while it's being punched, and + * wait on i_mutex to be released if vmf->flags permits. + */ + if (unlikely(inode->i_private)) { + struct shmem_falloc *shmem_falloc; + + spin_lock(&inode->i_lock); + shmem_falloc = inode->i_private; + if (!shmem_falloc || + shmem_falloc->mode != FALLOC_FL_PUNCH_HOLE || + vmf->pgoff < shmem_falloc->start || + vmf->pgoff >= shmem_falloc->next) + shmem_falloc = NULL; + spin_unlock(&inode->i_lock); + /* + * i_lock has protected us from taking shmem_falloc seriously + * once return from shmem_fallocate() went back up that stack. + * i_lock does not serialize with i_mutex at all, but it does + * not matter if sometimes we wait unnecessarily, or sometimes + * miss out on waiting: we just need to make those cases rare. + */ + if (shmem_falloc) { + if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && + !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { + up_read(&vma->vm_mm->mmap_sem); + mutex_lock(&inode->i_mutex); + mutex_unlock(&inode->i_mutex); + return VM_FAULT_RETRY; + } + /* cond_resched? Leave that to GUP or return to user */ + return VM_FAULT_NOPAGE; + } + } + error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); if (error) return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); @@ -1816,18 +1856,26 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, mutex_lock(&inode->i_mutex); + shmem_falloc.mode = mode & ~FALLOC_FL_KEEP_SIZE; + if (mode & FALLOC_FL_PUNCH_HOLE) { struct address_space *mapping = file->f_mapping; loff_t unmap_start = round_up(offset, PAGE_SIZE); loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; + shmem_falloc.start = unmap_start >> PAGE_SHIFT; + shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; + spin_lock(&inode->i_lock); + inode->i_private = &shmem_falloc; + spin_unlock(&inode->i_lock); + if ((u64)unmap_end > (u64)unmap_start) unmap_mapping_range(mapping, unmap_start, 1 + unmap_end - unmap_start, 0); shmem_truncate_range(inode, offset, offset + len - 1); /* No need to unmap again: hole-punching leaves COWed pages */ error = 0; - goto out; + goto undone; } /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ From d2a4f1e1a3fed74ff388799c6d7f16d0e81a4d26 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 23 Jul 2014 14:00:10 -0700 Subject: [PATCH 46/54] shmem: fix faulting into a hole, not taking i_mutex commit 8e205f779d1443a94b5ae81aa359cb535dd3021e upstream. Commit f00cdc6df7d7 ("shmem: fix faulting into a hole while it's punched") was buggy: Sasha sent a lockdep report to remind us that grabbing i_mutex in the fault path is a no-no (write syscall may already hold i_mutex while faulting user buffer). We tried a completely different approach (see following patch) but that proved inadequate: good enough for a rational workload, but not good enough against trinity - which forks off so many mappings of the object that contention on i_mmap_mutex while hole-puncher holds i_mutex builds into serious starvation when concurrent faults force the puncher to fall back to single-page unmap_mapping_range() searches of the i_mmap tree. So return to the original umbrella approach, but keep away from i_mutex this time. We really don't want to bloat every shmem inode with a new mutex or completion, just to protect this unlikely case from trinity. So extend the original with wait_queue_head on stack at the hole-punch end, and wait_queue item on the stack at the fault end. This involves further use of i_lock to guard against the races: lockdep has been happy so far, and I see fs/inode.c:unlock_new_inode() holds i_lock around wake_up_bit(), which is comparable to what we do here. i_lock is more convenient, but we could switch to shmem's info->lock. This issue has been tagged with CVE-2014-4171, which will require commit f00cdc6df7d7 and this and the following patch to be backported: we suggest to 3.1+, though in fact the trinity forkbomb effect might go back as far as 2.6.16, when madvise(,,MADV_REMOVE) came in - or might not, since much has changed, with i_mmap_mutex a spinlock before 3.0. Anyone running trinity on 3.0 and earlier? I don't think we need care. Change-Id: Ia8f6ca7b0c1ef0c14766bc81fee73e47bdb1738a Signed-off-by: Hugh Dickins Reported-by: Sasha Levin Tested-by: Sasha Levin Cc: Vlastimil Babka Cc: Konstantin Khlebnikov Cc: Johannes Weiner Cc: Lukas Czerner Cc: Dave Jones Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/shmem.c | 78 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 52 insertions(+), 26 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 92f06fa16388..fe1e2313c6ad 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -85,7 +85,7 @@ static struct vfsmount *shm_mnt; * a time): we would prefer not to enlarge the shmem inode just for that. */ struct shmem_falloc { - int mode; /* FALLOC_FL mode currently operating */ + wait_queue_head_t *waitq; /* faults into hole wait for punch to end */ pgoff_t start; /* start of range currently being fallocated */ pgoff_t next; /* the next page offset to be fallocated */ pgoff_t nr_falloced; /* how many new pages have been fallocated */ @@ -825,7 +825,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) spin_lock(&inode->i_lock); shmem_falloc = inode->i_private; if (shmem_falloc && - !shmem_falloc->mode && + !shmem_falloc->waitq && index >= shmem_falloc->start && index < shmem_falloc->next) shmem_falloc->nr_unswapped++; @@ -1304,38 +1304,58 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * Trinity finds that probing a hole which tmpfs is punching can * prevent the hole-punch from ever completing: which in turn * locks writers out with its hold on i_mutex. So refrain from - * faulting pages into the hole while it's being punched, and - * wait on i_mutex to be released if vmf->flags permits. + * faulting pages into the hole while it's being punched. Although + * shmem_undo_range() does remove the additions, it may be unable to + * keep up, as each new page needs its own unmap_mapping_range() call, + * and the i_mmap tree grows ever slower to scan if new vmas are added. + * + * It does not matter if we sometimes reach this check just before the + * hole-punch begins, so that one fault then races with the punch: + * we just need to make racing faults a rare case. + * + * The implementation below would be much simpler if we just used a + * standard mutex or completion: but we cannot take i_mutex in fault, + * and bloating every shmem inode for this unlikely case would be sad. */ if (unlikely(inode->i_private)) { struct shmem_falloc *shmem_falloc; spin_lock(&inode->i_lock); shmem_falloc = inode->i_private; - if (!shmem_falloc || - shmem_falloc->mode != FALLOC_FL_PUNCH_HOLE || - vmf->pgoff < shmem_falloc->start || - vmf->pgoff >= shmem_falloc->next) - shmem_falloc = NULL; - spin_unlock(&inode->i_lock); - /* - * i_lock has protected us from taking shmem_falloc seriously - * once return from shmem_fallocate() went back up that stack. - * i_lock does not serialize with i_mutex at all, but it does - * not matter if sometimes we wait unnecessarily, or sometimes - * miss out on waiting: we just need to make those cases rare. - */ - if (shmem_falloc) { + if (shmem_falloc && + shmem_falloc->waitq && + vmf->pgoff >= shmem_falloc->start && + vmf->pgoff < shmem_falloc->next) { + wait_queue_head_t *shmem_falloc_waitq; + DEFINE_WAIT(shmem_fault_wait); + + ret = VM_FAULT_NOPAGE; if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { + /* It's polite to up mmap_sem if we can */ up_read(&vma->vm_mm->mmap_sem); - mutex_lock(&inode->i_mutex); - mutex_unlock(&inode->i_mutex); - return VM_FAULT_RETRY; + ret = VM_FAULT_RETRY; } - /* cond_resched? Leave that to GUP or return to user */ - return VM_FAULT_NOPAGE; + + shmem_falloc_waitq = shmem_falloc->waitq; + prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, + TASK_UNINTERRUPTIBLE); + spin_unlock(&inode->i_lock); + schedule(); + + /* + * shmem_falloc_waitq points into the shmem_fallocate() + * stack of the hole-punching task: shmem_falloc_waitq + * is usually invalid by the time we reach here, but + * finish_wait() does not dereference it in that case; + * though i_lock needed lest racing with wake_up_all(). + */ + spin_lock(&inode->i_lock); + finish_wait(shmem_falloc_waitq, &shmem_fault_wait); + spin_unlock(&inode->i_lock); + return ret; } + spin_unlock(&inode->i_lock); } error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); @@ -1856,13 +1876,13 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, mutex_lock(&inode->i_mutex); - shmem_falloc.mode = mode & ~FALLOC_FL_KEEP_SIZE; - if (mode & FALLOC_FL_PUNCH_HOLE) { struct address_space *mapping = file->f_mapping; loff_t unmap_start = round_up(offset, PAGE_SIZE); loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); + shmem_falloc.waitq = &shmem_falloc_waitq; shmem_falloc.start = unmap_start >> PAGE_SHIFT; shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; spin_lock(&inode->i_lock); @@ -1874,8 +1894,13 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, 1 + unmap_end - unmap_start, 0); shmem_truncate_range(inode, offset, offset + len - 1); /* No need to unmap again: hole-punching leaves COWed pages */ + + spin_lock(&inode->i_lock); + inode->i_private = NULL; + wake_up_all(&shmem_falloc_waitq); + spin_unlock(&inode->i_lock); error = 0; - goto undone; + goto out; } /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ @@ -1891,6 +1916,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, goto out; } + shmem_falloc.waitq = NULL; shmem_falloc.start = start; shmem_falloc.next = start; shmem_falloc.nr_falloced = 0; From 5561fecf89f108db53271100baf424708efc3c2f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 23 Jul 2014 14:00:13 -0700 Subject: [PATCH 47/54] shmem: fix splicing from a hole while it's punched commit b1a366500bd537b50c3aad26dc7df083ec03a448 upstream. shmem_fault() is the actual culprit in trinity's hole-punch starvation, and the most significant cause of such problems: since a page faulted is one that then appears page_mapped(), needing unmap_mapping_range() and i_mmap_mutex to be unmapped again. But it is not the only way in which a page can be brought into a hole in the radix_tree while that hole is being punched; and Vlastimil's testing implies that if enough other processors are busy filling in the hole, then shmem_undo_range() can be kept from completing indefinitely. shmem_file_splice_read() is the main other user of SGP_CACHE, which can instantiate shmem pagecache pages in the read-only case (without holding i_mutex, so perhaps concurrently with a hole-punch). Probably it's silly not to use SGP_READ already (using the ZERO_PAGE for holes): which ought to be safe, but might bring surprises - not a change to be rushed. shmem_read_mapping_page_gfp() is an internal interface used by drivers/gpu/drm GEM (and next by uprobes): it should be okay. And shmem_file_read_iter() uses the SGP_DIRTY variant of SGP_CACHE, when called internally by the kernel (perhaps for a stacking filesystem, which might rely on holes to be reserved): it's unclear whether it could be provoked to keep hole-punch busy or not. We could apply the same umbrella as now used in shmem_fault() to shmem_file_splice_read() and the others; but it looks ugly, and use over a range raises questions - should it actually be per page? can these get starved themselves? The origin of this part of the problem is my v3.1 commit d0823576bf4b ("mm: pincer in truncate_inode_pages_range"), once it was duplicated into shmem.c. It seemed like a nice idea at the time, to ensure (barring RCU lookup fuzziness) that there's an instant when the entire hole is empty; but the indefinitely repeated scans to ensure that make it vulnerable. Revert that "enhancement" to hole-punch from shmem_undo_range(), but retain the unproblematic rescanning when it's truncating; add a couple of comments there. Remove the "indices[0] >= end" test: that is now handled satisfactorily by the inner loop, and mem_cgroup_uncharge_start()/end() are too light to be worth avoiding here. But if we do not always loop indefinitely, we do need to handle the case of swap swizzled back to page before shmem_free_swap() gets it: add a retry for that case, as suggested by Konstantin Khlebnikov; and for the case of page swizzled back to swap, as suggested by Johannes Weiner. Change-Id: I8fdc4c0d5ea757161b9cfff11a355402dbb58edf Signed-off-by: Hugh Dickins Reported-by: Sasha Levin Suggested-by: Vlastimil Babka Cc: Konstantin Khlebnikov Cc: Johannes Weiner Cc: Lukas Czerner Cc: Dave Jones Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/shmem.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index fe1e2313c6ad..82b5d49c18b3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -534,22 +534,19 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, return; index = start; - for ( ; ; ) { + while (index < end) { cond_resched(); pvec.nr = shmem_find_get_pages_and_swap(mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE), pvec.pages, indices); if (!pvec.nr) { - if (index == start || unfalloc) + /* If all gone or hole-punch or unfalloc, we're done */ + if (index == start || end != -1) break; + /* But if truncating, restart to make sure all gone */ index = start; continue; } - if ((index == start || unfalloc) && indices[0] >= end) { - shmem_deswap_pagevec(&pvec); - pagevec_release(&pvec); - break; - } mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -561,8 +558,12 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (radix_tree_exceptional_entry(page)) { if (unfalloc) continue; - nr_swaps_freed += !shmem_free_swap(mapping, - index, page); + if (shmem_free_swap(mapping, index, page)) { + /* Swap was replaced by page: retry */ + index--; + break; + } + nr_swaps_freed++; continue; } @@ -571,6 +572,11 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (page->mapping == mapping) { VM_BUG_ON(PageWriteback(page)); truncate_inode_page(mapping, page); + } else { + /* Page was replaced by swap: retry */ + unlock_page(page); + index--; + break; } } unlock_page(page); From 78963a7d8a7126f99d2ea5ecb562afed0cf95813 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 22 May 2013 14:50:31 +0900 Subject: [PATCH 48/54] sched: add cond_resched_rcu() helper This is intended for use in loops which read data protected by RCU and may have a large number of iterations. Such an example is dumping the list of connections known to IPVS: ip_vs_conn_array() and ip_vs_conn_seq_next(). The benefits are for CONFIG_PREEMPT_RCU=y where we save CPU cycles by moving rcu_read_lock and rcu_read_unlock out of large loops but still allowing the current task to be preempted after every loop iteration for the CONFIG_PREEMPT_RCU=n case. The call to cond_resched() is not needed when CONFIG_PREEMPT_RCU=y. Thanks to Paul E. McKenney for explaining this and for the final version that checks the context with CONFIG_DEBUG_ATOMIC_SLEEP=y for all possible configurations. The function can be empty in the CONFIG_PREEMPT_RCU case, rcu_read_lock and rcu_read_unlock are not needed in this case because the task can be preempted on indication from scheduler. Thanks to Peter Zijlstra for catching this and for his help in trying a solution that changes __might_sleep. Initial cond_resched_rcu_lock() function suggested by Eric Dumazet. Change-Id: Ia218ce69df122c64c3aed2166a1a7507d41fbcad Tested-by: Julian Anastasov Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman Acked-by: Peter Zijlstra Signed-off-by: Pablo Neira Ayuso --- include/linux/sched.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index c89044df2912..5da4eed8c6f0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2722,6 +2722,15 @@ extern int __cond_resched_softirq(void); __cond_resched_softirq(); \ }) +static inline void cond_resched_rcu(void) +{ +#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) + rcu_read_unlock(); + cond_resched(); + rcu_read_lock(); +#endif +} + /* * Does a critical section need to be broken due to another * task waiting?: (technically does not depend on CONFIG_PREEMPT, From 19b70600eaab9f32e5b3f17487e4342f111bdc36 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:20:20 -0700 Subject: [PATCH 49/54] mm: mmap_region: kill correct_wcount/inode, use allow_write_access() correct_wcount and inode in mmap_region() just complicate the code. This boolean was needed previously, when deny_write_access() was called before vma_merge(), now we can simply check VM_DENYWRITE and do allow_write_access() if it is set. allow_write_access() checks file != NULL, so this is safe even if it was possible to use VM_DENYWRITE && !file. Just we need to ensure we use the same file which was deny_write_access()'ed, so the patch also moves "file = vma->vm_file" down after allow_write_access(). Change-Id: I80d3674ce40fb97a80b128ea63edc86ca1770d20 Signed-off-by: Oleg Nesterov Cc: Hugh Dickins Cc: Al Viro Cc: Colin Cross Cc: David Rientjes Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index e846196cc494..2c545ec5e99a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1291,11 +1291,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; - int correct_wcount = 0; int error; struct rb_node **rb_link, *rb_parent; unsigned long charged = 0; - struct inode *inode = file ? file->f_path.dentry->d_inode : NULL; /* Check against address space limit. */ if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { @@ -1384,7 +1382,6 @@ munmap_back: error = deny_write_access(file); if (error) goto free_vma; - correct_wcount = 1; } vma->vm_file = file; get_file(file); @@ -1426,11 +1423,10 @@ munmap_back: } vma_link(mm, vma, prev, rb_link, rb_parent); - file = vma->vm_file; - /* Once vma denies write, undo our temporary denial count */ - if (correct_wcount) - atomic_inc(&inode->i_writecount); + if (vm_flags & VM_DENYWRITE) + allow_write_access(file); + file = vma->vm_file; out: perf_event_mmap(vma); @@ -1444,8 +1440,8 @@ out: return addr; unmap_and_free_vma: - if (correct_wcount) - atomic_inc(&inode->i_writecount); + if (vm_flags & VM_DENYWRITE) + allow_write_access(file); vma->vm_file = NULL; fput(file); From f54eb0ab107fdb61c1572b3584ebab03cb0fa687 Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Fri, 8 Aug 2014 14:25:25 -0700 Subject: [PATCH 50/54] mm: allow drivers to prevent new writable mappings This patch (of 6): The i_mmap_writable field counts existing writable mappings of an address_space. To allow drivers to prevent new writable mappings, make this counter signed and prevent new writable mappings if it is negative. This is modelled after i_writecount and DENYWRITE. This will be required by the shmem-sealing infrastructure to prevent any new writable mappings after the WRITE seal has been set. In case there exists a writable mapping, this operation will fail with EBUSY. Note that we rely on the fact that iff you already own a writable mapping, you can increase the counter without using the helpers. This is the same that we do for i_writecount. Change-Id: Id16c5b650e451956a4f6df004483cb63197c613c Signed-off-by: David Herrmann Acked-by: Hugh Dickins Cc: Michael Kerrisk Cc: Ryan Lortie Cc: Lennart Poettering Cc: Daniel Mack Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 1 + include/linux/fs.h | 29 +++++++++++++++++++++++++++-- kernel/fork.c | 2 +- mm/mmap.c | 30 ++++++++++++++++++++++++------ mm/swap_state.c | 1 + 5 files changed, 54 insertions(+), 9 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index d930c1523ab0..8b37e668a4fb 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -164,6 +164,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mapping->a_ops = &empty_aops; mapping->host = inode; mapping->flags = 0; + atomic_set(&mapping->i_mmap_writable, 0); mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->assoc_mapping = NULL; mapping->backing_dev_info = &default_backing_dev_info; diff --git a/include/linux/fs.h b/include/linux/fs.h index 30accce1d895..24ada387dc96 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -684,7 +684,7 @@ struct address_space { struct inode *host; /* owner: inode, block_device */ struct radix_tree_root page_tree; /* radix tree of all pages */ spinlock_t tree_lock; /* and lock protecting it */ - unsigned int i_mmap_writable;/* count VM_SHARED mappings */ + atomic_t i_mmap_writable;/* count VM_SHARED mappings */ struct prio_tree_root i_mmap; /* tree of private and shared mappings */ struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ struct mutex i_mmap_mutex; /* protect tree, count, list */ @@ -766,10 +766,35 @@ static inline int mapping_mapped(struct address_space *mapping) * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap_pgoff * marks vma as VM_SHARED if it is shared, and the file was opened for * writing i.e. vma may be mprotected writable even if now readonly. + * + * If i_mmap_writable is negative, no new writable mappings are allowed. You + * can only deny writable mappings, if none exists right now. */ static inline int mapping_writably_mapped(struct address_space *mapping) { - return mapping->i_mmap_writable != 0; + return atomic_read(&mapping->i_mmap_writable) > 0; +} + +static inline int mapping_map_writable(struct address_space *mapping) +{ + return atomic_inc_unless_negative(&mapping->i_mmap_writable) ? + 0 : -EPERM; +} + +static inline void mapping_unmap_writable(struct address_space *mapping) +{ + atomic_dec(&mapping->i_mmap_writable); +} + +static inline int mapping_deny_writable(struct address_space *mapping) +{ + return atomic_dec_unless_positive(&mapping->i_mmap_writable) ? + 0 : -EBUSY; +} + +static inline void mapping_allow_writable(struct address_space *mapping) +{ + atomic_inc(&mapping->i_mmap_writable); } /* diff --git a/kernel/fork.c b/kernel/fork.c index 8959d4f1da63..8d805b800c03 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -416,7 +416,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) atomic_dec(&inode->i_writecount); mutex_lock(&mapping->i_mmap_mutex); if (tmp->vm_flags & VM_SHARED) - mapping->i_mmap_writable++; + atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ vma_prio_tree_add(tmp, mpnt); diff --git a/mm/mmap.c b/mm/mmap.c index 2c545ec5e99a..ddd34ed92051 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -207,7 +207,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, if (vma->vm_flags & VM_DENYWRITE) atomic_inc(&file->f_path.dentry->d_inode->i_writecount); if (vma->vm_flags & VM_SHARED) - mapping->i_mmap_writable--; + mapping_unmap_writable(mapping); flush_dcache_mmap_lock(mapping); if (unlikely(vma->vm_flags & VM_NONLINEAR)) @@ -453,7 +453,7 @@ static void __vma_link_file(struct vm_area_struct *vma) if (vma->vm_flags & VM_DENYWRITE) atomic_dec(&file->f_path.dentry->d_inode->i_writecount); if (vma->vm_flags & VM_SHARED) - mapping->i_mmap_writable++; + atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); if (unlikely(vma->vm_flags & VM_NONLINEAR)) @@ -1383,6 +1383,17 @@ munmap_back: if (error) goto free_vma; } + if (vm_flags & VM_SHARED) { + error = mapping_map_writable(file->f_mapping); + if (error) + goto allow_write_and_free_vma; + } + + /* ->mmap() can change vma->vm_file, but must guarantee that + * vma_link() below can deny write-access if VM_DENYWRITE is set + * and map writably if VM_SHARED is set. This usually means the + * new file must not have been exposed to user-space, yet. + */ vma->vm_file = file; get_file(file); error = file->f_op->mmap(file, vma); @@ -1424,8 +1435,12 @@ munmap_back: vma_link(mm, vma, prev, rb_link, rb_parent); /* Once vma denies write, undo our temporary denial count */ - if (vm_flags & VM_DENYWRITE) - allow_write_access(file); + if (file) { + if (vm_flags & VM_SHARED) + mapping_unmap_writable(file->f_mapping); + if (vm_flags & VM_DENYWRITE) + allow_write_access(file); + } file = vma->vm_file; out: perf_event_mmap(vma); @@ -1440,14 +1455,17 @@ out: return addr; unmap_and_free_vma: - if (vm_flags & VM_DENYWRITE) - allow_write_access(file); vma->vm_file = NULL; fput(file); /* Undo any partial mapping done by a device driver. */ unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); charged = 0; + if (vm_flags & VM_SHARED) + mapping_unmap_writable(file->f_mapping); +allow_write_and_free_vma: + if (vm_flags & VM_DENYWRITE) + allow_write_access(file); free_vma: kmem_cache_free(vm_area_cachep, vma); unacct_error: diff --git a/mm/swap_state.c b/mm/swap_state.c index 86065b4d02a1..1d181b3353c2 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -38,6 +38,7 @@ static struct backing_dev_info swap_backing_dev_info = { struct address_space swapper_spaces[MAX_SWAPFILES] = { [0 ... MAX_SWAPFILES - 1] = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), + .i_mmap_writable = ATOMIC_INIT(0), .a_ops = &swap_aops, .backing_dev_info = &swap_backing_dev_info, } From 8412a7833c1a59162d9dcb1012a2bae9ab23a438 Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Fri, 8 Aug 2014 14:25:27 -0700 Subject: [PATCH 51/54] shm: add sealing API If two processes share a common memory region, they usually want some guarantees to allow safe access. This often includes: - one side cannot overwrite data while the other reads it - one side cannot shrink the buffer while the other accesses it - one side cannot grow the buffer beyond previously set boundaries If there is a trust-relationship between both parties, there is no need for policy enforcement. However, if there's no trust relationship (eg., for general-purpose IPC) sharing memory-regions is highly fragile and often not possible without local copies. Look at the following two use-cases: 1) A graphics client wants to share its rendering-buffer with a graphics-server. The memory-region is allocated by the client for read/write access and a second FD is passed to the server. While scanning out from the memory region, the server has no guarantee that the client doesn't shrink the buffer at any time, requiring rather cumbersome SIGBUS handling. 2) A process wants to perform an RPC on another process. To avoid huge bandwidth consumption, zero-copy is preferred. After a message is assembled in-memory and a FD is passed to the remote side, both sides want to be sure that neither modifies this shared copy, anymore. The source may have put sensible data into the message without a separate copy and the target may want to parse the message inline, to avoid a local copy. While SIGBUS handling, POSIX mandatory locking and MAP_DENYWRITE provide ways to achieve most of this, the first one is unproportionally ugly to use in libraries and the latter two are broken/racy or even disabled due to denial of service attacks. This patch introduces the concept of SEALING. If you seal a file, a specific set of operations is blocked on that file forever. Unlike locks, seals can only be set, never removed. Hence, once you verified a specific set of seals is set, you're guaranteed that no-one can perform the blocked operations on this file, anymore. An initial set of SEALS is introduced by this patch: - SHRINK: If SEAL_SHRINK is set, the file in question cannot be reduced in size. This affects ftruncate() and open(O_TRUNC). - GROW: If SEAL_GROW is set, the file in question cannot be increased in size. This affects ftruncate(), fallocate() and write(). - WRITE: If SEAL_WRITE is set, no write operations (besides resizing) are possible. This affects fallocate(PUNCH_HOLE), mmap() and write(). - SEAL: If SEAL_SEAL is set, no further seals can be added to a file. This basically prevents the F_ADD_SEAL operation on a file and can be set to prevent others from adding further seals that you don't want. The described use-cases can easily use these seals to provide safe use without any trust-relationship: 1) The graphics server can verify that a passed file-descriptor has SEAL_SHRINK set. This allows safe scanout, while the client is allowed to increase buffer size for window-resizing on-the-fly. Concurrent writes are explicitly allowed. 2) For general-purpose IPC, both processes can verify that SEAL_SHRINK, SEAL_GROW and SEAL_WRITE are set. This guarantees that neither process can modify the data while the other side parses it. Furthermore, it guarantees that even with writable FDs passed to the peer, it cannot increase the size to hit memory-limits of the source process (in case the file-storage is accounted to the source). The new API is an extension to fcntl(), adding two new commands: F_GET_SEALS: Return a bitset describing the seals on the file. This can be called on any FD if the underlying file supports sealing. F_ADD_SEALS: Change the seals of a given file. This requires WRITE access to the file and F_SEAL_SEAL may not already be set. Furthermore, the underlying file must support sealing and there may not be any existing shared mapping of that file. Otherwise, EBADF/EPERM is returned. The given seals are _added_ to the existing set of seals on the file. You cannot remove seals again. The fcntl() handler is currently specific to shmem and disabled on all files. A file needs to explicitly support sealing for this interface to work. A separate syscall is added in a follow-up, which creates files that support sealing. There is no intention to support this on other file-systems. Semantics are unclear for non-volatile files and we lack any use-case right now. Therefore, the implementation is specific to shmem. Change-Id: I2d6247d3287c61dbe6bafabf56554e80b414f938 Signed-off-by: David Herrmann Acked-by: Hugh Dickins Cc: Michael Kerrisk Cc: Ryan Lortie Cc: Lennart Poettering Cc: Daniel Mack Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fcntl.c | 5 ++ include/linux/fcntl.h | 15 ++++ include/linux/shmem_fs.h | 17 +++++ mm/shmem.c | 143 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 180 insertions(+) diff --git a/fs/fcntl.c b/fs/fcntl.c index 281c0da783e6..9a1e67c05478 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -420,6 +421,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, case F_GETPIPE_SZ: err = pipe_fcntl(filp, cmd, arg); break; + case F_ADD_SEALS: + case F_GET_SEALS: + err = shmem_fcntl(filp, cmd, arg); + break; default: break; } diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h index f550f894ba15..a558f455316a 100644 --- a/include/linux/fcntl.h +++ b/include/linux/fcntl.h @@ -27,6 +27,21 @@ #define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7) #define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8) +/* + * Set/Get seals + */ +#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) + +/* + * Types of seals + */ +#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +#define F_SEAL_GROW 0x0004 /* prevent file from growing */ +#define F_SEAL_WRITE 0x0008 /* prevent writes */ +/* (1U << 31) is reserved for signed error codes */ + /* * Types of directory notifications that may be requested. */ diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index f2410d3354d4..b2eb7f0c9cd4 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -1,6 +1,7 @@ #ifndef __SHMEM_FS_H #define __SHMEM_FS_H +#include #include #include #include @@ -11,6 +12,7 @@ struct shmem_inode_info { spinlock_t lock; + unsigned int seals; /* shmem seals */ unsigned long flags; unsigned long alloced; /* data pages alloced to file */ union { @@ -64,4 +66,19 @@ static inline struct page *shmem_read_mapping_page( mapping_gfp_mask(mapping)); } +#ifdef CONFIG_TMPFS + +extern int shmem_add_seals(struct file *file, unsigned int seals); +extern int shmem_get_seals(struct file *file); +extern long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg); + +#else + +static inline long shmem_fcntl(struct file *f, unsigned int c, unsigned long a) +{ + return -EINVAL; +} + +#endif + #endif diff --git a/mm/shmem.c b/mm/shmem.c index 82b5d49c18b3..f6b30537c353 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -65,6 +65,7 @@ static struct vfsmount *shm_mnt; #include #include #include +#include #include #include @@ -603,6 +604,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); static int shmem_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; + struct shmem_inode_info *info = SHMEM_I(inode); int error; error = inode_change_ok(inode, attr); @@ -613,6 +615,11 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) loff_t oldsize = inode->i_size; loff_t newsize = attr->ia_size; + /* protected by i_mutex */ + if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || + (newsize > oldsize && (info->seals & F_SEAL_GROW))) + return -EPERM; + if (newsize != oldsize) { i_size_write(inode, newsize); inode->i_ctime = inode->i_mtime = CURRENT_TIME; @@ -1447,6 +1454,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); spin_lock_init(&info->lock); + info->seals = F_SEAL_SEAL; info->flags = flags & VM_NORESERVE; INIT_LIST_HEAD(&info->swaplist); simple_xattrs_init(&info->xattrs); @@ -1500,7 +1508,17 @@ shmem_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; + struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t index = pos >> PAGE_CACHE_SHIFT; + + /* i_mutex is held by caller */ + if (unlikely(info->seals)) { + if (info->seals & F_SEAL_WRITE) + return -EPERM; + if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) + return -EPERM; + } + return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); } @@ -1871,11 +1889,125 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin) return offset; } +static int shmem_wait_for_pins(struct address_space *mapping) +{ + return 0; +} + +#define F_ALL_SEALS (F_SEAL_SEAL | \ + F_SEAL_SHRINK | \ + F_SEAL_GROW | \ + F_SEAL_WRITE) + +int shmem_add_seals(struct file *file, unsigned int seals) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct shmem_inode_info *info = SHMEM_I(inode); + int error; + + /* + * SEALING + * Sealing allows multiple parties to share a shmem-file but restrict + * access to a specific subset of file operations. Seals can only be + * added, but never removed. This way, mutually untrusted parties can + * share common memory regions with a well-defined policy. A malicious + * peer can thus never perform unwanted operations on a shared object. + * + * Seals are only supported on special shmem-files and always affect + * the whole underlying inode. Once a seal is set, it may prevent some + * kinds of access to the file. Currently, the following seals are + * defined: + * SEAL_SEAL: Prevent further seals from being set on this file + * SEAL_SHRINK: Prevent the file from shrinking + * SEAL_GROW: Prevent the file from growing + * SEAL_WRITE: Prevent write access to the file + * + * As we don't require any trust relationship between two parties, we + * must prevent seals from being removed. Therefore, sealing a file + * only adds a given set of seals to the file, it never touches + * existing seals. Furthermore, the "setting seals"-operation can be + * sealed itself, which basically prevents any further seal from being + * added. + * + * Semantics of sealing are only defined on volatile files. Only + * anonymous shmem files support sealing. More importantly, seals are + * never written to disk. Therefore, there's no plan to support it on + * other file types. + */ + + if (file->f_op != &shmem_file_operations) + return -EINVAL; + if (!(file->f_mode & FMODE_WRITE)) + return -EPERM; + if (seals & ~(unsigned int)F_ALL_SEALS) + return -EINVAL; + + mutex_lock(&inode->i_mutex); + + if (info->seals & F_SEAL_SEAL) { + error = -EPERM; + goto unlock; + } + + if ((seals & F_SEAL_WRITE) && !(info->seals & F_SEAL_WRITE)) { + error = mapping_deny_writable(file->f_mapping); + if (error) + goto unlock; + + error = shmem_wait_for_pins(file->f_mapping); + if (error) { + mapping_allow_writable(file->f_mapping); + goto unlock; + } + } + + info->seals |= seals; + error = 0; + +unlock: + mutex_unlock(&inode->i_mutex); + return error; +} +EXPORT_SYMBOL_GPL(shmem_add_seals); + +int shmem_get_seals(struct file *file) +{ + if (file->f_op != &shmem_file_operations) + return -EINVAL; + + return SHMEM_I(file->f_path.dentry->d_inode)->seals; +} +EXPORT_SYMBOL_GPL(shmem_get_seals); + +long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg) +{ + long error; + + switch (cmd) { + case F_ADD_SEALS: + /* disallow upper 32bit */ + if (arg > UINT_MAX) + return -EINVAL; + + error = shmem_add_seals(file, arg); + break; + case F_GET_SEALS: + error = shmem_get_seals(file); + break; + default: + error = -EINVAL; + break; + } + + return error; +} + static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file->f_path.dentry->d_inode; struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_falloc shmem_falloc; pgoff_t start, index, end; int error; @@ -1888,6 +2020,12 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); + /* protected by i_mutex */ + if (info->seals & F_SEAL_WRITE) { + error = -EPERM; + goto out; + } + shmem_falloc.waitq = &shmem_falloc_waitq; shmem_falloc.start = unmap_start >> PAGE_SHIFT; shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; @@ -1914,6 +2052,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, if (error) goto out; + if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { + error = -EPERM; + goto out; + } + start = offset >> PAGE_CACHE_SHIFT; end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; /* Try to avoid a swapstorm if len is impossible to satisfy */ From e2520dfb2086ff46abecf1d9c7099fbbcace1d1b Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Fri, 8 Aug 2014 14:25:36 -0700 Subject: [PATCH 52/54] shm: wait for pins to be released when sealing If we set SEAL_WRITE on a file, we must make sure there cannot be any ongoing write-operations on the file. For write() calls, we simply lock the inode mutex, for mmap() we simply verify there're no writable mappings. However, there might be pages pinned by AIO, Direct-IO and similar operations via GUP. We must make sure those do not write to the memfd file after we set SEAL_WRITE. As there is no way to notify GUP users to drop pages or to wait for them to be done, we implement the wait ourself: When setting SEAL_WRITE, we check all pages for their ref-count. If it's bigger than 1, we know there's some user of the page. We then mark the page and wait for up to 150ms for those ref-counts to be dropped. If the ref-counts are not dropped in time, we refuse the seal operation. Change-Id: I964c29a82f2a9a1077647b29b2073c9ef4e0a05a Signed-off-by: David Herrmann Acked-by: Hugh Dickins Cc: Michael Kerrisk Cc: Ryan Lortie Cc: Lennart Poettering Cc: Daniel Mack Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 109 insertions(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index f6b30537c353..d9eb4fc9cefe 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1889,9 +1889,117 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin) return offset; } +/* + * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, + * so reuse a tag which we firmly believe is never set or cleared on shmem. + */ +#define SHMEM_TAG_PINNED PAGECACHE_TAG_TOWRITE +#define LAST_SCAN 4 /* about 150ms max */ + +static void shmem_tag_pins(struct address_space *mapping) +{ + struct radix_tree_iter iter; + void **slot; + pgoff_t start; + struct page *page; + + lru_add_drain(); + start = 0; + rcu_read_lock(); + +restart: + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + page = radix_tree_deref_slot(slot); + if (!page || radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + goto restart; + } else if (page_count(page) - page_mapcount(page) > 1) { + spin_lock_irq(&mapping->tree_lock); + radix_tree_tag_set(&mapping->page_tree, iter.index, + SHMEM_TAG_PINNED); + spin_unlock_irq(&mapping->tree_lock); + } + + if (need_resched()) { + cond_resched_rcu(); + start = iter.index + 1; + goto restart; + } + } + rcu_read_unlock(); +} + +/* + * Setting SEAL_WRITE requires us to verify there's no pending writer. However, + * via get_user_pages(), drivers might have some pending I/O without any active + * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages + * and see whether it has an elevated ref-count. If so, we tag them and wait for + * them to be dropped. + * The caller must guarantee that no new user will acquire writable references + * to those pages to avoid races. + */ static int shmem_wait_for_pins(struct address_space *mapping) { - return 0; + struct radix_tree_iter iter; + void **slot; + pgoff_t start; + struct page *page; + int error, scan; + + shmem_tag_pins(mapping); + + error = 0; + for (scan = 0; scan <= LAST_SCAN; scan++) { + if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED)) + break; + + if (!scan) + lru_add_drain_all(); + else if (schedule_timeout_killable((HZ << scan) / 200)) + scan = LAST_SCAN; + + start = 0; + rcu_read_lock(); +restart: + radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, + start, SHMEM_TAG_PINNED) { + + page = radix_tree_deref_slot(slot); + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + goto restart; + + page = NULL; + } + + if (page && + page_count(page) - page_mapcount(page) != 1) { + if (scan < LAST_SCAN) + goto continue_resched; + + /* + * On the last scan, we clean up all those tags + * we inserted; but make a note that we still + * found pages pinned. + */ + error = -EBUSY; + } + + spin_lock_irq(&mapping->tree_lock); + radix_tree_tag_clear(&mapping->page_tree, + iter.index, SHMEM_TAG_PINNED); + spin_unlock_irq(&mapping->tree_lock); +continue_resched: + if (need_resched()) { + cond_resched_rcu(); + start = iter.index + 1; + goto restart; + } + } + rcu_read_unlock(); + } + + return error; } #define F_ALL_SEALS (F_SEAL_SEAL | \ From a76d50eb14c3e0b8387cf7501dca181e5c33938d Mon Sep 17 00:00:00 2001 From: David Drysdale Date: Tue, 9 Sep 2014 14:50:57 -0700 Subject: [PATCH 53/54] shm: add memfd.h to export list The new header file memfd.h from commit 9183df25fe7b ("shm: add memfd_create() syscall") should be exported. Change-Id: I07ea7ae25765ece11180c0dcaed749918bf03a11 Signed-off-by: David Drysdale Reviewed-by: David Herrmann Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/Kbuild | 1 + 1 file changed, 1 insertion(+) diff --git a/include/Kbuild b/include/Kbuild index 5f65ac2887a2..6890e19a6a85 100644 --- a/include/Kbuild +++ b/include/Kbuild @@ -11,3 +11,4 @@ header-y += drm/ header-y += xen/ header-y += scsi/ header-y += media/ +header-y += uapi/linux/memfd.h From a136bb84aa2132fe250c531ee1267bdb01511a64 Mon Sep 17 00:00:00 2001 From: Matsvei Niaverau Date: Fri, 11 Dec 2020 21:35:16 +0100 Subject: [PATCH 54/54] Revert "shm: add memfd.h to export list" This reverts commit a76d50eb14c3e0b8387cf7501dca181e5c33938d. Reason for revert: It breaks build on 17.1 Change-Id: I7584be16228db1b1537a1b4ad44e4cd183f6af7c --- include/Kbuild | 1 - 1 file changed, 1 deletion(-) diff --git a/include/Kbuild b/include/Kbuild index 6890e19a6a85..5f65ac2887a2 100644 --- a/include/Kbuild +++ b/include/Kbuild @@ -11,4 +11,3 @@ header-y += drm/ header-y += xen/ header-y += scsi/ header-y += media/ -header-y += uapi/linux/memfd.h