From 3748b2f15b06ea1861df39d5e9693dcd6e9542b1 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 28 Mar 2012 14:42:39 -0700 Subject: [PATCH 01/35] procfs: fix /proc/statm bda7bad62bc4 ("procfs: speed up /proc/pid/stat, statm") broke /proc/statm - 'text' is printed twice by mistake. Signed-off-by: KAMEZAWA Hiroyuki Reported-by: Ulrich Drepper Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index fbb53c249086..f9bd395b3473 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -550,7 +550,7 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, seq_put_decimal_ull(m, ' ', shared); seq_put_decimal_ull(m, ' ', text); seq_put_decimal_ull(m, ' ', 0); - seq_put_decimal_ull(m, ' ', text); + seq_put_decimal_ull(m, ' ', data); seq_put_decimal_ull(m, ' ', 0); seq_putc(m, '\n'); From 623e3db9f9b7d6e7b2a99180f9cf0825c936ab7a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 28 Mar 2012 14:42:40 -0700 Subject: [PATCH 02/35] mm for fs: add truncate_pagecache_range() Holepunching filesystems ext4 and xfs are using truncate_inode_pages_range but forgetting to unmap pages first (ocfs2 remembers). This is not really a bug, since races already require truncate_inode_page() to handle that case once the page is locked; but it can be very inefficient if the file being punched happens to be mapped into many vmas. Provide a drop-in replacement truncate_pagecache_range() which does the unmapping pass first, handling the awkward mismatch between arguments to truncate_inode_pages_range() and arguments to unmap_mapping_range(). Note that holepunching does not unmap privately COWed pages in the range: POSIX requires that we do so when truncating, but it's hard to justify, difficult to implement without an i_size cutoff, and no filesystem is attempting to implement it. Signed-off-by: Hugh Dickins Cc: "Theodore Ts'o" Cc: Andreas Dilger Cc: Mark Fasheh Cc: Joel Becker Cc: Ben Myers Cc: Alex Elder Cc: Christoph Hellwig Cc: Dave Chinner Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/truncate.c | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index cf7982336103..630068184265 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -954,7 +954,7 @@ extern void truncate_pagecache(struct inode *inode, loff_t old, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); extern int vmtruncate(struct inode *inode, loff_t offset); extern int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end); - +void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); int truncate_inode_page(struct address_space *mapping, struct page *page); int generic_error_remove_page(struct address_space *mapping, struct page *page); diff --git a/mm/truncate.c b/mm/truncate.c index 18aded3a89fc..61a183b89df6 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -626,3 +626,43 @@ int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend) return 0; } + +/** + * truncate_pagecache_range - unmap and remove pagecache that is hole-punched + * @inode: inode + * @lstart: offset of beginning of hole + * @lend: offset of last byte of hole + * + * This function should typically be called before the filesystem + * releases resources associated with the freed range (eg. deallocates + * blocks). This way, pagecache will always stay logically coherent + * with on-disk format, and the filesystem would not have to deal with + * situations such as writepage being called for a page that has already + * had its underlying blocks deallocated. + */ +void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend) +{ + struct address_space *mapping = inode->i_mapping; + loff_t unmap_start = round_up(lstart, PAGE_SIZE); + loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1; + /* + * This rounding is currently just for example: unmap_mapping_range + * expands its hole outwards, whereas we want it to contract the hole + * inwards. However, existing callers of truncate_pagecache_range are + * doing their own page rounding first; and truncate_inode_pages_range + * currently BUGs if lend is not pagealigned-1 (it handles partial + * page at start of hole, but not partial page at end of hole). Note + * unmap_mapping_range allows holelen 0 for all, and we allow lend -1. + */ + + /* + * Unlike in truncate_pagecache, unmap_mapping_range is called only + * once (before truncating pagecache), and without "even_cows" flag: + * hole-punching should not remove private COWed pages from the hole. + */ + if ((u64)unmap_end > (u64)unmap_start) + unmap_mapping_range(mapping, unmap_start, + 1 + unmap_end - unmap_start, 0); + truncate_inode_pages_range(mapping, lstart, lend); +} +EXPORT_SYMBOL(truncate_pagecache_range); From 45f83cefe3a5f0476ac3f96382ebfdc3fe4caab2 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 28 Mar 2012 14:42:40 -0700 Subject: [PATCH 03/35] mm: thp: fix up pmd_trans_unstable() locations pmd_trans_unstable() should be called before pmd_offset_map() in the locations where the mmap_sem is held for reading. Signed-off-by: Andrea Arcangeli Cc: Mel Gorman Cc: Hugh Dickins Cc: Larry Woodman Cc: Ulrich Obergfell Cc: Rik van Riel Cc: Mark Salter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 5 ++--- mm/memcontrol.c | 4 ++++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 9694cc283511..c283832d411d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -781,9 +781,6 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, int err = 0; pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); - if (pmd_trans_unstable(pmd)) - return 0; - /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); spin_lock(&walk->mm->page_table_lock); @@ -802,6 +799,8 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return err; } + if (pmd_trans_unstable(pmd)) + return 0; for (; addr != end; addr += PAGE_SIZE) { /* check to see if we've left 'vma' behind diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b2ee6df0e9bb..7d698df4a067 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5306,6 +5306,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, return 0; } + if (pmd_trans_unstable(pmd)) + return 0; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) if (get_mctgt_type(vma, addr, *pte, NULL)) @@ -5502,6 +5504,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, return 0; } + if (pmd_trans_unstable(pmd)) + return 0; retry: pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; addr += PAGE_SIZE) { From 29fd66d289f2981e11c550f8b411a6d3d38be0cf Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 28 Mar 2012 14:42:41 -0700 Subject: [PATCH 04/35] mm, coredump: fail allocations when coredumping instead of oom killing The size of coredump files is limited by RLIMIT_CORE, however, allocating large amounts of memory results in three negative consequences: - the coredumping process may be chosen for oom kill and quickly deplete all memory reserves in oom conditions preventing further progress from being made or tasks from exiting, - the coredumping process may cause other processes to be oom killed without fault of their own as the result of a SIGSEGV, for example, in the coredumping process, or - the coredumping process may result in a livelock while writing to the dump file if it needs memory to allocate while other threads are in the exit path waiting on the coredumper to complete. This is fixed by implying __GFP_NORETRY in the page allocator for coredumping processes when reclaim has failed so the allocations fail and the process continues to exit. Signed-off-by: David Rientjes Cc: Mel Gorman Cc: KAMEZAWA Hiroyuki Cc: Minchan Kim Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index caea788628e4..c313afcc8e5a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2308,6 +2308,10 @@ rebalance: if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { if (oom_killer_disabled) goto nopage; + /* Coredumps can quickly deplete all memory reserves */ + if ((current->flags & PF_DUMPCORE) && + !(gfp_mask & __GFP_NOFAIL)) + goto nopage; page = __alloc_pages_may_oom(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, From d15cab975459fb6092eeba1be72c13621337784f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 28 Mar 2012 14:42:42 -0700 Subject: [PATCH 05/35] swapon: check validity of swap_flags Most system calls taking flags first check that the flags passed in are valid, and that helps userspace to detect when new flags are supported. But swapon never did so: start checking now, to help if we ever want to support more swap_flags in future. It's difficult to get stray bits set in an int, and swapon is not widely used, so this is most unlikely to break any userspace; but we can just revert if it turns out to do so. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 3 +++ mm/swapfile.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/include/linux/swap.h b/include/linux/swap.h index b86b5c20617d..8dc0ea7caf02 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -21,6 +21,9 @@ struct bio; #define SWAP_FLAG_PRIO_SHIFT 0 #define SWAP_FLAG_DISCARD 0x10000 /* discard swap cluster after use */ +#define SWAP_FLAGS_VALID (SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \ + SWAP_FLAG_DISCARD) + static inline int current_is_kswapd(void) { return current->flags & PF_KSWAPD; diff --git a/mm/swapfile.c b/mm/swapfile.c index dae42f380d6e..fafc26d1b1dc 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2022,6 +2022,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) struct page *page = NULL; struct inode *inode = NULL; + if (swap_flags & ~SWAP_FLAGS_VALID) + return -EINVAL; + if (!capable(CAP_SYS_ADMIN)) return -EPERM; From 3fc498f165304dc913f1d13b5ac9ab4c758ee7ab Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Wed, 28 Mar 2012 14:42:43 -0700 Subject: [PATCH 06/35] smp: introduce a generic on_each_cpu_mask() function We have lots of infrastructure in place to partition multi-core systems such that we have a group of CPUs that are dedicated to specific task: cgroups, scheduler and interrupt affinity, and cpuisol= boot parameter. Still, kernel code will at times interrupt all CPUs in the system via IPIs for various needs. These IPIs are useful and cannot be avoided altogether, but in certain cases it is possible to interrupt only specific CPUs that have useful work to do and not the entire system. This patch set, inspired by discussions with Peter Zijlstra and Frederic Weisbecker when testing the nohz task patch set, is a first stab at trying to explore doing this by locating the places where such global IPI calls are being made and turning the global IPI into an IPI for a specific group of CPUs. The purpose of the patch set is to get feedback if this is the right way to go for dealing with this issue and indeed, if the issue is even worth dealing with at all. Based on the feedback from this patch set I plan to offer further patches that address similar issue in other code paths. This patch creates an on_each_cpu_mask() and on_each_cpu_cond() infrastructure API (the former derived from existing arch specific versions in Tile and Arm) and uses them to turn several global IPI invocation to per CPU group invocations. Core kernel: on_each_cpu_mask() calls a function on processors specified by cpumask, which may or may not include the local processor. You must not call this function with disabled interrupts or from a hardware interrupt handler or from a bottom half handler. arch/arm: Note that the generic version is a little different then the Arm one: 1. It has the mask as first parameter 2. It calls the function on the calling CPU with interrupts disabled, but this should be OK since the function is called on the other CPUs with interrupts disabled anyway. arch/tile: The API is the same as the tile private one, but the generic version also calls the function on the with interrupts disabled in UP case This is OK since the function is called on the other CPUs with interrupts disabled. Signed-off-by: Gilad Ben-Yossef Reviewed-by: Christoph Lameter Acked-by: Chris Metcalf Acked-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Russell King Cc: Pekka Enberg Cc: Matt Mackall Cc: Rik van Riel Cc: Andi Kleen Cc: Sasha Levin Cc: Mel Gorman Cc: Alexander Viro Cc: Avi Kivity Acked-by: Michal Nazarewicz Cc: Kosaki Motohiro Cc: Milton Miller Cc: Russell King Acked-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/kernel/smp_tlb.c | 20 +++++--------------- arch/tile/include/asm/smp.h | 7 ------- arch/tile/kernel/smp.c | 19 ------------------- include/linux/smp.h | 22 ++++++++++++++++++++++ kernel/smp.c | 29 +++++++++++++++++++++++++++++ 5 files changed, 56 insertions(+), 41 deletions(-) diff --git a/arch/arm/kernel/smp_tlb.c b/arch/arm/kernel/smp_tlb.c index 7dcb35285be7..02c5d2ce23bf 100644 --- a/arch/arm/kernel/smp_tlb.c +++ b/arch/arm/kernel/smp_tlb.c @@ -13,18 +13,6 @@ #include #include -static void on_each_cpu_mask(void (*func)(void *), void *info, int wait, - const struct cpumask *mask) -{ - preempt_disable(); - - smp_call_function_many(mask, func, info, wait); - if (cpumask_test_cpu(smp_processor_id(), mask)) - func(info); - - preempt_enable(); -} - /**********************************************************************/ /* @@ -87,7 +75,7 @@ void flush_tlb_all(void) void flush_tlb_mm(struct mm_struct *mm) { if (tlb_ops_need_broadcast()) - on_each_cpu_mask(ipi_flush_tlb_mm, mm, 1, mm_cpumask(mm)); + on_each_cpu_mask(mm_cpumask(mm), ipi_flush_tlb_mm, mm, 1); else local_flush_tlb_mm(mm); } @@ -98,7 +86,8 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr) struct tlb_args ta; ta.ta_vma = vma; ta.ta_start = uaddr; - on_each_cpu_mask(ipi_flush_tlb_page, &ta, 1, mm_cpumask(vma->vm_mm)); + on_each_cpu_mask(mm_cpumask(vma->vm_mm), ipi_flush_tlb_page, + &ta, 1); } else local_flush_tlb_page(vma, uaddr); } @@ -121,7 +110,8 @@ void flush_tlb_range(struct vm_area_struct *vma, ta.ta_vma = vma; ta.ta_start = start; ta.ta_end = end; - on_each_cpu_mask(ipi_flush_tlb_range, &ta, 1, mm_cpumask(vma->vm_mm)); + on_each_cpu_mask(mm_cpumask(vma->vm_mm), ipi_flush_tlb_range, + &ta, 1); } else local_flush_tlb_range(vma, start, end); } diff --git a/arch/tile/include/asm/smp.h b/arch/tile/include/asm/smp.h index 532124ae4b12..1aa759aeb5b3 100644 --- a/arch/tile/include/asm/smp.h +++ b/arch/tile/include/asm/smp.h @@ -43,10 +43,6 @@ void evaluate_message(int tag); /* Boot a secondary cpu */ void online_secondary(void); -/* Call a function on a specified set of CPUs (may include this one). */ -extern void on_each_cpu_mask(const struct cpumask *mask, - void (*func)(void *), void *info, bool wait); - /* Topology of the supervisor tile grid, and coordinates of boot processor */ extern HV_Topology smp_topology; @@ -91,9 +87,6 @@ void print_disabled_cpus(void); #else /* !CONFIG_SMP */ -#define on_each_cpu_mask(mask, func, info, wait) \ - do { if (cpumask_test_cpu(0, (mask))) func(info); } while (0) - #define smp_master_cpu 0 #define smp_height 1 #define smp_width 1 diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c index c52224d5ed45..a44e103c5a63 100644 --- a/arch/tile/kernel/smp.c +++ b/arch/tile/kernel/smp.c @@ -87,25 +87,6 @@ void send_IPI_allbutself(int tag) send_IPI_many(&mask, tag); } - -/* - * Provide smp_call_function_mask, but also run function locally - * if specified in the mask. - */ -void on_each_cpu_mask(const struct cpumask *mask, void (*func)(void *), - void *info, bool wait) -{ - int cpu = get_cpu(); - smp_call_function_many(mask, func, info, wait); - if (cpumask_test_cpu(cpu, mask)) { - local_irq_disable(); - func(info); - local_irq_enable(); - } - put_cpu(); -} - - /* * Functions related to starting/stopping cpus. */ diff --git a/include/linux/smp.h b/include/linux/smp.h index 8cc38d3bab0c..d0adb7898d54 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -101,6 +101,13 @@ static inline void call_function_init(void) { } */ int on_each_cpu(smp_call_func_t func, void *info, int wait); +/* + * Call a function on processors specified by mask, which might include + * the local one. + */ +void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, + void *info, bool wait); + /* * Mark the boot cpu "online" so that it can call console drivers in * printk() and can access its per-cpu storage. @@ -132,6 +139,21 @@ static inline int up_smp_call_function(smp_call_func_t func, void *info) local_irq_enable(); \ 0; \ }) +/* + * Note we still need to test the mask even for UP + * because we actually can get an empty mask from + * code that on SMP might call us without the local + * CPU in the mask. + */ +#define on_each_cpu_mask(mask, func, info, wait) \ + do { \ + if (cpumask_test_cpu(0, (mask))) { \ + local_irq_disable(); \ + (func)(info); \ + local_irq_enable(); \ + } \ + } while (0) + static inline void smp_send_reschedule(int cpu) { } #define num_booting_cpus() 1 #define smp_prepare_boot_cpu() do {} while (0) diff --git a/kernel/smp.c b/kernel/smp.c index db197d60489b..a081e6ce0e0a 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -701,3 +701,32 @@ int on_each_cpu(void (*func) (void *info), void *info, int wait) return ret; } EXPORT_SYMBOL(on_each_cpu); + +/** + * on_each_cpu_mask(): Run a function on processors specified by + * cpumask, which may include the local processor. + * @mask: The set of cpus to run on (only runs on online subset). + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @wait: If true, wait (atomically) until function has completed + * on other CPUs. + * + * If @wait is true, then returns once @func has returned. + * + * You must not call this function with disabled interrupts or + * from a hardware interrupt handler or from a bottom half handler. + */ +void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, + void *info, bool wait) +{ + int cpu = get_cpu(); + + smp_call_function_many(mask, func, info, wait); + if (cpumask_test_cpu(cpu, mask)) { + local_irq_disable(); + func(info); + local_irq_enable(); + } + put_cpu(); +} +EXPORT_SYMBOL(on_each_cpu_mask); From b3a7e98e024ffa9f7e4554dd720c508015c4a831 Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Wed, 28 Mar 2012 14:42:43 -0700 Subject: [PATCH 07/35] smp: add func to IPI cpus based on parameter func Add the on_each_cpu_cond() function that wraps on_each_cpu_mask() and calculates the cpumask of cpus to IPI by calling a function supplied as a parameter in order to determine whether to IPI each specific cpu. The function works around allocation failure of cpumask variable in CONFIG_CPUMASK_OFFSTACK=y by itereating over cpus sending an IPI a time via smp_call_function_single(). The function is useful since it allows to seperate the specific code that decided in each case whether to IPI a specific cpu for a specific request from the common boilerplate code of handling creating the mask, handling failures etc. [akpm@linux-foundation.org: s/gfpflags/gfp_flags/] [akpm@linux-foundation.org: avoid double-evaluation of `info' (per Michal), parenthesise evaluation of `cond_func'] [akpm@linux-foundation.org: s/CPU/CPUs, use all 80 cols in comment] Signed-off-by: Gilad Ben-Yossef Cc: Chris Metcalf Cc: Christoph Lameter Acked-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Russell King Cc: Pekka Enberg Cc: Matt Mackall Cc: Sasha Levin Cc: Rik van Riel Cc: Andi Kleen Cc: Alexander Viro Cc: Avi Kivity Acked-by: Michal Nazarewicz Cc: Kosaki Motohiro Cc: Milton Miller Reviewed-by: "Srivatsa S. Bhat" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/smp.h | 24 ++++++++++++++++++ kernel/smp.c | 61 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) diff --git a/include/linux/smp.h b/include/linux/smp.h index d0adb7898d54..10530d92c04b 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -108,6 +108,15 @@ int on_each_cpu(smp_call_func_t func, void *info, int wait); void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait); +/* + * Call a function on each processor for which the supplied function + * cond_func returns a positive value. This may include the local + * processor. + */ +void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags); + /* * Mark the boot cpu "online" so that it can call console drivers in * printk() and can access its per-cpu storage. @@ -153,6 +162,21 @@ static inline int up_smp_call_function(smp_call_func_t func, void *info) local_irq_enable(); \ } \ } while (0) +/* + * Preemption is disabled here to make sure the cond_func is called under the + * same condtions in UP and SMP. + */ +#define on_each_cpu_cond(cond_func, func, info, wait, gfp_flags)\ + do { \ + void *__info = (info); \ + preempt_disable(); \ + if ((cond_func)(0, __info)) { \ + local_irq_disable(); \ + (func)(__info); \ + local_irq_enable(); \ + } \ + preempt_enable(); \ + } while (0) static inline void smp_send_reschedule(int cpu) { } #define num_booting_cpus() 1 diff --git a/kernel/smp.c b/kernel/smp.c index a081e6ce0e0a..2f8b10ecf759 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -730,3 +730,64 @@ void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, put_cpu(); } EXPORT_SYMBOL(on_each_cpu_mask); + +/* + * on_each_cpu_cond(): Call a function on each processor for which + * the supplied function cond_func returns true, optionally waiting + * for all the required CPUs to finish. This may include the local + * processor. + * @cond_func: A callback function that is passed a cpu id and + * the the info parameter. The function is called + * with preemption disabled. The function should + * return a blooean value indicating whether to IPI + * the specified CPU. + * @func: The function to run on all applicable CPUs. + * This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to both functions. + * @wait: If true, wait (atomically) until function has + * completed on other CPUs. + * @gfp_flags: GFP flags to use when allocating the cpumask + * used internally by the function. + * + * The function might sleep if the GFP flags indicates a non + * atomic allocation is allowed. + * + * Preemption is disabled to protect against CPUs going offline but not online. + * CPUs going online during the call will not be seen or sent an IPI. + * + * You must not call this function with disabled interrupts or + * from a hardware interrupt handler or from a bottom half handler. + */ +void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags) +{ + cpumask_var_t cpus; + int cpu, ret; + + might_sleep_if(gfp_flags & __GFP_WAIT); + + if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) { + preempt_disable(); + for_each_online_cpu(cpu) + if (cond_func(cpu, info)) + cpumask_set_cpu(cpu, cpus); + on_each_cpu_mask(cpus, func, info, wait); + preempt_enable(); + free_cpumask_var(cpus); + } else { + /* + * No free cpumask, bother. No matter, we'll + * just have to IPI them one by one. + */ + preempt_disable(); + for_each_online_cpu(cpu) + if (cond_func(cpu, info)) { + ret = smp_call_function_single(cpu, func, + info, wait); + WARN_ON_ONCE(!ret); + } + preempt_enable(); + } +} +EXPORT_SYMBOL(on_each_cpu_cond); From a8364d5555b2030d093cde0f07951628e55454e1 Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Wed, 28 Mar 2012 14:42:44 -0700 Subject: [PATCH 08/35] slub: only IPI CPUs that have per cpu obj to flush flush_all() is called for each kmem_cache_destroy(). So every cache being destroyed dynamically ends up sending an IPI to each CPU in the system, regardless if the cache has ever been used there. For example, if you close the Infinband ipath driver char device file, the close file ops calls kmem_cache_destroy(). So running some infiniband config tool on one a single CPU dedicated to system tasks might interrupt the rest of the 127 CPUs dedicated to some CPU intensive or latency sensitive task. I suspect there is a good chance that every line in the output of "git grep kmem_cache_destroy linux/ | grep '\->'" has a similar scenario. This patch attempts to rectify this issue by sending an IPI to flush the per cpu objects back to the free lists only to CPUs that seem to have such objects. The check which CPU to IPI is racy but we don't care since asking a CPU without per cpu objects to flush does no damage and as far as I can tell the flush_all by itself is racy against allocs on remote CPUs anyway, so if you required the flush_all to be determinstic, you had to arrange for locking regardless. Without this patch the following artificial test case: $ cd /sys/kernel/slab $ for DIR in *; do cat $DIR/alloc_calls > /dev/null; done produces 166 IPIs on an cpuset isolated CPU. With it it produces none. The code path of memory allocation failure for CPUMASK_OFFSTACK=y config was tested using fault injection framework. Signed-off-by: Gilad Ben-Yossef Acked-by: Christoph Lameter Cc: Chris Metcalf Acked-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Russell King Cc: Pekka Enberg Cc: Matt Mackall Cc: Sasha Levin Cc: Rik van Riel Cc: Andi Kleen Cc: Mel Gorman Cc: Alexander Viro Cc: Avi Kivity Cc: Michal Nazarewicz Cc: Kosaki Motohiro Cc: Milton Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index f4a6229848fd..dcbb1926cb7f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2028,9 +2028,17 @@ static void flush_cpu_slab(void *d) __flush_cpu_slab(s, smp_processor_id()); } +static bool has_cpu_slab(int cpu, void *info) +{ + struct kmem_cache *s = info; + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + + return !!(c->page); +} + static void flush_all(struct kmem_cache *s) { - on_each_cpu(flush_cpu_slab, s, 1); + on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC); } /* From 42be35d0390b966253136a285f507f5ad00fd9e8 Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Wed, 28 Mar 2012 14:42:45 -0700 Subject: [PATCH 09/35] fs: only send IPI to invalidate LRU BH when needed In several code paths, such as when unmounting a file system (but not only) we send an IPI to ask each cpu to invalidate its local LRU BHs. For multi-cores systems that have many cpus that may not have any LRU BH because they are idle or because they have not performed any file system accesses since last invalidation (e.g. CPU crunching on high perfomance computing nodes that write results to shared memory or only using filesystems that do not use the bh layer.) This can lead to loss of performance each time someone switches the KVM (the virtual keyboard and screen type, not the hypervisor) if it has a USB storage stuck in. This patch attempts to only send an IPI to cpus that have LRU BH. Signed-off-by: Gilad Ben-Yossef Acked-by: Peter Zijlstra Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/buffer.c b/fs/buffer.c index 70e2017edd70..36d66653b931 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1384,10 +1384,23 @@ static void invalidate_bh_lru(void *arg) } put_cpu_var(bh_lrus); } + +static bool has_bh_in_lru(int cpu, void *dummy) +{ + struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu); + int i; + for (i = 0; i < BH_LRU_SIZE; i++) { + if (b->bhs[i]) + return 1; + } + + return 0; +} + void invalidate_bh_lrus(void) { - on_each_cpu(invalidate_bh_lru, NULL, 1); + on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1, GFP_KERNEL); } EXPORT_SYMBOL_GPL(invalidate_bh_lrus); From 74046494ea68676d29ef6501a4bd950f08112a2c Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Wed, 28 Mar 2012 14:42:45 -0700 Subject: [PATCH 10/35] mm: only IPI CPUs to drain local pages if they exist Calculate a cpumask of CPUs with per-cpu pages in any zone and only send an IPI requesting CPUs to drain these pages to the buddy allocator if they actually have pages when asked to flush. This patch saves 85%+ of IPIs asking to drain per-cpu pages in case of severe memory pressure that leads to OOM since in these cases multiple, possibly concurrent, allocation requests end up in the direct reclaim code path so when the per-cpu pages end up reclaimed on first allocation failure for most of the proceeding allocation attempts until the memory pressure is off (possibly via the OOM killer) there are no per-cpu pages on most CPUs (and there can easily be hundreds of them). This also has the side effect of shortening the average latency of direct reclaim by 1 or more order of magnitude since waiting for all the CPUs to ACK the IPI takes a long time. Tested by running "hackbench 400" on a 8 CPU x86 VM and observing the difference between the number of direct reclaim attempts that end up in drain_all_pages() and those were more then 1/2 of the online CPU had any per-cpu page in them, using the vmstat counters introduced in the next patch in the series and using proc/interrupts. In the test sceanrio, this was seen to save around 3600 global IPIs after trigerring an OOM on a concurrent workload: $ cat /proc/vmstat | tail -n 2 pcp_global_drain 0 pcp_global_ipi_saved 0 $ cat /proc/interrupts | grep CAL CAL: 1 2 1 2 2 2 2 2 Function call interrupts $ hackbench 400 [OOM messages snipped] $ cat /proc/vmstat | tail -n 2 pcp_global_drain 3647 pcp_global_ipi_saved 3642 $ cat /proc/interrupts | grep CAL CAL: 6 13 6 3 3 3 1 2 7 Function call interrupts Please note that if the global drain is removed from the direct reclaim path as a patch from Mel Gorman currently suggests this should be replaced with an on_each_cpu_cond invocation. Signed-off-by: Gilad Ben-Yossef Acked-by: Mel Gorman Cc: KOSAKI Motohiro Acked-by: Christoph Lameter Acked-by: Peter Zijlstra Cc: Pekka Enberg Cc: Rik van Riel Cc: Andi Kleen Acked-by: Michal Nazarewicz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c313afcc8e5a..a712fb9e04ce 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1161,11 +1161,47 @@ void drain_local_pages(void *arg) } /* - * Spill all the per-cpu pages from all CPUs back into the buddy allocator + * Spill all the per-cpu pages from all CPUs back into the buddy allocator. + * + * Note that this code is protected against sending an IPI to an offline + * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: + * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but + * nothing keeps CPUs from showing up after we populated the cpumask and + * before the call to on_each_cpu_mask(). */ void drain_all_pages(void) { - on_each_cpu(drain_local_pages, NULL, 1); + int cpu; + struct per_cpu_pageset *pcp; + struct zone *zone; + + /* + * Allocate in the BSS so we wont require allocation in + * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y + */ + static cpumask_t cpus_with_pcps; + + /* + * We don't care about racing with CPU hotplug event + * as offline notification will cause the notified + * cpu to drain that CPU pcps and on_each_cpu_mask + * disables preemption as part of its processing + */ + for_each_online_cpu(cpu) { + bool has_pcps = false; + for_each_populated_zone(zone) { + pcp = per_cpu_ptr(zone->pageset, cpu); + if (pcp->pcp.count) { + has_pcps = true; + break; + } + } + if (has_pcps) + cpumask_set_cpu(cpu, &cpus_with_pcps); + else + cpumask_clear_cpu(cpu, &cpus_with_pcps); + } + on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1); } #ifdef CONFIG_HIBERNATION From 38b93780a5381961ad92d24ab9a12a964189a3a4 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Wed, 28 Mar 2012 14:42:46 -0700 Subject: [PATCH 11/35] lib/cpumask.c: remove __any_online_cpu() __any_online_cpu() is not optimal and also unnecessary. So, replace its use by faster cpumask_* operations. Signed-off-by: Srivatsa S. Bhat Cc: Eric Dumazet Cc: Venkatesh Pallipadi Cc: Rusty Russell Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpumask.h | 3 +-- lib/cpumask.c | 12 ------------ 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 7b9b75a529be..1ffdb9856bb9 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -810,11 +810,10 @@ static inline const struct cpumask *get_cpu_mask(unsigned int cpu) #else /* NR_CPUS > 1 */ int __first_cpu(const cpumask_t *srcp); int __next_cpu(int n, const cpumask_t *srcp); -int __any_online_cpu(const cpumask_t *mask); #define first_cpu(src) __first_cpu(&(src)) #define next_cpu(n, src) __next_cpu((n), &(src)) -#define any_online_cpu(mask) __any_online_cpu(&(mask)) +#define any_online_cpu(mask) cpumask_any_and(&mask, cpu_online_mask) #define for_each_cpu_mask(cpu, mask) \ for ((cpu) = -1; \ (cpu) = next_cpu((cpu), (mask)), \ diff --git a/lib/cpumask.c b/lib/cpumask.c index 0b660118ed91..402a54ac35cb 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -26,18 +26,6 @@ int __next_cpu_nr(int n, const cpumask_t *srcp) EXPORT_SYMBOL(__next_cpu_nr); #endif -int __any_online_cpu(const cpumask_t *mask) -{ - int cpu; - - for_each_cpu(cpu, mask) { - if (cpu_online(cpu)) - break; - } - return cpu; -} -EXPORT_SYMBOL(__any_online_cpu); - /** * cpumask_next_and - get the next cpu in *src1p & *src2p * @n: the cpu prior to the place to search (ie. return will be > @n) From 7d7f98488b203cbf78538698cf5d937f670d96d3 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Wed, 28 Mar 2012 14:42:46 -0700 Subject: [PATCH 12/35] arch/ia64: remove references to cpu_*_map This was marked as obsolete for quite a while now.. Now it is time to remove it altogether. And while doing this, get rid of first_cpu() as well. Also, remove the redundant setting of cpu_online_mask in smp_prepare_cpus() because the generic code would have already set cpu 0 in cpu_online_mask. Reported-by: Tony Luck Signed-off-by: Srivatsa S. Bhat Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/acpi.c | 6 +++--- arch/ia64/kernel/irq_ia64.c | 8 ++++---- arch/ia64/kernel/mca.c | 6 ++++-- arch/ia64/kernel/msi_ia64.c | 4 ++-- arch/ia64/kernel/setup.c | 2 +- arch/ia64/kernel/smp.c | 2 +- arch/ia64/kernel/smpboot.c | 19 +++++++------------ arch/ia64/kernel/topology.c | 3 ++- 8 files changed, 24 insertions(+), 26 deletions(-) diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index 2d801bfe16ac..19bb1eefffb4 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -844,7 +844,7 @@ early_param("additional_cpus", setup_additional_cpus); * are onlined, or offlined. The reason is per-cpu data-structures * are allocated by some modules at init time, and dont expect to * do this dynamically on cpu arrival/departure. - * cpu_present_map on the other hand can change dynamically. + * cpu_present_mask on the other hand can change dynamically. * In case when cpu_hotplug is not compiled, then we resort to current * behaviour, which is cpu_possible == cpu_present. * - Ashok Raj @@ -922,7 +922,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) acpi_map_cpu2node(handle, cpu, physid); - cpu_set(cpu, cpu_present_map); + set_cpu_present(cpu, true); ia64_cpu_to_sapicid[cpu] = physid; acpi_processor_set_pdc(handle); @@ -941,7 +941,7 @@ EXPORT_SYMBOL(acpi_map_lsapic); int acpi_unmap_lsapic(int cpu) { ia64_cpu_to_sapicid[cpu] = -1; - cpu_clear(cpu, cpu_present_map); + set_cpu_present(cpu, false); #ifdef CONFIG_ACPI_NUMA /* NUMA specific cleanup's */ diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c index 782c3a357f24..51da77226b29 100644 --- a/arch/ia64/kernel/irq_ia64.c +++ b/arch/ia64/kernel/irq_ia64.c @@ -118,7 +118,7 @@ static inline int find_unassigned_vector(cpumask_t domain) cpumask_t mask; int pos, vector; - cpus_and(mask, domain, cpu_online_map); + cpumask_and(&mask, &domain, cpu_online_mask); if (cpus_empty(mask)) return -EINVAL; @@ -141,7 +141,7 @@ static int __bind_irq_vector(int irq, int vector, cpumask_t domain) BUG_ON((unsigned)irq >= NR_IRQS); BUG_ON((unsigned)vector >= IA64_NUM_VECTORS); - cpus_and(mask, domain, cpu_online_map); + cpumask_and(&mask, &domain, cpu_online_mask); if (cpus_empty(mask)) return -EINVAL; if ((cfg->vector == vector) && cpus_equal(cfg->domain, domain)) @@ -179,7 +179,7 @@ static void __clear_irq_vector(int irq) BUG_ON(cfg->vector == IRQ_VECTOR_UNASSIGNED); vector = cfg->vector; domain = cfg->domain; - cpus_and(mask, cfg->domain, cpu_online_map); + cpumask_and(&mask, &cfg->domain, cpu_online_mask); for_each_cpu_mask(cpu, mask) per_cpu(vector_irq, cpu)[vector] = -1; cfg->vector = IRQ_VECTOR_UNASSIGNED; @@ -322,7 +322,7 @@ void irq_complete_move(unsigned irq) if (unlikely(cpu_isset(smp_processor_id(), cfg->old_domain))) return; - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); + cpumask_and(&cleanup_mask, &cfg->old_domain, cpu_online_mask); cfg->move_cleanup_count = cpus_weight(cleanup_mask); for_each_cpu_mask(i, cleanup_mask) platform_send_ipi(i, IA64_IRQ_MOVE_VECTOR, IA64_IPI_DM_INT, 0); diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 8192009cb924..26dbbd3c3053 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -1515,7 +1515,8 @@ static void ia64_mca_cmc_poll (unsigned long dummy) { /* Trigger a CMC interrupt cascade */ - platform_send_ipi(first_cpu(cpu_online_map), IA64_CMCP_VECTOR, IA64_IPI_DM_INT, 0); + platform_send_ipi(cpumask_first(cpu_online_mask), IA64_CMCP_VECTOR, + IA64_IPI_DM_INT, 0); } /* @@ -1591,7 +1592,8 @@ static void ia64_mca_cpe_poll (unsigned long dummy) { /* Trigger a CPE interrupt cascade */ - platform_send_ipi(first_cpu(cpu_online_map), IA64_CPEP_VECTOR, IA64_IPI_DM_INT, 0); + platform_send_ipi(cpumask_first(cpu_online_mask), IA64_CPEP_VECTOR, + IA64_IPI_DM_INT, 0); } #endif /* CONFIG_ACPI */ diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c index 94e0db72d4a6..fb2f1e622877 100644 --- a/arch/ia64/kernel/msi_ia64.c +++ b/arch/ia64/kernel/msi_ia64.c @@ -57,7 +57,7 @@ int ia64_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *desc) return irq; irq_set_msi_desc(irq, desc); - cpus_and(mask, irq_to_domain(irq), cpu_online_map); + cpumask_and(&mask, &(irq_to_domain(irq)), cpu_online_mask); dest_phys_id = cpu_physical_id(first_cpu(mask)); vector = irq_to_vector(irq); @@ -179,7 +179,7 @@ msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) unsigned dest; cpumask_t mask; - cpus_and(mask, irq_to_domain(irq), cpu_online_map); + cpumask_and(&mask, &(irq_to_domain(irq)), cpu_online_mask); dest = cpu_physical_id(first_cpu(mask)); msg->address_hi = 0; diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index cd57d7312de0..4d1a5508a0ed 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -486,7 +486,7 @@ mark_bsp_online (void) { #ifdef CONFIG_SMP /* If we register an early console, allow CPU 0 to printk */ - cpu_set(smp_processor_id(), cpu_online_map); + set_cpu_online(smp_processor_id(), true); #endif } diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c index 0bd537b4ea6b..855197981962 100644 --- a/arch/ia64/kernel/smp.c +++ b/arch/ia64/kernel/smp.c @@ -77,7 +77,7 @@ stop_this_cpu(void) /* * Remove this CPU: */ - cpu_clear(smp_processor_id(), cpu_online_map); + set_cpu_online(smp_processor_id(), false); max_xtp(); local_irq_disable(); cpu_halt(); diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c index 559097986672..90916beddf07 100644 --- a/arch/ia64/kernel/smpboot.c +++ b/arch/ia64/kernel/smpboot.c @@ -401,7 +401,7 @@ smp_callin (void) /* Setup the per cpu irq handling data structures */ __setup_vector_irq(cpuid); notify_cpu_starting(cpuid); - cpu_set(cpuid, cpu_online_map); + set_cpu_online(cpuid, true); per_cpu(cpu_state, cpuid) = CPU_ONLINE; spin_unlock(&vector_lock); ipi_call_unlock_irq(); @@ -548,7 +548,7 @@ do_rest: if (!cpu_isset(cpu, cpu_callin_map)) { printk(KERN_ERR "Processor 0x%x/0x%x is stuck.\n", cpu, sapicid); ia64_cpu_to_sapicid[cpu] = -1; - cpu_clear(cpu, cpu_online_map); /* was set in smp_callin() */ + set_cpu_online(cpu, false); /* was set in smp_callin() */ return -EINVAL; } return 0; @@ -578,8 +578,7 @@ smp_build_cpu_map (void) } ia64_cpu_to_sapicid[0] = boot_cpu_id; - cpus_clear(cpu_present_map); - set_cpu_present(0, true); + init_cpu_present(cpumask_of(0)); set_cpu_possible(0, true); for (cpu = 1, i = 0; i < smp_boot_data.cpu_count; i++) { sapicid = smp_boot_data.cpu_phys_id[i]; @@ -606,10 +605,6 @@ smp_prepare_cpus (unsigned int max_cpus) smp_setup_percpu_timer(); - /* - * We have the boot CPU online for sure. - */ - cpu_set(0, cpu_online_map); cpu_set(0, cpu_callin_map); local_cpu_data->loops_per_jiffy = loops_per_jiffy; @@ -633,7 +628,7 @@ smp_prepare_cpus (unsigned int max_cpus) void __devinit smp_prepare_boot_cpu(void) { - cpu_set(smp_processor_id(), cpu_online_map); + set_cpu_online(smp_processor_id(), true); cpu_set(smp_processor_id(), cpu_callin_map); set_numa_node(cpu_to_node_map[smp_processor_id()]); per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; @@ -690,7 +685,7 @@ int migrate_platform_irqs(unsigned int cpu) /* * Now re-target the CPEI to a different processor */ - new_cpei_cpu = any_online_cpu(cpu_online_map); + new_cpei_cpu = cpumask_any(cpu_online_mask); mask = cpumask_of(new_cpei_cpu); set_cpei_target_cpu(new_cpei_cpu); data = irq_get_irq_data(ia64_cpe_irq); @@ -732,10 +727,10 @@ int __cpu_disable(void) return -EBUSY; } - cpu_clear(cpu, cpu_online_map); + set_cpu_online(cpu, false); if (migrate_platform_irqs(cpu)) { - cpu_set(cpu, cpu_online_map); + set_cpu_online(cpu, true); return -EBUSY; } diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c index 9deb21dbf629..c64460b9c704 100644 --- a/arch/ia64/kernel/topology.c +++ b/arch/ia64/kernel/topology.c @@ -220,7 +220,8 @@ static ssize_t show_shared_cpu_map(struct cache_info *this_leaf, char *buf) ssize_t len; cpumask_t shared_cpu_map; - cpus_and(shared_cpu_map, this_leaf->shared_cpu_map, cpu_online_map); + cpumask_and(&shared_cpu_map, + &this_leaf->shared_cpu_map, cpu_online_mask); len = cpumask_scnprintf(buf, NR_CPUS+1, &shared_cpu_map); len += sprintf(buf+len, "\n"); return len; From d034cfab4f7b9e768c5c1caaa56c5bd4805d2b92 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 28 Mar 2012 14:42:47 -0700 Subject: [PATCH 13/35] kexec: crash: don't save swapper_pg_dir for !CONFIG_MMU configurations nommu platforms don't have very interesting swapper_pg_dir pointers and usually just #define them to NULL, meaning that we can't include them in the vmcoreinfo on the kexec crash path. This patch only saves the swapper_pg_dir if we have an MMU. Signed-off-by: Will Deacon Reviewed-by: Simon Horman Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/kexec.c b/kernel/kexec.c index a6a675cb9818..769e347c5196 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1462,7 +1462,9 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(init_uts_ns); VMCOREINFO_SYMBOL(node_online_map); +#ifdef CONFIG_MMU VMCOREINFO_SYMBOL(swapper_pg_dir); +#endif VMCOREINFO_SYMBOL(_stext); VMCOREINFO_SYMBOL(vmlist); From eaa3be6add6f327ab0a633e4fee8e6f2cc8c8a4c Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 28 Mar 2012 14:42:47 -0700 Subject: [PATCH 14/35] kexec: add further check to crashkernel When using crashkernel=2M-256M, the kernel doesn't give any warning. This is misleading sometimes. Signed-off-by: Zhenzhong Duan Acked-by: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/kexec.c b/kernel/kexec.c index 769e347c5196..3288c9b29bae 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1359,6 +1359,10 @@ static int __init parse_crashkernel_simple(char *cmdline, if (*cur == '@') *crash_base = memparse(cur+1, &cur); + else if (*cur != ' ' && *cur != '\0') { + pr_warning("crashkernel: unrecognized char\n"); + return -EINVAL; + } return 0; } From 09c71bfd8384278c42f56380365940508194cec0 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 28 Mar 2012 14:42:47 -0700 Subject: [PATCH 15/35] kdump x86: fix total mem size calculation for reservation crashkernel reservation need know the total memory size. Current get_total_mem simply use max_pfn - min_low_pfn. It is wrong because it will including memory holes in the middle. Especially for kvm guest with memory > 0xe0000000, there's below in qemu code: qemu split memory as below: if (ram_size >= 0xe0000000 ) { above_4g_mem_size = ram_size - 0xe0000000; below_4g_mem_size = 0xe0000000; } else { below_4g_mem_size = ram_size; } So for 4G mem guest, seabios will insert a 512M usable region beyond of 4G. Thus in above case max_pfn - min_low_pfn will be more than original memsize. Fixing this issue by using memblock_phys_mem_size() to get the total memsize. Signed-off-by: Dave Young Reviewed-by: WANG Cong Reviewed-by: Simon Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/setup.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 88638883176a..ab77aae4ad9b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -509,15 +509,6 @@ static void __init memblock_x86_reserve_range_setup_data(void) #ifdef CONFIG_KEXEC -static inline unsigned long long get_total_mem(void) -{ - unsigned long long total; - - total = max_pfn - min_low_pfn; - - return total << PAGE_SHIFT; -} - /* * Keep the crash kernel below this limit. On 32 bits earlier kernels * would limit the kernel to the low 512 MiB due to mapping restrictions. @@ -536,7 +527,7 @@ static void __init reserve_crashkernel(void) unsigned long long crash_size, crash_base; int ret; - total_mem = get_total_mem(); + total_mem = memblock_phys_mem_size(); ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base); From b88e769368a88cf28e53db158b84eda096144bce Mon Sep 17 00:00:00 2001 From: Srinivas_Gowda Date: Wed, 28 Mar 2012 14:42:48 -0700 Subject: [PATCH 16/35] ipmi: decrease the IPMI message transaction time in interrupt mode Call the event handler immediately after starting the next message. This change considerably decreases the IPMI transaction time (cuts off ~9ms for a single ipmitool transaction). Signed-off-by: Srinivas_Gowda Signed-off-by: Corey Minyard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/ipmi/ipmi_si_intf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 50fcf9c04569..73ebbb1a3269 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -932,8 +932,10 @@ static void sender(void *send_info, spin_unlock_irqrestore(&smi_info->msg_lock, flags); spin_lock_irqsave(&smi_info->si_lock, flags); - if (smi_info->si_state == SI_NORMAL && smi_info->curr_msg == NULL) + if (smi_info->si_state == SI_NORMAL && smi_info->curr_msg == NULL) { start_next_msg(smi_info); + smi_event_handler(smi_info, 0); + } spin_unlock_irqrestore(&smi_info->si_lock, flags); } From 828dc9da50f9632bbc5bc9dfa510619d13135015 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Wed, 28 Mar 2012 14:42:48 -0700 Subject: [PATCH 17/35] ipmi: increase KCS timeouts We currently time out and retry KCS transactions after 1 second of waiting for IBF or OBF. This appears to be too short for some hardware. The IPMI spec says "All system software wait loops should include error timeouts. For simplicity, such timeouts are not shown explicitly in the flow diagrams. A five-second timeout or greater is recommended". Change the timeout to five seconds to satisfy the slow hardware. Signed-off-by: Matthew Garrett Signed-off-by: Corey Minyard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/ipmi/ipmi_kcs_sm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/char/ipmi/ipmi_kcs_sm.c b/drivers/char/ipmi/ipmi_kcs_sm.c index cf82fedae099..e53fc24c6af3 100644 --- a/drivers/char/ipmi/ipmi_kcs_sm.c +++ b/drivers/char/ipmi/ipmi_kcs_sm.c @@ -118,8 +118,8 @@ enum kcs_states { #define MAX_KCS_WRITE_SIZE IPMI_MAX_MSG_LENGTH /* Timeouts in microseconds. */ -#define IBF_RETRY_TIMEOUT 1000000 -#define OBF_RETRY_TIMEOUT 1000000 +#define IBF_RETRY_TIMEOUT 5000000 +#define OBF_RETRY_TIMEOUT 5000000 #define MAX_ERROR_RETRIES 10 #define ERROR0_OBF_WAIT_JIFFIES (2*HZ) From 7adf579c8babf62026e6aab1dee85e6b104d9936 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Wed, 28 Mar 2012 14:42:49 -0700 Subject: [PATCH 18/35] ipmi: use a tasklet for handling received messages The IPMI driver would release a lock, deliver a message, then relock. This is obviously ugly, and this patch converts the message handler interface to use a tasklet to schedule work. This lets the receive handler be called from an interrupt handler with interrupts enabled. Signed-off-by: Corey Minyard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/ipmi/ipmi_msghandler.c | 141 +++++++++++++++++----------- drivers/char/ipmi/ipmi_si_intf.c | 14 +-- 2 files changed, 88 insertions(+), 67 deletions(-) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index 58c0e6387cf7..289ab506b79b 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -46,6 +46,7 @@ #include #include #include +#include #define PFX "IPMI message handler: " @@ -53,6 +54,8 @@ static struct ipmi_recv_msg *ipmi_alloc_recv_msg(void); static int ipmi_init_msghandler(void); +static void smi_recv_tasklet(unsigned long); +static void handle_new_recv_msgs(ipmi_smi_t intf); static int initialized; @@ -355,12 +358,15 @@ struct ipmi_smi { int curr_seq; /* - * Messages that were delayed for some reason (out of memory, - * for instance), will go in here to be processed later in a - * periodic timer interrupt. + * Messages queued for delivery. If delivery fails (out of memory + * for instance), They will stay in here to be processed later in a + * periodic timer interrupt. The tasklet is for handling received + * messages directly from the handler. */ spinlock_t waiting_msgs_lock; struct list_head waiting_msgs; + atomic_t watchdog_pretimeouts_to_deliver; + struct tasklet_struct recv_tasklet; /* * The list of command receivers that are registered for commands @@ -493,6 +499,8 @@ static void clean_up_interface_data(ipmi_smi_t intf) struct cmd_rcvr *rcvr, *rcvr2; struct list_head list; + tasklet_kill(&intf->recv_tasklet); + free_smi_msg_list(&intf->waiting_msgs); free_recv_msg_list(&intf->waiting_events); @@ -2792,6 +2800,9 @@ void ipmi_poll_interface(ipmi_user_t user) if (intf->handlers->poll) intf->handlers->poll(intf->send_info); + + /* In case something came in */ + handle_new_recv_msgs(intf); } EXPORT_SYMBOL(ipmi_poll_interface); @@ -2860,6 +2871,10 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers, #endif spin_lock_init(&intf->waiting_msgs_lock); INIT_LIST_HEAD(&intf->waiting_msgs); + tasklet_init(&intf->recv_tasklet, + smi_recv_tasklet, + (unsigned long) intf); + atomic_set(&intf->watchdog_pretimeouts_to_deliver, 0); spin_lock_init(&intf->events_lock); INIT_LIST_HEAD(&intf->waiting_events); intf->waiting_events_count = 0; @@ -3622,11 +3637,11 @@ static int handle_bmc_rsp(ipmi_smi_t intf, } /* - * Handle a new message. Return 1 if the message should be requeued, + * Handle a received message. Return 1 if the message should be requeued, * 0 if the message should be freed, or -1 if the message should not * be freed or requeued. */ -static int handle_new_recv_msg(ipmi_smi_t intf, +static int handle_one_recv_msg(ipmi_smi_t intf, struct ipmi_smi_msg *msg) { int requeue; @@ -3784,12 +3799,72 @@ static int handle_new_recv_msg(ipmi_smi_t intf, return requeue; } +/* + * If there are messages in the queue or pretimeouts, handle them. + */ +static void handle_new_recv_msgs(ipmi_smi_t intf) +{ + struct ipmi_smi_msg *smi_msg; + unsigned long flags = 0; + int rv; + int run_to_completion = intf->run_to_completion; + + /* See if any waiting messages need to be processed. */ + if (!run_to_completion) + spin_lock_irqsave(&intf->waiting_msgs_lock, flags); + while (!list_empty(&intf->waiting_msgs)) { + smi_msg = list_entry(intf->waiting_msgs.next, + struct ipmi_smi_msg, link); + list_del(&smi_msg->link); + if (!run_to_completion) + spin_unlock_irqrestore(&intf->waiting_msgs_lock, flags); + rv = handle_one_recv_msg(intf, smi_msg); + if (!run_to_completion) + spin_lock_irqsave(&intf->waiting_msgs_lock, flags); + if (rv == 0) { + /* Message handled */ + ipmi_free_smi_msg(smi_msg); + } else if (rv < 0) { + /* Fatal error on the message, del but don't free. */ + } else { + /* + * To preserve message order, quit if we + * can't handle a message. + */ + list_add(&smi_msg->link, &intf->waiting_msgs); + break; + } + } + if (!run_to_completion) + spin_unlock_irqrestore(&intf->waiting_msgs_lock, flags); + + /* + * If the pretimout count is non-zero, decrement one from it and + * deliver pretimeouts to all the users. + */ + if (atomic_add_unless(&intf->watchdog_pretimeouts_to_deliver, -1, 0)) { + ipmi_user_t user; + + rcu_read_lock(); + list_for_each_entry_rcu(user, &intf->users, link) { + if (user->handler->ipmi_watchdog_pretimeout) + user->handler->ipmi_watchdog_pretimeout( + user->handler_data); + } + rcu_read_unlock(); + } +} + +static void smi_recv_tasklet(unsigned long val) +{ + handle_new_recv_msgs((ipmi_smi_t) val); +} + /* Handle a new message from the lower layer. */ void ipmi_smi_msg_received(ipmi_smi_t intf, struct ipmi_smi_msg *msg) { unsigned long flags = 0; /* keep us warning-free. */ - int rv; int run_to_completion; @@ -3843,31 +3918,11 @@ void ipmi_smi_msg_received(ipmi_smi_t intf, run_to_completion = intf->run_to_completion; if (!run_to_completion) spin_lock_irqsave(&intf->waiting_msgs_lock, flags); - if (!list_empty(&intf->waiting_msgs)) { - list_add_tail(&msg->link, &intf->waiting_msgs); - if (!run_to_completion) - spin_unlock_irqrestore(&intf->waiting_msgs_lock, flags); - goto out; - } + list_add_tail(&msg->link, &intf->waiting_msgs); if (!run_to_completion) spin_unlock_irqrestore(&intf->waiting_msgs_lock, flags); - rv = handle_new_recv_msg(intf, msg); - if (rv > 0) { - /* - * Could not handle the message now, just add it to a - * list to handle later. - */ - run_to_completion = intf->run_to_completion; - if (!run_to_completion) - spin_lock_irqsave(&intf->waiting_msgs_lock, flags); - list_add_tail(&msg->link, &intf->waiting_msgs); - if (!run_to_completion) - spin_unlock_irqrestore(&intf->waiting_msgs_lock, flags); - } else if (rv == 0) { - ipmi_free_smi_msg(msg); - } - + tasklet_schedule(&intf->recv_tasklet); out: return; } @@ -3875,16 +3930,8 @@ EXPORT_SYMBOL(ipmi_smi_msg_received); void ipmi_smi_watchdog_pretimeout(ipmi_smi_t intf) { - ipmi_user_t user; - - rcu_read_lock(); - list_for_each_entry_rcu(user, &intf->users, link) { - if (!user->handler->ipmi_watchdog_pretimeout) - continue; - - user->handler->ipmi_watchdog_pretimeout(user->handler_data); - } - rcu_read_unlock(); + atomic_set(&intf->watchdog_pretimeouts_to_deliver, 1); + tasklet_schedule(&intf->recv_tasklet); } EXPORT_SYMBOL(ipmi_smi_watchdog_pretimeout); @@ -3998,28 +4045,12 @@ static void ipmi_timeout_handler(long timeout_period) ipmi_smi_t intf; struct list_head timeouts; struct ipmi_recv_msg *msg, *msg2; - struct ipmi_smi_msg *smi_msg, *smi_msg2; unsigned long flags; int i; rcu_read_lock(); list_for_each_entry_rcu(intf, &ipmi_interfaces, link) { - /* See if any waiting messages need to be processed. */ - spin_lock_irqsave(&intf->waiting_msgs_lock, flags); - list_for_each_entry_safe(smi_msg, smi_msg2, - &intf->waiting_msgs, link) { - if (!handle_new_recv_msg(intf, smi_msg)) { - list_del(&smi_msg->link); - ipmi_free_smi_msg(smi_msg); - } else { - /* - * To preserve message order, quit if we - * can't handle a message. - */ - break; - } - } - spin_unlock_irqrestore(&intf->waiting_msgs_lock, flags); + tasklet_schedule(&intf->recv_tasklet); /* * Go through the seq table and find any messages that diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 73ebbb1a3269..01e53cd105dd 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -320,16 +320,8 @@ static int register_xaction_notifier(struct notifier_block *nb) static void deliver_recv_msg(struct smi_info *smi_info, struct ipmi_smi_msg *msg) { - /* Deliver the message to the upper layer with the lock - released. */ - - if (smi_info->run_to_completion) { - ipmi_smi_msg_received(smi_info->intf, msg); - } else { - spin_unlock(&(smi_info->si_lock)); - ipmi_smi_msg_received(smi_info->intf, msg); - spin_lock(&(smi_info->si_lock)); - } + /* Deliver the message to the upper layer. */ + ipmi_smi_msg_received(smi_info->intf, msg); } static void return_hosed_msg(struct smi_info *smi_info, int cCode) @@ -481,9 +473,7 @@ static void handle_flags(struct smi_info *smi_info) start_clear_flags(smi_info); smi_info->msg_flags &= ~WDT_PRE_TIMEOUT_INT; - spin_unlock(&(smi_info->si_lock)); ipmi_smi_watchdog_pretimeout(smi_info->intf); - spin_lock(&(smi_info->si_lock)); } else if (smi_info->msg_flags & RECEIVE_MSG_AVAIL) { /* Messages available. */ smi_info->curr_msg = ipmi_alloc_smi_msg(); From 895dcfd1cab84d7e1c22af645a7f2f3c9bb5f24e Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Wed, 28 Mar 2012 14:42:49 -0700 Subject: [PATCH 19/35] ipmi: fix message handling during panics The part of the IPMI driver that delivered panic information to the event log and extended the watchdog timeout during a panic was not properly handling the messages. It used static messages to avoid allocation, but wasn't properly waiting for these, or wasn't properly handling the refcounts. Signed-off-by: Corey Minyard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/ipmi/ipmi_msghandler.c | 103 +++++++++++++--------------- drivers/char/ipmi/ipmi_watchdog.c | 17 +++-- 2 files changed, 56 insertions(+), 64 deletions(-) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index 289ab506b79b..5c1820c2a853 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -2794,16 +2794,18 @@ channel_handler(ipmi_smi_t intf, struct ipmi_recv_msg *msg) return; } -void ipmi_poll_interface(ipmi_user_t user) +static void ipmi_poll(ipmi_smi_t intf) { - ipmi_smi_t intf = user->intf; - if (intf->handlers->poll) intf->handlers->poll(intf->send_info); - /* In case something came in */ handle_new_recv_msgs(intf); } + +void ipmi_poll_interface(ipmi_user_t user) +{ + ipmi_poll(user->intf); +} EXPORT_SYMBOL(ipmi_poll_interface); int ipmi_register_smi(struct ipmi_smi_handlers *handlers, @@ -4204,12 +4206,48 @@ EXPORT_SYMBOL(ipmi_free_recv_msg); #ifdef CONFIG_IPMI_PANIC_EVENT +static atomic_t panic_done_count = ATOMIC_INIT(0); + static void dummy_smi_done_handler(struct ipmi_smi_msg *msg) { + atomic_dec(&panic_done_count); } static void dummy_recv_done_handler(struct ipmi_recv_msg *msg) { + atomic_dec(&panic_done_count); +} + +/* + * Inside a panic, send a message and wait for a response. + */ +static void ipmi_panic_request_and_wait(ipmi_smi_t intf, + struct ipmi_addr *addr, + struct kernel_ipmi_msg *msg) +{ + struct ipmi_smi_msg smi_msg; + struct ipmi_recv_msg recv_msg; + int rv; + + smi_msg.done = dummy_smi_done_handler; + recv_msg.done = dummy_recv_done_handler; + atomic_add(2, &panic_done_count); + rv = i_ipmi_request(NULL, + intf, + addr, + 0, + msg, + intf, + &smi_msg, + &recv_msg, + 0, + intf->channels[0].address, + intf->channels[0].lun, + 0, 1); /* Don't retry, and don't wait. */ + if (rv) + atomic_sub(2, &panic_done_count); + while (atomic_read(&panic_done_count) != 0) + ipmi_poll(intf); } #ifdef CONFIG_IPMI_PANIC_STRING @@ -4248,8 +4286,6 @@ static void send_panic_events(char *str) unsigned char data[16]; struct ipmi_system_interface_addr *si; struct ipmi_addr addr; - struct ipmi_smi_msg smi_msg; - struct ipmi_recv_msg recv_msg; si = (struct ipmi_system_interface_addr *) &addr; si->addr_type = IPMI_SYSTEM_INTERFACE_ADDR_TYPE; @@ -4277,9 +4313,6 @@ static void send_panic_events(char *str) data[7] = str[2]; } - smi_msg.done = dummy_smi_done_handler; - recv_msg.done = dummy_recv_done_handler; - /* For every registered interface, send the event. */ list_for_each_entry_rcu(intf, &ipmi_interfaces, link) { if (!intf->handlers) @@ -4289,18 +4322,7 @@ static void send_panic_events(char *str) intf->run_to_completion = 1; /* Send the event announcing the panic. */ intf->handlers->set_run_to_completion(intf->send_info, 1); - i_ipmi_request(NULL, - intf, - &addr, - 0, - &msg, - intf, - &smi_msg, - &recv_msg, - 0, - intf->channels[0].address, - intf->channels[0].lun, - 0, 1); /* Don't retry, and don't wait. */ + ipmi_panic_request_and_wait(intf, &addr, &msg); } #ifdef CONFIG_IPMI_PANIC_STRING @@ -4348,18 +4370,7 @@ static void send_panic_events(char *str) msg.data = NULL; msg.data_len = 0; intf->null_user_handler = device_id_fetcher; - i_ipmi_request(NULL, - intf, - &addr, - 0, - &msg, - intf, - &smi_msg, - &recv_msg, - 0, - intf->channels[0].address, - intf->channels[0].lun, - 0, 1); /* Don't retry, and don't wait. */ + ipmi_panic_request_and_wait(intf, &addr, &msg); if (intf->local_event_generator) { /* Request the event receiver from the local MC. */ @@ -4368,18 +4379,7 @@ static void send_panic_events(char *str) msg.data = NULL; msg.data_len = 0; intf->null_user_handler = event_receiver_fetcher; - i_ipmi_request(NULL, - intf, - &addr, - 0, - &msg, - intf, - &smi_msg, - &recv_msg, - 0, - intf->channels[0].address, - intf->channels[0].lun, - 0, 1); /* no retry, and no wait. */ + ipmi_panic_request_and_wait(intf, &addr, &msg); } intf->null_user_handler = NULL; @@ -4436,18 +4436,7 @@ static void send_panic_events(char *str) strncpy(data+5, p, 11); p += size; - i_ipmi_request(NULL, - intf, - &addr, - 0, - &msg, - intf, - &smi_msg, - &recv_msg, - 0, - intf->channels[0].address, - intf->channels[0].lun, - 0, 1); /* no retry, and no wait. */ + ipmi_panic_request_and_wait(intf, &addr, &msg); } } #endif /* CONFIG_IPMI_PANIC_STRING */ diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c index 34767a6d7f42..57a53ba7758c 100644 --- a/drivers/char/ipmi/ipmi_watchdog.c +++ b/drivers/char/ipmi/ipmi_watchdog.c @@ -520,6 +520,7 @@ static void panic_halt_ipmi_heartbeat(void) msg.cmd = IPMI_WDOG_RESET_TIMER; msg.data = NULL; msg.data_len = 0; + atomic_add(2, &panic_done_count); rv = ipmi_request_supply_msgs(watchdog_user, (struct ipmi_addr *) &addr, 0, @@ -528,8 +529,8 @@ static void panic_halt_ipmi_heartbeat(void) &panic_halt_heartbeat_smi_msg, &panic_halt_heartbeat_recv_msg, 1); - if (!rv) - atomic_add(2, &panic_done_count); + if (rv) + atomic_sub(2, &panic_done_count); } static struct ipmi_smi_msg panic_halt_smi_msg = { @@ -553,16 +554,18 @@ static void panic_halt_ipmi_set_timeout(void) /* Wait for the messages to be free. */ while (atomic_read(&panic_done_count) != 0) ipmi_poll_interface(watchdog_user); + atomic_add(2, &panic_done_count); rv = i_ipmi_set_timeout(&panic_halt_smi_msg, &panic_halt_recv_msg, &send_heartbeat_now); - if (!rv) { - atomic_add(2, &panic_done_count); - if (send_heartbeat_now) - panic_halt_ipmi_heartbeat(); - } else + if (rv) { + atomic_sub(2, &panic_done_count); printk(KERN_WARNING PFX "Unable to extend the watchdog timeout."); + } else { + if (send_heartbeat_now) + panic_halt_ipmi_heartbeat(); + } while (atomic_read(&panic_done_count) != 0) ipmi_poll_interface(watchdog_user); } From f60adf42ad55405d1b17e9e5c33fdb63f1eb8861 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Wed, 28 Mar 2012 14:42:50 -0700 Subject: [PATCH 20/35] ipmi: simplify locking Now that the the IPMI driver is using a tasklet, we can simplify the locking in the driver and get rid of the message lock. Signed-off-by: Corey Minyard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/ipmi/ipmi_si_intf.c | 54 +++++++++++++------------------- 1 file changed, 21 insertions(+), 33 deletions(-) diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 01e53cd105dd..3c7e693018d9 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -171,7 +171,6 @@ struct smi_info { struct si_sm_handlers *handlers; enum si_type si_type; spinlock_t si_lock; - spinlock_t msg_lock; struct list_head xmit_msgs; struct list_head hp_xmit_msgs; struct ipmi_smi_msg *curr_msg; @@ -350,13 +349,6 @@ static enum si_sm_result start_next_msg(struct smi_info *smi_info) struct timeval t; #endif - /* - * No need to save flags, we aleady have interrupts off and we - * already hold the SMI lock. - */ - if (!smi_info->run_to_completion) - spin_lock(&(smi_info->msg_lock)); - /* Pick the high priority queue first. */ if (!list_empty(&(smi_info->hp_xmit_msgs))) { entry = smi_info->hp_xmit_msgs.next; @@ -394,9 +386,6 @@ static enum si_sm_result start_next_msg(struct smi_info *smi_info) rv = SI_SM_CALL_WITHOUT_DELAY; } out: - if (!smi_info->run_to_completion) - spin_unlock(&(smi_info->msg_lock)); - return rv; } @@ -879,19 +868,6 @@ static void sender(void *send_info, printk("**Enqueue: %d.%9.9d\n", t.tv_sec, t.tv_usec); #endif - /* - * last_timeout_jiffies is updated here to avoid - * smi_timeout() handler passing very large time_diff - * value to smi_event_handler() that causes - * the send command to abort. - */ - smi_info->last_timeout_jiffies = jiffies; - - mod_timer(&smi_info->si_timer, jiffies + SI_TIMEOUT_JIFFIES); - - if (smi_info->thread) - wake_up_process(smi_info->thread); - if (smi_info->run_to_completion) { /* * If we are running to completion, then throw it in @@ -914,15 +890,26 @@ static void sender(void *send_info, return; } - spin_lock_irqsave(&smi_info->msg_lock, flags); + spin_lock_irqsave(&smi_info->si_lock, flags); if (priority > 0) list_add_tail(&msg->link, &smi_info->hp_xmit_msgs); else list_add_tail(&msg->link, &smi_info->xmit_msgs); - spin_unlock_irqrestore(&smi_info->msg_lock, flags); - spin_lock_irqsave(&smi_info->si_lock, flags); if (smi_info->si_state == SI_NORMAL && smi_info->curr_msg == NULL) { + /* + * last_timeout_jiffies is updated here to avoid + * smi_timeout() handler passing very large time_diff + * value to smi_event_handler() that causes + * the send command to abort. + */ + smi_info->last_timeout_jiffies = jiffies; + + mod_timer(&smi_info->si_timer, jiffies + SI_TIMEOUT_JIFFIES); + + if (smi_info->thread) + wake_up_process(smi_info->thread); + start_next_msg(smi_info); smi_event_handler(smi_info, 0); } @@ -1026,16 +1013,19 @@ static int ipmi_thread(void *data) static void poll(void *send_info) { struct smi_info *smi_info = send_info; - unsigned long flags; + unsigned long flags = 0; + int run_to_completion = smi_info->run_to_completion; /* * Make sure there is some delay in the poll loop so we can * drive time forward and timeout things. */ udelay(10); - spin_lock_irqsave(&smi_info->si_lock, flags); + if (!run_to_completion) + spin_lock_irqsave(&smi_info->si_lock, flags); smi_event_handler(smi_info, 10); - spin_unlock_irqrestore(&smi_info->si_lock, flags); + if (!run_to_completion) + spin_unlock_irqrestore(&smi_info->si_lock, flags); } static void request_events(void *send_info) @@ -1672,10 +1662,8 @@ static struct smi_info *smi_info_alloc(void) { struct smi_info *info = kzalloc(sizeof(*info), GFP_KERNEL); - if (info) { + if (info) spin_lock_init(&info->si_lock); - spin_lock_init(&info->msg_lock); - } return info; } From 423a5bb49ec530ec8bbfc73fd2ded83da8e58684 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Wed, 28 Mar 2012 14:42:50 -0700 Subject: [PATCH 21/35] ipmi: use locks on watchdog timeout set on reboot The IPMI watchdog timer clears or extends the timer on reboot/shutdown. It was using the non-locking routine for setting the watchdog timer, but this was causing race conditions. Instead, use the locking version to avoid the races. It seems to work fine. Signed-off-by: Corey Minyard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/ipmi/ipmi_watchdog.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c index 57a53ba7758c..99dc1daa4c37 100644 --- a/drivers/char/ipmi/ipmi_watchdog.c +++ b/drivers/char/ipmi/ipmi_watchdog.c @@ -1167,7 +1167,7 @@ static int wdog_reboot_handler(struct notifier_block *this, if (code == SYS_POWER_OFF || code == SYS_HALT) { /* Disable the WDT if we are shutting down. */ ipmi_watchdog_state = WDOG_TIMEOUT_NONE; - panic_halt_ipmi_set_timeout(); + ipmi_set_timeout(IPMI_SET_TIMEOUT_NO_HB); } else if (ipmi_watchdog_state != WDOG_TIMEOUT_NONE) { /* Set a long timer to let the reboot happens, but reboot if it hangs, but only if the watchdog @@ -1175,7 +1175,7 @@ static int wdog_reboot_handler(struct notifier_block *this, timeout = 120; pretimeout = 0; ipmi_watchdog_state = WDOG_TIMEOUT_RESET; - panic_halt_ipmi_set_timeout(); + ipmi_set_timeout(IPMI_SET_TIMEOUT_NO_HB); } } return NOTIFY_OK; From 5a04cca6c39cdd0b8c75b0628da634248f381b62 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 28 Mar 2012 14:42:50 -0700 Subject: [PATCH 22/35] sysctl: use bitmap library functions Use bitmap_set() instead of using set_bit() for each bit. This conversion is valid because the bitmap is private in the function call and atomic bitops were unnecessary. This also includes minor change. - Use bitmap_copy() for shorter typing Signed-off-by: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d48ff4fd44c3..dbd70bdc1765 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -2393,9 +2394,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, } } - while (val_a <= val_b) - set_bit(val_a++, tmp_bitmap); - + bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1); first = 0; proc_skip_char(&kbuf, &left, '\n'); } @@ -2438,8 +2437,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, if (*ppos) bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len); else - memcpy(bitmap, tmp_bitmap, - BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long)); + bitmap_copy(bitmap, tmp_bitmap, bitmap_len); } kfree(tmp_bitmap); *lenp -= left; From cf3f89214ef6a33fad60856bc5ffd7bb2fc4709b Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Wed, 28 Mar 2012 14:42:51 -0700 Subject: [PATCH 23/35] pidns: add reboot_pid_ns() to handle the reboot syscall In the case of a child pid namespace, rebooting the system does not really makes sense. When the pid namespace is used in conjunction with the other namespaces in order to create a linux container, the reboot syscall leads to some problems. A container can reboot the host. That can be fixed by dropping the sys_reboot capability but we are unable to correctly to poweroff/ halt/reboot a container and the container stays stuck at the shutdown time with the container's init process waiting indefinitively. After several attempts, no solution from userspace was found to reliabily handle the shutdown from a container. This patch propose to make the init process of the child pid namespace to exit with a signal status set to : SIGINT if the child pid namespace called "halt/poweroff" and SIGHUP if the child pid namespace called "reboot". When the reboot syscall is called and we are not in the initial pid namespace, we kill the pid namespace for "HALT", "POWEROFF", "RESTART", and "RESTART2". Otherwise we return EINVAL. Returning EINVAL is also an easy way to check if this feature is supported by the kernel when invoking another 'reboot' option like CAD. By this way the parent process of the child pid namespace knows if it rebooted or not and can take the right decision. Test case: ========== #include #include #include #include #include #include #include #include #include static int do_reboot(void *arg) { int *cmd = arg; if (reboot(*cmd)) printf("failed to reboot(%d): %m\n", *cmd); } int test_reboot(int cmd, int sig) { long stack_size = 4096; void *stack = alloca(stack_size) + stack_size; int status; pid_t ret; ret = clone(do_reboot, stack, CLONE_NEWPID | SIGCHLD, &cmd); if (ret < 0) { printf("failed to clone: %m\n"); return -1; } if (wait(&status) < 0) { printf("unexpected wait error: %m\n"); return -1; } if (!WIFSIGNALED(status)) { printf("child process exited but was not signaled\n"); return -1; } if (WTERMSIG(status) != sig) { printf("signal termination is not the one expected\n"); return -1; } return 0; } int main(int argc, char *argv[]) { int status; status = test_reboot(LINUX_REBOOT_CMD_RESTART, SIGHUP); if (status < 0) return 1; printf("reboot(LINUX_REBOOT_CMD_RESTART) succeed\n"); status = test_reboot(LINUX_REBOOT_CMD_RESTART2, SIGHUP); if (status < 0) return 1; printf("reboot(LINUX_REBOOT_CMD_RESTART2) succeed\n"); status = test_reboot(LINUX_REBOOT_CMD_HALT, SIGINT); if (status < 0) return 1; printf("reboot(LINUX_REBOOT_CMD_HALT) succeed\n"); status = test_reboot(LINUX_REBOOT_CMD_POWER_OFF, SIGINT); if (status < 0) return 1; printf("reboot(LINUX_REBOOT_CMD_POWERR_OFF) succeed\n"); status = test_reboot(LINUX_REBOOT_CMD_CAD_ON, -1); if (status >= 0) { printf("reboot(LINUX_REBOOT_CMD_CAD_ON) should have failed\n"); return 1; } printf("reboot(LINUX_REBOOT_CMD_CAD_ON) has failed as expected\n"); return 0; } [akpm@linux-foundation.org: tweak and add comments] [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Daniel Lezcano Acked-by: Serge Hallyn Tested-by: Serge Hallyn Reviewed-by: Oleg Nesterov Cc: Michael Kerrisk Cc: "Eric W. Biederman" Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pid_namespace.h | 8 +++++++- kernel/pid_namespace.c | 33 +++++++++++++++++++++++++++++++++ kernel/sys.c | 9 +++++++++ 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index f5bd679be46b..b067bd8c49d0 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -33,6 +33,7 @@ struct pid_namespace { #endif gid_t pid_gid; int hide_pid; + int reboot; /* group exit code if this pidns was rebooted */ }; extern struct pid_namespace init_pid_ns; @@ -48,6 +49,7 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) extern struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *ns); extern void free_pid_ns(struct kref *kref); extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); +extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd); static inline void put_pid_ns(struct pid_namespace *ns) { @@ -75,11 +77,15 @@ static inline void put_pid_ns(struct pid_namespace *ns) { } - static inline void zap_pid_ns_processes(struct pid_namespace *ns) { BUG(); } + +static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) +{ + return 0; +} #endif /* CONFIG_PID_NS */ extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 17b232869a04..57bc1fd35b3c 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -15,6 +15,7 @@ #include #include #include +#include #define BITS_PER_PAGE (PAGE_SIZE*8) @@ -183,6 +184,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) rc = sys_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); + if (pid_ns->reboot) + current->signal->group_exit_code = pid_ns->reboot; + acct_exit_ns(pid_ns); return; } @@ -217,6 +221,35 @@ static struct ctl_table pid_ns_ctl_table[] = { static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; +int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) +{ + if (pid_ns == &init_pid_ns) + return 0; + + switch (cmd) { + case LINUX_REBOOT_CMD_RESTART2: + case LINUX_REBOOT_CMD_RESTART: + pid_ns->reboot = SIGHUP; + break; + + case LINUX_REBOOT_CMD_POWER_OFF: + case LINUX_REBOOT_CMD_HALT: + pid_ns->reboot = SIGINT; + break; + default: + return -EINVAL; + } + + read_lock(&tasklist_lock); + force_sig(SIGKILL, pid_ns->child_reaper); + read_unlock(&tasklist_lock); + + do_exit(0); + + /* Not reached */ + return 0; +} + static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); diff --git a/kernel/sys.c b/kernel/sys.c index 9eb7fcab8df6..e7006eb6c1e4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -444,6 +444,15 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, magic2 != LINUX_REBOOT_MAGIC2C)) return -EINVAL; + /* + * If pid namespaces are enabled and the current task is in a child + * pid_namespace, the command is handled by reboot_pid_ns() which will + * call do_exit(). + */ + ret = reboot_pid_ns(task_active_pid_ns(current), cmd); + if (ret) + return ret; + /* Instead of trying to make the power_off code look like * halt when pm_power_off is not set do it the easy way. */ From f4507164e7796b66c371ff9a63154f1c884a2433 Mon Sep 17 00:00:00 2001 From: Wanlong Gao Date: Wed, 28 Mar 2012 14:42:51 -0700 Subject: [PATCH 24/35] nbd: rename the nbd_device variable from lo to nbd rename the nbd_device variable from "lo" to "nbd", since "lo" is just a name copied from loop.c. Signed-off-by: Wanlong Gao Cc: Paul Clements Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/nbd.c | 295 ++++++++++++++++++++++---------------------- 1 file changed, 148 insertions(+), 147 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index c3f0ee16594d..864db101ad2d 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -39,7 +39,7 @@ #include -#define LO_MAGIC 0x68797548 +#define NBD_MAGIC 0x68797548 #ifdef NDEBUG #define dprintk(flags, fmt...) @@ -116,7 +116,7 @@ static void nbd_end_request(struct request *req) spin_unlock_irqrestore(q->queue_lock, flags); } -static void sock_shutdown(struct nbd_device *lo, int lock) +static void sock_shutdown(struct nbd_device *nbd, int lock) { /* Forcibly shutdown the socket causing all listeners * to error @@ -125,14 +125,14 @@ static void sock_shutdown(struct nbd_device *lo, int lock) * there should be a more generic interface rather than * calling socket ops directly here */ if (lock) - mutex_lock(&lo->tx_lock); - if (lo->sock) { - dev_warn(disk_to_dev(lo->disk), "shutting down socket\n"); - kernel_sock_shutdown(lo->sock, SHUT_RDWR); - lo->sock = NULL; + mutex_lock(&nbd->tx_lock); + if (nbd->sock) { + dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n"); + kernel_sock_shutdown(nbd->sock, SHUT_RDWR); + nbd->sock = NULL; } if (lock) - mutex_unlock(&lo->tx_lock); + mutex_unlock(&nbd->tx_lock); } static void nbd_xmit_timeout(unsigned long arg) @@ -147,17 +147,17 @@ static void nbd_xmit_timeout(unsigned long arg) /* * Send or receive packet. */ -static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size, +static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size, int msg_flags) { - struct socket *sock = lo->sock; + struct socket *sock = nbd->sock; int result; struct msghdr msg; struct kvec iov; sigset_t blocked, oldset; if (unlikely(!sock)) { - dev_err(disk_to_dev(lo->disk), + dev_err(disk_to_dev(nbd->disk), "Attempted %s on closed socket in sock_xmit\n", (send ? "send" : "recv")); return -EINVAL; @@ -181,15 +181,15 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size, if (send) { struct timer_list ti; - if (lo->xmit_timeout) { + if (nbd->xmit_timeout) { init_timer(&ti); ti.function = nbd_xmit_timeout; ti.data = (unsigned long)current; - ti.expires = jiffies + lo->xmit_timeout; + ti.expires = jiffies + nbd->xmit_timeout; add_timer(&ti); } result = kernel_sendmsg(sock, &msg, &iov, 1, size); - if (lo->xmit_timeout) + if (nbd->xmit_timeout) del_timer_sync(&ti); } else result = kernel_recvmsg(sock, &msg, &iov, 1, size, @@ -201,7 +201,7 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size, task_pid_nr(current), current->comm, dequeue_signal_lock(current, ¤t->blocked, &info)); result = -EINTR; - sock_shutdown(lo, !send); + sock_shutdown(nbd, !send); break; } @@ -219,18 +219,19 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size, return result; } -static inline int sock_send_bvec(struct nbd_device *lo, struct bio_vec *bvec, +static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec, int flags) { int result; void *kaddr = kmap(bvec->bv_page); - result = sock_xmit(lo, 1, kaddr + bvec->bv_offset, bvec->bv_len, flags); + result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset, + bvec->bv_len, flags); kunmap(bvec->bv_page); return result; } /* always call with the tx_lock held */ -static int nbd_send_req(struct nbd_device *lo, struct request *req) +static int nbd_send_req(struct nbd_device *nbd, struct request *req) { int result, flags; struct nbd_request request; @@ -243,14 +244,14 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req) memcpy(request.handle, &req, sizeof(req)); dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%uB)\n", - lo->disk->disk_name, req, + nbd->disk->disk_name, req, nbdcmd_to_ascii(nbd_cmd(req)), (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req)); - result = sock_xmit(lo, 1, &request, sizeof(request), + result = sock_xmit(nbd, 1, &request, sizeof(request), (nbd_cmd(req) == NBD_CMD_WRITE) ? MSG_MORE : 0); if (result <= 0) { - dev_err(disk_to_dev(lo->disk), + dev_err(disk_to_dev(nbd->disk), "Send control failed (result %d)\n", result); goto error_out; } @@ -267,10 +268,10 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req) if (!rq_iter_last(req, iter)) flags = MSG_MORE; dprintk(DBG_TX, "%s: request %p: sending %d bytes data\n", - lo->disk->disk_name, req, bvec->bv_len); - result = sock_send_bvec(lo, bvec, flags); + nbd->disk->disk_name, req, bvec->bv_len); + result = sock_send_bvec(nbd, bvec, flags); if (result <= 0) { - dev_err(disk_to_dev(lo->disk), + dev_err(disk_to_dev(nbd->disk), "Send data failed (result %d)\n", result); goto error_out; @@ -283,25 +284,25 @@ error_out: return -EIO; } -static struct request *nbd_find_request(struct nbd_device *lo, +static struct request *nbd_find_request(struct nbd_device *nbd, struct request *xreq) { struct request *req, *tmp; int err; - err = wait_event_interruptible(lo->active_wq, lo->active_req != xreq); + err = wait_event_interruptible(nbd->active_wq, nbd->active_req != xreq); if (unlikely(err)) goto out; - spin_lock(&lo->queue_lock); - list_for_each_entry_safe(req, tmp, &lo->queue_head, queuelist) { + spin_lock(&nbd->queue_lock); + list_for_each_entry_safe(req, tmp, &nbd->queue_head, queuelist) { if (req != xreq) continue; list_del_init(&req->queuelist); - spin_unlock(&lo->queue_lock); + spin_unlock(&nbd->queue_lock); return req; } - spin_unlock(&lo->queue_lock); + spin_unlock(&nbd->queue_lock); err = -ENOENT; @@ -309,78 +310,78 @@ out: return ERR_PTR(err); } -static inline int sock_recv_bvec(struct nbd_device *lo, struct bio_vec *bvec) +static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec) { int result; void *kaddr = kmap(bvec->bv_page); - result = sock_xmit(lo, 0, kaddr + bvec->bv_offset, bvec->bv_len, + result = sock_xmit(nbd, 0, kaddr + bvec->bv_offset, bvec->bv_len, MSG_WAITALL); kunmap(bvec->bv_page); return result; } /* NULL returned = something went wrong, inform userspace */ -static struct request *nbd_read_stat(struct nbd_device *lo) +static struct request *nbd_read_stat(struct nbd_device *nbd) { int result; struct nbd_reply reply; struct request *req; reply.magic = 0; - result = sock_xmit(lo, 0, &reply, sizeof(reply), MSG_WAITALL); + result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL); if (result <= 0) { - dev_err(disk_to_dev(lo->disk), + dev_err(disk_to_dev(nbd->disk), "Receive control failed (result %d)\n", result); goto harderror; } if (ntohl(reply.magic) != NBD_REPLY_MAGIC) { - dev_err(disk_to_dev(lo->disk), "Wrong magic (0x%lx)\n", + dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n", (unsigned long)ntohl(reply.magic)); result = -EPROTO; goto harderror; } - req = nbd_find_request(lo, *(struct request **)reply.handle); + req = nbd_find_request(nbd, *(struct request **)reply.handle); if (IS_ERR(req)) { result = PTR_ERR(req); if (result != -ENOENT) goto harderror; - dev_err(disk_to_dev(lo->disk), "Unexpected reply (%p)\n", + dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n", reply.handle); result = -EBADR; goto harderror; } if (ntohl(reply.error)) { - dev_err(disk_to_dev(lo->disk), "Other side returned error (%d)\n", + dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n", ntohl(reply.error)); req->errors++; return req; } dprintk(DBG_RX, "%s: request %p: got reply\n", - lo->disk->disk_name, req); + nbd->disk->disk_name, req); if (nbd_cmd(req) == NBD_CMD_READ) { struct req_iterator iter; struct bio_vec *bvec; rq_for_each_segment(bvec, req, iter) { - result = sock_recv_bvec(lo, bvec); + result = sock_recv_bvec(nbd, bvec); if (result <= 0) { - dev_err(disk_to_dev(lo->disk), "Receive data failed (result %d)\n", + dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n", result); req->errors++; return req; } dprintk(DBG_RX, "%s: request %p: got %d bytes data\n", - lo->disk->disk_name, req, bvec->bv_len); + nbd->disk->disk_name, req, bvec->bv_len); } } return req; harderror: - lo->harderror = result; + nbd->harderror = result; return NULL; } @@ -398,48 +399,48 @@ static struct device_attribute pid_attr = { .show = pid_show, }; -static int nbd_do_it(struct nbd_device *lo) +static int nbd_do_it(struct nbd_device *nbd) { struct request *req; int ret; - BUG_ON(lo->magic != LO_MAGIC); + BUG_ON(nbd->magic != NBD_MAGIC); - lo->pid = task_pid_nr(current); - ret = device_create_file(disk_to_dev(lo->disk), &pid_attr); + nbd->pid = task_pid_nr(current); + ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr); if (ret) { - dev_err(disk_to_dev(lo->disk), "device_create_file failed!\n"); - lo->pid = 0; + dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n"); + nbd->pid = 0; return ret; } - while ((req = nbd_read_stat(lo)) != NULL) + while ((req = nbd_read_stat(nbd)) != NULL) nbd_end_request(req); - device_remove_file(disk_to_dev(lo->disk), &pid_attr); - lo->pid = 0; + device_remove_file(disk_to_dev(nbd->disk), &pid_attr); + nbd->pid = 0; return 0; } -static void nbd_clear_que(struct nbd_device *lo) +static void nbd_clear_que(struct nbd_device *nbd) { struct request *req; - BUG_ON(lo->magic != LO_MAGIC); + BUG_ON(nbd->magic != NBD_MAGIC); /* - * Because we have set lo->sock to NULL under the tx_lock, all + * Because we have set nbd->sock to NULL under the tx_lock, all * modifications to the list must have completed by now. For * the same reason, the active_req must be NULL. * * As a consequence, we don't need to take the spin lock while * purging the list here. */ - BUG_ON(lo->sock); - BUG_ON(lo->active_req); + BUG_ON(nbd->sock); + BUG_ON(nbd->active_req); - while (!list_empty(&lo->queue_head)) { - req = list_entry(lo->queue_head.next, struct request, + while (!list_empty(&nbd->queue_head)) { + req = list_entry(nbd->queue_head.next, struct request, queuelist); list_del_init(&req->queuelist); req->errors++; @@ -448,7 +449,7 @@ static void nbd_clear_que(struct nbd_device *lo) } -static void nbd_handle_req(struct nbd_device *lo, struct request *req) +static void nbd_handle_req(struct nbd_device *nbd, struct request *req) { if (req->cmd_type != REQ_TYPE_FS) goto error_out; @@ -456,8 +457,8 @@ static void nbd_handle_req(struct nbd_device *lo, struct request *req) nbd_cmd(req) = NBD_CMD_READ; if (rq_data_dir(req) == WRITE) { nbd_cmd(req) = NBD_CMD_WRITE; - if (lo->flags & NBD_READ_ONLY) { - dev_err(disk_to_dev(lo->disk), + if (nbd->flags & NBD_READ_ONLY) { + dev_err(disk_to_dev(nbd->disk), "Write on read-only\n"); goto error_out; } @@ -465,29 +466,29 @@ static void nbd_handle_req(struct nbd_device *lo, struct request *req) req->errors = 0; - mutex_lock(&lo->tx_lock); - if (unlikely(!lo->sock)) { - mutex_unlock(&lo->tx_lock); - dev_err(disk_to_dev(lo->disk), + mutex_lock(&nbd->tx_lock); + if (unlikely(!nbd->sock)) { + mutex_unlock(&nbd->tx_lock); + dev_err(disk_to_dev(nbd->disk), "Attempted send on closed socket\n"); goto error_out; } - lo->active_req = req; + nbd->active_req = req; - if (nbd_send_req(lo, req) != 0) { - dev_err(disk_to_dev(lo->disk), "Request send failed\n"); + if (nbd_send_req(nbd, req) != 0) { + dev_err(disk_to_dev(nbd->disk), "Request send failed\n"); req->errors++; nbd_end_request(req); } else { - spin_lock(&lo->queue_lock); - list_add(&req->queuelist, &lo->queue_head); - spin_unlock(&lo->queue_lock); + spin_lock(&nbd->queue_lock); + list_add(&req->queuelist, &nbd->queue_head); + spin_unlock(&nbd->queue_lock); } - lo->active_req = NULL; - mutex_unlock(&lo->tx_lock); - wake_up_all(&lo->active_wq); + nbd->active_req = NULL; + mutex_unlock(&nbd->tx_lock); + wake_up_all(&nbd->active_wq); return; @@ -498,28 +499,28 @@ error_out: static int nbd_thread(void *data) { - struct nbd_device *lo = data; + struct nbd_device *nbd = data; struct request *req; set_user_nice(current, -20); - while (!kthread_should_stop() || !list_empty(&lo->waiting_queue)) { + while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) { /* wait for something to do */ - wait_event_interruptible(lo->waiting_wq, + wait_event_interruptible(nbd->waiting_wq, kthread_should_stop() || - !list_empty(&lo->waiting_queue)); + !list_empty(&nbd->waiting_queue)); /* extract request */ - if (list_empty(&lo->waiting_queue)) + if (list_empty(&nbd->waiting_queue)) continue; - spin_lock_irq(&lo->queue_lock); - req = list_entry(lo->waiting_queue.next, struct request, + spin_lock_irq(&nbd->queue_lock); + req = list_entry(nbd->waiting_queue.next, struct request, queuelist); list_del_init(&req->queuelist); - spin_unlock_irq(&lo->queue_lock); + spin_unlock_irq(&nbd->queue_lock); /* handle request */ - nbd_handle_req(lo, req); + nbd_handle_req(nbd, req); } return 0; } @@ -527,7 +528,7 @@ static int nbd_thread(void *data) /* * We always wait for result of write, for now. It would be nice to make it optional * in future - * if ((rq_data_dir(req) == WRITE) && (lo->flags & NBD_WRITE_NOCHK)) + * if ((rq_data_dir(req) == WRITE) && (nbd->flags & NBD_WRITE_NOCHK)) * { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); } */ @@ -536,19 +537,19 @@ static void do_nbd_request(struct request_queue *q) struct request *req; while ((req = blk_fetch_request(q)) != NULL) { - struct nbd_device *lo; + struct nbd_device *nbd; spin_unlock_irq(q->queue_lock); dprintk(DBG_BLKDEV, "%s: request %p: dequeued (flags=%x)\n", req->rq_disk->disk_name, req, req->cmd_type); - lo = req->rq_disk->private_data; + nbd = req->rq_disk->private_data; - BUG_ON(lo->magic != LO_MAGIC); + BUG_ON(nbd->magic != NBD_MAGIC); - if (unlikely(!lo->sock)) { - dev_err(disk_to_dev(lo->disk), + if (unlikely(!nbd->sock)) { + dev_err(disk_to_dev(nbd->disk), "Attempted send on closed socket\n"); req->errors++; nbd_end_request(req); @@ -556,11 +557,11 @@ static void do_nbd_request(struct request_queue *q) continue; } - spin_lock_irq(&lo->queue_lock); - list_add_tail(&req->queuelist, &lo->waiting_queue); - spin_unlock_irq(&lo->queue_lock); + spin_lock_irq(&nbd->queue_lock); + list_add_tail(&req->queuelist, &nbd->waiting_queue); + spin_unlock_irq(&nbd->queue_lock); - wake_up(&lo->waiting_wq); + wake_up(&nbd->waiting_wq); spin_lock_irq(q->queue_lock); } @@ -568,32 +569,32 @@ static void do_nbd_request(struct request_queue *q) /* Must be called with tx_lock held */ -static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, +static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, unsigned int cmd, unsigned long arg) { switch (cmd) { case NBD_DISCONNECT: { struct request sreq; - dev_info(disk_to_dev(lo->disk), "NBD_DISCONNECT\n"); + dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n"); blk_rq_init(NULL, &sreq); sreq.cmd_type = REQ_TYPE_SPECIAL; nbd_cmd(&sreq) = NBD_CMD_DISC; - if (!lo->sock) + if (!nbd->sock) return -EINVAL; - nbd_send_req(lo, &sreq); + nbd_send_req(nbd, &sreq); return 0; } case NBD_CLEAR_SOCK: { struct file *file; - lo->sock = NULL; - file = lo->file; - lo->file = NULL; - nbd_clear_que(lo); - BUG_ON(!list_empty(&lo->queue_head)); + nbd->sock = NULL; + file = nbd->file; + nbd->file = NULL; + nbd_clear_que(nbd); + BUG_ON(!list_empty(&nbd->queue_head)); if (file) fput(file); return 0; @@ -601,14 +602,14 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, case NBD_SET_SOCK: { struct file *file; - if (lo->file) + if (nbd->file) return -EBUSY; file = fget(arg); if (file) { struct inode *inode = file->f_path.dentry->d_inode; if (S_ISSOCK(inode->i_mode)) { - lo->file = file; - lo->sock = SOCKET_I(inode); + nbd->file = file; + nbd->sock = SOCKET_I(inode); if (max_part > 0) bdev->bd_invalidated = 1; return 0; @@ -620,29 +621,29 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, } case NBD_SET_BLKSIZE: - lo->blksize = arg; - lo->bytesize &= ~(lo->blksize-1); - bdev->bd_inode->i_size = lo->bytesize; - set_blocksize(bdev, lo->blksize); - set_capacity(lo->disk, lo->bytesize >> 9); + nbd->blksize = arg; + nbd->bytesize &= ~(nbd->blksize-1); + bdev->bd_inode->i_size = nbd->bytesize; + set_blocksize(bdev, nbd->blksize); + set_capacity(nbd->disk, nbd->bytesize >> 9); return 0; case NBD_SET_SIZE: - lo->bytesize = arg & ~(lo->blksize-1); - bdev->bd_inode->i_size = lo->bytesize; - set_blocksize(bdev, lo->blksize); - set_capacity(lo->disk, lo->bytesize >> 9); + nbd->bytesize = arg & ~(nbd->blksize-1); + bdev->bd_inode->i_size = nbd->bytesize; + set_blocksize(bdev, nbd->blksize); + set_capacity(nbd->disk, nbd->bytesize >> 9); return 0; case NBD_SET_TIMEOUT: - lo->xmit_timeout = arg * HZ; + nbd->xmit_timeout = arg * HZ; return 0; case NBD_SET_SIZE_BLOCKS: - lo->bytesize = ((u64) arg) * lo->blksize; - bdev->bd_inode->i_size = lo->bytesize; - set_blocksize(bdev, lo->blksize); - set_capacity(lo->disk, lo->bytesize >> 9); + nbd->bytesize = ((u64) arg) * nbd->blksize; + bdev->bd_inode->i_size = nbd->bytesize; + set_blocksize(bdev, nbd->blksize); + set_capacity(nbd->disk, nbd->bytesize >> 9); return 0; case NBD_DO_IT: { @@ -650,38 +651,38 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, struct file *file; int error; - if (lo->pid) + if (nbd->pid) return -EBUSY; - if (!lo->file) + if (!nbd->file) return -EINVAL; - mutex_unlock(&lo->tx_lock); + mutex_unlock(&nbd->tx_lock); - thread = kthread_create(nbd_thread, lo, lo->disk->disk_name); + thread = kthread_create(nbd_thread, nbd, nbd->disk->disk_name); if (IS_ERR(thread)) { - mutex_lock(&lo->tx_lock); + mutex_lock(&nbd->tx_lock); return PTR_ERR(thread); } wake_up_process(thread); - error = nbd_do_it(lo); + error = nbd_do_it(nbd); kthread_stop(thread); - mutex_lock(&lo->tx_lock); + mutex_lock(&nbd->tx_lock); if (error) return error; - sock_shutdown(lo, 0); - file = lo->file; - lo->file = NULL; - nbd_clear_que(lo); - dev_warn(disk_to_dev(lo->disk), "queue cleared\n"); + sock_shutdown(nbd, 0); + file = nbd->file; + nbd->file = NULL; + nbd_clear_que(nbd); + dev_warn(disk_to_dev(nbd->disk), "queue cleared\n"); if (file) fput(file); - lo->bytesize = 0; + nbd->bytesize = 0; bdev->bd_inode->i_size = 0; - set_capacity(lo->disk, 0); + set_capacity(nbd->disk, 0); if (max_part > 0) ioctl_by_bdev(bdev, BLKRRPART, 0); - return lo->harderror; + return nbd->harderror; } case NBD_CLEAR_QUE: @@ -689,14 +690,14 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, * This is for compatibility only. The queue is always cleared * by NBD_DO_IT or NBD_CLEAR_SOCK. */ - BUG_ON(!lo->sock && !list_empty(&lo->queue_head)); + BUG_ON(!nbd->sock && !list_empty(&nbd->queue_head)); return 0; case NBD_PRINT_DEBUG: - dev_info(disk_to_dev(lo->disk), + dev_info(disk_to_dev(nbd->disk), "next = %p, prev = %p, head = %p\n", - lo->queue_head.next, lo->queue_head.prev, - &lo->queue_head); + nbd->queue_head.next, nbd->queue_head.prev, + &nbd->queue_head); return 0; } return -ENOTTY; @@ -705,21 +706,21 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, static int nbd_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { - struct nbd_device *lo = bdev->bd_disk->private_data; + struct nbd_device *nbd = bdev->bd_disk->private_data; int error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - BUG_ON(lo->magic != LO_MAGIC); + BUG_ON(nbd->magic != NBD_MAGIC); /* Anyone capable of this syscall can do *real bad* things */ dprintk(DBG_IOCTL, "%s: nbd_ioctl cmd=%s(0x%x) arg=%lu\n", - lo->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg); + nbd->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg); - mutex_lock(&lo->tx_lock); - error = __nbd_ioctl(bdev, lo, cmd, arg); - mutex_unlock(&lo->tx_lock); + mutex_lock(&nbd->tx_lock); + error = __nbd_ioctl(bdev, nbd, cmd, arg); + mutex_unlock(&nbd->tx_lock); return error; } @@ -805,7 +806,7 @@ static int __init nbd_init(void) for (i = 0; i < nbds_max; i++) { struct gendisk *disk = nbd_dev[i].disk; nbd_dev[i].file = NULL; - nbd_dev[i].magic = LO_MAGIC; + nbd_dev[i].magic = NBD_MAGIC; nbd_dev[i].flags = 0; INIT_LIST_HEAD(&nbd_dev[i].waiting_queue); spin_lock_init(&nbd_dev[i].queue_lock); From 4c619aa0ba171c092a0ae5d969364deb82dbe371 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 28 Mar 2012 14:42:52 -0700 Subject: [PATCH 25/35] fs/proc/namespaces.c: prevent crash when ns_entries[] is empty If CONFIG_NET_NS, CONFIG_UTS_NS and CONFIG_IPC_NS are disabled, ns_entries[] becomes empty and things like ns_entries[ARRAY_SIZE(ns_entries) - 1] will explode. Reported-by: Richard Weinberger Cc: "Eric W. Biederman" Cc: Daniel Lezcano Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/namespaces.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 3551f1f839eb..0d9e23a39e49 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -156,15 +156,15 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir, if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out; - last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; - for (entry = ns_entries; entry <= last; entry++) { + last = &ns_entries[ARRAY_SIZE(ns_entries)]; + for (entry = ns_entries; entry < last; entry++) { if (strlen((*entry)->name) != len) continue; if (!memcmp(dentry->d_name.name, (*entry)->name, len)) break; } error = ERR_PTR(-ENOENT); - if (entry > last) + if (entry == last) goto out; error = proc_ns_instantiate(dir, dentry, task, *entry); From 78c1d78488a3c45685d993130c9f17102dc79a54 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 28 Mar 2012 14:42:53 -0700 Subject: [PATCH 26/35] radix-tree: introduce bit-optimized iterator A series of radix tree cleanups, and usage of them in the core pagecache code. Micro-benchmark: lookup 14 slots (typical page-vector size) in radix-tree there earch slot filled and tagged before/after - nsec per full scan through tree * Intel Sandy Bridge i7-2620M 4Mb L3 New code always faster * AMD Athlon 6000+ 2x1Mb L2, without L3 New code generally faster, Minor degradation (marked with "*") for huge sparse trees * i386 on Sandy Bridge New code faster for common cases: tagged and dense trees. Some degradations for non-tagged lookup on sparse trees. Ideally, there might help __ffs() analog for searching first non-zero long element in array, gcc sometimes cannot optimize this loop corretly. Numbers: CPU: Intel Sandy Bridge i7-2620M 4Mb L3 radix-tree with 1024 slots: tagged lookup step 1 before 7156 after 3613 step 2 before 5399 after 2696 step 3 before 4779 after 1928 step 4 before 4456 after 1429 step 5 before 4292 after 1213 step 6 before 4183 after 1052 step 7 before 4157 after 951 step 8 before 4016 after 812 step 9 before 3952 after 851 step 10 before 3937 after 732 step 11 before 4023 after 709 step 12 before 3872 after 657 step 13 before 3892 after 633 step 14 before 3720 after 591 step 15 before 3879 after 578 step 16 before 3561 after 513 normal lookup step 1 before 4266 after 3301 step 2 before 2695 after 2129 step 3 before 2083 after 1712 step 4 before 1801 after 1534 step 5 before 1628 after 1313 step 6 before 1551 after 1263 step 7 before 1475 after 1185 step 8 before 1432 after 1167 step 9 before 1373 after 1092 step 10 before 1339 after 1134 step 11 before 1292 after 1056 step 12 before 1319 after 1030 step 13 before 1276 after 1004 step 14 before 1256 after 987 step 15 before 1228 after 992 step 16 before 1247 after 999 radix-tree with 1024*1024*128 slots: tagged lookup step 1 before 1086102841 after 674196409 step 2 before 816839155 after 498138306 step 7 before 599728907 after 240676762 step 15 before 555729253 after 185219677 step 63 before 606637748 after 128585664 step 64 before 608384432 after 102945089 step 65 before 596987114 after 123996019 step 128 before 304459225 after 56783056 step 256 before 158846855 after 31232481 step 512 before 86085652 after 18950595 step 12345 before 6517189 after 1674057 normal lookup step 1 before 626064869 after 544418266 step 2 before 418809975 after 336321473 step 7 before 242303598 after 207755560 step 15 before 208380563 after 176496355 step 63 before 186854206 after 167283638 step 64 before 176188060 after 170143976 step 65 before 185139608 after 167487116 step 128 before 88181865 after 86913490 step 256 before 45733628 after 45143534 step 512 before 24506038 after 23859036 step 12345 before 2177425 after 2018662 * AMD Athlon 6000+ 2x1Mb L2, without L3 radix-tree with 1024 slots: tag-lookup step 1 before 8164 after 5379 step 2 before 5818 after 5581 step 3 before 4959 after 4213 step 4 before 4371 after 3386 step 5 before 4204 after 2997 step 6 before 4950 after 2744 step 7 before 4598 after 2480 step 8 before 4251 after 2288 step 9 before 4262 after 2243 step 10 before 4175 after 2131 step 11 before 3999 after 2024 step 12 before 3979 after 1994 step 13 before 3842 after 1929 step 14 before 3750 after 1810 step 15 before 3735 after 1810 step 16 before 3532 after 1660 normal-lookup step 1 before 7875 after 5847 step 2 before 4808 after 4071 step 3 before 4073 after 3462 step 4 before 3677 after 3074 step 5 before 4308 after 2978 step 6 before 3911 after 3807 step 7 before 3635 after 3522 step 8 before 3313 after 3202 step 9 before 3280 after 3257 step 10 before 3166 after 3083 step 11 before 3066 after 3026 step 12 before 2985 after 2982 step 13 before 2925 after 2924 step 14 before 2834 after 2808 step 15 before 2805 after 2803 step 16 before 2647 after 2622 radix-tree with 1024*1024*128 slots: tag-lookup step 1 before 1288059720 after 951736580 step 2 before 961292300 after 884212140 step 7 before 768905140 after 547267580 step 15 before 771319480 after 456550640 step 63 before 504847640 after 242704304 step 64 before 392484800 after 177920786 step 65 before 491162160 after 246895264 step 128 before 208084064 after 97348392 step 256 before 112401035 after 51408126 step 512 before 75825834 after 29145070 step 12345 before 5603166 after 2847330 normal-lookup step 1 before 1025677120 after 861375100 step 2 before 647220080 after 572258540 step 7 before 505518960 after 484041813 step 15 before 430483053 after 444815320 * step 63 before 388113453 after 404250546 * step 64 before 374154666 after 396027440 * step 65 before 381423973 after 396704853 * step 128 before 190078700 after 202619384 * step 256 before 100886756 after 102829108 * step 512 before 64074505 after 56158720 step 12345 before 4237289 after 4422299 * * i686 on Sandy bridge radix-tree with 1024 slots: tagged lookup step 1 before 7990 after 4019 step 2 before 5698 after 2897 step 3 before 5013 after 2475 step 4 before 4630 after 1721 step 5 before 4346 after 1759 step 6 before 4299 after 1556 step 7 before 4098 after 1513 step 8 before 4115 after 1222 step 9 before 3983 after 1390 step 10 before 4077 after 1207 step 11 before 3921 after 1231 step 12 before 3894 after 1116 step 13 before 3840 after 1147 step 14 before 3799 after 1090 step 15 before 3797 after 1059 step 16 before 3783 after 745 normal lookup step 1 before 5103 after 3499 step 2 before 3299 after 2550 step 3 before 2489 after 2370 step 4 before 2034 after 2302 * step 5 before 1846 after 2268 * step 6 before 1752 after 2249 * step 7 before 1679 after 2164 * step 8 before 1627 after 2153 * step 9 before 1542 after 2095 * step 10 before 1479 after 2109 * step 11 before 1469 after 2009 * step 12 before 1445 after 2039 * step 13 before 1411 after 2013 * step 14 before 1374 after 2046 * step 15 before 1340 after 1975 * step 16 before 1331 after 2000 * radix-tree with 1024*1024*128 slots: tagged lookup step 1 before 1225865377 after 667153553 step 2 before 842427423 after 471533007 step 7 before 609296153 after 276260116 step 15 before 544232060 after 226859105 step 63 before 519209199 after 141343043 step 64 before 588980279 after 141951339 step 65 before 521099710 after 138282060 step 128 before 298476778 after 83390628 step 256 before 149358342 after 43602609 step 512 before 76994713 after 22911077 step 12345 before 5328666 after 1472111 normal lookup step 1 before 819284564 after 533635310 step 2 before 512421605 after 364956155 step 7 before 271443305 after 305721345 * step 15 before 223591630 after 273960216 * step 63 before 190320247 after 217770207 * step 64 before 178538168 after 267411372 * step 65 before 186400423 after 215347937 * step 128 before 88106045 after 140540612 * step 256 before 44812420 after 70660377 * step 512 before 24435438 after 36328275 * step 12345 before 2123924 after 2148062 * bloat-o-meter delta for this patchset + patchset with related shmem cleanups bloat-o-meter: x86_64 add/remove: 4/3 grow/shrink: 5/6 up/down: 928/-939 (-11) function old new delta radix_tree_next_chunk - 499 +499 shmem_unuse 428 554 +126 shmem_radix_tree_replace 131 227 +96 find_get_pages_tag 354 419 +65 find_get_pages_contig 345 407 +62 find_get_pages 362 396 +34 __kstrtab_radix_tree_next_chunk - 22 +22 __ksymtab_radix_tree_next_chunk - 16 +16 __kcrctab_radix_tree_next_chunk - 8 +8 radix_tree_gang_lookup_slot 204 203 -1 static.shmem_xattr_set 384 381 -3 radix_tree_gang_lookup_tag_slot 208 191 -17 radix_tree_gang_lookup 231 187 -44 radix_tree_gang_lookup_tag 247 199 -48 shmem_unlock_mapping 278 190 -88 __lookup 217 - -217 __lookup_tag 242 - -242 radix_tree_locate_item 279 - -279 bloat-o-meter: i386 add/remove: 3/3 grow/shrink: 8/9 up/down: 1075/-1275 (-200) function old new delta radix_tree_next_chunk - 757 +757 shmem_unuse 352 449 +97 find_get_pages_contig 269 322 +53 shmem_radix_tree_replace 113 154 +41 find_get_pages_tag 277 318 +41 dcache_dir_lseek 426 458 +32 __kstrtab_radix_tree_next_chunk - 22 +22 vc_do_resize 968 977 +9 snd_pcm_lib_read1 725 733 +8 __ksymtab_radix_tree_next_chunk - 8 +8 netlbl_cipsov4_list 1120 1127 +7 find_get_pages 293 291 -2 new_slab 467 459 -8 bitfill_unaligned_rev 425 417 -8 radix_tree_gang_lookup_tag_slot 177 146 -31 blk_dump_cmd 267 229 -38 radix_tree_gang_lookup_slot 212 134 -78 shmem_unlock_mapping 221 128 -93 radix_tree_gang_lookup_tag 275 162 -113 radix_tree_gang_lookup 255 126 -129 __lookup 227 - -227 __lookup_tag 271 - -271 radix_tree_locate_item 277 - -277 This patch: Implement a clean, simple and effective radix-tree iteration routine. Iterating divided into two phases: * lookup next chunk in radix-tree leaf node * iterating through slots in this chunk Main iterator function radix_tree_next_chunk() returns pointer to first slot, and stores in the struct radix_tree_iter index of next-to-last slot. For tagged-iterating it also constuct bitmask of tags for retunted chunk. All additional logic implemented as static-inline functions and macroses. Also adds radix_tree_find_next_bit() static-inline variant of find_next_bit() optimized for small constant size arrays, because find_next_bit() too heavy for searching in an array with one/two long elements. [akpm@linux-foundation.org: rework comments a bit] Signed-off-by: Konstantin Khlebnikov Tested-by: Hugh Dickins Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 196 +++++++++++++++++++++++++++++++++++++ lib/radix-tree.c | 151 ++++++++++++++++++++++++++++ 2 files changed, 347 insertions(+) diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index e9a48234e693..0d04cd69ab9b 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -2,6 +2,7 @@ * Copyright (C) 2001 Momchil Velikov * Portions Copyright (C) 2001 Christoph Hellwig * Copyright (C) 2006 Nick Piggin + * Copyright (C) 2012 Konstantin Khlebnikov * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as @@ -257,4 +258,199 @@ static inline void radix_tree_preload_end(void) preempt_enable(); } +/** + * struct radix_tree_iter - radix tree iterator state + * + * @index: index of current slot + * @next_index: next-to-last index for this chunk + * @tags: bit-mask for tag-iterating + * + * This radix tree iterator works in terms of "chunks" of slots. A chunk is a + * subinterval of slots contained within one radix tree leaf node. It is + * described by a pointer to its first slot and a struct radix_tree_iter + * which holds the chunk's position in the tree and its size. For tagged + * iteration radix_tree_iter also holds the slots' bit-mask for one chosen + * radix tree tag. + */ +struct radix_tree_iter { + unsigned long index; + unsigned long next_index; + unsigned long tags; +}; + +#define RADIX_TREE_ITER_TAG_MASK 0x00FF /* tag index in lower byte */ +#define RADIX_TREE_ITER_TAGGED 0x0100 /* lookup tagged slots */ +#define RADIX_TREE_ITER_CONTIG 0x0200 /* stop at first hole */ + +/** + * radix_tree_iter_init - initialize radix tree iterator + * + * @iter: pointer to iterator state + * @start: iteration starting index + * Returns: NULL + */ +static __always_inline void ** +radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start) +{ + /* + * Leave iter->tags uninitialized. radix_tree_next_chunk() will fill it + * in the case of a successful tagged chunk lookup. If the lookup was + * unsuccessful or non-tagged then nobody cares about ->tags. + * + * Set index to zero to bypass next_index overflow protection. + * See the comment in radix_tree_next_chunk() for details. + */ + iter->index = 0; + iter->next_index = start; + return NULL; +} + +/** + * radix_tree_next_chunk - find next chunk of slots for iteration + * + * @root: radix tree root + * @iter: iterator state + * @flags: RADIX_TREE_ITER_* flags and tag index + * Returns: pointer to chunk first slot, or NULL if there no more left + * + * This function looks up the next chunk in the radix tree starting from + * @iter->next_index. It returns a pointer to the chunk's first slot. + * Also it fills @iter with data about chunk: position in the tree (index), + * its end (next_index), and constructs a bit mask for tagged iterating (tags). + */ +void **radix_tree_next_chunk(struct radix_tree_root *root, + struct radix_tree_iter *iter, unsigned flags); + +/** + * radix_tree_chunk_size - get current chunk size + * + * @iter: pointer to radix tree iterator + * Returns: current chunk size + */ +static __always_inline unsigned +radix_tree_chunk_size(struct radix_tree_iter *iter) +{ + return iter->next_index - iter->index; +} + +/** + * radix_tree_next_slot - find next slot in chunk + * + * @slot: pointer to current slot + * @iter: pointer to interator state + * @flags: RADIX_TREE_ITER_*, should be constant + * Returns: pointer to next slot, or NULL if there no more left + * + * This function updates @iter->index in the case of a successful lookup. + * For tagged lookup it also eats @iter->tags. + */ +static __always_inline void ** +radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) +{ + if (flags & RADIX_TREE_ITER_TAGGED) { + iter->tags >>= 1; + if (likely(iter->tags & 1ul)) { + iter->index++; + return slot + 1; + } + if (!(flags & RADIX_TREE_ITER_CONTIG) && likely(iter->tags)) { + unsigned offset = __ffs(iter->tags); + + iter->tags >>= offset; + iter->index += offset + 1; + return slot + offset + 1; + } + } else { + unsigned size = radix_tree_chunk_size(iter) - 1; + + while (size--) { + slot++; + iter->index++; + if (likely(*slot)) + return slot; + if (flags & RADIX_TREE_ITER_CONTIG) + break; + } + } + return NULL; +} + +/** + * radix_tree_for_each_chunk - iterate over chunks + * + * @slot: the void** variable for pointer to chunk first slot + * @root: the struct radix_tree_root pointer + * @iter: the struct radix_tree_iter pointer + * @start: iteration starting index + * @flags: RADIX_TREE_ITER_* and tag index + * + * Locks can be released and reacquired between iterations. + */ +#define radix_tree_for_each_chunk(slot, root, iter, start, flags) \ + for (slot = radix_tree_iter_init(iter, start) ; \ + (slot = radix_tree_next_chunk(root, iter, flags)) ;) + +/** + * radix_tree_for_each_chunk_slot - iterate over slots in one chunk + * + * @slot: the void** variable, at the beginning points to chunk first slot + * @iter: the struct radix_tree_iter pointer + * @flags: RADIX_TREE_ITER_*, should be constant + * + * This macro is designed to be nested inside radix_tree_for_each_chunk(). + * @slot points to the radix tree slot, @iter->index contains its index. + */ +#define radix_tree_for_each_chunk_slot(slot, iter, flags) \ + for (; slot ; slot = radix_tree_next_slot(slot, iter, flags)) + +/** + * radix_tree_for_each_slot - iterate over non-empty slots + * + * @slot: the void** variable for pointer to slot + * @root: the struct radix_tree_root pointer + * @iter: the struct radix_tree_iter pointer + * @start: iteration starting index + * + * @slot points to radix tree slot, @iter->index contains its index. + */ +#define radix_tree_for_each_slot(slot, root, iter, start) \ + for (slot = radix_tree_iter_init(iter, start) ; \ + slot || (slot = radix_tree_next_chunk(root, iter, 0)) ; \ + slot = radix_tree_next_slot(slot, iter, 0)) + +/** + * radix_tree_for_each_contig - iterate over contiguous slots + * + * @slot: the void** variable for pointer to slot + * @root: the struct radix_tree_root pointer + * @iter: the struct radix_tree_iter pointer + * @start: iteration starting index + * + * @slot points to radix tree slot, @iter->index contains its index. + */ +#define radix_tree_for_each_contig(slot, root, iter, start) \ + for (slot = radix_tree_iter_init(iter, start) ; \ + slot || (slot = radix_tree_next_chunk(root, iter, \ + RADIX_TREE_ITER_CONTIG)) ; \ + slot = radix_tree_next_slot(slot, iter, \ + RADIX_TREE_ITER_CONTIG)) + +/** + * radix_tree_for_each_tagged - iterate over tagged slots + * + * @slot: the void** variable for pointer to slot + * @root: the struct radix_tree_root pointer + * @iter: the struct radix_tree_iter pointer + * @start: iteration starting index + * @tag: tag index + * + * @slot points to radix tree slot, @iter->index contains its index. + */ +#define radix_tree_for_each_tagged(slot, root, iter, start, tag) \ + for (slot = radix_tree_iter_init(iter, start) ; \ + slot || (slot = radix_tree_next_chunk(root, iter, \ + RADIX_TREE_ITER_TAGGED | tag)) ; \ + slot = radix_tree_next_slot(slot, iter, \ + RADIX_TREE_ITER_TAGGED)) + #endif /* _LINUX_RADIX_TREE_H */ diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 3e69c2b66c94..fefa76e6ff96 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -3,6 +3,7 @@ * Portions Copyright (C) 2001 Christoph Hellwig * Copyright (C) 2005 SGI, Christoph Lameter * Copyright (C) 2006 Nick Piggin + * Copyright (C) 2012 Konstantin Khlebnikov * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as @@ -146,6 +147,43 @@ static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag) } return 0; } + +/** + * radix_tree_find_next_bit - find the next set bit in a memory region + * + * @addr: The address to base the search on + * @size: The bitmap size in bits + * @offset: The bitnumber to start searching at + * + * Unrollable variant of find_next_bit() for constant size arrays. + * Tail bits starting from size to roundup(size, BITS_PER_LONG) must be zero. + * Returns next bit offset, or size if nothing found. + */ +static __always_inline unsigned long +radix_tree_find_next_bit(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + if (!__builtin_constant_p(size)) + return find_next_bit(addr, size, offset); + + if (offset < size) { + unsigned long tmp; + + addr += offset / BITS_PER_LONG; + tmp = *addr >> (offset % BITS_PER_LONG); + if (tmp) + return __ffs(tmp) + offset; + offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1); + while (offset < size) { + tmp = *++addr; + if (tmp) + return __ffs(tmp) + offset; + offset += BITS_PER_LONG; + } + } + return size; +} + /* * This assumes that the caller has performed appropriate preallocation, and * that the caller has pinned this thread of control to the current CPU. @@ -612,6 +650,119 @@ int radix_tree_tag_get(struct radix_tree_root *root, } EXPORT_SYMBOL(radix_tree_tag_get); +/** + * radix_tree_next_chunk - find next chunk of slots for iteration + * + * @root: radix tree root + * @iter: iterator state + * @flags: RADIX_TREE_ITER_* flags and tag index + * Returns: pointer to chunk first slot, or NULL if iteration is over + */ +void **radix_tree_next_chunk(struct radix_tree_root *root, + struct radix_tree_iter *iter, unsigned flags) +{ + unsigned shift, tag = flags & RADIX_TREE_ITER_TAG_MASK; + struct radix_tree_node *rnode, *node; + unsigned long index, offset; + + if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag)) + return NULL; + + /* + * Catch next_index overflow after ~0UL. iter->index never overflows + * during iterating; it can be zero only at the beginning. + * And we cannot overflow iter->next_index in a single step, + * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG. + */ + index = iter->next_index; + if (!index && iter->index) + return NULL; + + rnode = rcu_dereference_raw(root->rnode); + if (radix_tree_is_indirect_ptr(rnode)) { + rnode = indirect_to_ptr(rnode); + } else if (rnode && !index) { + /* Single-slot tree */ + iter->index = 0; + iter->next_index = 1; + iter->tags = 1; + return (void **)&root->rnode; + } else + return NULL; + +restart: + shift = (rnode->height - 1) * RADIX_TREE_MAP_SHIFT; + offset = index >> shift; + + /* Index outside of the tree */ + if (offset >= RADIX_TREE_MAP_SIZE) + return NULL; + + node = rnode; + while (1) { + if ((flags & RADIX_TREE_ITER_TAGGED) ? + !test_bit(offset, node->tags[tag]) : + !node->slots[offset]) { + /* Hole detected */ + if (flags & RADIX_TREE_ITER_CONTIG) + return NULL; + + if (flags & RADIX_TREE_ITER_TAGGED) + offset = radix_tree_find_next_bit( + node->tags[tag], + RADIX_TREE_MAP_SIZE, + offset + 1); + else + while (++offset < RADIX_TREE_MAP_SIZE) { + if (node->slots[offset]) + break; + } + index &= ~((RADIX_TREE_MAP_SIZE << shift) - 1); + index += offset << shift; + /* Overflow after ~0UL */ + if (!index) + return NULL; + if (offset == RADIX_TREE_MAP_SIZE) + goto restart; + } + + /* This is leaf-node */ + if (!shift) + break; + + node = rcu_dereference_raw(node->slots[offset]); + if (node == NULL) + goto restart; + shift -= RADIX_TREE_MAP_SHIFT; + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + } + + /* Update the iterator state */ + iter->index = index; + iter->next_index = (index | RADIX_TREE_MAP_MASK) + 1; + + /* Construct iter->tags bit-mask from node->tags[tag] array */ + if (flags & RADIX_TREE_ITER_TAGGED) { + unsigned tag_long, tag_bit; + + tag_long = offset / BITS_PER_LONG; + tag_bit = offset % BITS_PER_LONG; + iter->tags = node->tags[tag][tag_long] >> tag_bit; + /* This never happens if RADIX_TREE_TAG_LONGS == 1 */ + if (tag_long < RADIX_TREE_TAG_LONGS - 1) { + /* Pick tags from next element */ + if (tag_bit) + iter->tags |= node->tags[tag][tag_long + 1] << + (BITS_PER_LONG - tag_bit); + /* Clip chunk size, here only BITS_PER_LONG tags */ + iter->next_index = index + BITS_PER_LONG; + } + } + + return node->slots + offset; +} +EXPORT_SYMBOL(radix_tree_next_chunk); + /** * radix_tree_range_tag_if_tagged - for each item in given range set given * tag if item has another tag set From cebbd29e1c2f7a969919f19f74583070840163d7 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 28 Mar 2012 14:42:53 -0700 Subject: [PATCH 27/35] radix-tree: rewrite gang lookup using iterator Rewrite radix_tree_gang_lookup_* functions using the new radix-tree iterator. Signed-off-by: Konstantin Khlebnikov Tested-by: Hugh Dickins Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/radix-tree.c | 291 ++++++----------------------------------------- 1 file changed, 33 insertions(+), 258 deletions(-) diff --git a/lib/radix-tree.c b/lib/radix-tree.c index fefa76e6ff96..86516f5588e3 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -968,57 +968,6 @@ unsigned long radix_tree_prev_hole(struct radix_tree_root *root, } EXPORT_SYMBOL(radix_tree_prev_hole); -static unsigned int -__lookup(struct radix_tree_node *slot, void ***results, unsigned long *indices, - unsigned long index, unsigned int max_items, unsigned long *next_index) -{ - unsigned int nr_found = 0; - unsigned int shift, height; - unsigned long i; - - height = slot->height; - if (height == 0) - goto out; - shift = (height-1) * RADIX_TREE_MAP_SHIFT; - - for ( ; height > 1; height--) { - i = (index >> shift) & RADIX_TREE_MAP_MASK; - for (;;) { - if (slot->slots[i] != NULL) - break; - index &= ~((1UL << shift) - 1); - index += 1UL << shift; - if (index == 0) - goto out; /* 32-bit wraparound */ - i++; - if (i == RADIX_TREE_MAP_SIZE) - goto out; - } - - shift -= RADIX_TREE_MAP_SHIFT; - slot = rcu_dereference_raw(slot->slots[i]); - if (slot == NULL) - goto out; - } - - /* Bottom level: grab some items */ - for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) { - if (slot->slots[i]) { - results[nr_found] = &(slot->slots[i]); - if (indices) - indices[nr_found] = index; - if (++nr_found == max_items) { - index++; - goto out; - } - } - index++; - } -out: - *next_index = index; - return nr_found; -} - /** * radix_tree_gang_lookup - perform multiple lookup on a radix tree * @root: radix tree root @@ -1042,48 +991,19 @@ unsigned int radix_tree_gang_lookup(struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items) { - unsigned long max_index; - struct radix_tree_node *node; - unsigned long cur_index = first_index; - unsigned int ret; + struct radix_tree_iter iter; + void **slot; + unsigned int ret = 0; - node = rcu_dereference_raw(root->rnode); - if (!node) + if (unlikely(!max_items)) return 0; - if (!radix_tree_is_indirect_ptr(node)) { - if (first_index > 0) - return 0; - results[0] = node; - return 1; - } - node = indirect_to_ptr(node); - - max_index = radix_tree_maxindex(node->height); - - ret = 0; - while (ret < max_items) { - unsigned int nr_found, slots_found, i; - unsigned long next_index; /* Index of next search */ - - if (cur_index > max_index) + radix_tree_for_each_slot(slot, root, &iter, first_index) { + results[ret] = indirect_to_ptr(rcu_dereference_raw(*slot)); + if (!results[ret]) + continue; + if (++ret == max_items) break; - slots_found = __lookup(node, (void ***)results + ret, NULL, - cur_index, max_items - ret, &next_index); - nr_found = 0; - for (i = 0; i < slots_found; i++) { - struct radix_tree_node *slot; - slot = *(((void ***)results)[ret + i]); - if (!slot) - continue; - results[ret + nr_found] = - indirect_to_ptr(rcu_dereference_raw(slot)); - nr_found++; - } - ret += nr_found; - if (next_index == 0) - break; - cur_index = next_index; } return ret; @@ -1113,112 +1033,25 @@ radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, unsigned long *indices, unsigned long first_index, unsigned int max_items) { - unsigned long max_index; - struct radix_tree_node *node; - unsigned long cur_index = first_index; - unsigned int ret; + struct radix_tree_iter iter; + void **slot; + unsigned int ret = 0; - node = rcu_dereference_raw(root->rnode); - if (!node) + if (unlikely(!max_items)) return 0; - if (!radix_tree_is_indirect_ptr(node)) { - if (first_index > 0) - return 0; - results[0] = (void **)&root->rnode; + radix_tree_for_each_slot(slot, root, &iter, first_index) { + results[ret] = slot; if (indices) - indices[0] = 0; - return 1; - } - node = indirect_to_ptr(node); - - max_index = radix_tree_maxindex(node->height); - - ret = 0; - while (ret < max_items) { - unsigned int slots_found; - unsigned long next_index; /* Index of next search */ - - if (cur_index > max_index) + indices[ret] = iter.index; + if (++ret == max_items) break; - slots_found = __lookup(node, results + ret, - indices ? indices + ret : NULL, - cur_index, max_items - ret, &next_index); - ret += slots_found; - if (next_index == 0) - break; - cur_index = next_index; } return ret; } EXPORT_SYMBOL(radix_tree_gang_lookup_slot); -/* - * FIXME: the two tag_get()s here should use find_next_bit() instead of - * open-coding the search. - */ -static unsigned int -__lookup_tag(struct radix_tree_node *slot, void ***results, unsigned long index, - unsigned int max_items, unsigned long *next_index, unsigned int tag) -{ - unsigned int nr_found = 0; - unsigned int shift, height; - - height = slot->height; - if (height == 0) - goto out; - shift = (height-1) * RADIX_TREE_MAP_SHIFT; - - while (height > 0) { - unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK ; - - for (;;) { - if (tag_get(slot, tag, i)) - break; - index &= ~((1UL << shift) - 1); - index += 1UL << shift; - if (index == 0) - goto out; /* 32-bit wraparound */ - i++; - if (i == RADIX_TREE_MAP_SIZE) - goto out; - } - height--; - if (height == 0) { /* Bottom level: grab some items */ - unsigned long j = index & RADIX_TREE_MAP_MASK; - - for ( ; j < RADIX_TREE_MAP_SIZE; j++) { - index++; - if (!tag_get(slot, tag, j)) - continue; - /* - * Even though the tag was found set, we need to - * recheck that we have a non-NULL node, because - * if this lookup is lockless, it may have been - * subsequently deleted. - * - * Similar care must be taken in any place that - * lookup ->slots[x] without a lock (ie. can't - * rely on its value remaining the same). - */ - if (slot->slots[j]) { - results[nr_found++] = &(slot->slots[j]); - if (nr_found == max_items) - goto out; - } - } - } - shift -= RADIX_TREE_MAP_SHIFT; - slot = rcu_dereference_raw(slot->slots[i]); - if (slot == NULL) - break; - } -out: - *next_index = index; - return nr_found; -} - /** * radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree * based on a tag @@ -1237,52 +1070,19 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items, unsigned int tag) { - struct radix_tree_node *node; - unsigned long max_index; - unsigned long cur_index = first_index; - unsigned int ret; + struct radix_tree_iter iter; + void **slot; + unsigned int ret = 0; - /* check the root's tag bit */ - if (!root_tag_get(root, tag)) + if (unlikely(!max_items)) return 0; - node = rcu_dereference_raw(root->rnode); - if (!node) - return 0; - - if (!radix_tree_is_indirect_ptr(node)) { - if (first_index > 0) - return 0; - results[0] = node; - return 1; - } - node = indirect_to_ptr(node); - - max_index = radix_tree_maxindex(node->height); - - ret = 0; - while (ret < max_items) { - unsigned int nr_found, slots_found, i; - unsigned long next_index; /* Index of next search */ - - if (cur_index > max_index) + radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) { + results[ret] = indirect_to_ptr(rcu_dereference_raw(*slot)); + if (!results[ret]) + continue; + if (++ret == max_items) break; - slots_found = __lookup_tag(node, (void ***)results + ret, - cur_index, max_items - ret, &next_index, tag); - nr_found = 0; - for (i = 0; i < slots_found; i++) { - struct radix_tree_node *slot; - slot = *(((void ***)results)[ret + i]); - if (!slot) - continue; - results[ret + nr_found] = - indirect_to_ptr(rcu_dereference_raw(slot)); - nr_found++; - } - ret += nr_found; - if (next_index == 0) - break; - cur_index = next_index; } return ret; @@ -1307,42 +1107,17 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, unsigned long first_index, unsigned int max_items, unsigned int tag) { - struct radix_tree_node *node; - unsigned long max_index; - unsigned long cur_index = first_index; - unsigned int ret; + struct radix_tree_iter iter; + void **slot; + unsigned int ret = 0; - /* check the root's tag bit */ - if (!root_tag_get(root, tag)) + if (unlikely(!max_items)) return 0; - node = rcu_dereference_raw(root->rnode); - if (!node) - return 0; - - if (!radix_tree_is_indirect_ptr(node)) { - if (first_index > 0) - return 0; - results[0] = (void **)&root->rnode; - return 1; - } - node = indirect_to_ptr(node); - - max_index = radix_tree_maxindex(node->height); - - ret = 0; - while (ret < max_items) { - unsigned int slots_found; - unsigned long next_index; /* Index of next search */ - - if (cur_index > max_index) + radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) { + results[ret] = slot; + if (++ret == max_items) break; - slots_found = __lookup_tag(node, results + ret, - cur_index, max_items - ret, &next_index, tag); - ret += slots_found; - if (next_index == 0) - break; - cur_index = next_index; } return ret; From 0fc9d1040313047edf6a39fd4d7c7defdca97c62 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 28 Mar 2012 14:42:54 -0700 Subject: [PATCH 28/35] radix-tree: use iterators in find_get_pages* functions Replace radix_tree_gang_lookup_slot() and radix_tree_gang_lookup_tag_slot() in page-cache lookup functions with brand-new radix-tree direct iterating. This avoids the double-scanning and pointer copying. Iterator don't stop after nr_pages page-get fails in a row, it continue lookup till the radix-tree end. Thus we can safely remove these restart conditions. Unfortunately, old implementation didn't forbid nr_pages == 0, this corner case does not fit into new code, so the patch adds an extra check at the beginning. Signed-off-by: Konstantin Khlebnikov Tested-by: Hugh Dickins Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 86 +++++++++++++++++++++++----------------------------- 1 file changed, 38 insertions(+), 48 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index c3811bc6b9e3..79c4b2b0b14e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -813,20 +813,19 @@ EXPORT_SYMBOL(find_or_create_page); unsigned find_get_pages(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages) { - unsigned int i; - unsigned int ret; - unsigned int nr_found, nr_skip; + struct radix_tree_iter iter; + void **slot; + unsigned ret = 0; + + if (unlikely(!nr_pages)) + return 0; rcu_read_lock(); restart: - nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, - (void ***)pages, NULL, start, nr_pages); - ret = 0; - nr_skip = 0; - for (i = 0; i < nr_found; i++) { + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { struct page *page; repeat: - page = radix_tree_deref_slot((void **)pages[i]); + page = radix_tree_deref_slot(slot); if (unlikely(!page)) continue; @@ -837,7 +836,7 @@ repeat: * when entry at index 0 moves out of or back * to root: none yet gotten, safe to restart. */ - WARN_ON(start | i); + WARN_ON(iter.index); goto restart; } /* @@ -845,7 +844,6 @@ repeat: * here as an exceptional entry: so skip over it - * we only reach this from invalidate_mapping_pages(). */ - nr_skip++; continue; } @@ -853,21 +851,16 @@ repeat: goto repeat; /* Has the page moved? */ - if (unlikely(page != *((void **)pages[i]))) { + if (unlikely(page != *slot)) { page_cache_release(page); goto repeat; } pages[ret] = page; - ret++; + if (++ret == nr_pages) + break; } - /* - * If all entries were removed before we could secure them, - * try again, because callers stop trying once 0 is returned. - */ - if (unlikely(!ret && nr_found > nr_skip)) - goto restart; rcu_read_unlock(); return ret; } @@ -887,21 +880,22 @@ repeat: unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, unsigned int nr_pages, struct page **pages) { - unsigned int i; - unsigned int ret; - unsigned int nr_found; + struct radix_tree_iter iter; + void **slot; + unsigned int ret = 0; + + if (unlikely(!nr_pages)) + return 0; rcu_read_lock(); restart: - nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, - (void ***)pages, NULL, index, nr_pages); - ret = 0; - for (i = 0; i < nr_found; i++) { + radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { struct page *page; repeat: - page = radix_tree_deref_slot((void **)pages[i]); + page = radix_tree_deref_slot(slot); + /* The hole, there no reason to continue */ if (unlikely(!page)) - continue; + break; if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { @@ -924,7 +918,7 @@ repeat: goto repeat; /* Has the page moved? */ - if (unlikely(page != *((void **)pages[i]))) { + if (unlikely(page != *slot)) { page_cache_release(page); goto repeat; } @@ -934,14 +928,14 @@ repeat: * otherwise we can get both false positives and false * negatives, which is just confusing to the caller. */ - if (page->mapping == NULL || page->index != index) { + if (page->mapping == NULL || page->index != iter.index) { page_cache_release(page); break; } pages[ret] = page; - ret++; - index++; + if (++ret == nr_pages) + break; } rcu_read_unlock(); return ret; @@ -962,19 +956,20 @@ EXPORT_SYMBOL(find_get_pages_contig); unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, int tag, unsigned int nr_pages, struct page **pages) { - unsigned int i; - unsigned int ret; - unsigned int nr_found; + struct radix_tree_iter iter; + void **slot; + unsigned ret = 0; + + if (unlikely(!nr_pages)) + return 0; rcu_read_lock(); restart: - nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree, - (void ***)pages, *index, nr_pages, tag); - ret = 0; - for (i = 0; i < nr_found; i++) { + radix_tree_for_each_tagged(slot, &mapping->page_tree, + &iter, *index, tag) { struct page *page; repeat: - page = radix_tree_deref_slot((void **)pages[i]); + page = radix_tree_deref_slot(slot); if (unlikely(!page)) continue; @@ -998,21 +993,16 @@ repeat: goto repeat; /* Has the page moved? */ - if (unlikely(page != *((void **)pages[i]))) { + if (unlikely(page != *slot)) { page_cache_release(page); goto repeat; } pages[ret] = page; - ret++; + if (++ret == nr_pages) + break; } - /* - * If all entries were removed before we could secure them, - * try again, because callers stop trying once 0 is returned. - */ - if (unlikely(!ret && nr_found)) - goto restart; rcu_read_unlock(); if (ret) From f467f7140339355978994ffcc23d569e7b4cea4d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 28 Mar 2012 14:42:54 -0700 Subject: [PATCH 29/35] selftests: launch individual selftests from the main Makefile Remove the run_tests script and launch the selftests by calling "make run_tests" from the selftests top directory instead. This delegates to the Makefile in each selftest directory, where it is decided how to launch the local test. This removes the need to add each selftest directory to the now removed "run_tests" top script. Signed-off-by: Frederic Weisbecker Cc: Dave Young Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/Makefile | 5 +++++ tools/testing/selftests/breakpoints/Makefile | 7 +++++-- tools/testing/selftests/run_tests | 8 -------- 3 files changed, 10 insertions(+), 10 deletions(-) delete mode 100644 tools/testing/selftests/run_tests diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 4ec84018cc13..b1119f0db518 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -5,6 +5,11 @@ all: make -C $$TARGET; \ done; +run_tests: + for TARGET in $(TARGETS); do \ + make -C $$TARGET run_tests; \ + done; + clean: for TARGET in $(TARGETS); do \ make -C $$TARGET clean; \ diff --git a/tools/testing/selftests/breakpoints/Makefile b/tools/testing/selftests/breakpoints/Makefile index f362722cdce7..931278035f5c 100644 --- a/tools/testing/selftests/breakpoints/Makefile +++ b/tools/testing/selftests/breakpoints/Makefile @@ -11,10 +11,13 @@ endif all: ifeq ($(ARCH),x86) - gcc breakpoint_test.c -o run_test + gcc breakpoint_test.c -o breakpoint_test else echo "Not an x86 target, can't build breakpoints selftests" endif +run_tests: + ./breakpoint_test + clean: - rm -fr run_test + rm -fr breakpoint_test diff --git a/tools/testing/selftests/run_tests b/tools/testing/selftests/run_tests deleted file mode 100644 index 320718a4e6bf..000000000000 --- a/tools/testing/selftests/run_tests +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -TARGETS=breakpoints - -for TARGET in $TARGETS -do - $TARGET/run_test -done From cab6b0560080c6da5107c5d7dbba6372f7b288ab Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 28 Mar 2012 14:42:54 -0700 Subject: [PATCH 30/35] selftests/Makefile: make `run_tests' depend on `all' So a "make run_tests" will build the tests before trying to run them. Acked-by: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index b1119f0db518..9203cd77fc33 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -5,7 +5,7 @@ all: make -C $$TARGET; \ done; -run_tests: +run_tests: all for TARGET in $(TARGETS); do \ make -C $$TARGET run_tests; \ done; From c6dd897f3bfc54a44942d742d6dfa842e33d88e0 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 28 Mar 2012 14:42:55 -0700 Subject: [PATCH 31/35] mm: move page-types.c from Documentation to tools/vm tools/ is the better place for vm tools which are used by many people. Moving them to tools also make them open to more users instead of hide in Documentation folder. This patch moves page-types.c to tools/vm/page-types.c. Also add a Makefile in tools/vm and fix two coding style problems: a) change const arrary to 'const char * const', b) change a space to tab for indent. Signed-off-by: Dave Young Acked-by: Wu Fengguang Cc: Christoph Lameter Cc: Pekka Enberg Cc: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/Makefile | 2 +- tools/vm/Makefile | 11 +++++++++++ {Documentation => tools}/vm/page-types.c | 6 +++--- 3 files changed, 15 insertions(+), 4 deletions(-) create mode 100644 tools/vm/Makefile rename {Documentation => tools}/vm/page-types.c (99%) diff --git a/Documentation/vm/Makefile b/Documentation/vm/Makefile index 3fa4d0668864..e538864bfc63 100644 --- a/Documentation/vm/Makefile +++ b/Documentation/vm/Makefile @@ -2,7 +2,7 @@ obj- := dummy.o # List of programs to build -hostprogs-y := page-types hugepage-mmap hugepage-shm map_hugetlb +hostprogs-y := hugepage-mmap hugepage-shm map_hugetlb # Tell kbuild to always build the programs always := $(hostprogs-y) diff --git a/tools/vm/Makefile b/tools/vm/Makefile new file mode 100644 index 000000000000..3823d4b1fa75 --- /dev/null +++ b/tools/vm/Makefile @@ -0,0 +1,11 @@ +# Makefile for vm tools + +CC = $(CROSS_COMPILE)gcc +CFLAGS = -Wall -Wextra + +all: page-types +%: %.c + $(CC) $(CFLAGS) -o $@ $^ + +clean: + $(RM) page-types diff --git a/Documentation/vm/page-types.c b/tools/vm/page-types.c similarity index 99% rename from Documentation/vm/page-types.c rename to tools/vm/page-types.c index 0b13f02d4059..7dab7b25b5c6 100644 --- a/Documentation/vm/page-types.c +++ b/tools/vm/page-types.c @@ -124,7 +124,7 @@ #define BIT(name) (1ULL << KPF_##name) #define BITS_COMPOUND (BIT(COMPOUND_HEAD) | BIT(COMPOUND_TAIL)) -static const char *page_flag_names[] = { +static const char * const page_flag_names[] = { [KPF_LOCKED] = "L:locked", [KPF_ERROR] = "E:error", [KPF_REFERENCED] = "R:referenced", @@ -166,7 +166,7 @@ static const char *page_flag_names[] = { }; -static const char *debugfs_known_mountpoints[] = { +static const char * const debugfs_known_mountpoints[] = { "/sys/kernel/debug", "/debug", 0, @@ -215,7 +215,7 @@ static int hwpoison_forget_fd; static unsigned long total_pages; static unsigned long nr_pages[HASH_SIZE]; -static uint64_t page_flags[HASH_SIZE]; +static uint64_t page_flags[HASH_SIZE]; /* From 63e315535abe0d820d0e3db4c06bc5de74aeefc8 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 28 Mar 2012 14:42:55 -0700 Subject: [PATCH 32/35] mm: move slabinfo.c to tools/vm We have tools/vm/ folder for vm tools, so move slabinfo.c from tools/slub/ to tools/vm/ Signed-off-by: Dave Young Cc: Wu Fengguang Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/Makefile | 4 ++-- tools/{slub => vm}/slabinfo.c | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename tools/{slub => vm}/slabinfo.c (100%) diff --git a/tools/vm/Makefile b/tools/vm/Makefile index 3823d4b1fa75..8e30e5c40f8a 100644 --- a/tools/vm/Makefile +++ b/tools/vm/Makefile @@ -3,9 +3,9 @@ CC = $(CROSS_COMPILE)gcc CFLAGS = -Wall -Wextra -all: page-types +all: page-types slabinfo %: %.c $(CC) $(CFLAGS) -o $@ $^ clean: - $(RM) page-types + $(RM) page-types slabinfo diff --git a/tools/slub/slabinfo.c b/tools/vm/slabinfo.c similarity index 100% rename from tools/slub/slabinfo.c rename to tools/vm/slabinfo.c From f0f57b2b1488251970c25deea0ea150a8d0911ed Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 28 Mar 2012 14:42:56 -0700 Subject: [PATCH 33/35] mm: move hugepage test examples to tools/testing/selftests/vm hugepage-mmap.c, hugepage-shm.c and map_hugetlb.c in Documentation/vm are simple pass/fail tests, It's better to promote them to tools/testing/selftests. Thanks suggestion of Andrew Morton about this. They all need firstly setting up proper nr_hugepages and hugepage-mmap need to mount hugetlbfs. So I add a shell script run_vmtests to do such work which will call the three test programs and check the return value of them. Changes to original code including below: a. add run_vmtests script b. return error when read_bytes mismatch with writed bytes. c. coding style fixes: do not use assignment in if condition [akpm@linux-foundation.org: build the targets before trying to execute them] [akpm@linux-foundation.org: Documentation/vm/ no longer has a Makefile. Fixes "make clean"] Signed-off-by: Dave Young Cc: Wu Fengguang Cc: Christoph Lameter Cc: Pekka Enberg Cc: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/Makefile | 2 +- Documentation/vm/Makefile | 8 -- tools/testing/selftests/Makefile | 2 +- tools/testing/selftests/vm/Makefile | 14 ++++ .../testing/selftests}/vm/hugepage-mmap.c | 13 ++-- .../testing/selftests}/vm/hugepage-shm.c | 10 ++- .../testing/selftests}/vm/map_hugetlb.c | 10 ++- tools/testing/selftests/vm/run_vmtests | 77 +++++++++++++++++++ 8 files changed, 112 insertions(+), 24 deletions(-) delete mode 100644 Documentation/vm/Makefile create mode 100644 tools/testing/selftests/vm/Makefile rename {Documentation => tools/testing/selftests}/vm/hugepage-mmap.c (93%) rename {Documentation => tools/testing/selftests}/vm/hugepage-shm.c (94%) rename {Documentation => tools/testing/selftests}/vm/map_hugetlb.c (94%) create mode 100644 tools/testing/selftests/vm/run_vmtests diff --git a/Documentation/Makefile b/Documentation/Makefile index 9b4bc5c76f33..30b656ece7aa 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -1,3 +1,3 @@ obj-m := DocBook/ accounting/ auxdisplay/ connector/ \ filesystems/ filesystems/configfs/ ia64/ laptops/ networking/ \ - pcmcia/ spi/ timers/ vm/ watchdog/src/ + pcmcia/ spi/ timers/ watchdog/src/ diff --git a/Documentation/vm/Makefile b/Documentation/vm/Makefile deleted file mode 100644 index e538864bfc63..000000000000 --- a/Documentation/vm/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -# kbuild trick to avoid linker error. Can be omitted if a module is built. -obj- := dummy.o - -# List of programs to build -hostprogs-y := hugepage-mmap hugepage-shm map_hugetlb - -# Tell kbuild to always build the programs -always := $(hostprogs-y) diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 9203cd77fc33..28bc57ee757c 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -1,4 +1,4 @@ -TARGETS = breakpoints +TARGETS = breakpoints vm all: for TARGET in $(TARGETS); do \ diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile new file mode 100644 index 000000000000..b336b24aa6c0 --- /dev/null +++ b/tools/testing/selftests/vm/Makefile @@ -0,0 +1,14 @@ +# Makefile for vm selftests + +CC = $(CROSS_COMPILE)gcc +CFLAGS = -Wall -Wextra + +all: hugepage-mmap hugepage-shm map_hugetlb +%: %.c + $(CC) $(CFLAGS) -o $@ $^ + +run_tests: all + /bin/sh ./run_vmtests + +clean: + $(RM) hugepage-mmap hugepage-shm map_hugetlb diff --git a/Documentation/vm/hugepage-mmap.c b/tools/testing/selftests/vm/hugepage-mmap.c similarity index 93% rename from Documentation/vm/hugepage-mmap.c rename to tools/testing/selftests/vm/hugepage-mmap.c index db0dd9a33d54..a10f310d2362 100644 --- a/Documentation/vm/hugepage-mmap.c +++ b/tools/testing/selftests/vm/hugepage-mmap.c @@ -22,7 +22,7 @@ #include #include -#define FILE_NAME "/mnt/hugepagefile" +#define FILE_NAME "huge/hugepagefile" #define LENGTH (256UL*1024*1024) #define PROTECTION (PROT_READ | PROT_WRITE) @@ -48,7 +48,7 @@ static void write_bytes(char *addr) *(addr + i) = (char)i; } -static void read_bytes(char *addr) +static int read_bytes(char *addr) { unsigned long i; @@ -56,14 +56,15 @@ static void read_bytes(char *addr) for (i = 0; i < LENGTH; i++) if (*(addr + i) != (char)i) { printf("Mismatch at %lu\n", i); - break; + return 1; } + return 0; } int main(void) { void *addr; - int fd; + int fd, ret; fd = open(FILE_NAME, O_CREAT | O_RDWR, 0755); if (fd < 0) { @@ -81,11 +82,11 @@ int main(void) printf("Returned address is %p\n", addr); check_bytes(addr); write_bytes(addr); - read_bytes(addr); + ret = read_bytes(addr); munmap(addr, LENGTH); close(fd); unlink(FILE_NAME); - return 0; + return ret; } diff --git a/Documentation/vm/hugepage-shm.c b/tools/testing/selftests/vm/hugepage-shm.c similarity index 94% rename from Documentation/vm/hugepage-shm.c rename to tools/testing/selftests/vm/hugepage-shm.c index 07956d8592c9..0d0ef4fc0c04 100644 --- a/Documentation/vm/hugepage-shm.c +++ b/tools/testing/selftests/vm/hugepage-shm.c @@ -57,8 +57,8 @@ int main(void) unsigned long i; char *shmaddr; - if ((shmid = shmget(2, LENGTH, - SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) { + shmid = shmget(2, LENGTH, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W); + if (shmid < 0) { perror("shmget"); exit(1); } @@ -82,14 +82,16 @@ int main(void) dprintf("Starting the Check..."); for (i = 0; i < LENGTH; i++) - if (shmaddr[i] != (char)i) + if (shmaddr[i] != (char)i) { printf("\nIndex %lu mismatched\n", i); + exit(3); + } dprintf("Done.\n"); if (shmdt((const void *)shmaddr) != 0) { perror("Detach failure"); shmctl(shmid, IPC_RMID, NULL); - exit(3); + exit(4); } shmctl(shmid, IPC_RMID, NULL); diff --git a/Documentation/vm/map_hugetlb.c b/tools/testing/selftests/vm/map_hugetlb.c similarity index 94% rename from Documentation/vm/map_hugetlb.c rename to tools/testing/selftests/vm/map_hugetlb.c index eda1a6d3578a..ac56639dd4a9 100644 --- a/Documentation/vm/map_hugetlb.c +++ b/tools/testing/selftests/vm/map_hugetlb.c @@ -44,7 +44,7 @@ static void write_bytes(char *addr) *(addr + i) = (char)i; } -static void read_bytes(char *addr) +static int read_bytes(char *addr) { unsigned long i; @@ -52,13 +52,15 @@ static void read_bytes(char *addr) for (i = 0; i < LENGTH; i++) if (*(addr + i) != (char)i) { printf("Mismatch at %lu\n", i); - break; + return 1; } + return 0; } int main(void) { void *addr; + int ret; addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, 0, 0); if (addr == MAP_FAILED) { @@ -69,9 +71,9 @@ int main(void) printf("Returned address is %p\n", addr); check_bytes(addr); write_bytes(addr); - read_bytes(addr); + ret = read_bytes(addr); munmap(addr, LENGTH); - return 0; + return ret; } diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests new file mode 100644 index 000000000000..8b40bd5e5cc2 --- /dev/null +++ b/tools/testing/selftests/vm/run_vmtests @@ -0,0 +1,77 @@ +#!/bin/bash +#please run as root + +#we need 256M, below is the size in kB +needmem=262144 +mnt=./huge + +#get pagesize and freepages from /proc/meminfo +while read name size unit; do + if [ "$name" = "HugePages_Free:" ]; then + freepgs=$size + fi + if [ "$name" = "Hugepagesize:" ]; then + pgsize=$size + fi +done < /proc/meminfo + +#set proper nr_hugepages +if [ -n "$freepgs" ] && [ -n "$pgsize" ]; then + nr_hugepgs=`cat /proc/sys/vm/nr_hugepages` + needpgs=`expr $needmem / $pgsize` + if [ $freepgs -lt $needpgs ]; then + lackpgs=$(( $needpgs - $freepgs )) + echo $(( $lackpgs + $nr_hugepgs )) > /proc/sys/vm/nr_hugepages + if [ $? -ne 0 ]; then + echo "Please run this test as root" + exit 1 + fi + fi +else + echo "no hugetlbfs support in kernel?" + exit 1 +fi + +mkdir $mnt +mount -t hugetlbfs none $mnt + +echo "--------------------" +echo "runing hugepage-mmap" +echo "--------------------" +./hugepage-mmap +if [ $? -ne 0 ]; then + echo "[FAIL]" +else + echo "[PASS]" +fi + +shmmax=`cat /proc/sys/kernel/shmmax` +shmall=`cat /proc/sys/kernel/shmall` +echo 268435456 > /proc/sys/kernel/shmmax +echo 4194304 > /proc/sys/kernel/shmall +echo "--------------------" +echo "runing hugepage-shm" +echo "--------------------" +./hugepage-shm +if [ $? -ne 0 ]; then + echo "[FAIL]" +else + echo "[PASS]" +fi +echo $shmmax > /proc/sys/kernel/shmmax +echo $shmall > /proc/sys/kernel/shmall + +echo "--------------------" +echo "runing map_hugetlb" +echo "--------------------" +./map_hugetlb +if [ $? -ne 0 ]; then + echo "[FAIL]" +else + echo "[PASS]" +fi + +#cleanup +umount $mnt +rm -rf $mnt +echo $nr_hugepgs > /proc/sys/vm/nr_hugepages From 82edb4baa762c98008fcea6393e85bffedab2b3c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 28 Mar 2012 14:42:56 -0700 Subject: [PATCH 34/35] crc32: add help text for the algorithm select option Add help text to the crc32 algorithm selection option in Kconfig. Signed-off-by: Darrick J. Wong Reported-by: Stefan Richter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/Kconfig b/lib/Kconfig index a0e5900a9d85..4a8aba2e5cc0 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -88,6 +88,10 @@ choice prompt "CRC32 implementation" depends on CRC32 default CRC32_SLICEBY8 + help + This option allows a kernel builder to override the default choice + of CRC32 algorithm. Choose the default ("slice by 8") unless you + know that you need one of the others. config CRC32_SLICEBY8 bool "Slice by 8 bytes" From 8da00edc1069f01c34510fa405dc15d96c090a3f Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Wed, 28 Mar 2012 14:42:56 -0700 Subject: [PATCH 35/35] backlight: fix typo in tosa_lcd.c Fix typo in drivers/video/backlight/tosa_lcd.c "tosa_lcd_reume" should be "tosa_lcd_resume". Signed-off-by: Masanari Iida Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/tosa_lcd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/backlight/tosa_lcd.c b/drivers/video/backlight/tosa_lcd.c index a2161f631a83..2231aec23918 100644 --- a/drivers/video/backlight/tosa_lcd.c +++ b/drivers/video/backlight/tosa_lcd.c @@ -271,7 +271,7 @@ static int tosa_lcd_resume(struct spi_device *spi) } #else #define tosa_lcd_suspend NULL -#define tosa_lcd_reume NULL +#define tosa_lcd_resume NULL #endif static struct spi_driver tosa_lcd_driver = {