From fb50b020c5331c8c4bee0eb875865f5f8be6c03a Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 16 Nov 2012 13:53:09 -0800 Subject: [PATCH 001/105] x86: Move some contents of page_64_types.h into pgtable_64.h and page_64.h This patch is meant to clean-up the fact that we have several functions in page_64_types.h which really don't belong there. I found this issue when I had tried to replace __phys_addr with an inline function. It resulted in the realmode bits generating compile warnings about types. In order to resolve that I am relocating the address translation to page_64.h since this is in keeping with where these functions are located in 32 bit. In addtion I have relocated several functions defined in init_64.c to pgtable_64.h as this seems to be where most of the functions related to memory initialization were already located. [ hpa: added missing #include to apic_numachip.c, as reported by Yinghai Lu. ] Signed-off-by: Alexander Duyck Link: http://lkml.kernel.org/r/20121116215244.8521.31505.stgit@ahduyck-cp1.jf.intel.com Signed-off-by: H. Peter Anvin Cc: Yinghai Lu Cc: Daniel J Blueman --- arch/x86/include/asm/page_64.h | 19 +++++++++++++++++++ arch/x86/include/asm/page_64_types.h | 22 ---------------------- arch/x86/include/asm/pgtable_64.h | 5 +++++ arch/x86/kernel/apic/apic_numachip.c | 1 + 4 files changed, 25 insertions(+), 22 deletions(-) diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 072694ed81a5..4150999fc20e 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -3,4 +3,23 @@ #include +#ifndef __ASSEMBLY__ + +/* duplicated to the one in bootmem.h */ +extern unsigned long max_pfn; +extern unsigned long phys_base; + +extern unsigned long __phys_addr(unsigned long); + +#define __phys_reloc_hide(x) (x) + +#ifdef CONFIG_FLATMEM +#define pfn_valid(pfn) ((pfn) < max_pfn) +#endif + +void clear_page(void *page); +void copy_page(void *to, void *from); + +#endif /* !__ASSEMBLY__ */ + #endif /* _ASM_X86_PAGE_64_H */ diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 320f7bb95f76..8b491e66eaa8 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -50,26 +50,4 @@ #define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) #define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL) -#ifndef __ASSEMBLY__ -void clear_page(void *page); -void copy_page(void *to, void *from); - -/* duplicated to the one in bootmem.h */ -extern unsigned long max_pfn; -extern unsigned long phys_base; - -extern unsigned long __phys_addr(unsigned long); -#define __phys_reloc_hide(x) (x) - -#define vmemmap ((struct page *)VMEMMAP_START) - -extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); -extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); - -#endif /* !__ASSEMBLY__ */ - -#ifdef CONFIG_FLATMEM -#define pfn_valid(pfn) ((pfn) < max_pfn) -#endif - #endif /* _ASM_X86_PAGE_64_DEFS_H */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 47356f9df82e..b5d30ad39022 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -183,6 +183,11 @@ extern void cleanup_highmap(void); #define __HAVE_ARCH_PTE_SAME +#define vmemmap ((struct page *)VMEMMAP_START) + +extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); +extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_64_H */ diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index a65829ac2b9a..ae9196f31261 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -27,6 +27,7 @@ #include #include #include +#include static int numachip_system __read_mostly; From 0bdf525f04afd3a32c14e5a8778771f9c9e0f074 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 16 Nov 2012 13:53:51 -0800 Subject: [PATCH 002/105] x86: Improve __phys_addr performance by making use of carry flags and inlining This patch is meant to improve overall system performance when making use of the __phys_addr call. To do this I have implemented several changes. First if CONFIG_DEBUG_VIRTUAL is not defined __phys_addr is made an inline, similar to how this is currently handled in 32 bit. However in order to do this it is required to export phys_base so that it is available if __phys_addr is used in kernel modules. The second change was to streamline the code by making use of the carry flag on an add operation instead of performing a compare on a 64 bit value. The advantage to this is that it allows us to significantly reduce the overall size of the call. On my Xeon E5 system the entire __phys_addr inline call consumes a little less than 32 bytes and 5 instructions. I also applied similar logic to the debug version of the function. My testing shows that the debug version of the function with this patch applied is slightly faster than the non-debug version without the patch. Finally I also applied the same logic changes to __virt_addr_valid since it used the same general code flow as __phys_addr and could achieve similar gains though these changes. Signed-off-by: Alexander Duyck Link: http://lkml.kernel.org/r/20121116215315.8521.46270.stgit@ahduyck-cp1.jf.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page_64.h | 14 +++++++++++ arch/x86/kernel/x8664_ksyms_64.c | 3 +++ arch/x86/mm/physaddr.c | 40 ++++++++++++++++++++------------ 3 files changed, 42 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 4150999fc20e..5138174c2101 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -9,7 +9,21 @@ extern unsigned long max_pfn; extern unsigned long phys_base; +static inline unsigned long __phys_addr_nodebug(unsigned long x) +{ + unsigned long y = x - __START_KERNEL_map; + + /* use the carry flag to determine if x was < __START_KERNEL_map */ + x = y + ((x > y) ? phys_base : (__START_KERNEL_map - PAGE_OFFSET)); + + return x; +} + +#ifdef CONFIG_DEBUG_VIRTUAL extern unsigned long __phys_addr(unsigned long); +#else +#define __phys_addr(x) __phys_addr_nodebug(x) +#endif #define __phys_reloc_hide(x) (x) diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 1330dd102950..b014d9414d08 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -59,6 +59,9 @@ EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(__memcpy); EXPORT_SYMBOL(memmove); +#ifndef CONFIG_DEBUG_VIRTUAL +EXPORT_SYMBOL(phys_base); +#endif EXPORT_SYMBOL(empty_zero_page); #ifndef CONFIG_PARAVIRT EXPORT_SYMBOL(native_load_gs_index); diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c index d2e2735327b4..fd40d75109ef 100644 --- a/arch/x86/mm/physaddr.c +++ b/arch/x86/mm/physaddr.c @@ -8,33 +8,43 @@ #ifdef CONFIG_X86_64 +#ifdef CONFIG_DEBUG_VIRTUAL unsigned long __phys_addr(unsigned long x) { - if (x >= __START_KERNEL_map) { - x -= __START_KERNEL_map; - VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE); - x += phys_base; + unsigned long y = x - __START_KERNEL_map; + + /* use the carry flag to determine if x was < __START_KERNEL_map */ + if (unlikely(x > y)) { + x = y + phys_base; + + VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE); } else { - VIRTUAL_BUG_ON(x < PAGE_OFFSET); - x -= PAGE_OFFSET; - VIRTUAL_BUG_ON(!phys_addr_valid(x)); + x = y + (__START_KERNEL_map - PAGE_OFFSET); + + /* carry flag will be set if starting x was >= PAGE_OFFSET */ + VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x)); } + return x; } EXPORT_SYMBOL(__phys_addr); +#endif bool __virt_addr_valid(unsigned long x) { - if (x >= __START_KERNEL_map) { - x -= __START_KERNEL_map; - if (x >= KERNEL_IMAGE_SIZE) + unsigned long y = x - __START_KERNEL_map; + + /* use the carry flag to determine if x was < __START_KERNEL_map */ + if (unlikely(x > y)) { + x = y + phys_base; + + if (y >= KERNEL_IMAGE_SIZE) return false; - x += phys_base; } else { - if (x < PAGE_OFFSET) - return false; - x -= PAGE_OFFSET; - if (!phys_addr_valid(x)) + x = y + (__START_KERNEL_map - PAGE_OFFSET); + + /* carry flag will be set if starting x was >= PAGE_OFFSET */ + if ((x > y) || !phys_addr_valid(x)) return false; } From 7d74275d39def4d3ccc8cf4725388bf79ef13861 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 16 Nov 2012 13:55:46 -0800 Subject: [PATCH 003/105] x86: Make it so that __pa_symbol can only process kernel symbols on x86_64 I submitted an earlier patch that make __phys_addr an inline. This obviously results in an increase in the code size. One step I can take to reduce that is to make it so that the __pa_symbol call does a direct translation for kernel addresses instead of covering all of virtual memory. On my system this reduced the size for __pa_symbol from 5 instructions totalling 30 bytes to 3 instructions totalling 16 bytes. Signed-off-by: Alexander Duyck Link: http://lkml.kernel.org/r/20121116215356.8521.92472.stgit@ahduyck-cp1.jf.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page.h | 3 ++- arch/x86/include/asm/page_32.h | 1 + arch/x86/include/asm/page_64.h | 3 +++ arch/x86/mm/physaddr.c | 11 +++++++++++ 4 files changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 8ca82839288a..3698a6a0a940 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -44,7 +44,8 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, * case properly. Once all supported versions of gcc understand it, we can * remove this Voodoo magic stuff. (i.e. once gcc3.x is deprecated) */ -#define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x))) +#define __pa_symbol(x) \ + __phys_addr_symbol(__phys_reloc_hide((unsigned long)(x))) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h index da4e762406f7..4d550d04b609 100644 --- a/arch/x86/include/asm/page_32.h +++ b/arch/x86/include/asm/page_32.h @@ -15,6 +15,7 @@ extern unsigned long __phys_addr(unsigned long); #else #define __phys_addr(x) __phys_addr_nodebug(x) #endif +#define __phys_addr_symbol(x) __phys_addr(x) #define __phys_reloc_hide(x) RELOC_HIDE((x), 0) #ifdef CONFIG_FLATMEM diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 5138174c2101..0f1ddee6a0ce 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -21,8 +21,11 @@ static inline unsigned long __phys_addr_nodebug(unsigned long x) #ifdef CONFIG_DEBUG_VIRTUAL extern unsigned long __phys_addr(unsigned long); +extern unsigned long __phys_addr_symbol(unsigned long); #else #define __phys_addr(x) __phys_addr_nodebug(x) +#define __phys_addr_symbol(x) \ + ((unsigned long)(x) - __START_KERNEL_map + phys_base) #endif #define __phys_reloc_hide(x) (x) diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c index fd40d75109ef..c73feddd518d 100644 --- a/arch/x86/mm/physaddr.c +++ b/arch/x86/mm/physaddr.c @@ -28,6 +28,17 @@ unsigned long __phys_addr(unsigned long x) return x; } EXPORT_SYMBOL(__phys_addr); + +unsigned long __phys_addr_symbol(unsigned long x) +{ + unsigned long y = x - __START_KERNEL_map; + + /* only check upper bounds since lower bounds will trigger carry */ + VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE); + + return y + phys_base; +} +EXPORT_SYMBOL(__phys_addr_symbol); #endif bool __virt_addr_valid(unsigned long x) From 05a476b6e3795f205806662bf09ab95774266292 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 16 Nov 2012 13:56:35 -0800 Subject: [PATCH 004/105] x86: Drop 4 unnecessary calls to __pa_symbol While debugging the __pa_symbol inline patch I found that there were a couple spots where __pa_symbol was used as follows: __pa_symbol(x) - __pa_symbol(y) The compiler had reduced them to: x - y Since we also support a debug case where __pa_symbol is a function call it would probably be useful to just change the two cases I found so that they are always just treated as "x - y". As such I am casting the values to phys_addr_t and then doing simple subtraction so that the correct type and value is returned. Signed-off-by: Alexander Duyck Link: http://lkml.kernel.org/r/20121116215552.8521.68085.stgit@ahduyck-cp1.jf.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head32.c | 4 ++-- arch/x86/kernel/head64.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index c18f59d10101..f15db0c40713 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -30,8 +30,8 @@ static void __init i386_default_early_setup(void) void __init i386_start_kernel(void) { - memblock_reserve(__pa_symbol(&_text), - __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); + memblock_reserve(__pa_symbol(_text), + (phys_addr_t)__bss_stop - (phys_addr_t)_text); #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 037df57a99ac..42f5df134341 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -97,8 +97,8 @@ void __init x86_64_start_reservations(char *real_mode_data) { copy_bootdata(__va(real_mode_data)); - memblock_reserve(__pa_symbol(&_text), - __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); + memblock_reserve(__pa_symbol(_text), + (phys_addr_t)__bss_stop - (phys_addr_t)_text); #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ From fc8d782677f163dee76427fdd8a92bebd2b50b23 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 16 Nov 2012 13:57:13 -0800 Subject: [PATCH 005/105] x86: Use __pa_symbol instead of __pa on C visible symbols When I made an attempt at separating __pa_symbol and __pa I found that there were a number of cases where __pa was used on an obvious symbol. I also caught one non-obvious case as _brk_start and _brk_end are based on the address of __brk_base which is a C visible symbol. In mark_rodata_ro I was able to reduce the overhead of kernel symbol to virtual memory translation by using a combination of __va(__pa_symbol()) instead of page_address(virt_to_page()). Signed-off-by: Alexander Duyck Link: http://lkml.kernel.org/r/20121116215640.8521.80483.stgit@ahduyck-cp1.jf.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/intel.c | 2 +- arch/x86/kernel/setup.c | 16 ++++++++-------- arch/x86/mm/init_64.c | 18 ++++++++---------- arch/x86/mm/pageattr.c | 8 ++++---- arch/x86/platform/efi/efi.c | 4 ++-- arch/x86/realmode/init.c | 8 ++++---- 6 files changed, 27 insertions(+), 29 deletions(-) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 198e019a531a..2249e7e44521 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -168,7 +168,7 @@ int __cpuinit ppro_with_ram_bug(void) #ifdef CONFIG_X86_F00F_BUG static void __cpuinit trap_init_f00f_bug(void) { - __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); + __set_fixmap(FIX_F00F_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO); /* * Update the IDT descriptor and reload the IDT so that diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ca45696f30fb..2702c5d4acd2 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -300,8 +300,8 @@ static void __init cleanup_highmap(void) static void __init reserve_brk(void) { if (_brk_end > _brk_start) - memblock_reserve(__pa(_brk_start), - __pa(_brk_end) - __pa(_brk_start)); + memblock_reserve(__pa_symbol(_brk_start), + _brk_end - _brk_start); /* Mark brk area as locked down and no longer taking any new allocations */ @@ -761,12 +761,12 @@ void __init setup_arch(char **cmdline_p) init_mm.end_data = (unsigned long) _edata; init_mm.brk = _brk_end; - code_resource.start = virt_to_phys(_text); - code_resource.end = virt_to_phys(_etext)-1; - data_resource.start = virt_to_phys(_etext); - data_resource.end = virt_to_phys(_edata)-1; - bss_resource.start = virt_to_phys(&__bss_start); - bss_resource.end = virt_to_phys(&__bss_stop)-1; + code_resource.start = __pa_symbol(_text); + code_resource.end = __pa_symbol(_etext)-1; + data_resource.start = __pa_symbol(_etext); + data_resource.end = __pa_symbol(_edata)-1; + bss_resource.start = __pa_symbol(__bss_start); + bss_resource.end = __pa_symbol(__bss_stop)-1; #ifdef CONFIG_CMDLINE_BOOL #ifdef CONFIG_CMDLINE_OVERRIDE diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3baff255adac..0374a10f4fb7 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -770,12 +770,10 @@ void set_kernel_text_ro(void) void mark_rodata_ro(void) { unsigned long start = PFN_ALIGN(_text); - unsigned long rodata_start = - ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; + unsigned long rodata_start = PFN_ALIGN(__start_rodata); unsigned long end = (unsigned long) &__end_rodata_hpage_align; - unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table); - unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata); - unsigned long data_start = (unsigned long) &_sdata; + unsigned long text_end = PFN_ALIGN(&__stop___ex_table); + unsigned long rodata_end = PFN_ALIGN(&__end_rodata); printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); @@ -800,12 +798,12 @@ void mark_rodata_ro(void) #endif free_init_pages("unused kernel memory", - (unsigned long) page_address(virt_to_page(text_end)), - (unsigned long) - page_address(virt_to_page(rodata_start))); + (unsigned long) __va(__pa_symbol(text_end)), + (unsigned long) __va(__pa_symbol(rodata_start))); + free_init_pages("unused kernel memory", - (unsigned long) page_address(virt_to_page(rodata_end)), - (unsigned long) page_address(virt_to_page(data_start))); + (unsigned long) __va(__pa_symbol(rodata_end)), + (unsigned long) __va(__pa_symbol(_sdata))); } #endif diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a718e0d23503..40f92f3aea54 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -94,12 +94,12 @@ static inline void split_page_count(int level) { } static inline unsigned long highmap_start_pfn(void) { - return __pa(_text) >> PAGE_SHIFT; + return __pa_symbol(_text) >> PAGE_SHIFT; } static inline unsigned long highmap_end_pfn(void) { - return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT; + return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT; } #endif @@ -276,8 +276,8 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, * The .rodata section needs to be read-only. Using the pfn * catches all aliases. */ - if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, - __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) + if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT, + __pa_symbol(__end_rodata) >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_RW; #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index ad4439145f85..1b600266265e 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -410,8 +410,8 @@ void __init efi_reserve_boot_services(void) * - Not within any part of the kernel * - Not the bios reserved area */ - if ((start+size >= virt_to_phys(_text) - && start <= virt_to_phys(_end)) || + if ((start+size >= __pa_symbol(_text) + && start <= __pa_symbol(_end)) || !e820_all_mapped(start, start+size, E820_RAM) || memblock_is_region_reserved(start, size)) { /* Could not reserve, skip it */ diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index cbca565af5bd..80450261215c 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -62,9 +62,9 @@ void __init setup_real_mode(void) __va(real_mode_header->trampoline_header); #ifdef CONFIG_X86_32 - trampoline_header->start = __pa(startup_32_smp); + trampoline_header->start = __pa_symbol(startup_32_smp); trampoline_header->gdt_limit = __BOOT_DS + 7; - trampoline_header->gdt_base = __pa(boot_gdt); + trampoline_header->gdt_base = __pa_symbol(boot_gdt); #else /* * Some AMD processors will #GP(0) if EFER.LMA is set in WRMSR @@ -78,8 +78,8 @@ void __init setup_real_mode(void) *trampoline_cr4_features = read_cr4(); trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); - trampoline_pgd[0] = __pa(level3_ident_pgt) + _KERNPG_TABLE; - trampoline_pgd[511] = __pa(level3_kernel_pgt) + _KERNPG_TABLE; + trampoline_pgd[0] = __pa_symbol(level3_ident_pgt) + _KERNPG_TABLE; + trampoline_pgd[511] = __pa_symbol(level3_kernel_pgt) + _KERNPG_TABLE; #endif } From 217f155e9fc68bf2a6c58a7b47e0d1ce68d78818 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 16 Nov 2012 13:57:32 -0800 Subject: [PATCH 006/105] x86/ftrace: Use __pa_symbol instead of __pa on C visible symbols Instead of using __pa which is meant to be a general function for converting virtual addresses to physical addresses we can use __pa_symbol which is the preferred way of decoding kernel text virtual addresses to physical addresses. In this case we are not directly converting C visible symbols however if we know that the instruction pointer is somewhere between _text and _etext we know that we are going to be translating an address form the kernel text space. Cc: Steven Rostedt Cc: Frederic Weisbecker Signed-off-by: Alexander Duyck Link: http://lkml.kernel.org/r/20121116215718.8521.24026.stgit@ahduyck-cp1.jf.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 1d414029f1d8..42a392a9fd02 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -89,7 +89,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code) * kernel identity mapping to modify code. */ if (within(ip, (unsigned long)_text, (unsigned long)_etext)) - ip = (unsigned long)__va(__pa(ip)); + ip = (unsigned long)__va(__pa_symbol(ip)); return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE); } @@ -279,7 +279,7 @@ static int ftrace_write(unsigned long ip, const char *val, int size) * kernel identity mapping to modify code. */ if (within(ip, (unsigned long)_text, (unsigned long)_etext)) - ip = (unsigned long)__va(__pa(ip)); + ip = (unsigned long)__va(__pa_symbol(ip)); return probe_kernel_write((void *)ip, val, size); } From afd51a0e32cd79261f0e823400886ed322a355ac Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 16 Nov 2012 13:57:43 -0800 Subject: [PATCH 007/105] x86/acpi: Use __pa_symbol instead of __pa on C visible symbols This change just updates one spot where __pa was being used when __pa_symbol should have been used. By using __pa_symbol we are able to drop a few extra lines of code as we don't have to test to see if the virtual pointer is a part of the kernel text or just standard virtual memory. Cc: Len Brown Cc: Pavel Machek Acked-by: "Rafael J. Wysocki" Signed-off-by: Alexander Duyck Link: http://lkml.kernel.org/r/20121116215737.8521.51167.stgit@ahduyck-cp1.jf.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/acpi/sleep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 11676cf65aee..f146a3c10814 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -69,7 +69,7 @@ int acpi_suspend_lowlevel(void) #ifndef CONFIG_64BIT header->pmode_entry = (u32)&wakeup_pmode_return; - header->pmode_cr3 = (u32)__pa(&initial_page_table); + header->pmode_cr3 = (u32)__pa_symbol(initial_page_table); saved_magic = 0x12345678; #else /* CONFIG_64BIT */ #ifdef CONFIG_SMP From 6a3956bd242926f8956992f6ed7805b0811be003 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 16 Nov 2012 13:58:12 -0800 Subject: [PATCH 008/105] x86/lguest: Use __pa_symbol instead of __pa on C visible symbols The function lguest_write_cr3 is using __pa to convert swapper_pg_dir and initial_page_table from virtual addresses to physical. The correct function to use for these values is __pa_symbol since they are C visible symbols. Cc: Rusty Russell Signed-off-by: Alexander Duyck Link: http://lkml.kernel.org/r/20121116215748.8521.83556.stgit@ahduyck-cp1.jf.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/lguest/boot.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 642d8805bc1b..139dd353c2f2 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -552,7 +552,8 @@ static void lguest_write_cr3(unsigned long cr3) current_cr3 = cr3; /* These two page tables are simple, linear, and used during boot */ - if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table)) + if (cr3 != __pa_symbol(swapper_pg_dir) && + cr3 != __pa_symbol(initial_page_table)) cr3_changed = true; } From fa62aafea9e415cd1efd8c4054106112fe809f19 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:38 -0800 Subject: [PATCH 009/105] x86, mm: Add global page_size_mask and probe one time only Now we pass around use_gbpages and use_pse for calculating page table size, Later we will need to call init_memory_mapping for every ram range one by one, that mean those calculation will be done several times. Those information are the same for all ram range and could be stored in page_size_mask and could be probed it one time only. Move that probing code out of init_memory_mapping into separated function probe_page_size_mask(), and call it before all init_memory_mapping. Suggested-by: Ingo Molnar Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-2-git-send-email-yinghai@kernel.org Reviewed-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable.h | 1 + arch/x86/kernel/setup.c | 1 + arch/x86/mm/init.c | 55 ++++++++++++++++------------------ 3 files changed, 27 insertions(+), 30 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index a1f780d45f76..98ac76dc4eae 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -602,6 +602,7 @@ static inline int pgd_none(pgd_t pgd) #ifndef __ASSEMBLY__ extern int direct_gbpages; +void probe_page_size_mask(void); /* local pte updates need not use xchg for locking */ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ca45696f30fb..01fb5f9baf90 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -913,6 +913,7 @@ void __init setup_arch(char **cmdline_p) setup_real_mode(); init_gbpages(); + probe_page_size_mask(); /* max_pfn_mapped is updated here */ max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< Date: Fri, 16 Nov 2012 19:38:39 -0800 Subject: [PATCH 010/105] x86, mm: Split out split_mem_range from init_memory_mapping So make init_memory_mapping smaller and readable. -v2: use 0 instead of nr_range as input parameter found by Yasuaki Ishimatsu. Suggested-by: Ingo Molnar Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-3-git-send-email-yinghai@kernel.org Reviewed-by: Pekka Enberg Cc: Yasuaki Ishimatsu Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index aa5b0da03f6d..6368b86b84e2 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -146,25 +146,13 @@ static int __meminit save_mr(struct map_range *mr, int nr_range, return nr_range; } -/* - * Setup the direct mapping of the physical memory at PAGE_OFFSET. - * This runs before bootmem is initialized and gets pages directly from - * the physical memory. To access them they are temporarily mapped. - */ -unsigned long __init_refok init_memory_mapping(unsigned long start, - unsigned long end) +static int __meminit split_mem_range(struct map_range *mr, int nr_range, + unsigned long start, + unsigned long end) { unsigned long start_pfn, end_pfn; - unsigned long ret = 0; unsigned long pos; - struct map_range mr[NR_RANGE_MR]; - int nr_range, i; - - printk(KERN_INFO "init_memory_mapping: [mem %#010lx-%#010lx]\n", - start, end - 1); - - memset(mr, 0, sizeof(mr)); - nr_range = 0; + int i; /* head if not big page alignment ? */ start_pfn = start >> PAGE_SHIFT; @@ -258,6 +246,27 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, (mr[i].page_size_mask & (1< Date: Fri, 16 Nov 2012 19:38:40 -0800 Subject: [PATCH 011/105] x86, mm: Move down find_early_table_space() It will need to call split_mem_range(). Move it down after that to avoid extra declaration. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-4-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 117 +++++++++++++++++++++++---------------------- 1 file changed, 59 insertions(+), 58 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 6368b86b84e2..701abbc24735 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -36,64 +36,6 @@ struct map_range { }; static int page_size_mask; -/* - * First calculate space needed for kernel direct mapping page tables to cover - * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB - * pages. Then find enough contiguous space for those page tables. - */ -static void __init find_early_table_space(struct map_range *mr, int nr_range) -{ - int i; - unsigned long puds = 0, pmds = 0, ptes = 0, tables; - unsigned long start = 0, good_end; - phys_addr_t base; - - for (i = 0; i < nr_range; i++) { - unsigned long range, extra; - - range = mr[i].end - mr[i].start; - puds += (range + PUD_SIZE - 1) >> PUD_SHIFT; - - if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) { - extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT); - pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT; - } else { - pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT; - } - - if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) { - extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT); -#ifdef CONFIG_X86_32 - extra += PMD_SIZE; -#endif - ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; - } else { - ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT; - } - } - - tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); - tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); - tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); - -#ifdef CONFIG_X86_32 - /* for fixmap */ - tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); -#endif - good_end = max_pfn_mapped << PAGE_SHIFT; - - base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); - if (!base) - panic("Cannot find space for the kernel page tables"); - - pgt_buf_start = base >> PAGE_SHIFT; - pgt_buf_end = pgt_buf_start; - pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); - - printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n", - mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT, - (pgt_buf_top << PAGE_SHIFT) - 1); -} void probe_page_size_mask(void) { @@ -249,6 +191,65 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, return nr_range; } +/* + * First calculate space needed for kernel direct mapping page tables to cover + * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB + * pages. Then find enough contiguous space for those page tables. + */ +static void __init find_early_table_space(struct map_range *mr, int nr_range) +{ + int i; + unsigned long puds = 0, pmds = 0, ptes = 0, tables; + unsigned long start = 0, good_end; + phys_addr_t base; + + for (i = 0; i < nr_range; i++) { + unsigned long range, extra; + + range = mr[i].end - mr[i].start; + puds += (range + PUD_SIZE - 1) >> PUD_SHIFT; + + if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) { + extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT); + pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT; + } else { + pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT; + } + + if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) { + extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT); +#ifdef CONFIG_X86_32 + extra += PMD_SIZE; +#endif + ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; + } else { + ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT; + } + } + + tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); + tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); + tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); + +#ifdef CONFIG_X86_32 + /* for fixmap */ + tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); +#endif + good_end = max_pfn_mapped << PAGE_SHIFT; + + base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); + if (!base) + panic("Cannot find space for the kernel page tables"); + + pgt_buf_start = base >> PAGE_SHIFT; + pgt_buf_end = pgt_buf_start; + pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); + + printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n", + mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT, + (pgt_buf_top << PAGE_SHIFT) - 1); +} + /* * Setup the direct mapping of the physical memory at PAGE_OFFSET. * This runs before bootmem is initialized and gets pages directly from From 22ddfcaa0dbae992332381d41b8a1fbc72269a13 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:41 -0800 Subject: [PATCH 012/105] x86, mm: Move init_memory_mapping calling out of setup.c Now init_memory_mapping is called two times, later will be called for every ram ranges. Could put all related init_mem calling together and out of setup.c. Actually, it reverts commit 1bbbbe7 x86: Exclude E820_RESERVED regions and memory holes above 4 GB from direct mapping. will address that later with complete solution include handling hole under 4g. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-5-git-send-email-yinghai@kernel.org Reviewed-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/init.h | 1 - arch/x86/include/asm/pgtable.h | 2 +- arch/x86/kernel/setup.c | 27 +-------------------------- arch/x86/mm/init.c | 19 ++++++++++++++++++- 4 files changed, 20 insertions(+), 29 deletions(-) diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index adcc0ae73d09..4f13998be59a 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -12,7 +12,6 @@ kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask); - extern unsigned long __initdata pgt_buf_start; extern unsigned long __meminitdata pgt_buf_end; extern unsigned long __meminitdata pgt_buf_top; diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 98ac76dc4eae..dd1a88832d25 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -602,7 +602,7 @@ static inline int pgd_none(pgd_t pgd) #ifndef __ASSEMBLY__ extern int direct_gbpages; -void probe_page_size_mask(void); +void init_mem_mapping(void); /* local pte updates need not use xchg for locking */ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 01fb5f9baf90..23b079fb93fc 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -913,34 +913,9 @@ void __init setup_arch(char **cmdline_p) setup_real_mode(); init_gbpages(); - probe_page_size_mask(); - /* max_pfn_mapped is updated here */ - max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< max_low_pfn) { - int i; - unsigned long start, end; - unsigned long start_pfn, end_pfn; - - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, - NULL) { - - end = PFN_PHYS(end_pfn); - if (end <= (1UL<<32)) - continue; - - start = PFN_PHYS(start_pfn); - max_pfn_mapped = init_memory_mapping( - max((1UL<<32), start), end); - } - - /* can we preseve max_low_pfn ?*/ - max_low_pfn = max_pfn; - } -#endif memblock.current_limit = get_max_mapped(); dma_contiguous_reserve(0); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 701abbc24735..9e17f9e18a21 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -37,7 +37,7 @@ struct map_range { static int page_size_mask; -void probe_page_size_mask(void) +static void __init probe_page_size_mask(void) { #if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) /* @@ -315,6 +315,23 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, return ret >> PAGE_SHIFT; } +void __init init_mem_mapping(void) +{ + probe_page_size_mask(); + + /* max_pfn_mapped is updated here */ + max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< max_low_pfn) { + max_pfn_mapped = init_memory_mapping(1UL<<32, + max_pfn< Date: Fri, 16 Nov 2012 19:38:42 -0800 Subject: [PATCH 013/105] x86, mm: Revert back good_end setting for 64bit After | commit 8548c84da2f47e71bbbe300f55edb768492575f7 | Author: Takashi Iwai | Date: Sun Oct 23 23:19:12 2011 +0200 | | x86: Fix S4 regression | | Commit 4b239f458 ("x86-64, mm: Put early page table high") causes a S4 | regression since 2.6.39, namely the machine reboots occasionally at S4 | resume. It doesn't happen always, overall rate is about 1/20. But, | like other bugs, once when this happens, it continues to happen. | | This patch fixes the problem by essentially reverting the memory | assignment in the older way. Have some page table around 512M again, that will prevent kdump to find 512M under 768M. We need revert that reverting, so we could put page table high again for 64bit. Takashi agreed that S4 regression could be something else. https://lkml.org/lkml/2012/6/15/182 Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-6-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 9e17f9e18a21..dbef4ffe8d31 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -234,8 +234,8 @@ static void __init find_early_table_space(struct map_range *mr, int nr_range) #ifdef CONFIG_X86_32 /* for fixmap */ tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); -#endif good_end = max_pfn_mapped << PAGE_SHIFT; +#endif base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); if (!base) From 84f1ae30bb68d8da98bca7ff2c2b825b2ac8c9a5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:43 -0800 Subject: [PATCH 014/105] x86, mm: Change find_early_table_space() paramters call split_mem_range inside the function. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-7-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index dbef4ffe8d31..51f919febf64 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -196,12 +196,18 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB * pages. Then find enough contiguous space for those page tables. */ -static void __init find_early_table_space(struct map_range *mr, int nr_range) +static void __init find_early_table_space(unsigned long start, unsigned long end) { int i; unsigned long puds = 0, pmds = 0, ptes = 0, tables; - unsigned long start = 0, good_end; + unsigned long good_end; phys_addr_t base; + struct map_range mr[NR_RANGE_MR]; + int nr_range; + + memset(mr, 0, sizeof(mr)); + nr_range = 0; + nr_range = split_mem_range(mr, nr_range, start, end); for (i = 0; i < nr_range; i++) { unsigned long range, extra; @@ -276,7 +282,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, * nodes are discovered. */ if (!after_bootmem) - find_early_table_space(mr, nr_range); + find_early_table_space(start, end); for (i = 0; i < nr_range; i++) ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, From c14fa0b63b5b4234667c03fdc3314c0881caa514 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:44 -0800 Subject: [PATCH 015/105] x86, mm: Find early page table buffer together We should not do that in every calling of init_memory_mapping. At the same time need to move down early_memtest, and could remove after_bootmem checking. -v2: fix one early_memtest with 32bit by passing max_pfn_mapped instead. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-8-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 66 ++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 32 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 51f919febf64..1ce0d033fafc 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -274,16 +274,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, memset(mr, 0, sizeof(mr)); nr_range = split_mem_range(mr, 0, start, end); - /* - * Find space for the kernel direct mapping tables. - * - * Later we should allocate these tables in the local node of the - * memory mapped. Unfortunately this is done currently before the - * nodes are discovered. - */ - if (!after_bootmem) - find_early_table_space(start, end); - for (i = 0; i < nr_range; i++) ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, mr[i].page_size_mask); @@ -296,6 +286,36 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, __flush_tlb_all(); + return ret >> PAGE_SHIFT; +} + +void __init init_mem_mapping(void) +{ + probe_page_size_mask(); + + /* + * Find space for the kernel direct mapping tables. + * + * Later we should allocate these tables in the local node of the + * memory mapped. Unfortunately this is done currently before the + * nodes are discovered. + */ +#ifdef CONFIG_X86_64 + find_early_table_space(0, max_pfn< max_low_pfn) { + max_pfn_mapped = init_memory_mapping(1UL<<32, + max_pfn< pgt_buf_start) + if (pgt_buf_end > pgt_buf_start) x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start), PFN_PHYS(pgt_buf_end)); - if (!after_bootmem) - early_memtest(start, end); + /* stop the wrong using */ + pgt_buf_top = 0; - return ret >> PAGE_SHIFT; -} - -void __init init_mem_mapping(void) -{ - probe_page_size_mask(); - - /* max_pfn_mapped is updated here */ - max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< max_low_pfn) { - max_pfn_mapped = init_memory_mapping(1UL<<32, - max_pfn< Date: Fri, 16 Nov 2012 19:38:45 -0800 Subject: [PATCH 016/105] x86, mm: Separate out calculate_table_space_size() It should take physical address range that will need to be mapped. find_early_table_space should take range that pgt buff should be in. Separating page table size calculating and finding early page table to reduce confusing. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-9-git-send-email-yinghai@kernel.org Reviewed-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 1ce0d033fafc..7b961d0b1389 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -196,12 +196,10 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB * pages. Then find enough contiguous space for those page tables. */ -static void __init find_early_table_space(unsigned long start, unsigned long end) +static unsigned long __init calculate_table_space_size(unsigned long start, unsigned long end) { int i; unsigned long puds = 0, pmds = 0, ptes = 0, tables; - unsigned long good_end; - phys_addr_t base; struct map_range mr[NR_RANGE_MR]; int nr_range; @@ -240,9 +238,17 @@ static void __init find_early_table_space(unsigned long start, unsigned long end #ifdef CONFIG_X86_32 /* for fixmap */ tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); - good_end = max_pfn_mapped << PAGE_SHIFT; #endif + return tables; +} + +static void __init find_early_table_space(unsigned long start, + unsigned long good_end, + unsigned long tables) +{ + phys_addr_t base; + base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); if (!base) panic("Cannot find space for the kernel page tables"); @@ -250,10 +256,6 @@ static void __init find_early_table_space(unsigned long start, unsigned long end pgt_buf_start = base >> PAGE_SHIFT; pgt_buf_end = pgt_buf_start; pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); - - printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n", - mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT, - (pgt_buf_top << PAGE_SHIFT) - 1); } /* @@ -291,6 +293,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, void __init init_mem_mapping(void) { + unsigned long tables, good_end, end; + probe_page_size_mask(); /* @@ -301,10 +305,18 @@ void __init init_mem_mapping(void) * nodes are discovered. */ #ifdef CONFIG_X86_64 - find_early_table_space(0, max_pfn< pgt_buf_start) + if (pgt_buf_end > pgt_buf_start) { + printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx] final\n", + end - 1, pgt_buf_start << PAGE_SHIFT, + (pgt_buf_end << PAGE_SHIFT) - 1); x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start), PFN_PHYS(pgt_buf_end)); + } /* stop the wrong using */ pgt_buf_top = 0; From dd7dfad7fb297b1746bcdbebbdc970d723a635bd Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:46 -0800 Subject: [PATCH 017/105] x86, mm: Set memblock initial limit to 1M memblock_x86_fill() could double memory array. If we set memblock.current_limit to 512M, so memory array could be around 512M. So kdump will not get big range (like 512M) under 1024M. Try to put it down under 1M, it would use about 4k or so, and that is limited. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-10-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 23b079fb93fc..4bd89218cbc3 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -890,7 +890,7 @@ void __init setup_arch(char **cmdline_p) cleanup_highmap(); - memblock.current_limit = get_max_mapped(); + memblock.current_limit = ISA_END_ADDRESS; memblock_x86_fill(); /* From 4eea6aa581abfeb2695ebe9f9d4672597e1bdd4b Mon Sep 17 00:00:00 2001 From: Jacob Shin Date: Fri, 16 Nov 2012 19:38:47 -0800 Subject: [PATCH 018/105] x86, mm: if kernel .text .data .bss are not marked as E820_RAM, complain and fix There could be cases where user supplied memmap=exactmap memory mappings do not mark the region where the kernel .text .data and .bss reside as E820_RAM, as reported here: https://lkml.org/lkml/2012/8/14/86 Handle it by complaining, and adding the range back into the e820. Signed-off-by: Jacob Shin Link: http://lkml.kernel.org/r/1353123563-3103-11-git-send-email-yinghai@kernel.org Signed-off-by: Yinghai Lu Reviewed-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4bd89218cbc3..d85cbd96525d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -832,6 +832,20 @@ void __init setup_arch(char **cmdline_p) insert_resource(&iomem_resource, &data_resource); insert_resource(&iomem_resource, &bss_resource); + /* + * Complain if .text .data and .bss are not marked as E820_RAM and + * attempt to fix it by adding the range. We may have a confused BIOS, + * or the user may have incorrectly supplied it via memmap=exactmap. If + * we really are running on top non-RAM, we will crash later anyways. + */ + if (!e820_all_mapped(code_resource.start, __pa(__brk_limit), E820_RAM)) { + pr_warn(".text .data .bss are not marked as E820_RAM!\n"); + + e820_add_region(code_resource.start, + __pa(__brk_limit) - code_resource.start + 1, + E820_RAM); + } + trim_bios_range(); #ifdef CONFIG_X86_32 if (ppro_with_ram_bug()) { From dda56e134059b840631fdfd034784056b627c2a6 Mon Sep 17 00:00:00 2001 From: Jacob Shin Date: Fri, 16 Nov 2012 19:38:48 -0800 Subject: [PATCH 019/105] x86, mm: Fixup code testing if a pfn is direct mapped Update code that previously assumed pfns [ 0 - max_low_pfn_mapped ) and [ 4GB - max_pfn_mapped ) were always direct mapped, to now look up pfn_mapped ranges instead. -v2: change applying sequence to keep git bisecting working. so add dummy pfn_range_is_mapped(). - Yinghai Lu Signed-off-by: Jacob Shin Link: http://lkml.kernel.org/r/1353123563-3103-12-git-send-email-yinghai@kernel.org Signed-off-by: Yinghai Lu Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page_types.h | 8 ++++++++ arch/x86/kernel/cpu/amd.c | 8 +++----- arch/x86/platform/efi/efi.c | 7 +++---- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index e21fdd10479f..45aae6e5f649 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -51,6 +51,14 @@ static inline phys_addr_t get_max_mapped(void) return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT; } +static inline bool pfn_range_is_mapped(unsigned long start_pfn, + unsigned long end_pfn) +{ + return end_pfn <= max_low_pfn_mapped || + (end_pfn > (1UL << (32 - PAGE_SHIFT)) && + end_pfn <= max_pfn_mapped); +} + extern unsigned long init_memory_mapping(unsigned long start, unsigned long end); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f7e98a2c0d12..9619ba6528ca 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -676,12 +676,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) * benefit in doing so. */ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { + unsigned long pfn = tseg >> PAGE_SHIFT; + printk(KERN_DEBUG "tseg: %010llx\n", tseg); - if ((tseg>>PMD_SHIFT) < - (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) || - ((tseg>>PMD_SHIFT) < - (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) && - (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) + if (pfn_range_is_mapped(pfn, pfn + 1)) set_memory_4k((unsigned long)__va(tseg), 1); } } diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index ad4439145f85..36e53f0a9ce3 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -835,7 +835,7 @@ void __init efi_enter_virtual_mode(void) efi_memory_desc_t *md, *prev_md = NULL; efi_status_t status; unsigned long size; - u64 end, systab, end_pfn; + u64 end, systab, start_pfn, end_pfn; void *p, *va, *new_memmap = NULL; int count = 0; @@ -888,10 +888,9 @@ void __init efi_enter_virtual_mode(void) size = md->num_pages << EFI_PAGE_SHIFT; end = md->phys_addr + size; + start_pfn = PFN_DOWN(md->phys_addr); end_pfn = PFN_UP(end); - if (end_pfn <= max_low_pfn_mapped - || (end_pfn > (1UL << (32 - PAGE_SHIFT)) - && end_pfn <= max_pfn_mapped)) { + if (pfn_range_is_mapped(start_pfn, end_pfn)) { va = __va(md->phys_addr); if (!(md->attribute & EFI_MEMORY_WB)) From 8eb5779f6b9c7e390c92f451edaafc039e06e743 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:49 -0800 Subject: [PATCH 020/105] x86, mm: use pfn_range_is_mapped() with CPA We are going to map ram only, so under max_low_pfn_mapped, between 4g and max_pfn_mapped does not mean mapped at all. Use pfn_range_is_mapped() directly. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-13-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/pageattr.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a718e0d23503..44acfcd6c16f 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -551,16 +551,10 @@ static int split_large_page(pte_t *kpte, unsigned long address) for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); - if (address >= (unsigned long)__va(0) && - address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT)) + if (pfn_range_is_mapped(PFN_DOWN(__pa(address)), + PFN_DOWN(__pa(address)) + 1)) split_page_count(level); -#ifdef CONFIG_X86_64 - if (address >= (unsigned long)__va(1UL<<32) && - address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT)) - split_page_count(level); -#endif - /* * Install the new, split up pagetable. * @@ -729,13 +723,9 @@ static int cpa_process_alias(struct cpa_data *cpa) unsigned long vaddr; int ret; - if (cpa->pfn >= max_pfn_mapped) + if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1)) return 0; -#ifdef CONFIG_X86_64 - if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT))) - return 0; -#endif /* * No need to redo, when the primary call touched the direct * mapping already: From 5101730cb0613b91d40b9bb7be6bb023d2f6aa24 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:50 -0800 Subject: [PATCH 021/105] x86, mm: use pfn_range_is_mapped() with gart We are going to map ram only, so under max_low_pfn_mapped, between 4g and max_pfn_mapped does not mean mapped at all. Use pfn_range_is_mapped() directly. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-14-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/amd_gart_64.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index e66311200cbd..b574b295a2f9 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -768,10 +768,9 @@ int __init gart_iommu_init(void) aper_base = info.aper_base; end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); - if (end_pfn > max_low_pfn_mapped) { - start_pfn = (aper_base>>PAGE_SHIFT); + start_pfn = PFN_DOWN(aper_base); + if (!pfn_range_is_mapped(start_pfn, end_pfn)) init_memory_mapping(start_pfn< Date: Fri, 16 Nov 2012 19:38:51 -0800 Subject: [PATCH 022/105] x86, mm: use pfn_range_is_mapped() with reserve_initrd We are going to map ram only, so under max_low_pfn_mapped, between 4g and max_pfn_mapped does not mean mapped at all. Use pfn_range_is_mapped() to find out if range is mapped for initrd. That could happen bootloader put initrd in range but user could use memmap to carve some of range out. Also during copying need to use early_memmap to map original initrd for accessing. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-15-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 52 ++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d85cbd96525d..bd52f9da17cc 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -317,20 +317,19 @@ static void __init relocate_initrd(void) u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; u64 area_size = PAGE_ALIGN(ramdisk_size); - u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; u64 ramdisk_here; unsigned long slop, clen, mapaddr; char *p, *q; - /* We need to move the initrd down into lowmem */ - ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size, - PAGE_SIZE); + /* We need to move the initrd down into directly mapped mem */ + ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_low_pfn_mapped), + area_size, PAGE_SIZE); if (!ramdisk_here) panic("Cannot find place for new RAMDISK of size %lld\n", ramdisk_size); - /* Note: this includes all the lowmem currently occupied by + /* Note: this includes all the mem currently occupied by the initrd, we rely on that fact to keep the data intact. */ memblock_reserve(ramdisk_here, area_size); initrd_start = ramdisk_here + PAGE_OFFSET; @@ -340,17 +339,7 @@ static void __init relocate_initrd(void) q = (char *)initrd_start; - /* Copy any lowmem portion of the initrd */ - if (ramdisk_image < end_of_lowmem) { - clen = end_of_lowmem - ramdisk_image; - p = (char *)__va(ramdisk_image); - memcpy(q, p, clen); - q += clen; - ramdisk_image += clen; - ramdisk_size -= clen; - } - - /* Copy the highmem portion of the initrd */ + /* Copy the initrd */ while (ramdisk_size) { slop = ramdisk_image & ~PAGE_MASK; clen = ramdisk_size; @@ -364,7 +353,7 @@ static void __init relocate_initrd(void) ramdisk_image += clen; ramdisk_size -= clen; } - /* high pages is not converted by early_res_to_bootmem */ + ramdisk_image = boot_params.hdr.ramdisk_image; ramdisk_size = boot_params.hdr.ramdisk_size; printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" @@ -373,13 +362,27 @@ static void __init relocate_initrd(void) ramdisk_here, ramdisk_here + ramdisk_size - 1); } +static u64 __init get_mem_size(unsigned long limit_pfn) +{ + int i; + u64 mapped_pages = 0; + unsigned long start_pfn, end_pfn; + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { + start_pfn = min_t(unsigned long, start_pfn, limit_pfn); + end_pfn = min_t(unsigned long, end_pfn, limit_pfn); + mapped_pages += end_pfn - start_pfn; + } + + return mapped_pages << PAGE_SHIFT; +} static void __init reserve_initrd(void) { /* Assume only end is not page aligned */ u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; + u64 mapped_size; if (!boot_params.hdr.type_of_loader || !ramdisk_image || !ramdisk_size) @@ -387,18 +390,19 @@ static void __init reserve_initrd(void) initrd_start = 0; - if (ramdisk_size >= (end_of_lowmem>>1)) { + mapped_size = get_mem_size(max_low_pfn_mapped); + if (ramdisk_size >= (mapped_size>>1)) panic("initrd too large to handle, " "disabling initrd (%lld needed, %lld available)\n", - ramdisk_size, end_of_lowmem>>1); - } + ramdisk_size, mapped_size>>1); printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image, ramdisk_end - 1); - - if (ramdisk_end <= end_of_lowmem) { - /* All in lowmem, easy case */ + if (ramdisk_end <= (max_low_pfn_mapped< Date: Fri, 16 Nov 2012 19:38:52 -0800 Subject: [PATCH 023/105] x86, mm: Only direct map addresses that are marked as E820_RAM Currently direct mappings are created for [ 0 to max_low_pfn< Link: http://lkml.kernel.org/r/1353123563-3103-16-git-send-email-yinghai@kernel.org Signed-off-by: Yinghai Lu Reviewed-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page_types.h | 8 +- arch/x86/kernel/setup.c | 8 +- arch/x86/mm/init.c | 120 +++++++++++++++++++++++++++--- arch/x86/mm/init_64.c | 6 +- 4 files changed, 117 insertions(+), 25 deletions(-) diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 45aae6e5f649..54c97879195e 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -51,13 +51,7 @@ static inline phys_addr_t get_max_mapped(void) return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT; } -static inline bool pfn_range_is_mapped(unsigned long start_pfn, - unsigned long end_pfn) -{ - return end_pfn <= max_low_pfn_mapped || - (end_pfn > (1UL << (32 - PAGE_SHIFT)) && - end_pfn <= max_pfn_mapped); -} +bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn); extern unsigned long init_memory_mapping(unsigned long start, unsigned long end); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bd52f9da17cc..68dffeceb193 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -116,9 +116,11 @@ #include /* - * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. - * The direct mapping extends to max_pfn_mapped, so that we can directly access - * apertures, ACPI and other tables without having to play with fixmaps. + * max_low_pfn_mapped: highest direct mapped pfn under 4GB + * max_pfn_mapped: highest direct mapped pfn over 4GB + * + * The direct mapping only covers E820_RAM regions, so the ranges and gaps are + * represented by pfn_mapped */ unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 7b961d0b1389..bb44e9f2cc49 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -243,6 +243,38 @@ static unsigned long __init calculate_table_space_size(unsigned long start, unsi return tables; } +static unsigned long __init calculate_all_table_space_size(void) +{ + unsigned long start_pfn, end_pfn; + unsigned long tables; + int i; + + /* the ISA range is always mapped regardless of memory holes */ + tables = calculate_table_space_size(0, ISA_END_ADDRESS); + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { + u64 start = start_pfn << PAGE_SHIFT; + u64 end = end_pfn << PAGE_SHIFT; + + if (end <= ISA_END_ADDRESS) + continue; + + if (start < ISA_END_ADDRESS) + start = ISA_END_ADDRESS; +#ifdef CONFIG_X86_32 + /* on 32 bit, we only map up to max_low_pfn */ + if ((start >> PAGE_SHIFT) >= max_low_pfn) + continue; + + if ((end >> PAGE_SHIFT) > max_low_pfn) + end = max_low_pfn << PAGE_SHIFT; +#endif + tables += calculate_table_space_size(start, end); + } + + return tables; +} + static void __init find_early_table_space(unsigned long start, unsigned long good_end, unsigned long tables) @@ -258,6 +290,34 @@ static void __init find_early_table_space(unsigned long start, pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); } +static struct range pfn_mapped[E820_X_MAX]; +static int nr_pfn_mapped; + +static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn) +{ + nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX, + nr_pfn_mapped, start_pfn, end_pfn); + nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX); + + max_pfn_mapped = max(max_pfn_mapped, end_pfn); + + if (start_pfn < (1UL<<(32-PAGE_SHIFT))) + max_low_pfn_mapped = max(max_low_pfn_mapped, + min(end_pfn, 1UL<<(32-PAGE_SHIFT))); +} + +bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn) +{ + int i; + + for (i = 0; i < nr_pfn_mapped; i++) + if ((start_pfn >= pfn_mapped[i].start) && + (end_pfn <= pfn_mapped[i].end)) + return true; + + return false; +} + /* * Setup the direct mapping of the physical memory at PAGE_OFFSET. * This runs before bootmem is initialized and gets pages directly from @@ -288,9 +348,55 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, __flush_tlb_all(); + add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT); + return ret >> PAGE_SHIFT; } +/* + * Iterate through E820 memory map and create direct mappings for only E820_RAM + * regions. We cannot simply create direct mappings for all pfns from + * [0 to max_low_pfn) and [4GB to max_pfn) because of possible memory holes in + * high addresses that cannot be marked as UC by fixed/variable range MTRRs. + * Depending on the alignment of E820 ranges, this may possibly result in using + * smaller size (i.e. 4K instead of 2M or 1G) page tables. + */ +static void __init init_all_memory_mapping(void) +{ + unsigned long start_pfn, end_pfn; + int i; + + /* the ISA range is always mapped regardless of memory holes */ + init_memory_mapping(0, ISA_END_ADDRESS); + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { + u64 start = (u64)start_pfn << PAGE_SHIFT; + u64 end = (u64)end_pfn << PAGE_SHIFT; + + if (end <= ISA_END_ADDRESS) + continue; + + if (start < ISA_END_ADDRESS) + start = ISA_END_ADDRESS; +#ifdef CONFIG_X86_32 + /* on 32 bit, we only map up to max_low_pfn */ + if ((start >> PAGE_SHIFT) >= max_low_pfn) + continue; + + if ((end >> PAGE_SHIFT) > max_low_pfn) + end = max_low_pfn << PAGE_SHIFT; +#endif + init_memory_mapping(start, end); + } + +#ifdef CONFIG_X86_64 + if (max_pfn > max_low_pfn) { + /* can we preseve max_low_pfn ?*/ + max_low_pfn = max_pfn; + } +#endif +} + void __init init_mem_mapping(void) { unsigned long tables, good_end, end; @@ -311,23 +417,15 @@ void __init init_mem_mapping(void) end = max_low_pfn << PAGE_SHIFT; good_end = max_pfn_mapped << PAGE_SHIFT; #endif - tables = calculate_table_space_size(0, end); + tables = calculate_all_table_space_size(); find_early_table_space(0, good_end, tables); printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx] prealloc\n", end - 1, pgt_buf_start << PAGE_SHIFT, (pgt_buf_top << PAGE_SHIFT) - 1); - max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< max_low_pfn) { - max_pfn_mapped = init_memory_mapping(1UL<<32, - max_pfn<node_zones + ZONE_NORMAL; - unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT; + unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - last_mapped_pfn = init_memory_mapping(start, start + size); - if (last_mapped_pfn > max_pfn_mapped) - max_pfn_mapped = last_mapped_pfn; + init_memory_mapping(start, start + size); ret = __add_pages(nid, zone, start_pfn, nr_pages); WARN_ON_ONCE(ret); From 74f27655dda84604d8bab47872020dcce5c88731 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:53 -0800 Subject: [PATCH 024/105] x86, mm: relocate initrd under all mem for 64bit instead of under 4g. For 64bit, we can use any mapped mem instead of low mem. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-17-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 68dffeceb193..94f922a73c54 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -324,7 +324,7 @@ static void __init relocate_initrd(void) char *p, *q; /* We need to move the initrd down into directly mapped mem */ - ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_low_pfn_mapped), + ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), area_size, PAGE_SIZE); if (!ramdisk_here) @@ -392,7 +392,7 @@ static void __init reserve_initrd(void) initrd_start = 0; - mapped_size = get_mem_size(max_low_pfn_mapped); + mapped_size = get_mem_size(max_pfn_mapped); if (ramdisk_size >= (mapped_size>>1)) panic("initrd too large to handle, " "disabling initrd (%lld needed, %lld available)\n", @@ -401,8 +401,7 @@ static void __init reserve_initrd(void) printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image, ramdisk_end - 1); - if (ramdisk_end <= (max_low_pfn_mapped< Date: Fri, 16 Nov 2012 19:38:54 -0800 Subject: [PATCH 025/105] x86, mm: Align start address to correct big page size We are going to use buffer in BRK to map small range just under memory top, and use those new mapped ram to map ram range under it. The ram range that will be mapped at first could be only page aligned, but ranges around it are ram too, we could use bigger page to map it to avoid small page size. We will adjust page_size_mask in following patch: x86, mm: Use big page size for small memory range to use big page size for small ram range. Before that patch, this patch will make sure start address to be aligned down according to bigger page size, otherwise entry in page page will not have correct value. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-18-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_32.c | 1 + arch/x86/mm/init_64.c | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 11a58001b4ce..27f7fc69cf8a 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -310,6 +310,7 @@ repeat: __pgprot(PTE_IDENT_ATTR | _PAGE_PSE); + pfn &= PMD_MASK >> PAGE_SHIFT; addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 32c7e3847cf6..869372a5d3cf 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -464,7 +464,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, pages++; spin_lock(&init_mm.page_table_lock); set_pte((pte_t *)pmd, - pfn_pte(address >> PAGE_SHIFT, + pfn_pte((address & PMD_MASK) >> PAGE_SHIFT, __pgprot(pgprot_val(prot) | _PAGE_PSE))); spin_unlock(&init_mm.page_table_lock); last_map_addr = next; @@ -541,7 +541,8 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, pages++; spin_lock(&init_mm.page_table_lock); set_pte((pte_t *)pud, - pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); + pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT, + PAGE_KERNEL_LARGE)); spin_unlock(&init_mm.page_table_lock); last_map_addr = next; continue; From aeebe84cc96cde4181807bc67c300c550d0ef123 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:55 -0800 Subject: [PATCH 026/105] x86, mm: Use big page size for small memory range We could map small range in the middle of big range at first, so should use big page size at first to avoid using small page size to break down page table. Only can set big page bit when that range has ram area around it. -v2: fix 32bit boundary checking. We can not count ram above max_low_pfn for 32 bit. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-19-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index bb44e9f2cc49..da591ebc8d12 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -88,6 +88,40 @@ static int __meminit save_mr(struct map_range *mr, int nr_range, return nr_range; } +/* + * adjust the page_size_mask for small range to go with + * big page size instead small one if nearby are ram too. + */ +static void __init_refok adjust_range_page_size_mask(struct map_range *mr, + int nr_range) +{ + int i; + + for (i = 0; i < nr_range; i++) { + if ((page_size_mask & (1<> PAGE_SHIFT) > max_low_pfn) + continue; +#endif + + if (memblock_is_region_memory(start, end - start)) + mr[i].page_size_mask |= 1< Date: Fri, 16 Nov 2012 19:38:56 -0800 Subject: [PATCH 027/105] x86, mm: Don't clear page table if range is ram After we add code use buffer in BRK to pre-map buf for page table in following patch: x86, mm: setup page table in top-down it should be safe to remove early_memmap for page table accessing. Instead we get panic with that. It turns out that we clear the initial page table wrongly for next range that is separated by holes. And it only happens when we are trying to map ram range one by one. We need to check if the range is ram before clearing page table. We change the loop structure to remove the extra little loop and use one loop only, and in that loop will caculate next at first, and check if [addr,next) is covered by E820_RAM. -v2: E820_RESERVED_KERN is treated as E820_RAM. EFI one change some E820_RAM to that, so next kernel by kexec will know that range is used already. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-20-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_64.c | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 869372a5d3cf..fa28e3e29741 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -363,20 +363,20 @@ static unsigned long __meminit phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, pgprot_t prot) { - unsigned pages = 0; + unsigned long pages = 0, next; unsigned long last_map_addr = end; int i; pte_t *pte = pte_page + pte_index(addr); - for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) { - + for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) { + next = (addr & PAGE_MASK) + PAGE_SIZE; if (addr >= end) { - if (!after_bootmem) { - for(; i < PTRS_PER_PTE; i++, pte++) - set_pte(pte, __pte(0)); - } - break; + if (!after_bootmem && + !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) && + !e820_any_mapped(addr & PAGE_MASK, next, E820_RESERVED_KERN)) + set_pte(pte, __pte(0)); + continue; } /* @@ -419,15 +419,14 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, pte_t *pte; pgprot_t new_prot = prot; - if (address >= end) { - if (!after_bootmem) { - for (; i < PTRS_PER_PMD; i++, pmd++) - set_pmd(pmd, __pmd(0)); - } - break; - } - next = (address & PMD_MASK) + PMD_SIZE; + if (address >= end) { + if (!after_bootmem && + !e820_any_mapped(address & PMD_MASK, next, E820_RAM) && + !e820_any_mapped(address & PMD_MASK, next, E820_RESERVED_KERN)) + set_pmd(pmd, __pmd(0)); + continue; + } if (pmd_val(*pmd)) { if (!pmd_large(*pmd)) { @@ -497,13 +496,12 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, pmd_t *pmd; pgprot_t prot = PAGE_KERNEL; - if (addr >= end) - break; - next = (addr & PUD_MASK) + PUD_SIZE; - - if (!after_bootmem && !e820_any_mapped(addr, next, 0)) { - set_pud(pud, __pud(0)); + if (addr >= end) { + if (!after_bootmem && + !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) && + !e820_any_mapped(addr & PUD_MASK, next, E820_RESERVED_KERN)) + set_pud(pud, __pud(0)); continue; } From f763ad1d3870abb811ec7520b4c1adc56471a3a4 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:57 -0800 Subject: [PATCH 028/105] x86, mm: Break down init_all_memory_mapping Will replace that with top-down page table initialization. New API need to take range: init_range_memory_mapping() Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-21-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index da591ebc8d12..c688ea3887f2 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -398,40 +398,30 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, * Depending on the alignment of E820 ranges, this may possibly result in using * smaller size (i.e. 4K instead of 2M or 1G) page tables. */ -static void __init init_all_memory_mapping(void) +static void __init init_range_memory_mapping(unsigned long range_start, + unsigned long range_end) { unsigned long start_pfn, end_pfn; int i; - /* the ISA range is always mapped regardless of memory holes */ - init_memory_mapping(0, ISA_END_ADDRESS); - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { u64 start = (u64)start_pfn << PAGE_SHIFT; u64 end = (u64)end_pfn << PAGE_SHIFT; - if (end <= ISA_END_ADDRESS) + if (end <= range_start) continue; - if (start < ISA_END_ADDRESS) - start = ISA_END_ADDRESS; -#ifdef CONFIG_X86_32 - /* on 32 bit, we only map up to max_low_pfn */ - if ((start >> PAGE_SHIFT) >= max_low_pfn) + if (start < range_start) + start = range_start; + + if (start >= range_end) continue; - if ((end >> PAGE_SHIFT) > max_low_pfn) - end = max_low_pfn << PAGE_SHIFT; -#endif + if (end > range_end) + end = range_end; + init_memory_mapping(start, end); } - -#ifdef CONFIG_X86_64 - if (max_pfn > max_low_pfn) { - /* can we preseve max_low_pfn ?*/ - max_low_pfn = max_pfn; - } -#endif } void __init init_mem_mapping(void) @@ -461,8 +451,15 @@ void __init init_mem_mapping(void) (pgt_buf_top << PAGE_SHIFT) - 1); max_pfn_mapped = 0; /* will get exact value next */ - init_all_memory_mapping(); - + /* the ISA range is always mapped regardless of memory holes */ + init_memory_mapping(0, ISA_END_ADDRESS); + init_range_memory_mapping(ISA_END_ADDRESS, end); +#ifdef CONFIG_X86_64 + if (max_pfn > max_low_pfn) { + /* can we preseve max_low_pfn ?*/ + max_low_pfn = max_pfn; + } +#endif /* * Reserve the kernel pagetable pages we used (pgt_buf_start - * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) From 8d57470d8f859635deffe3919d7d4867b488b85a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:58 -0800 Subject: [PATCH 029/105] x86, mm: setup page table in top-down Get pgt_buf early from BRK, and use it to map PMD_SIZE from top at first. Then use mapped pages to map more ranges below, and keep looping until all pages get mapped. alloc_low_page will use page from BRK at first, after that buffer is used up, will use memblock to find and reserve pages for page table usage. Introduce min_pfn_mapped to make sure find new pages from mapped ranges, that will be updated when lower pages get mapped. Also add step_size to make sure that don't try to map too big range with limited mapped pages initially, and increase the step_size when we have more mapped pages on hand. We don't need to call pagetable_reserve anymore, reserve work is done in alloc_low_page() directly. At last we can get rid of calculation and find early pgt related code. -v2: update to after fix_xen change, also use MACRO for initial pgt_buf size and add comments with it. -v3: skip big reserved range in memblock.reserved near end. -v4: don't need fix_xen change now. -v5: add changelog about moving about reserving pagetable to alloc_low_page. Suggested-by: "H. Peter Anvin" Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-22-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page_types.h | 1 + arch/x86/include/asm/pgtable.h | 1 + arch/x86/kernel/setup.c | 3 + arch/x86/mm/init.c | 210 +++++++++--------------------- arch/x86/mm/init_32.c | 17 ++- arch/x86/mm/init_64.c | 17 ++- 6 files changed, 94 insertions(+), 155 deletions(-) diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 54c97879195e..9f6f3e66e84d 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -45,6 +45,7 @@ extern int devmem_is_allowed(unsigned long pagenr); extern unsigned long max_low_pfn_mapped; extern unsigned long max_pfn_mapped; +extern unsigned long min_pfn_mapped; static inline phys_addr_t get_max_mapped(void) { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index dd1a88832d25..6991a3e1bf81 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -603,6 +603,7 @@ static inline int pgd_none(pgd_t pgd) extern int direct_gbpages; void init_mem_mapping(void); +void early_alloc_pgt_buf(void); /* local pte updates need not use xchg for locking */ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 94f922a73c54..f7634092931b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -124,6 +124,7 @@ */ unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; +unsigned long min_pfn_mapped; #ifdef CONFIG_DMI RESERVE_BRK(dmi_alloc, 65536); @@ -900,6 +901,8 @@ void __init setup_arch(char **cmdline_p) reserve_ibft_region(); + early_alloc_pgt_buf(); + /* * Need to conclude brk, before memblock_x86_fill() * it could use memblock_find_in_range, could overlap with diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index c688ea3887f2..2393d0099e7f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -21,6 +21,21 @@ unsigned long __initdata pgt_buf_start; unsigned long __meminitdata pgt_buf_end; unsigned long __meminitdata pgt_buf_top; +/* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */ +#define INIT_PGT_BUF_SIZE (5 * PAGE_SIZE) +RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); +void __init early_alloc_pgt_buf(void) +{ + unsigned long tables = INIT_PGT_BUF_SIZE; + phys_addr_t base; + + base = __pa(extend_brk(tables, PAGE_SIZE)); + + pgt_buf_start = base >> PAGE_SHIFT; + pgt_buf_end = pgt_buf_start; + pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); +} + int after_bootmem; int direct_gbpages @@ -228,105 +243,6 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, return nr_range; } -/* - * First calculate space needed for kernel direct mapping page tables to cover - * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB - * pages. Then find enough contiguous space for those page tables. - */ -static unsigned long __init calculate_table_space_size(unsigned long start, unsigned long end) -{ - int i; - unsigned long puds = 0, pmds = 0, ptes = 0, tables; - struct map_range mr[NR_RANGE_MR]; - int nr_range; - - memset(mr, 0, sizeof(mr)); - nr_range = 0; - nr_range = split_mem_range(mr, nr_range, start, end); - - for (i = 0; i < nr_range; i++) { - unsigned long range, extra; - - range = mr[i].end - mr[i].start; - puds += (range + PUD_SIZE - 1) >> PUD_SHIFT; - - if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) { - extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT); - pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT; - } else { - pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT; - } - - if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) { - extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT); -#ifdef CONFIG_X86_32 - extra += PMD_SIZE; -#endif - ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; - } else { - ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT; - } - } - - tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); - tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); - tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); - -#ifdef CONFIG_X86_32 - /* for fixmap */ - tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); -#endif - - return tables; -} - -static unsigned long __init calculate_all_table_space_size(void) -{ - unsigned long start_pfn, end_pfn; - unsigned long tables; - int i; - - /* the ISA range is always mapped regardless of memory holes */ - tables = calculate_table_space_size(0, ISA_END_ADDRESS); - - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { - u64 start = start_pfn << PAGE_SHIFT; - u64 end = end_pfn << PAGE_SHIFT; - - if (end <= ISA_END_ADDRESS) - continue; - - if (start < ISA_END_ADDRESS) - start = ISA_END_ADDRESS; -#ifdef CONFIG_X86_32 - /* on 32 bit, we only map up to max_low_pfn */ - if ((start >> PAGE_SHIFT) >= max_low_pfn) - continue; - - if ((end >> PAGE_SHIFT) > max_low_pfn) - end = max_low_pfn << PAGE_SHIFT; -#endif - tables += calculate_table_space_size(start, end); - } - - return tables; -} - -static void __init find_early_table_space(unsigned long start, - unsigned long good_end, - unsigned long tables) -{ - phys_addr_t base; - - base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); - if (!base) - panic("Cannot find space for the kernel page tables"); - - pgt_buf_start = base >> PAGE_SHIFT; - pgt_buf_end = pgt_buf_start; - pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); -} - static struct range pfn_mapped[E820_X_MAX]; static int nr_pfn_mapped; @@ -391,17 +307,14 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, } /* - * Iterate through E820 memory map and create direct mappings for only E820_RAM - * regions. We cannot simply create direct mappings for all pfns from - * [0 to max_low_pfn) and [4GB to max_pfn) because of possible memory holes in - * high addresses that cannot be marked as UC by fixed/variable range MTRRs. - * Depending on the alignment of E820 ranges, this may possibly result in using - * smaller size (i.e. 4K instead of 2M or 1G) page tables. + * would have hole in the middle or ends, and only ram parts will be mapped. */ -static void __init init_range_memory_mapping(unsigned long range_start, +static unsigned long __init init_range_memory_mapping( + unsigned long range_start, unsigned long range_end) { unsigned long start_pfn, end_pfn; + unsigned long mapped_ram_size = 0; int i; for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { @@ -421,71 +334,70 @@ static void __init init_range_memory_mapping(unsigned long range_start, end = range_end; init_memory_mapping(start, end); + + mapped_ram_size += end - start; } + + return mapped_ram_size; } +/* (PUD_SHIFT-PMD_SHIFT)/2 */ +#define STEP_SIZE_SHIFT 5 void __init init_mem_mapping(void) { - unsigned long tables, good_end, end; + unsigned long end, real_end, start, last_start; + unsigned long step_size; + unsigned long addr; + unsigned long mapped_ram_size = 0; + unsigned long new_mapped_ram_size; probe_page_size_mask(); - /* - * Find space for the kernel direct mapping tables. - * - * Later we should allocate these tables in the local node of the - * memory mapped. Unfortunately this is done currently before the - * nodes are discovered. - */ #ifdef CONFIG_X86_64 end = max_pfn << PAGE_SHIFT; - good_end = end; #else end = max_low_pfn << PAGE_SHIFT; - good_end = max_pfn_mapped << PAGE_SHIFT; #endif - tables = calculate_all_table_space_size(); - find_early_table_space(0, good_end, tables); - printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx] prealloc\n", - end - 1, pgt_buf_start << PAGE_SHIFT, - (pgt_buf_top << PAGE_SHIFT) - 1); - max_pfn_mapped = 0; /* will get exact value next */ /* the ISA range is always mapped regardless of memory holes */ init_memory_mapping(0, ISA_END_ADDRESS); - init_range_memory_mapping(ISA_END_ADDRESS, end); + + /* xen has big range in reserved near end of ram, skip it at first */ + addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE, + PAGE_SIZE); + real_end = addr + PMD_SIZE; + + /* step_size need to be small so pgt_buf from BRK could cover it */ + step_size = PMD_SIZE; + max_pfn_mapped = 0; /* will get exact value next */ + min_pfn_mapped = real_end >> PAGE_SHIFT; + last_start = start = real_end; + while (last_start > ISA_END_ADDRESS) { + if (last_start > step_size) { + start = round_down(last_start - 1, step_size); + if (start < ISA_END_ADDRESS) + start = ISA_END_ADDRESS; + } else + start = ISA_END_ADDRESS; + new_mapped_ram_size = init_range_memory_mapping(start, + last_start); + last_start = start; + min_pfn_mapped = last_start >> PAGE_SHIFT; + /* only increase step_size after big range get mapped */ + if (new_mapped_ram_size > mapped_ram_size) + step_size <<= STEP_SIZE_SHIFT; + mapped_ram_size += new_mapped_ram_size; + } + + if (real_end < end) + init_range_memory_mapping(real_end, end); + #ifdef CONFIG_X86_64 if (max_pfn > max_low_pfn) { /* can we preseve max_low_pfn ?*/ max_low_pfn = max_pfn; } #endif - /* - * Reserve the kernel pagetable pages we used (pgt_buf_start - - * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) - * so that they can be reused for other purposes. - * - * On native it just means calling memblock_reserve, on Xen it also - * means marking RW the pagetable pages that we allocated before - * but that haven't been used. - * - * In fact on xen we mark RO the whole range pgt_buf_start - - * pgt_buf_top, because we have to make sure that when - * init_memory_mapping reaches the pagetable pages area, it maps - * RO all the pagetable pages, including the ones that are beyond - * pgt_buf_end at that time. - */ - if (pgt_buf_end > pgt_buf_start) { - printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx] final\n", - end - 1, pgt_buf_start << PAGE_SHIFT, - (pgt_buf_end << PAGE_SHIFT) - 1); - x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start), - PFN_PHYS(pgt_buf_end)); - } - - /* stop the wrong using */ - pgt_buf_top = 0; - early_memtest(0, max_pfn_mapped << PAGE_SHIFT); } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 27f7fc69cf8a..7bb11064a9e1 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -61,11 +61,22 @@ bool __read_mostly __vmalloc_start_set = false; static __init void *alloc_low_page(void) { - unsigned long pfn = pgt_buf_end++; + unsigned long pfn; void *adr; - if (pfn >= pgt_buf_top) - panic("alloc_low_page: ran out of memory"); + if ((pgt_buf_end + 1) >= pgt_buf_top) { + unsigned long ret; + if (min_pfn_mapped >= max_pfn_mapped) + panic("alloc_low_page: ran out of memory"); + ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, + max_pfn_mapped << PAGE_SHIFT, + PAGE_SIZE, PAGE_SIZE); + if (!ret) + panic("alloc_low_page: can not alloc memory"); + memblock_reserve(ret, PAGE_SIZE); + pfn = ret >> PAGE_SHIFT; + } else + pfn = pgt_buf_end++; adr = __va(pfn * PAGE_SIZE); clear_page(adr); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index fa28e3e29741..eefaea637b3d 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -316,7 +316,7 @@ void __init cleanup_highmap(void) static __ref void *alloc_low_page(unsigned long *phys) { - unsigned long pfn = pgt_buf_end++; + unsigned long pfn; void *adr; if (after_bootmem) { @@ -326,8 +326,19 @@ static __ref void *alloc_low_page(unsigned long *phys) return adr; } - if (pfn >= pgt_buf_top) - panic("alloc_low_page: ran out of memory"); + if ((pgt_buf_end + 1) >= pgt_buf_top) { + unsigned long ret; + if (min_pfn_mapped >= max_pfn_mapped) + panic("alloc_low_page: ran out of memory"); + ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, + max_pfn_mapped << PAGE_SHIFT, + PAGE_SIZE, PAGE_SIZE); + if (!ret) + panic("alloc_low_page: can not alloc memory"); + memblock_reserve(ret, PAGE_SIZE); + pfn = ret >> PAGE_SHIFT; + } else + pfn = pgt_buf_end++; adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); clear_page(adr); From 973dc4f3fad5890bc7b694148ad4c825b9af6dc1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:59 -0800 Subject: [PATCH 030/105] x86, mm: Remove early_memremap workaround for page table accessing on 64bit We try to put page table high to make room for kdump, and at that time those ranges are not mapped yet, and have to use ioremap to access it. Now after patch that pre-map page table top down. x86, mm: setup page table in top-down We do not need that workaround anymore. Just use __va to return directly mapping address. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-23-git-send-email-yinghai@kernel.org Acked-by: Stefano Stabellini Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_64.c | 38 ++++---------------------------------- 1 file changed, 4 insertions(+), 34 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index eefaea637b3d..5ee924237689 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -340,36 +340,12 @@ static __ref void *alloc_low_page(unsigned long *phys) } else pfn = pgt_buf_end++; - adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); + adr = __va(pfn * PAGE_SIZE); clear_page(adr); *phys = pfn * PAGE_SIZE; return adr; } -static __ref void *map_low_page(void *virt) -{ - void *adr; - unsigned long phys, left; - - if (after_bootmem) - return virt; - - phys = __pa(virt); - left = phys & (PAGE_SIZE - 1); - adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE); - adr = (void *)(((unsigned long)adr) | left); - - return adr; -} - -static __ref void unmap_low_page(void *adr) -{ - if (after_bootmem) - return; - - early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE); -} - static unsigned long __meminit phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, pgprot_t prot) @@ -442,10 +418,9 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, if (pmd_val(*pmd)) { if (!pmd_large(*pmd)) { spin_lock(&init_mm.page_table_lock); - pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd)); + pte = (pte_t *)pmd_page_vaddr(*pmd); last_map_addr = phys_pte_init(pte, address, end, prot); - unmap_low_page(pte); spin_unlock(&init_mm.page_table_lock); continue; } @@ -483,7 +458,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, pte = alloc_low_page(&pte_phys); last_map_addr = phys_pte_init(pte, address, end, new_prot); - unmap_low_page(pte); spin_lock(&init_mm.page_table_lock); pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); @@ -518,10 +492,9 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, if (pud_val(*pud)) { if (!pud_large(*pud)) { - pmd = map_low_page(pmd_offset(pud, 0)); + pmd = pmd_offset(pud, 0); last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, prot); - unmap_low_page(pmd); __flush_tlb_all(); continue; } @@ -560,7 +533,6 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, pmd = alloc_low_page(&pmd_phys); last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, prot); - unmap_low_page(pmd); spin_lock(&init_mm.page_table_lock); pud_populate(&init_mm, pud, __va(pmd_phys)); @@ -596,17 +568,15 @@ kernel_physical_mapping_init(unsigned long start, next = end; if (pgd_val(*pgd)) { - pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd)); + pud = (pud_t *)pgd_page_vaddr(*pgd); last_map_addr = phys_pud_init(pud, __pa(start), __pa(end), page_size_mask); - unmap_low_page(pud); continue; } pud = alloc_low_page(&pud_phys); last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), page_size_mask); - unmap_low_page(pud); spin_lock(&init_mm.page_table_lock); pgd_populate(&init_mm, pgd, __va(pud_phys)); From 868bf4d6b94c980d3ad87f892a5e528b8ee2c320 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:00 -0800 Subject: [PATCH 031/105] x86, mm: Remove parameter in alloc_low_page for 64bit Now all page table buf are pre-mapped, and could use virtual address directly. So don't need to remember physical address anymore. Remove that phys pointer in alloc_low_page(), and that will allow us to merge alloc_low_page between 64bit and 32bit. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-24-git-send-email-yinghai@kernel.org Acked-by: Stefano Stabellini Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_64.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5ee924237689..19608203fa5d 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -314,14 +314,13 @@ void __init cleanup_highmap(void) } } -static __ref void *alloc_low_page(unsigned long *phys) +static __ref void *alloc_low_page(void) { unsigned long pfn; void *adr; if (after_bootmem) { adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); - *phys = __pa(adr); return adr; } @@ -342,7 +341,6 @@ static __ref void *alloc_low_page(unsigned long *phys) adr = __va(pfn * PAGE_SIZE); clear_page(adr); - *phys = pfn * PAGE_SIZE; return adr; } @@ -401,7 +399,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, int i = pmd_index(address); for (; i < PTRS_PER_PMD; i++, address = next) { - unsigned long pte_phys; pmd_t *pmd = pmd_page + pmd_index(address); pte_t *pte; pgprot_t new_prot = prot; @@ -456,11 +453,11 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, continue; } - pte = alloc_low_page(&pte_phys); + pte = alloc_low_page(); last_map_addr = phys_pte_init(pte, address, end, new_prot); spin_lock(&init_mm.page_table_lock); - pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); + pmd_populate_kernel(&init_mm, pmd, pte); spin_unlock(&init_mm.page_table_lock); } update_page_count(PG_LEVEL_2M, pages); @@ -476,7 +473,6 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, int i = pud_index(addr); for (; i < PTRS_PER_PUD; i++, addr = next) { - unsigned long pmd_phys; pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; pgprot_t prot = PAGE_KERNEL; @@ -530,12 +526,12 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, continue; } - pmd = alloc_low_page(&pmd_phys); + pmd = alloc_low_page(); last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, prot); spin_lock(&init_mm.page_table_lock); - pud_populate(&init_mm, pud, __va(pmd_phys)); + pud_populate(&init_mm, pud, pmd); spin_unlock(&init_mm.page_table_lock); } __flush_tlb_all(); @@ -560,7 +556,6 @@ kernel_physical_mapping_init(unsigned long start, for (; start < end; start = next) { pgd_t *pgd = pgd_offset_k(start); - unsigned long pud_phys; pud_t *pud; next = (start + PGDIR_SIZE) & PGDIR_MASK; @@ -574,12 +569,12 @@ kernel_physical_mapping_init(unsigned long start, continue; } - pud = alloc_low_page(&pud_phys); + pud = alloc_low_page(); last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), page_size_mask); spin_lock(&init_mm.page_table_lock); - pgd_populate(&init_mm, pgd, __va(pud_phys)); + pgd_populate(&init_mm, pgd, pud); spin_unlock(&init_mm.page_table_lock); pgd_changed = true; } From 5c51bdbe4c74dce7996d0bbfa39974775cc3f13c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:01 -0800 Subject: [PATCH 032/105] x86, mm: Merge alloc_low_page between 64bit and 32bit They are almost same except 64 bit need to handle after_bootmem case. Add mm_internal.h to make that alloc_low_page() only to be accessible from arch/x86/mm/init*.c Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-25-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 34 ++++++++++++++++++++++++++++++++++ arch/x86/mm/init_32.c | 26 ++------------------------ arch/x86/mm/init_64.c | 32 ++------------------------------ arch/x86/mm/mm_internal.h | 6 ++++++ 4 files changed, 44 insertions(+), 54 deletions(-) create mode 100644 arch/x86/mm/mm_internal.h diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 2393d0099e7f..848189229b2d 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -17,10 +17,44 @@ #include #include /* for MAX_DMA_PFN */ +#include "mm_internal.h" + unsigned long __initdata pgt_buf_start; unsigned long __meminitdata pgt_buf_end; unsigned long __meminitdata pgt_buf_top; +__ref void *alloc_low_page(void) +{ + unsigned long pfn; + void *adr; + +#ifdef CONFIG_X86_64 + if (after_bootmem) { + adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); + + return adr; + } +#endif + + if ((pgt_buf_end + 1) >= pgt_buf_top) { + unsigned long ret; + if (min_pfn_mapped >= max_pfn_mapped) + panic("alloc_low_page: ran out of memory"); + ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, + max_pfn_mapped << PAGE_SHIFT, + PAGE_SIZE, PAGE_SIZE); + if (!ret) + panic("alloc_low_page: can not alloc memory"); + memblock_reserve(ret, PAGE_SIZE); + pfn = ret >> PAGE_SHIFT; + } else + pfn = pgt_buf_end++; + + adr = __va(pfn * PAGE_SIZE); + clear_page(adr); + return adr; +} + /* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */ #define INIT_PGT_BUF_SIZE (5 * PAGE_SIZE) RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 7bb11064a9e1..a7f2df1cdcfd 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -53,36 +53,14 @@ #include #include +#include "mm_internal.h" + unsigned long highstart_pfn, highend_pfn; static noinline int do_test_wp_bit(void); bool __read_mostly __vmalloc_start_set = false; -static __init void *alloc_low_page(void) -{ - unsigned long pfn; - void *adr; - - if ((pgt_buf_end + 1) >= pgt_buf_top) { - unsigned long ret; - if (min_pfn_mapped >= max_pfn_mapped) - panic("alloc_low_page: ran out of memory"); - ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, - max_pfn_mapped << PAGE_SHIFT, - PAGE_SIZE, PAGE_SIZE); - if (!ret) - panic("alloc_low_page: can not alloc memory"); - memblock_reserve(ret, PAGE_SIZE); - pfn = ret >> PAGE_SHIFT; - } else - pfn = pgt_buf_end++; - - adr = __va(pfn * PAGE_SIZE); - clear_page(adr); - return adr; -} - /* * Creates a middle page table and puts a pointer to it in the * given global directory entry. This only returns the gd entry diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 19608203fa5d..1d53defc99c9 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -54,6 +54,8 @@ #include #include +#include "mm_internal.h" + static int __init parse_direct_gbpages_off(char *arg) { direct_gbpages = 0; @@ -314,36 +316,6 @@ void __init cleanup_highmap(void) } } -static __ref void *alloc_low_page(void) -{ - unsigned long pfn; - void *adr; - - if (after_bootmem) { - adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); - - return adr; - } - - if ((pgt_buf_end + 1) >= pgt_buf_top) { - unsigned long ret; - if (min_pfn_mapped >= max_pfn_mapped) - panic("alloc_low_page: ran out of memory"); - ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, - max_pfn_mapped << PAGE_SHIFT, - PAGE_SIZE, PAGE_SIZE); - if (!ret) - panic("alloc_low_page: can not alloc memory"); - memblock_reserve(ret, PAGE_SIZE); - pfn = ret >> PAGE_SHIFT; - } else - pfn = pgt_buf_end++; - - adr = __va(pfn * PAGE_SIZE); - clear_page(adr); - return adr; -} - static unsigned long __meminit phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, pgprot_t prot) diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h new file mode 100644 index 000000000000..b3f993a2555e --- /dev/null +++ b/arch/x86/mm/mm_internal.h @@ -0,0 +1,6 @@ +#ifndef __X86_MM_INTERNAL_H +#define __X86_MM_INTERNAL_H + +void *alloc_low_page(void); + +#endif /* __X86_MM_INTERNAL_H */ From 9985b4c6fa7d660f685918a58282275e9e35d8e0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:02 -0800 Subject: [PATCH 033/105] x86, mm: Move min_pfn_mapped back to mm/init.c Also change it to static. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-26-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page_types.h | 1 - arch/x86/kernel/setup.c | 1 - arch/x86/mm/init.c | 2 ++ 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 9f6f3e66e84d..54c97879195e 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -45,7 +45,6 @@ extern int devmem_is_allowed(unsigned long pagenr); extern unsigned long max_low_pfn_mapped; extern unsigned long max_pfn_mapped; -extern unsigned long min_pfn_mapped; static inline phys_addr_t get_max_mapped(void) { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f7634092931b..20151941cce8 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -124,7 +124,6 @@ */ unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; -unsigned long min_pfn_mapped; #ifdef CONFIG_DMI RESERVE_BRK(dmi_alloc, 65536); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 848189229b2d..6392bf9a3947 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -23,6 +23,8 @@ unsigned long __initdata pgt_buf_start; unsigned long __meminitdata pgt_buf_end; unsigned long __meminitdata pgt_buf_top; +static unsigned long min_pfn_mapped; + __ref void *alloc_low_page(void) { unsigned long pfn; From 6f80b68e9e515547edbacb0c37491730bf766db5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:03 -0800 Subject: [PATCH 034/105] x86, mm, Xen: Remove mapping_pagetable_reserve() Page table area are pre-mapped now after x86, mm: setup page table in top-down x86, mm: Remove early_memremap workaround for page table accessing on 64bit mapping_pagetable_reserve is not used anymore, so remove it. Also remove operation in mask_rw_pte(), as modified allow_low_page always return pages that are already mapped, moreover xen_alloc_pte_init, xen_alloc_pmd_init, etc, will mark the page RO before hooking it into the pagetable automatically. -v2: add changelog about mask_rw_pte() from Stefano. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-27-git-send-email-yinghai@kernel.org Cc: Stefano Stabellini Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable_types.h | 1 - arch/x86/include/asm/x86_init.h | 12 ------------ arch/x86/kernel/x86_init.c | 4 ---- arch/x86/mm/init.c | 4 ---- arch/x86/xen/mmu.c | 28 ---------------------------- 5 files changed, 49 deletions(-) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index ec8a1fc9505d..79738f20aaf5 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -301,7 +301,6 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, /* Install a pte for a particular vaddr in kernel space. */ void set_pte_vaddr(unsigned long vaddr, pte_t pte); -extern void native_pagetable_reserve(u64 start, u64 end); #ifdef CONFIG_X86_32 extern void native_pagetable_init(void); #else diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 57693498519c..3b2ce8fc995a 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -68,17 +68,6 @@ struct x86_init_oem { void (*banner)(void); }; -/** - * struct x86_init_mapping - platform specific initial kernel pagetable setup - * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage - * - * For more details on the purpose of this hook, look in - * init_memory_mapping and the commit that added it. - */ -struct x86_init_mapping { - void (*pagetable_reserve)(u64 start, u64 end); -}; - /** * struct x86_init_paging - platform specific paging functions * @pagetable_init: platform specific paging initialization call to setup @@ -136,7 +125,6 @@ struct x86_init_ops { struct x86_init_mpparse mpparse; struct x86_init_irqs irqs; struct x86_init_oem oem; - struct x86_init_mapping mapping; struct x86_init_paging paging; struct x86_init_timers timers; struct x86_init_iommu iommu; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 7a3d075a814a..50cf83ecd32e 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -62,10 +62,6 @@ struct x86_init_ops x86_init __initdata = { .banner = default_banner, }, - .mapping = { - .pagetable_reserve = native_pagetable_reserve, - }, - .paging = { .pagetable_init = native_pagetable_init, }, diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 6392bf9a3947..21173fcdb4a1 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -112,10 +112,6 @@ static void __init probe_page_size_mask(void) __supported_pte_mask |= _PAGE_GLOBAL; } } -void __init native_pagetable_reserve(u64 start, u64 end) -{ - memblock_reserve(start, end - start); -} #ifdef CONFIG_X86_32 #define NR_RANGE_MR 3 diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index dcf5f2dd91ec..bbb883f58bc4 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1178,20 +1178,6 @@ static void xen_exit_mmap(struct mm_struct *mm) static void xen_post_allocator_init(void); -static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) -{ - /* reserve the range used */ - native_pagetable_reserve(start, end); - - /* set as RW the rest */ - printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end, - PFN_PHYS(pgt_buf_top)); - while (end < PFN_PHYS(pgt_buf_top)) { - make_lowmem_page_readwrite(__va(end)); - end += PAGE_SIZE; - } -} - #ifdef CONFIG_X86_64 static void __init xen_cleanhighmap(unsigned long vaddr, unsigned long vaddr_end) @@ -1503,19 +1489,6 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) #else /* CONFIG_X86_64 */ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) { - unsigned long pfn = pte_pfn(pte); - - /* - * If the new pfn is within the range of the newly allocated - * kernel pagetable, and it isn't being mapped into an - * early_ioremap fixmap slot as a freshly allocated page, make sure - * it is RO. - */ - if (((!is_early_ioremap_ptep(ptep) && - pfn >= pgt_buf_start && pfn < pgt_buf_top)) || - (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1))) - pte = pte_wrprotect(pte); - return pte; } #endif /* CONFIG_X86_64 */ @@ -2197,7 +2170,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { void __init xen_init_mmu_ops(void) { - x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; x86_init.paging.pagetable_init = xen_pagetable_init; pv_mmu_ops = xen_mmu_ops; From 22c8ca2ac256bb681be791858b35502b5d37e73b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:04 -0800 Subject: [PATCH 035/105] x86, mm: Add alloc_low_pages(num) 32bit kmap mapping needs pages to be used for low to high. At this point those pages are still from pgt_buf_* from BRK, so it is ok now. But we want to move early_ioremap_page_table_range_init() out of init_memory_mapping() and only call it one time later, that will make page_table_range_init/page_table_kmap_check/alloc_low_page to use memblock to get page. memblock allocation for pages are from high to low. So will get panic from page_table_kmap_check() that has BUG_ON to do ordering checking. This patch add alloc_low_pages to make it possible to allocate serveral pages at first, and hand out pages one by one from low to high. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-28-git-send-email-yinghai@kernel.org Cc: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 33 +++++++++++++++++++++------------ arch/x86/mm/mm_internal.h | 6 +++++- 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 21173fcdb4a1..02cea14c6d0c 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -25,36 +25,45 @@ unsigned long __meminitdata pgt_buf_top; static unsigned long min_pfn_mapped; -__ref void *alloc_low_page(void) +__ref void *alloc_low_pages(unsigned int num) { unsigned long pfn; - void *adr; + int i; #ifdef CONFIG_X86_64 if (after_bootmem) { - adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); + unsigned int order; - return adr; + order = get_order((unsigned long)num << PAGE_SHIFT); + return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK | + __GFP_ZERO, order); } #endif - if ((pgt_buf_end + 1) >= pgt_buf_top) { + if ((pgt_buf_end + num) >= pgt_buf_top) { unsigned long ret; if (min_pfn_mapped >= max_pfn_mapped) panic("alloc_low_page: ran out of memory"); ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, max_pfn_mapped << PAGE_SHIFT, - PAGE_SIZE, PAGE_SIZE); + PAGE_SIZE * num , PAGE_SIZE); if (!ret) panic("alloc_low_page: can not alloc memory"); - memblock_reserve(ret, PAGE_SIZE); + memblock_reserve(ret, PAGE_SIZE * num); pfn = ret >> PAGE_SHIFT; - } else - pfn = pgt_buf_end++; + } else { + pfn = pgt_buf_end; + pgt_buf_end += num; + } - adr = __va(pfn * PAGE_SIZE); - clear_page(adr); - return adr; + for (i = 0; i < num; i++) { + void *adr; + + adr = __va((pfn + i) << PAGE_SHIFT); + clear_page(adr); + } + + return __va(pfn << PAGE_SHIFT); } /* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */ diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h index b3f993a2555e..7e3b88ee078a 100644 --- a/arch/x86/mm/mm_internal.h +++ b/arch/x86/mm/mm_internal.h @@ -1,6 +1,10 @@ #ifndef __X86_MM_INTERNAL_H #define __X86_MM_INTERNAL_H -void *alloc_low_page(void); +void *alloc_low_pages(unsigned int num); +static inline void *alloc_low_page(void) +{ + return alloc_low_pages(1); +} #endif /* __X86_MM_INTERNAL_H */ From ddd3509df8f8d4f1cf4784f559d702ce00dc8846 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Fri, 16 Nov 2012 19:39:05 -0800 Subject: [PATCH 036/105] x86, mm: Add pointer about Xen mmu requirement for alloc_low_pages Add link for more information 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve -v2: updated to commets from hpa to include commit name. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-29-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 02cea14c6d0c..cb4f8ba70ecc 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -25,6 +25,15 @@ unsigned long __meminitdata pgt_buf_top; static unsigned long min_pfn_mapped; +/* + * Pages returned are already directly mapped. + * + * Changing that is likely to break Xen, see commit: + * + * 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve + * + * for detailed information. + */ __ref void *alloc_low_pages(unsigned int num) { unsigned long pfn; From 719272c45b821d38608fc333700bde1a89c56c59 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:06 -0800 Subject: [PATCH 037/105] x86, mm: only call early_ioremap_page_table_range_init() once On 32bit, before patcheset that only set page table for ram, we only call that one time. Now, we are calling that during every init_memory_mapping if we have holes under max_low_pfn. We should only call it one time after all ranges under max_low_page get mapped just like we did before. Also that could avoid the risk to run out of pgt_buf in BRK. Need to update page_table_range_init() to count the pages for kmap page table at first, and use new added alloc_low_pages() to get pages in sequence. That will conform to the requirement that pages need to be in low to high order. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-30-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 13 +++++------- arch/x86/mm/init_32.c | 47 +++++++++++++++++++++++++++++++++++++------ 2 files changed, 46 insertions(+), 14 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index cb4f8ba70ecc..bed4888c6f4f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -343,14 +343,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, mr[i].page_size_mask); -#ifdef CONFIG_X86_32 - early_ioremap_page_table_range_init(); - - load_cr3(swapper_pg_dir); -#endif - - __flush_tlb_all(); - add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT); return ret >> PAGE_SHIFT; @@ -447,7 +439,12 @@ void __init init_mem_mapping(void) /* can we preseve max_low_pfn ?*/ max_low_pfn = max_pfn; } +#else + early_ioremap_page_table_range_init(); + load_cr3(swapper_pg_dir); + __flush_tlb_all(); #endif + early_memtest(0, max_pfn_mapped << PAGE_SHIFT); } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index a7f2df1cdcfd..0ae1ba8bc1b9 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -135,8 +135,39 @@ pte_t * __init populate_extra_pte(unsigned long vaddr) return one_page_table_init(pmd) + pte_idx; } +static unsigned long __init +page_table_range_init_count(unsigned long start, unsigned long end) +{ + unsigned long count = 0; +#ifdef CONFIG_HIGHMEM + int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT; + int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT; + int pgd_idx, pmd_idx; + unsigned long vaddr; + + if (pmd_idx_kmap_begin == pmd_idx_kmap_end) + return 0; + + vaddr = start; + pgd_idx = pgd_index(vaddr); + + for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) { + for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); + pmd_idx++) { + if ((vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin && + (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) + count++; + vaddr += PMD_SIZE; + } + pmd_idx = 0; + } +#endif + return count; +} + static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, - unsigned long vaddr, pte_t *lastpte) + unsigned long vaddr, pte_t *lastpte, + void **adr) { #ifdef CONFIG_HIGHMEM /* @@ -150,16 +181,15 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, if (pmd_idx_kmap_begin != pmd_idx_kmap_end && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin - && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end - && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start - || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) { + && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) { pte_t *newpte; int i; BUG_ON(after_bootmem); - newpte = alloc_low_page(); + newpte = *adr; for (i = 0; i < PTRS_PER_PTE; i++) set_pte(newpte + i, pte[i]); + *adr = (void *)(((unsigned long)(*adr)) + PAGE_SIZE); paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE)); @@ -193,6 +223,11 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) pgd_t *pgd; pmd_t *pmd; pte_t *pte = NULL; + unsigned long count = page_table_range_init_count(start, end); + void *adr = NULL; + + if (count) + adr = alloc_low_pages(count); vaddr = start; pgd_idx = pgd_index(vaddr); @@ -205,7 +240,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { pte = page_table_kmap_check(one_page_table_init(pmd), - pmd, vaddr, pte); + pmd, vaddr, pte, &adr); vaddr += PMD_SIZE; } From cf47065961b48727b4e47bc3e2e67f4996878437 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:07 -0800 Subject: [PATCH 038/105] x86, mm: Move back pgt_buf_* to mm/init.c Also change them to static. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-31-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/init.h | 4 ---- arch/x86/mm/init.c | 6 +++--- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 4f13998be59a..626ea8d31b6d 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -12,8 +12,4 @@ kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask); -extern unsigned long __initdata pgt_buf_start; -extern unsigned long __meminitdata pgt_buf_end; -extern unsigned long __meminitdata pgt_buf_top; - #endif /* _ASM_X86_INIT_32_H */ diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index bed4888c6f4f..3cadf1013cea 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -19,9 +19,9 @@ #include "mm_internal.h" -unsigned long __initdata pgt_buf_start; -unsigned long __meminitdata pgt_buf_end; -unsigned long __meminitdata pgt_buf_top; +static unsigned long __initdata pgt_buf_start; +static unsigned long __initdata pgt_buf_end; +static unsigned long __initdata pgt_buf_top; static unsigned long min_pfn_mapped; From 148b20989e0b83cb301e1fcd9e987c7abde05333 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:08 -0800 Subject: [PATCH 039/105] x86, mm: Move init_gbpages() out of setup.c Put it in mm/init.c, and call it from probe_page_mask(). init_mem_mapping is calling probe_page_mask at first. So calling sequence is not changed. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-32-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 15 +-------------- arch/x86/mm/init.c | 12 ++++++++++++ 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 20151941cce8..85b62f1c8071 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -282,18 +282,7 @@ void * __init extend_brk(size_t size, size_t align) return ret; } -#ifdef CONFIG_X86_64 -static void __init init_gbpages(void) -{ - if (direct_gbpages && cpu_has_gbpages) - printk(KERN_INFO "Using GB pages for direct mapping\n"); - else - direct_gbpages = 0; -} -#else -static inline void init_gbpages(void) -{ -} +#ifdef CONFIG_X86_32 static void __init cleanup_highmap(void) { } @@ -933,8 +922,6 @@ void __init setup_arch(char **cmdline_p) setup_real_mode(); - init_gbpages(); - init_mem_mapping(); memblock.current_limit = get_max_mapped(); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 3cadf1013cea..8168bf8fcda7 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -98,6 +98,16 @@ int direct_gbpages #endif ; +static void __init init_gbpages(void) +{ +#ifdef CONFIG_X86_64 + if (direct_gbpages && cpu_has_gbpages) + printk(KERN_INFO "Using GB pages for direct mapping\n"); + else + direct_gbpages = 0; +#endif +} + struct map_range { unsigned long start; unsigned long end; @@ -108,6 +118,8 @@ static int page_size_mask; static void __init probe_page_size_mask(void) { + init_gbpages(); + #if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) /* * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. From f836e35a98ab3b2f0d4c8730610e4a4a7f533505 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:09 -0800 Subject: [PATCH 040/105] x86, mm: change low/hignmem_pfn_init to static on 32bit Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-33-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 0ae1ba8bc1b9..322ee56ea1fe 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -575,7 +575,7 @@ early_param("highmem", parse_highmem); * artificially via the highmem=x boot parameter then create * it: */ -void __init lowmem_pfn_init(void) +static void __init lowmem_pfn_init(void) { /* max_low_pfn is 0, we already have early_res support */ max_low_pfn = max_pfn; @@ -611,7 +611,7 @@ void __init lowmem_pfn_init(void) * We have more RAM than fits into lowmem - we try to put it into * highmem, also taking the highmem=x boot parameter into account: */ -void __init highmem_pfn_init(void) +static void __init highmem_pfn_init(void) { max_low_pfn = MAXMEM_PFN; From c8dcdb9ce463ad4a660099a74a850f4f6fc81c41 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:10 -0800 Subject: [PATCH 041/105] x86, mm: Move function declaration into mm_internal.h They are only for mm/init*.c. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-34-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/init.h | 16 +++------------- arch/x86/mm/mm_internal.h | 7 +++++++ 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 626ea8d31b6d..bac770b7cbc4 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -1,15 +1,5 @@ -#ifndef _ASM_X86_INIT_32_H -#define _ASM_X86_INIT_32_H +#ifndef _ASM_X86_INIT_H +#define _ASM_X86_INIT_H -#ifdef CONFIG_X86_32 -extern void __init early_ioremap_page_table_range_init(void); -#endif -extern void __init zone_sizes_init(void); - -extern unsigned long __init -kernel_physical_mapping_init(unsigned long start, - unsigned long end, - unsigned long page_size_mask); - -#endif /* _ASM_X86_INIT_32_H */ +#endif /* _ASM_X86_INIT_H */ diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h index 7e3b88ee078a..dc79ac121774 100644 --- a/arch/x86/mm/mm_internal.h +++ b/arch/x86/mm/mm_internal.h @@ -7,4 +7,11 @@ static inline void *alloc_low_page(void) return alloc_low_pages(1); } +void early_ioremap_page_table_range_init(void); + +unsigned long kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask); +void zone_sizes_init(void); + #endif /* __X86_MM_INTERNAL_H */ From 11ed9e927d573d78beda6e6a166612666ae97064 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:11 -0800 Subject: [PATCH 042/105] x86, mm: Add check before clear pte above max_low_pfn on 32bit During test patch that adjust page_size_mask to map small range ram with big page size, found page table is setup wrongly for 32bit. And native_pagetable_init wrong clear pte for pmd with large page support. 1. add more comments about why we are expecting pte. 2. add BUG checking, so next time we could find problem earlier when we mess up page table setup again. 3. max_low_pfn is not included boundary for low memory mapping. We should check from max_low_pfn instead of +1. 4. add print out when some pte really get cleared, or we should use WARN() to find out why above max_low_pfn get mapped? so we could fix it. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-35-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_32.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 322ee56ea1fe..19ef9f018012 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -480,9 +480,14 @@ void __init native_pagetable_init(void) /* * Remove any mappings which extend past the end of physical - * memory from the boot time page table: + * memory from the boot time page table. + * In virtual address space, we should have at least two pages + * from VMALLOC_END to pkmap or fixmap according to VMALLOC_END + * definition. And max_low_pfn is set to VMALLOC_END physical + * address. If initial memory mapping is doing right job, we + * should have pte used near max_low_pfn or one pmd is not present. */ - for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) { + for (pfn = max_low_pfn; pfn < 1<<(32-PAGE_SHIFT); pfn++) { va = PAGE_OFFSET + (pfn<> PAGE_SHIFT); From 5a0d3aeeeffbd1534a510fc10c4ab7c99c45afce Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:12 -0800 Subject: [PATCH 043/105] x86, mm: use round_up/down in split_mem_range() to replace own inline version for those roundup and rounddown. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-36-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 8168bf8fcda7..0e625e606e5d 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -218,13 +218,11 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, * slowdowns. */ if (pos == 0) - end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); + end_pfn = PMD_SIZE >> PAGE_SHIFT; else - end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = round_up(pos, PMD_SIZE) >> PAGE_SHIFT; #else /* CONFIG_X86_64 */ - end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = round_up(pos, PMD_SIZE) >> PAGE_SHIFT; #endif if (end_pfn > (end >> PAGE_SHIFT)) end_pfn = end >> PAGE_SHIFT; @@ -234,15 +232,13 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, } /* big page (2M) range */ - start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); + start_pfn = round_up(pos, PMD_SIZE) >> PAGE_SHIFT; #ifdef CONFIG_X86_32 - end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = round_down(end, PMD_SIZE) >> PAGE_SHIFT; #else /* CONFIG_X86_64 */ - end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) - << (PUD_SHIFT - PAGE_SHIFT); - if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) - end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); + end_pfn = round_up(pos, PUD_SIZE) >> PAGE_SHIFT; + if (end_pfn > (round_down(end, PMD_SIZE) >> PAGE_SHIFT)) + end_pfn = round_down(end, PMD_SIZE) >> PAGE_SHIFT; #endif if (start_pfn < end_pfn) { @@ -253,9 +249,8 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, #ifdef CONFIG_X86_64 /* big page (1G) range */ - start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) - << (PUD_SHIFT - PAGE_SHIFT); - end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); + start_pfn = round_up(pos, PUD_SIZE) >> PAGE_SHIFT; + end_pfn = round_down(end, PUD_SIZE) >> PAGE_SHIFT; if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & @@ -264,9 +259,8 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, } /* tail is not big page (1G) alignment */ - start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); - end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + start_pfn = round_up(pos, PMD_SIZE) >> PAGE_SHIFT; + end_pfn = round_down(end, PMD_SIZE) >> PAGE_SHIFT; if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1< Date: Fri, 16 Nov 2012 19:39:13 -0800 Subject: [PATCH 044/105] x86, mm: use PFN_DOWN in split_mem_range() to replace own inline version for shifting. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-37-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 0e625e606e5d..1cca052b2cbd 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -208,8 +208,8 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, int i; /* head if not big page alignment ? */ - start_pfn = start >> PAGE_SHIFT; - pos = start_pfn << PAGE_SHIFT; + start_pfn = PFN_DOWN(start); + pos = PFN_PHYS(start_pfn); #ifdef CONFIG_X86_32 /* * Don't use a large page for the first 2/4MB of memory @@ -218,59 +218,59 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, * slowdowns. */ if (pos == 0) - end_pfn = PMD_SIZE >> PAGE_SHIFT; + end_pfn = PFN_DOWN(PMD_SIZE); else - end_pfn = round_up(pos, PMD_SIZE) >> PAGE_SHIFT; + end_pfn = PFN_DOWN(round_up(pos, PMD_SIZE)); #else /* CONFIG_X86_64 */ - end_pfn = round_up(pos, PMD_SIZE) >> PAGE_SHIFT; + end_pfn = PFN_DOWN(round_up(pos, PMD_SIZE)); #endif - if (end_pfn > (end >> PAGE_SHIFT)) - end_pfn = end >> PAGE_SHIFT; + if (end_pfn > PFN_DOWN(end)) + end_pfn = PFN_DOWN(end); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); - pos = end_pfn << PAGE_SHIFT; + pos = PFN_PHYS(end_pfn); } /* big page (2M) range */ - start_pfn = round_up(pos, PMD_SIZE) >> PAGE_SHIFT; + start_pfn = PFN_DOWN(round_up(pos, PMD_SIZE)); #ifdef CONFIG_X86_32 - end_pfn = round_down(end, PMD_SIZE) >> PAGE_SHIFT; + end_pfn = PFN_DOWN(round_down(end, PMD_SIZE)); #else /* CONFIG_X86_64 */ - end_pfn = round_up(pos, PUD_SIZE) >> PAGE_SHIFT; - if (end_pfn > (round_down(end, PMD_SIZE) >> PAGE_SHIFT)) - end_pfn = round_down(end, PMD_SIZE) >> PAGE_SHIFT; + end_pfn = PFN_DOWN(round_up(pos, PUD_SIZE)); + if (end_pfn > PFN_DOWN(round_down(end, PMD_SIZE))) + end_pfn = PFN_DOWN(round_down(end, PMD_SIZE)); #endif if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1<> PAGE_SHIFT; - end_pfn = round_down(end, PUD_SIZE) >> PAGE_SHIFT; + start_pfn = PFN_DOWN(round_up(pos, PUD_SIZE)); + end_pfn = PFN_DOWN(round_down(end, PUD_SIZE)); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & ((1<> PAGE_SHIFT; - end_pfn = round_down(end, PMD_SIZE) >> PAGE_SHIFT; + start_pfn = PFN_DOWN(round_up(pos, PMD_SIZE)); + end_pfn = PFN_DOWN(round_down(end, PMD_SIZE)); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1<>PAGE_SHIFT; - end_pfn = end>>PAGE_SHIFT; + start_pfn = PFN_DOWN(pos); + end_pfn = PFN_DOWN(end); nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); /* try to merge same page size and continuous */ From 1829ae9ad7380bf17333ab9ad1610631d9cb8664 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:14 -0800 Subject: [PATCH 045/105] x86, mm: use pfn instead of pos in split_mem_range could save some bit shifting operations. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-38-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 1cca052b2cbd..4bf1c5374928 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -204,12 +204,11 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, unsigned long end) { unsigned long start_pfn, end_pfn; - unsigned long pos; + unsigned long pfn; int i; /* head if not big page alignment ? */ - start_pfn = PFN_DOWN(start); - pos = PFN_PHYS(start_pfn); + pfn = start_pfn = PFN_DOWN(start); #ifdef CONFIG_X86_32 /* * Don't use a large page for the first 2/4MB of memory @@ -217,26 +216,26 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, * and overlapping MTRRs into large pages can cause * slowdowns. */ - if (pos == 0) + if (pfn == 0) end_pfn = PFN_DOWN(PMD_SIZE); else - end_pfn = PFN_DOWN(round_up(pos, PMD_SIZE)); + end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #else /* CONFIG_X86_64 */ - end_pfn = PFN_DOWN(round_up(pos, PMD_SIZE)); + end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #endif if (end_pfn > PFN_DOWN(end)) end_pfn = PFN_DOWN(end); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); - pos = PFN_PHYS(end_pfn); + pfn = end_pfn; } /* big page (2M) range */ - start_pfn = PFN_DOWN(round_up(pos, PMD_SIZE)); + start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #ifdef CONFIG_X86_32 end_pfn = PFN_DOWN(round_down(end, PMD_SIZE)); #else /* CONFIG_X86_64 */ - end_pfn = PFN_DOWN(round_up(pos, PUD_SIZE)); + end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); if (end_pfn > PFN_DOWN(round_down(end, PMD_SIZE))) end_pfn = PFN_DOWN(round_down(end, PMD_SIZE)); #endif @@ -244,32 +243,32 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1< Date: Fri, 16 Nov 2012 19:39:15 -0800 Subject: [PATCH 046/105] x86, mm: use limit_pfn for end pfn instead of shifting end to get that. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-39-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 4bf1c5374928..f410dc6f843e 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -203,10 +203,12 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, unsigned long start, unsigned long end) { - unsigned long start_pfn, end_pfn; + unsigned long start_pfn, end_pfn, limit_pfn; unsigned long pfn; int i; + limit_pfn = PFN_DOWN(end); + /* head if not big page alignment ? */ pfn = start_pfn = PFN_DOWN(start); #ifdef CONFIG_X86_32 @@ -223,8 +225,8 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, #else /* CONFIG_X86_64 */ end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #endif - if (end_pfn > PFN_DOWN(end)) - end_pfn = PFN_DOWN(end); + if (end_pfn > limit_pfn) + end_pfn = limit_pfn; if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); pfn = end_pfn; @@ -233,11 +235,11 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, /* big page (2M) range */ start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #ifdef CONFIG_X86_32 - end_pfn = PFN_DOWN(round_down(end, PMD_SIZE)); + end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); #else /* CONFIG_X86_64 */ end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); - if (end_pfn > PFN_DOWN(round_down(end, PMD_SIZE))) - end_pfn = PFN_DOWN(round_down(end, PMD_SIZE)); + if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE))) + end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); #endif if (start_pfn < end_pfn) { @@ -249,7 +251,7 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, #ifdef CONFIG_X86_64 /* big page (1G) range */ start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); - end_pfn = PFN_DOWN(round_down(end, PUD_SIZE)); + end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE)); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & @@ -259,7 +261,7 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, /* tail is not big page (1G) alignment */ start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); - end_pfn = PFN_DOWN(round_down(end, PMD_SIZE)); + end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1< Date: Fri, 16 Nov 2012 19:39:16 -0800 Subject: [PATCH 047/105] x86, mm: Unifying after_bootmem for 32bit and 64bit after_bootmem has different meaning in 32bit and 64bit. 32bit: after bootmem is ready 64bit: after bootmem is distroyed Let's merget them make 32bit the same as 64bit. for 32bit, it is mixing alloc_bootmem_pages, and alloc_low_page under after_bootmem is set or not set. alloc_bootmem is just wrapper for memblock for x86. Now we have alloc_low_page() with memblock too. We can drop bootmem path now, and only alloc_low_page only. At the same time, we make alloc_low_page could handle real after_bootmem for 32bit, because alloc_bootmem_pages could fallback to use slab too. At last move after_bootmem set position for 32bit the same as 64bit. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-40-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 2 -- arch/x86/mm/init_32.c | 21 ++++----------------- 2 files changed, 4 insertions(+), 19 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index f410dc6f843e..2a27e5ac1e3c 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -39,7 +39,6 @@ __ref void *alloc_low_pages(unsigned int num) unsigned long pfn; int i; -#ifdef CONFIG_X86_64 if (after_bootmem) { unsigned int order; @@ -47,7 +46,6 @@ __ref void *alloc_low_pages(unsigned int num) return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK | __GFP_ZERO, order); } -#endif if ((pgt_buf_end + num) >= pgt_buf_top) { unsigned long ret; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 19ef9f018012..f4fc4a28393a 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -73,10 +73,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) #ifdef CONFIG_X86_PAE if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { - if (after_bootmem) - pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE); - else - pmd_table = (pmd_t *)alloc_low_page(); + pmd_table = (pmd_t *)alloc_low_page(); paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); @@ -98,17 +95,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) static pte_t * __init one_page_table_init(pmd_t *pmd) { if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { - pte_t *page_table = NULL; - - if (after_bootmem) { -#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) - page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); -#endif - if (!page_table) - page_table = - (pte_t *)alloc_bootmem_pages(PAGE_SIZE); - } else - page_table = (pte_t *)alloc_low_page(); + pte_t *page_table = (pte_t *)alloc_low_page(); paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); @@ -708,8 +695,6 @@ void __init setup_bootmem_allocator(void) printk(KERN_INFO " mapped low ram: 0 - %08lx\n", max_pfn_mapped< Date: Fri, 16 Nov 2012 19:39:17 -0800 Subject: [PATCH 048/105] x86, mm: Move after_bootmem to mm_internel.h it is only used in arch/x86/mm/init*.c Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-41-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/mm_internal.h | 2 ++ include/linux/mm.h | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h index dc79ac121774..6b563a118891 100644 --- a/arch/x86/mm/mm_internal.h +++ b/arch/x86/mm/mm_internal.h @@ -14,4 +14,6 @@ unsigned long kernel_physical_mapping_init(unsigned long start, unsigned long page_size_mask); void zone_sizes_init(void); +extern int after_bootmem; + #endif /* __X86_MM_INTERNAL_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index bcaab4e6fe91..64d5271a3d36 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1355,7 +1355,6 @@ extern void __init mmap_init(void); extern void show_mem(unsigned int flags); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); -extern int after_bootmem; extern __printf(3, 4) void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...); From b8fd39c036ab982aa087b7ee671f86e2574d31f2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:18 -0800 Subject: [PATCH 049/105] x86, mm: Use clamp_t() in init_range_memory_mapping save some lines, and make code more readable. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-42-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 2a27e5ac1e3c..6f85de8a1f28 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -357,31 +357,20 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, * would have hole in the middle or ends, and only ram parts will be mapped. */ static unsigned long __init init_range_memory_mapping( - unsigned long range_start, - unsigned long range_end) + unsigned long r_start, + unsigned long r_end) { unsigned long start_pfn, end_pfn; unsigned long mapped_ram_size = 0; int i; for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { - u64 start = (u64)start_pfn << PAGE_SHIFT; - u64 end = (u64)end_pfn << PAGE_SHIFT; - - if (end <= range_start) + u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end); + u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end); + if (start >= end) continue; - if (start < range_start) - start = range_start; - - if (start >= range_end) - continue; - - if (end > range_end) - end = range_end; - init_memory_mapping(start, end); - mapped_ram_size += end - start; } From 94b43c3d86dddf95064fc83e9087448b35f985ff Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:19 -0800 Subject: [PATCH 050/105] x86, mm: kill numa_free_all_bootmem() Now NO_BOOTMEM version free_all_bootmem_node() does not really do free_bootmem at all, and it only call register_page_bootmem_info_node instead. That is confusing, try to kill that free_all_bootmem_node(). Before that, this patch will remove numa_free_all_bootmem(). That function could be replaced with register_page_bootmem_info() and free_all_bootmem(); Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-43-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/numa_64.h | 2 -- arch/x86/mm/init_64.c | 15 +++++++++++---- arch/x86/mm/numa_64.c | 13 ------------- 3 files changed, 11 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 0c05f7ae46e8..fe4d2d410ad2 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -1,6 +1,4 @@ #ifndef _ASM_X86_NUMA_64_H #define _ASM_X86_NUMA_64_H -extern unsigned long numa_free_all_bootmem(void); - #endif /* _ASM_X86_NUMA_64_H */ diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 1d53defc99c9..41785305f645 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -629,6 +629,16 @@ EXPORT_SYMBOL_GPL(arch_add_memory); static struct kcore_list kcore_vsyscall; +static void __init register_page_bootmem_info(void) +{ +#ifdef CONFIG_NUMA + int i; + + for_each_online_node(i) + register_page_bootmem_info_node(NODE_DATA(i)); +#endif +} + void __init mem_init(void) { long codesize, reservedpages, datasize, initsize; @@ -641,11 +651,8 @@ void __init mem_init(void) reservedpages = 0; /* this will put all low memory onto the freelists */ -#ifdef CONFIG_NUMA - totalram_pages = numa_free_all_bootmem(); -#else + register_page_bootmem_info(); totalram_pages = free_all_bootmem(); -#endif absent_pages = absent_pages_in_range(0, max_pfn); reservedpages = max_pfn - totalram_pages - absent_pages; diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 92e27119ee1a..9405ffc91502 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -10,16 +10,3 @@ void __init initmem_init(void) { x86_numa_init(); } - -unsigned long __init numa_free_all_bootmem(void) -{ - unsigned long pages = 0; - int i; - - for_each_online_node(i) - pages += free_all_bootmem_node(NODE_DATA(i)); - - pages += free_low_memory_core_early(MAX_NUMNODES); - - return pages; -} From c074eaac2ab264c94520efff7e896b771de885ae Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:20 -0800 Subject: [PATCH 051/105] x86, mm: kill numa_64.h Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-44-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/numa.h | 2 -- arch/x86/include/asm/numa_64.h | 4 ---- arch/x86/kernel/acpi/boot.c | 1 - arch/x86/kernel/cpu/amd.c | 1 - arch/x86/kernel/cpu/intel.c | 1 - arch/x86/kernel/setup.c | 3 --- 6 files changed, 12 deletions(-) delete mode 100644 arch/x86/include/asm/numa_64.h diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 49119fcea2dc..52560a2038e1 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -54,8 +54,6 @@ static inline int numa_cpu_node(int cpu) #ifdef CONFIG_X86_32 # include -#else -# include #endif #ifdef CONFIG_NUMA diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h deleted file mode 100644 index fe4d2d410ad2..000000000000 --- a/arch/x86/include/asm/numa_64.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef _ASM_X86_NUMA_64_H -#define _ASM_X86_NUMA_64_H - -#endif /* _ASM_X86_NUMA_64_H */ diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index e651f7a589ac..4b23aa18518d 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -51,7 +51,6 @@ EXPORT_SYMBOL(acpi_disabled); #ifdef CONFIG_X86_64 # include -# include #endif /* X86 */ #define BAD_MADT_ENTRY(entry, end) ( \ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 9619ba6528ca..913f94f9e8d9 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -12,7 +12,6 @@ #include #ifdef CONFIG_X86_64 -# include # include # include #endif diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 198e019a531a..3b547cc4bd03 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -17,7 +17,6 @@ #ifdef CONFIG_X86_64 #include -#include #endif #include "cpu.h" diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 85b62f1c8071..6d29d1fcf068 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -108,9 +108,6 @@ #include #include #include -#ifdef CONFIG_X86_64 -#include -#endif #include #include #include From 961f8fa04b1c3dfe35c2d3ee7ccba28f9b0e0e09 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:21 -0800 Subject: [PATCH 052/105] sparc, mm: Remove calling of free_all_bootmem_node() Now NO_BOOTMEM version free_all_bootmem_node() does not really do free_bootmem at all, and it only call register_page_bootmem_info_node instead. That is confusing, try to kill that free_all_bootmem_node(). Before that, this patch will remove calling of free_all_bootmem_node() We add register_page_bootmem_info() to call register_page_bootmem_info_node directly. Also could use free_all_bootmem() for numa case, and it is just the same as free_low_memory_core_early(). Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-45-git-send-email-yinghai@kernel.org Cc: "David S. Miller" Cc: Andrew Morton Cc: sparclinux@vger.kernel.org Acked-by: "David S. Miller" Signed-off-by: H. Peter Anvin --- arch/sparc/mm/init_64.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 9e28a118e6a4..b24bac238e34 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2021,6 +2021,16 @@ static void __init patch_tlb_miss_handler_bitmap(void) flushi(&valid_addr_bitmap_insn[0]); } +static void __init register_page_bootmem_info(void) +{ +#ifdef CONFIG_NEED_MULTIPLE_NODES + int i; + + for_each_online_node(i) + if (NODE_DATA(i)->node_spanned_pages) + register_page_bootmem_info_node(NODE_DATA(i)); +#endif +} void __init mem_init(void) { unsigned long codepages, datapages, initpages; @@ -2038,20 +2048,8 @@ void __init mem_init(void) high_memory = __va(last_valid_pfn << PAGE_SHIFT); -#ifdef CONFIG_NEED_MULTIPLE_NODES - { - int i; - for_each_online_node(i) { - if (NODE_DATA(i)->node_spanned_pages != 0) { - totalram_pages += - free_all_bootmem_node(NODE_DATA(i)); - } - } - totalram_pages += free_low_memory_core_early(MAX_NUMNODES); - } -#else + register_page_bootmem_info(); totalram_pages = free_all_bootmem(); -#endif /* We subtract one to account for the mem_map_zero page * allocated below. From 600cc5b7f6371706679490d7ee108015ae57ac2f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:22 -0800 Subject: [PATCH 053/105] mm: Kill NO_BOOTMEM version free_all_bootmem_node() Now NO_BOOTMEM version free_all_bootmem_node() does not really do free_bootmem at all, and it only call register_page_bootmem_info_node for online nodes instead. That is confusing. We can kill that free_all_bootmem_node(), after we kill two callings in x86 and sparc. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-46-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- mm/nobootmem.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/mm/nobootmem.c b/mm/nobootmem.c index bd82f6b31411..ecc2f13d557d 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -137,20 +137,6 @@ unsigned long __init free_low_memory_core_early(int nodeid) return count; } -/** - * free_all_bootmem_node - release a node's free pages to the buddy allocator - * @pgdat: node to be released - * - * Returns the number of pages actually released. - */ -unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) -{ - register_page_bootmem_info_node(pgdat); - - /* free_low_memory_core_early(MAX_NUMNODES) will be called later */ - return 0; -} - /** * free_all_bootmem - release free pages to the buddy allocator * From 9710f581bb4c35589ac046b0cfc0deb7f369fc85 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:23 -0800 Subject: [PATCH 054/105] x86, mm: Let "memmap=" take more entries one time Current "memmap=" only can take one entry every time. when we have more entries, we have to use memmap= for each of them. For pxe booting, we have command line length limitation, those extra "memmap=" would waste too much space. This patch make memmap= could take several entries one time, and those entries will be split with ',' Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-47-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/e820.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index df06ade26bef..d32abeabbda5 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -835,7 +835,7 @@ static int __init parse_memopt(char *p) } early_param("mem", parse_memopt); -static int __init parse_memmap_opt(char *p) +static int __init parse_memmap_one(char *p) { char *oldp; u64 start_at, mem_size; @@ -877,6 +877,20 @@ static int __init parse_memmap_opt(char *p) return *p == '\0' ? 0 : -EINVAL; } +static int __init parse_memmap_opt(char *str) +{ + while (str) { + char *k = strchr(str, ','); + + if (k) + *k++ = 0; + + parse_memmap_one(str); + str = k; + } + + return 0; +} early_param("memmap", parse_memmap_opt); void __init finish_e820_parsing(void) From bbee3aec3472fc2ca10b6b1020aec84567ea25ce Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 19 Nov 2012 10:31:37 -0800 Subject: [PATCH 055/105] x86: Fix warning about cast from pointer to integer of different size This patch fixes a warning reported by the kbuild test robot where we were casting a pointer to a physical address which represents an integer of a different size. Per the suggestion of Peter Anvin I am replacing it and one other spot where I made a similar cast with an unsigned long. Signed-off-by: Alexander Duyck Link: http://lkml.kernel.org/r/20121119182927.3655.7641.stgit@ahduyck-cp1.jf.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head32.c | 2 +- arch/x86/kernel/head64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index f15db0c40713..e17554832991 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -31,7 +31,7 @@ static void __init i386_default_early_setup(void) void __init i386_start_kernel(void) { memblock_reserve(__pa_symbol(_text), - (phys_addr_t)__bss_stop - (phys_addr_t)_text); + (unsigned long)__bss_stop - (unsigned long)_text); #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 42f5df134341..7b215a50ec1e 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -98,7 +98,7 @@ void __init x86_64_start_reservations(char *real_mode_data) copy_bootdata(__va(real_mode_data)); memblock_reserve(__pa_symbol(_text), - (phys_addr_t)__bss_stop - (phys_addr_t)_text); + (unsigned long)__bss_stop - (unsigned long)_text); #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ From 5e4bf1a55da976a5ed60901bb8801f1024ef9774 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 20 Nov 2012 13:02:51 +0100 Subject: [PATCH 056/105] x86/mm: Don't flush the TLB on #WP pmd fixups If we have a write protection #PF and fix up the pmd then the hugetlb code [the only user of pmdp_set_access_flags], in its do_huge_pmd_wp_page() page fault resolution function calls pmdp_set_access_flags() to mark the pmd permissive again, and flushes the TLB. This TLB flush is unnecessary: a flush on #PF is guaranteed on most (all?) x86 CPUs, and even in the worst-case we'll generate a spurious fault. So remove it. Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Paul Turner Cc: Lee Schermerhorn Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Johannes Weiner Cc: Christoph Lameter Cc: Mel Gorman Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20121120120251.GA15742@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/mm/pgtable.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 8573b83a63d0..8a828d773e58 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -328,7 +328,12 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, if (changed && dirty) { *pmdp = entry; pmd_update_defer(vma->vm_mm, address, pmdp); - flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + /* + * We had a write-protection fault here and changed the pmd + * to to more permissive. No need to flush the TLB for that, + * #PF is architecturally guaranteed to do that and in the + * worst-case we'll generate a spurious fault. + */ } return changed; From a25b9316841c5afa226f8f70a457861b35276a92 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 22 Jan 2013 13:24:30 -0800 Subject: [PATCH 057/105] x86, mm: Make DEBUG_VIRTUAL work earlier in boot The KVM code has some repeated bugs in it around use of __pa() on per-cpu data. Those data are not in an area on which using __pa() is valid. However, they are also called early enough in boot that __vmalloc_start_set is not set, and thus the CONFIG_DEBUG_VIRTUAL debugging does not catch them. This adds a check to also verify __pa() calls against max_low_pfn, which we can use earler in boot than is_vmalloc_addr(). However, if we are super-early in boot, max_low_pfn=0 and this will trip on every call, so also make sure that max_low_pfn is set before we try to use it. With this patch applied, CONFIG_DEBUG_VIRTUAL will actually catch the bug I was chasing (and fix later in this series). I'd love to find a generic way so that any __pa() call on percpu areas could do a BUG_ON(), but there don't appear to be any nice and easy ways to check if an address is a percpu one. Anybody have ideas on a way to do this? Signed-off-by: Dave Hansen Link: http://lkml.kernel.org/r/20130122212430.F46F8159@kernel.stglabs.ibm.com Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa.c | 2 +- arch/x86/mm/pat.c | 4 ++-- arch/x86/mm/physaddr.c | 9 ++++++++- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 2d125be1bae9..76604eb9e4b0 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -219,7 +219,7 @@ static void __init setup_node_data(int nid, u64 start, u64 end) */ nd = alloc_remap(nid, nd_size); if (nd) { - nd_pa = __pa(nd); + nd_pa = __phys_addr_nodebug(nd); remapped = true; } else { nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 0eb572eda406..2610bd93c896 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -560,10 +560,10 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags) { unsigned long id_sz; - if (base >= __pa(high_memory)) + if (base > __pa(high_memory-1)) return 0; - id_sz = (__pa(high_memory) < base + size) ? + id_sz = (__pa(high_memory-1) <= base + size) ? __pa(high_memory) - base : size; diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c index c73feddd518d..e666cbbb9261 100644 --- a/arch/x86/mm/physaddr.c +++ b/arch/x86/mm/physaddr.c @@ -1,3 +1,4 @@ +#include #include #include #include @@ -68,10 +69,16 @@ EXPORT_SYMBOL(__virt_addr_valid); #ifdef CONFIG_DEBUG_VIRTUAL unsigned long __phys_addr(unsigned long x) { + unsigned long phys_addr = x - PAGE_OFFSET; /* VMALLOC_* aren't constants */ VIRTUAL_BUG_ON(x < PAGE_OFFSET); VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x)); - return x - PAGE_OFFSET; + /* max_low_pfn is set early, but not _that_ early */ + if (max_low_pfn) { + VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn); + BUG_ON(slow_virt_to_phys((void *)x) != phys_addr); + } + return phys_addr; } EXPORT_SYMBOL(__phys_addr); #endif From 4cbeb51b860c57ba8b2ae50c4016ee7a41f5fbd5 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 22 Jan 2013 13:24:31 -0800 Subject: [PATCH 058/105] x86, mm: Pagetable level size/shift/mask helpers I plan to use lookup_address() to walk the kernel pagetables in a later patch. It returns a "pte" and the level in the pagetables where the "pte" was found. The level is just an enum and needs to be converted to a useful value in order to do address calculations with it. These helpers will be used in at least two places. This also gives the anonymous enum a real name so that no one gets confused about what they should be passing in to these helpers. "PTE_SHIFT" was chosen for naming consistency with the other pagetable levels (PGD/PUD/PMD_SHIFT). Cc: H. Peter Anvin Signed-off-by: Dave Hansen Link: http://lkml.kernel.org/r/20130122212431.405D3A8C@kernel.stglabs.ibm.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable.h | 14 ++++++++++++++ arch/x86/include/asm/pgtable_types.h | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5199db2923d3..bc28e6fe7052 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -390,6 +390,7 @@ pte_t *populate_extra_pte(unsigned long vaddr); #ifndef __ASSEMBLY__ #include +#include static inline int pte_none(pte_t pte) { @@ -781,6 +782,19 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) memcpy(dst, src, count * sizeof(pgd_t)); } +#define PTE_SHIFT ilog2(PTRS_PER_PTE) +static inline int page_level_shift(enum pg_level level) +{ + return (PAGE_SHIFT - PTE_SHIFT) + level * PTE_SHIFT; +} +static inline unsigned long page_level_size(enum pg_level level) +{ + return 1UL << page_level_shift(level); +} +static inline unsigned long page_level_mask(enum pg_level level) +{ + return ~(page_level_size(level) - 1); +} #include #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 3c32db8c539d..6c297e7998cc 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -331,7 +331,7 @@ extern void native_pagetable_init(void); struct seq_file; extern void arch_report_meminfo(struct seq_file *m); -enum { +enum pg_level { PG_LEVEL_NONE, PG_LEVEL_4K, PG_LEVEL_2M, From f3c4fbb68e93b10c781c0cc462a9d80770244da6 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 22 Jan 2013 13:24:32 -0800 Subject: [PATCH 059/105] x86, mm: Use new pagetable helpers in try_preserve_large_page() try_preserve_large_page() can be slightly simplified by using the new page_level_*() helpers. This also moves the 'level' over to the new pg_level enum type. Signed-off-by: Dave Hansen Link: http://lkml.kernel.org/r/20130122212432.14F3D993@kernel.stglabs.ibm.com Signed-off-by: H. Peter Anvin --- arch/x86/mm/pageattr.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 40f92f3aea54..2a5c9ab710b9 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -396,7 +396,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, pte_t new_pte, old_pte, *tmp; pgprot_t old_prot, new_prot, req_prot; int i, do_split = 1; - unsigned int level; + enum pg_level level; if (cpa->force_split) return 1; @@ -412,15 +412,12 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, switch (level) { case PG_LEVEL_2M: - psize = PMD_PAGE_SIZE; - pmask = PMD_PAGE_MASK; - break; #ifdef CONFIG_X86_64 case PG_LEVEL_1G: - psize = PUD_PAGE_SIZE; - pmask = PUD_PAGE_MASK; - break; #endif + psize = page_level_size(level); + pmask = page_level_mask(level); + break; default: do_split = -EINVAL; goto out_unlock; From d765653445129b7c476758040e3079480775f80a Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 22 Jan 2013 13:24:33 -0800 Subject: [PATCH 060/105] x86, mm: Create slow_virt_to_phys() This is necessary because __pa() does not work on some kinds of memory, like vmalloc() or the alloc_remap() areas on 32-bit NUMA systems. We have some functions to do conversions _like_ this in the vmalloc() code (like vmalloc_to_page()), but they do not work on sizes other than 4k pages. We would potentially need to be able to handle all the page sizes that we use for the kernel linear mapping (4k, 2M, 1G). In practice, on 32-bit NUMA systems, the percpu areas get stuck in the alloc_remap() area. Any __pa() call on them will break and basically return garbage. This patch introduces a new function slow_virt_to_phys(), which walks the kernel page tables on x86 and should do precisely the same logical thing as __pa(), but actually work on a wider range of memory. It should work on the normal linear mapping, vmalloc(), kmap(), etc... Signed-off-by: Dave Hansen Link: http://lkml.kernel.org/r/20130122212433.4D1FCA62@kernel.stglabs.ibm.com Acked-by: Rik van Riel Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable_types.h | 1 + arch/x86/mm/pageattr.c | 31 ++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 6c297e7998cc..9f82690f81ed 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -352,6 +352,7 @@ static inline void update_page_count(int level, unsigned long pages) { } * as a pte too. */ extern pte_t *lookup_address(unsigned long address, unsigned int *level); +extern phys_addr_t slow_virt_to_phys(void *__address); #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 2a5c9ab710b9..6d13d2a3f825 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -363,6 +363,37 @@ pte_t *lookup_address(unsigned long address, unsigned int *level) } EXPORT_SYMBOL_GPL(lookup_address); +/* + * This is necessary because __pa() does not work on some + * kinds of memory, like vmalloc() or the alloc_remap() + * areas on 32-bit NUMA systems. The percpu areas can + * end up in this kind of memory, for instance. + * + * This could be optimized, but it is only intended to be + * used at inititalization time, and keeping it + * unoptimized should increase the testing coverage for + * the more obscure platforms. + */ +phys_addr_t slow_virt_to_phys(void *__virt_addr) +{ + unsigned long virt_addr = (unsigned long)__virt_addr; + phys_addr_t phys_addr; + unsigned long offset; + enum pg_level level; + unsigned long psize; + unsigned long pmask; + pte_t *pte; + + pte = lookup_address(virt_addr, &level); + BUG_ON(!pte); + psize = page_level_size(level); + pmask = page_level_mask(level); + offset = virt_addr & ~pmask; + phys_addr = pte_pfn(*pte) << PAGE_SHIFT; + return (phys_addr | offset); +} +EXPORT_SYMBOL_GPL(slow_virt_to_phys); + /* * Set the new pmd in all the pgds we know about: */ From 5dfd486c4750c9278c63fa96e6e85bdd2fb58e9d Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 22 Jan 2013 13:24:35 -0800 Subject: [PATCH 061/105] x86, kvm: Fix kvm's use of __pa() on percpu areas In short, it is illegal to call __pa() on an address holding a percpu variable. This replaces those __pa() calls with slow_virt_to_phys(). All of the cases in this patch are in boot time (or CPU hotplug time at worst) code, so the slow pagetable walking in slow_virt_to_phys() is not expected to have a performance impact. The times when this actually matters are pretty obscure (certain 32-bit NUMA systems), but it _does_ happen. It is important to keep KVM guests working on these systems because the real hardware is getting harder and harder to find. This bug manifested first by me seeing a plain hang at boot after this message: CPU 0 irqstacks, hard=f3018000 soft=f301a000 or, sometimes, it would actually make it out to the console: [ 0.000000] BUG: unable to handle kernel paging request at ffffffff I eventually traced it down to the KVM async pagefault code. This can be worked around by disabling that code either at compile-time, or on the kernel command-line. The kvm async pagefault code was injecting page faults in to the guest which the guest misinterpreted because its "reason" was not being properly sent from the host. The guest passes a physical address of an per-cpu async page fault structure via an MSR to the host. Since __pa() is broken on percpu data, the physical address it sent was bascially bogus and the host went scribbling on random data. The guest never saw the real reason for the page fault (it was injected by the host), assumed that the kernel had taken a _real_ page fault, and panic()'d. The behavior varied, though, depending on what got corrupted by the bad write. Signed-off-by: Dave Hansen Link: http://lkml.kernel.org/r/20130122212435.4905663F@kernel.stglabs.ibm.com Acked-by: Rik van Riel Reviewed-by: Marcelo Tosatti Signed-off-by: H. Peter Anvin --- arch/x86/kernel/kvm.c | 9 +++++---- arch/x86/kernel/kvmclock.c | 4 ++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 9c2bd8bd4b4c..aa7e58b82b39 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -297,9 +297,9 @@ static void kvm_register_steal_time(void) memset(st, 0, sizeof(*st)); - wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED)); + wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n", - cpu, __pa(st)); + cpu, slow_virt_to_phys(st)); } static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; @@ -324,7 +324,7 @@ void __cpuinit kvm_guest_cpu_init(void) return; if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { - u64 pa = __pa(&__get_cpu_var(apf_reason)); + u64 pa = slow_virt_to_phys(&__get_cpu_var(apf_reason)); #ifdef CONFIG_PREEMPT pa |= KVM_ASYNC_PF_SEND_ALWAYS; @@ -340,7 +340,8 @@ void __cpuinit kvm_guest_cpu_init(void) /* Size alignment is implied but just to make it explicit. */ BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); __get_cpu_var(kvm_apic_eoi) = 0; - pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED; + pa = slow_virt_to_phys(&__get_cpu_var(kvm_apic_eoi)) + | KVM_MSR_ENABLED; wrmsrl(MSR_KVM_PV_EOI_EN, pa); } diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 220a360010f8..9f966dc0b9e4 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -162,8 +162,8 @@ int kvm_register_clock(char *txt) int low, high, ret; struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti; - low = (int)__pa(src) | 1; - high = ((u64)__pa(src) >> 32); + low = (int)slow_virt_to_phys(src) | 1; + high = ((u64)slow_virt_to_phys(src) >> 32); ret = native_write_msr_safe(msr_kvm_system_time, low, high); printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", cpu, high, low, txt); From c9b3234a6abadaa12684083d39552939baaed1f4 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:19:42 -0800 Subject: [PATCH 062/105] x86, mm: Fix page table early allocation offset checking During debugging loading kernel above 4G, found that one page is not used in pre-allocated BRK area for early page allocation. pgt_buf_top is address that can not be used, so should check if that new end is above that top, otherwise last page will not be used. Fix that checking and also add print out for allocation from pre-allocated BRK area to catch possible bugs later. But after we get back that page for pgt, it tiggers one bug in pgt allocation with xen: We need to avoid to use page as pgt to map range that is overlapping with that pgt page. Add checking about overlapping, when it happens, use memblock allocation instead. That fixes crash on Xen PV guest with 2G that Stefan found. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-2-git-send-email-yinghai@kernel.org Acked-by: Stefano Stabellini Tested-by: Stefano Stabellini Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 6f85de8a1f28..78d1ef3eab66 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -25,6 +25,8 @@ static unsigned long __initdata pgt_buf_top; static unsigned long min_pfn_mapped; +static bool __initdata can_use_brk_pgt = true; + /* * Pages returned are already directly mapped. * @@ -47,7 +49,7 @@ __ref void *alloc_low_pages(unsigned int num) __GFP_ZERO, order); } - if ((pgt_buf_end + num) >= pgt_buf_top) { + if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) { unsigned long ret; if (min_pfn_mapped >= max_pfn_mapped) panic("alloc_low_page: ran out of memory"); @@ -61,6 +63,8 @@ __ref void *alloc_low_pages(unsigned int num) } else { pfn = pgt_buf_end; pgt_buf_end += num; + printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n", + pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1); } for (i = 0; i < num; i++) { @@ -370,8 +374,15 @@ static unsigned long __init init_range_memory_mapping( if (start >= end) continue; + /* + * if it is overlapping with brk pgt, we need to + * alloc pgt buf from memblock instead. + */ + can_use_brk_pgt = max(start, (u64)pgt_buf_end<= + min(end, (u64)pgt_buf_top< Date: Thu, 24 Jan 2013 12:19:45 -0800 Subject: [PATCH 063/105] x86: Factor out e820_add_kernel_range() Separate out the reservation of the kernel static memory areas into a separate function. Also add support for case when memmap=xxM$yyM is used without exactmap. Need to remove reserved range at first before we add E820_RAM range, otherwise added E820_RAM range will be ignored. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-5-git-send-email-yinghai@kernel.org Cc: Jacob Shin Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 268193746cd8..5552d04b0cc1 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -702,6 +702,27 @@ static void __init trim_bios_range(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); } +/* called before trim_bios_range() to spare extra sanitize */ +static void __init e820_add_kernel_range(void) +{ + u64 start = __pa_symbol(_text); + u64 size = __pa_symbol(_end) - start; + + /* + * Complain if .text .data and .bss are not marked as E820_RAM and + * attempt to fix it by adding the range. We may have a confused BIOS, + * or the user may have used memmap=exactmap or memmap=xxM$yyM to + * exclude kernel range. If we really are running on top non-RAM, + * we will crash later anyways. + */ + if (e820_all_mapped(start, start + size, E820_RAM)) + return; + + pr_warn(".text .data .bss are not marked as E820_RAM!\n"); + e820_remove_range(start, size, E820_RAM, 0); + e820_add_region(start, size, E820_RAM); +} + static int __init parse_reservelow(char *p) { unsigned long long size; @@ -897,20 +918,7 @@ void __init setup_arch(char **cmdline_p) insert_resource(&iomem_resource, &data_resource); insert_resource(&iomem_resource, &bss_resource); - /* - * Complain if .text .data and .bss are not marked as E820_RAM and - * attempt to fix it by adding the range. We may have a confused BIOS, - * or the user may have incorrectly supplied it via memmap=exactmap. If - * we really are running on top non-RAM, we will crash later anyways. - */ - if (!e820_all_mapped(code_resource.start, __pa(__brk_limit), E820_RAM)) { - pr_warn(".text .data .bss are not marked as E820_RAM!\n"); - - e820_add_region(code_resource.start, - __pa(__brk_limit) - code_resource.start + 1, - E820_RAM); - } - + e820_add_kernel_range(); trim_bios_range(); #ifdef CONFIG_X86_32 if (ppro_with_ram_bug()) { From c2bdee594ebcf4a531afe795baf18da509438392 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:19:46 -0800 Subject: [PATCH 064/105] x86, 64bit, mm: Make pgd next calculation consistent with pud/pmd Just like the way we calculate next for pud and pmd, aka round down and add size. Also, do not do boundary-checking with 'next', and just pass 'end' down to phys_pud_init() instead. Because the loop in phys_pud_init() stops at PTRS_PER_PUD and thus can handle a possibly bigger 'end' properly. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-6-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_64.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 191ab12f5ff3..d7af907c07f4 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -530,9 +530,7 @@ kernel_physical_mapping_init(unsigned long start, pgd_t *pgd = pgd_offset_k(start); pud_t *pud; - next = (start + PGDIR_SIZE) & PGDIR_MASK; - if (next > end) - next = end; + next = (start & PGDIR_MASK) + PGDIR_SIZE; if (pgd_val(*pgd)) { pud = (pud_t *)pgd_page_vaddr(*pgd); @@ -542,7 +540,7 @@ kernel_physical_mapping_init(unsigned long start, } pud = alloc_low_page(); - last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), + last_map_addr = phys_pud_init(pud, __pa(start), __pa(end), page_size_mask); spin_lock(&init_mm.page_table_lock); From 231b3642a3c73fb9f1221dcb96fe8c0fbb658dfd Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:19:47 -0800 Subject: [PATCH 065/105] x86, realmode: Set real_mode permissions early Trampoline code is executed by APs with kernel low mapping on 64bit. We need to set trampoline code to EXEC early before we boot APs. Found the problem after switching to #PF handler set page table, and we do not set initial kernel low mapping with EXEC anymore in arch/x86/kernel/head_64.S. Change to use early_initcall instead that will make sure trampoline will have EXEC set. -v2: Merge two comments according to Borislav Petkov Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-7-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/realmode/init.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index cbca565af5bd..c44ea7cf5741 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -84,10 +84,12 @@ void __init setup_real_mode(void) } /* - * set_real_mode_permissions() gets called very early, to guarantee the - * availability of low memory. This is before the proper kernel page + * setup_real_mode() gets called very early, to guarantee the + * availability of low memory. This is before the proper kernel page * tables are set up, so we cannot set page permissions in that - * function. Thus, we use an arch_initcall instead. + * function. Also trampoline code will be executed by APs so we + * need to mark it executable at do_pre_smp_initcalls() at least, + * thus run it as a early_initcall(). */ static int __init set_real_mode_permissions(void) { @@ -111,5 +113,4 @@ static int __init set_real_mode_permissions(void) return 0; } - -arch_initcall(set_real_mode_permissions); +early_initcall(set_real_mode_permissions); From aece27851d44bde62fc0587e06f5e8e27fd96e5f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:19:48 -0800 Subject: [PATCH 066/105] x86, 64bit, mm: Add generic kernel/ident mapping helper It is simple version for kernel_physical_mapping_init. it will work to build one page table that will be used later. Use mapping_info to control 1. alloc_pg_page method 2. if PMD is EXEC, 3. if pgd is with kernel low mapping or ident mapping. Will use to replace some local versions in kexec, hibernation and etc. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-8-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/init.h | 9 +++++ arch/x86/mm/init_64.c | 74 +++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index bac770b7cbc4..223042086f4e 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -1,5 +1,14 @@ #ifndef _ASM_X86_INIT_H #define _ASM_X86_INIT_H +struct x86_mapping_info { + void *(*alloc_pgt_page)(void *); /* allocate buf for page table */ + void *context; /* context for alloc_pgt_page */ + unsigned long pmd_flag; /* page flag for PMD entry */ + bool kernel_mapping; /* kernel mapping or ident mapping */ +}; + +int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, + unsigned long addr, unsigned long end); #endif /* _ASM_X86_INIT_H */ diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index d7af907c07f4..9fbb85c8d930 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -56,6 +56,80 @@ #include "mm_internal.h" +static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page, + unsigned long addr, unsigned long end) +{ + addr &= PMD_MASK; + for (; addr < end; addr += PMD_SIZE) { + pmd_t *pmd = pmd_page + pmd_index(addr); + + if (!pmd_present(*pmd)) + set_pmd(pmd, __pmd(addr | pmd_flag)); + } +} +static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next) { + pud_t *pud = pud_page + pud_index(addr); + pmd_t *pmd; + + next = (addr & PUD_MASK) + PUD_SIZE; + if (next > end) + next = end; + + if (pud_present(*pud)) { + pmd = pmd_offset(pud, 0); + ident_pmd_init(info->pmd_flag, pmd, addr, next); + continue; + } + pmd = (pmd_t *)info->alloc_pgt_page(info->context); + if (!pmd) + return -ENOMEM; + ident_pmd_init(info->pmd_flag, pmd, addr, next); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + } + + return 0; +} + +int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + int result; + int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0; + + for (; addr < end; addr = next) { + pgd_t *pgd = pgd_page + pgd_index(addr) + off; + pud_t *pud; + + next = (addr & PGDIR_MASK) + PGDIR_SIZE; + if (next > end) + next = end; + + if (pgd_present(*pgd)) { + pud = pud_offset(pgd, 0); + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + continue; + } + + pud = (pud_t *)info->alloc_pgt_page(info->context); + if (!pud) + return -ENOMEM; + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + } + + return 0; +} + static int __init parse_direct_gbpages_off(char *arg) { direct_gbpages = 0; From fa2bbce985ca97943305cdc81d9626e6810ed7f2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:19:49 -0800 Subject: [PATCH 067/105] x86, 64bit: Copy struct boot_params early We want to support struct boot_params (formerly known as the zero-page, or real-mode data) above the 4 GiB mark. We will have #PF handler to set page table for not accessible ram early, but want to limit it before x86_64_start_reservations to limit the code change to native path only. Also we will need the ramdisk info in struct boot_params to access the microcode blob in ramdisk in x86_64_start_kernel, so copy struct boot_params early makes it accessing ramdisk info simple. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-9-git-send-email-yinghai@kernel.org Cc: Alexander Duyck Cc: Fenghua Yu Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head64.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 849fc9e63c2f..7785e66840a4 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -89,6 +89,8 @@ void __init x86_64_start_kernel(char * real_mode_data) } load_idt((const struct desc_ptr *)&idt_descr); + copy_bootdata(__va(real_mode_data)); + if (console_loglevel == 10) early_printk("Kernel alive\n"); @@ -97,7 +99,9 @@ void __init x86_64_start_kernel(char * real_mode_data) void __init x86_64_start_reservations(char *real_mode_data) { - copy_bootdata(__va(real_mode_data)); + /* version is always not zero if it is copied */ + if (!boot_params.hdr.version) + copy_bootdata(__va(real_mode_data)); memblock_reserve(__pa_symbol(&_text), __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); From 9735e91e9c29c0d8fe432aef1152e43e50bdb316 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:19:50 -0800 Subject: [PATCH 068/105] x86, 64bit, realmode: Use init_level4_pgt to set trampoline_pgd directly with #PF handler way to set early page table, level3_ident will go away with 64bit native path. So just use entries in init_level4_pgt to set them in trampoline_pgd. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-10-git-send-email-yinghai@kernel.org Cc: Jarkko Sakkinen Acked-by: Jarkko Sakkinen Signed-off-by: H. Peter Anvin --- arch/x86/realmode/init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index c44ea7cf5741..ffee06ac7de9 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -78,8 +78,8 @@ void __init setup_real_mode(void) *trampoline_cr4_features = read_cr4(); trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); - trampoline_pgd[0] = __pa(level3_ident_pgt) + _KERNPG_TABLE; - trampoline_pgd[511] = __pa(level3_kernel_pgt) + _KERNPG_TABLE; + trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd; + trampoline_pgd[511] = init_level4_pgt[511].pgd; #endif } From 4f7b92263ad68cdc72b11808320d9c881bfa857e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:19:51 -0800 Subject: [PATCH 069/105] x86, realmode: Separate real_mode reserve and setup After we switch to use #PF handler help to set page table, init_level4_pgt will only have entries set after init_mem_mapping(). We need to move copying init_level4_pgt to trampoline_pgd after that. So split reserve and setup, and move the setup after init_mem_mapping() Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-11-git-send-email-yinghai@kernel.org Cc: Jarkko Sakkinen Acked-by: Jarkko Sakkinen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/realmode.h | 3 ++- arch/x86/kernel/setup.c | 4 +++- arch/x86/realmode/init.c | 32 ++++++++++++++++++++------------ 3 files changed, 25 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index fe1ec5bcd846..9c6b890d5e7a 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -58,6 +58,7 @@ extern unsigned char boot_gdt[]; extern unsigned char secondary_startup_64[]; #endif -extern void __init setup_real_mode(void); +void reserve_real_mode(void); +void setup_real_mode(void); #endif /* _ARCH_X86_REALMODE_H */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 5552d04b0cc1..85a8290801df 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -999,12 +999,14 @@ void __init setup_arch(char **cmdline_p) printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", (max_pfn_mapped< Date: Thu, 24 Jan 2013 12:19:52 -0800 Subject: [PATCH 070/105] x86, 64bit: Use a #PF handler to materialize early mappings on demand Linear mode (CR0.PG = 0) is mutually exclusive with 64-bit mode; all 64-bit code has to use page tables. This makes it awkward before we have first set up properly all-covering page tables to access objects that are outside the static kernel range. So far we have dealt with that simply by mapping a fixed amount of low memory, but that fails in at least two upcoming use cases: 1. We will support load and run kernel, struct boot_params, ramdisk, command line, etc. above the 4 GiB mark. 2. need to access ramdisk early to get microcode to update that as early possible. We could use early_iomap to access them too, but it will make code to messy and hard to be unified with 32 bit. Hence, set up a #PF table and use a fixed number of buffers to set up page tables on demand. If the buffers fill up then we simply flush them and start over. These buffers are all in __initdata, so it does not increase RAM usage at runtime. Thus, with the help of the #PF handler, we can set the final kernel mapping from blank, and switch to init_level4_pgt later. During the switchover in head_64.S, before #PF handler is available, we use three pages to handle kernel crossing 1G, 512G boundaries with sharing page by playing games with page aliasing: the same page is mapped twice in the higher-level tables with appropriate wraparound. The kernel region itself will be properly mapped; other mappings may be spurious. early_make_pgtable is using kernel high mapping address to access pages to set page table. -v4: Add phys_base offset to make kexec happy, and add init_mapping_kernel() - Yinghai -v5: fix compiling with xen, and add back ident level3 and level2 for xen also move back init_level4_pgt from BSS to DATA again. because we have to clear it anyway. - Yinghai -v6: switch to init_level4_pgt in init_mem_mapping. - Yinghai -v7: remove not needed clear_page for init_level4_page it is with fill 512,8,0 already in head_64.S - Yinghai -v8: we need to keep that handler alive until init_mem_mapping and don't let early_trap_init to trash that early #PF handler. So split early_trap_pf_init out and move it down. - Yinghai -v9: switchover only cover kernel space instead of 1G so could avoid touch possible mem holes. - Yinghai -v11: change far jmp back to far return to initial_code, that is needed to fix failure that is reported by Konrad on AMD systems. - Yinghai Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-12-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable_64_types.h | 4 + arch/x86/include/asm/processor.h | 1 + arch/x86/kernel/head64.c | 81 ++++++++- arch/x86/kernel/head_64.S | 214 ++++++++++++++---------- arch/x86/kernel/setup.c | 2 + arch/x86/kernel/traps.c | 9 + arch/x86/mm/init.c | 3 +- 7 files changed, 221 insertions(+), 93 deletions(-) diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 766ea16fbbbd..2d883440cb9a 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_PGTABLE_64_DEFS_H #define _ASM_X86_PGTABLE_64_DEFS_H +#include + #ifndef __ASSEMBLY__ #include @@ -60,4 +62,6 @@ typedef struct { pteval_t pte; } pte_t; #define MODULES_END _AC(0xffffffffff000000, UL) #define MODULES_LEN (MODULES_END - MODULES_VADDR) +#define EARLY_DYNAMIC_PAGE_TABLES 64 + #endif /* _ASM_X86_PGTABLE_64_DEFS_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 888184b2fc85..bdee8bd318ea 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -731,6 +731,7 @@ extern void enable_sep_cpu(void); extern int sysenter_setup(void); extern void early_trap_init(void); +void early_trap_pf_init(void); /* Defined in head.S */ extern struct desc_ptr early_gdt_descr; diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 7785e66840a4..f57df05ea126 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -27,11 +27,73 @@ #include #include -static void __init zap_identity_mappings(void) +/* + * Manage page tables very early on. + */ +extern pgd_t early_level4_pgt[PTRS_PER_PGD]; +extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; +static unsigned int __initdata next_early_pgt = 2; + +/* Wipe all early page tables except for the kernel symbol map */ +static void __init reset_early_page_tables(void) { - pgd_t *pgd = pgd_offset_k(0UL); - pgd_clear(pgd); - __flush_tlb_all(); + unsigned long i; + + for (i = 0; i < PTRS_PER_PGD-1; i++) + early_level4_pgt[i].pgd = 0; + + next_early_pgt = 0; + + write_cr3(__pa(early_level4_pgt)); +} + +/* Create a new PMD entry */ +int __init early_make_pgtable(unsigned long address) +{ + unsigned long physaddr = address - __PAGE_OFFSET; + unsigned long i; + pgdval_t pgd, *pgd_p; + pudval_t *pud_p; + pmdval_t pmd, *pmd_p; + + /* Invalid address or early pgt is done ? */ + if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt)) + return -1; + + i = (address >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1); + pgd_p = &early_level4_pgt[i].pgd; + pgd = *pgd_p; + + /* + * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is + * critical -- __PAGE_OFFSET would point us back into the dynamic + * range and we might end up looping forever... + */ + if (pgd && next_early_pgt < EARLY_DYNAMIC_PAGE_TABLES) { + pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); + } else { + if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES-1) + reset_early_page_tables(); + + pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; + for (i = 0; i < PTRS_PER_PUD; i++) + pud_p[i] = 0; + + *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + } + i = (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); + pud_p += i; + + pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++]; + pmd = (physaddr & PUD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL); + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd_p[i] = pmd; + pmd += PMD_SIZE; + } + + *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + + return 0; } /* Don't add a printk in there. printk relies on the PDA which is not initialized @@ -72,12 +134,13 @@ void __init x86_64_start_kernel(char * real_mode_data) (__START_KERNEL & PGDIR_MASK))); BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); + /* Kill off the identity-map trampoline */ + reset_early_page_tables(); + /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); - /* Make NULL pointers segfault */ - zap_identity_mappings(); - + /* XXX - this is wrong... we need to build page tables from scratch */ max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { @@ -94,6 +157,10 @@ void __init x86_64_start_kernel(char * real_mode_data) if (console_loglevel == 10) early_printk("Kernel alive\n"); + clear_page(init_level4_pgt); + /* set init_level4_pgt kernel high mapping*/ + init_level4_pgt[511] = early_level4_pgt[511]; + x86_64_start_reservations(real_mode_data); } diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 980053c4b9cc..d94f6d68be2a 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -47,14 +47,13 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map) .code64 .globl startup_64 startup_64: - /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, * and someone has loaded an identity mapped page table * for us. These identity mapped page tables map all of the * kernel pages and possibly all of memory. * - * %esi holds a physical pointer to real_mode_data. + * %rsi holds a physical pointer to real_mode_data. * * We come here either directly from a 64bit bootloader, or from * arch/x86_64/boot/compressed/head.S. @@ -66,7 +65,8 @@ startup_64: * tables and then reload them. */ - /* Compute the delta between the address I am compiled to run at and the + /* + * Compute the delta between the address I am compiled to run at and the * address I am actually running at. */ leaq _text(%rip), %rbp @@ -78,45 +78,62 @@ startup_64: testl %eax, %eax jnz bad_address - /* Is the address too large? */ - leaq _text(%rip), %rdx - movq $PGDIR_SIZE, %rax - cmpq %rax, %rdx - jae bad_address - - /* Fixup the physical addresses in the page table + /* + * Is the address too large? */ - addq %rbp, init_level4_pgt + 0(%rip) - addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip) - addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip) + leaq _text(%rip), %rax + shrq $MAX_PHYSMEM_BITS, %rax + jnz bad_address - addq %rbp, level3_ident_pgt + 0(%rip) + /* + * Fixup the physical addresses in the page table + */ + addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip) addq %rbp, level3_kernel_pgt + (510*8)(%rip) addq %rbp, level3_kernel_pgt + (511*8)(%rip) addq %rbp, level2_fixmap_pgt + (506*8)(%rip) - /* Add an Identity mapping if I am above 1G */ + /* + * Set up the identity mapping for the switchover. These + * entries should *NOT* have the global bit set! This also + * creates a bunch of nonsense entries but that is fine -- + * it avoids problems around wraparound. + */ leaq _text(%rip), %rdi - andq $PMD_PAGE_MASK, %rdi + leaq early_level4_pgt(%rip), %rbx + movq %rdi, %rax + shrq $PGDIR_SHIFT, %rax + + leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx + movq %rdx, 0(%rbx,%rax,8) + movq %rdx, 8(%rbx,%rax,8) + + addq $4096, %rdx movq %rdi, %rax shrq $PUD_SHIFT, %rax - andq $(PTRS_PER_PUD - 1), %rax - jz ident_complete - - leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx - leaq level3_ident_pgt(%rip), %rbx - movq %rdx, 0(%rbx, %rax, 8) + andl $(PTRS_PER_PUD-1), %eax + movq %rdx, (4096+0)(%rbx,%rax,8) + movq %rdx, (4096+8)(%rbx,%rax,8) + addq $8192, %rbx movq %rdi, %rax - shrq $PMD_SHIFT, %rax - andq $(PTRS_PER_PMD - 1), %rax - leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx - leaq level2_spare_pgt(%rip), %rbx - movq %rdx, 0(%rbx, %rax, 8) -ident_complete: + shrq $PMD_SHIFT, %rdi + addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax + leaq (_end - 1)(%rip), %rcx + shrq $PMD_SHIFT, %rcx + subq %rdi, %rcx + incl %ecx + +1: + andq $(PTRS_PER_PMD - 1), %rdi + movq %rax, (%rbx,%rdi,8) + incq %rdi + addq $PMD_SIZE, %rax + decl %ecx + jnz 1b /* * Fixup the kernel text+data virtual addresses. Note that @@ -124,7 +141,6 @@ ident_complete: * cleanup_highmap() fixes this up along with the mappings * beyond _end. */ - leaq level2_kernel_pgt(%rip), %rdi leaq 4096(%rdi), %r8 /* See if it is a valid page table entry */ @@ -139,17 +155,14 @@ ident_complete: /* Fixup phys_base */ addq %rbp, phys_base(%rip) - /* Due to ENTRY(), sometimes the empty space gets filled with - * zeros. Better take a jmp than relying on empty space being - * filled with 0x90 (nop) - */ - jmp secondary_startup_64 + movq $(early_level4_pgt - __START_KERNEL_map), %rax + jmp 1f ENTRY(secondary_startup_64) /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, * and someone has loaded a mapped page table. * - * %esi holds a physical pointer to real_mode_data. + * %rsi holds a physical pointer to real_mode_data. * * We come here either from startup_64 (using physical addresses) * or from trampoline.S (using virtual addresses). @@ -159,12 +172,14 @@ ENTRY(secondary_startup_64) * after the boot processor executes this code. */ + movq $(init_level4_pgt - __START_KERNEL_map), %rax +1: + /* Enable PAE mode and PGE */ - movl $(X86_CR4_PAE | X86_CR4_PGE), %eax - movq %rax, %cr4 + movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx + movq %rcx, %cr4 /* Setup early boot stage 4 level pagetables. */ - movq $(init_level4_pgt - __START_KERNEL_map), %rax addq phys_base(%rip), %rax movq %rax, %cr3 @@ -196,7 +211,7 @@ ENTRY(secondary_startup_64) movq %rax, %cr0 /* Setup a boot time stack */ - movq stack_start(%rip),%rsp + movq stack_start(%rip), %rsp /* zero EFLAGS after setting rsp */ pushq $0 @@ -236,15 +251,33 @@ ENTRY(secondary_startup_64) movl initial_gs+4(%rip),%edx wrmsr - /* esi is pointer to real mode structure with interesting info. + /* rsi is pointer to real mode structure with interesting info. pass it to C */ - movl %esi, %edi + movq %rsi, %rdi /* Finally jump to run C code and to be on real kernel address * Since we are running on identity-mapped space we have to jump * to the full 64bit address, this is only possible as indirect * jump. In addition we need to ensure %cs is set so we make this * a far return. + * + * Note: do not change to far jump indirect with 64bit offset. + * + * AMD does not support far jump indirect with 64bit offset. + * AMD64 Architecture Programmer's Manual, Volume 3: states only + * JMP FAR mem16:16 FF /5 Far jump indirect, + * with the target specified by a far pointer in memory. + * JMP FAR mem16:32 FF /5 Far jump indirect, + * with the target specified by a far pointer in memory. + * + * Intel64 does support 64bit offset. + * Software Developer Manual Vol 2: states: + * FF /5 JMP m16:16 Jump far, absolute indirect, + * address given in m16:16 + * FF /5 JMP m16:32 Jump far, absolute indirect, + * address given in m16:32. + * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect, + * address given in m16:64. */ movq initial_code(%rip),%rax pushq $0 # fake return address to stop unwinder @@ -270,13 +303,13 @@ ENDPROC(start_cpu0) /* SMP bootup changes these two */ __REFDATA - .align 8 - ENTRY(initial_code) + .balign 8 + GLOBAL(initial_code) .quad x86_64_start_kernel - ENTRY(initial_gs) + GLOBAL(initial_gs) .quad INIT_PER_CPU_VAR(irq_stack_union) - ENTRY(stack_start) + GLOBAL(stack_start) .quad init_thread_union+THREAD_SIZE-8 .word 0 __FINITDATA @@ -284,7 +317,7 @@ ENDPROC(start_cpu0) bad_address: jmp bad_address - .section ".init.text","ax" + __INIT .globl early_idt_handlers early_idt_handlers: # 104(%rsp) %rflags @@ -321,14 +354,22 @@ ENTRY(early_idt_handler) pushq %r11 # 0(%rsp) cmpl $__KERNEL_CS,96(%rsp) - jne 10f + jne 11f + cmpl $14,72(%rsp) # Page fault? + jnz 10f + GET_CR2_INTO(%rdi) # can clobber any volatile register if pv + call early_make_pgtable + andl %eax,%eax + jz 20f # All good + +10: leaq 88(%rsp),%rdi # Pointer to %rip call early_fixup_exception andl %eax,%eax jnz 20f # Found an exception entry -10: +11: #ifdef CONFIG_EARLY_PRINTK GET_CR2_INTO(%r9) # can clobber any volatile register if pv movl 80(%rsp),%r8d # error code @@ -350,7 +391,7 @@ ENTRY(early_idt_handler) 1: hlt jmp 1b -20: # Exception table entry found +20: # Exception table entry found or page table generated popq %r11 popq %r10 popq %r9 @@ -364,6 +405,8 @@ ENTRY(early_idt_handler) decl early_recursion_flag(%rip) INTERRUPT_RETURN + __INITDATA + .balign 4 early_recursion_flag: .long 0 @@ -374,11 +417,10 @@ early_idt_msg: early_idt_ripmsg: .asciz "RIP %s\n" #endif /* CONFIG_EARLY_PRINTK */ - .previous #define NEXT_PAGE(name) \ .balign PAGE_SIZE; \ -ENTRY(name) +GLOBAL(name) /* Automate the creation of 1 to 1 mapping pmd entries */ #define PMDS(START, PERM, COUNT) \ @@ -388,24 +430,37 @@ ENTRY(name) i = i + 1 ; \ .endr - .data - /* - * This default setting generates an ident mapping at address 0x100000 - * and a mapping for the kernel that precisely maps virtual address - * 0xffffffff80000000 to physical address 0x000000. (always using - * 2Mbyte large pages provided by PAE mode) - */ -NEXT_PAGE(init_level4_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .org init_level4_pgt + L4_START_KERNEL*8, 0 - /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ + __INITDATA +NEXT_PAGE(early_level4_pgt) + .fill 511,8,0 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE +NEXT_PAGE(early_dynamic_pgts) + .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 + + .data + +#ifndef CONFIG_XEN +NEXT_PAGE(init_level4_pgt) + .fill 512,8,0 +#else +NEXT_PAGE(init_level4_pgt) + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .org init_level4_pgt + L4_START_KERNEL*8, 0 + /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + NEXT_PAGE(level3_ident_pgt) .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .fill 511,8,0 + .fill 511, 8, 0 +NEXT_PAGE(level2_ident_pgt) + /* Since I easily can, map the first 1G. + * Don't set NX because code runs from these pages. + */ + PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) +#endif NEXT_PAGE(level3_kernel_pgt) .fill L3_START_KERNEL,8,0 @@ -413,21 +468,6 @@ NEXT_PAGE(level3_kernel_pgt) .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE -NEXT_PAGE(level2_fixmap_pgt) - .fill 506,8,0 - .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE - /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ - .fill 5,8,0 - -NEXT_PAGE(level1_fixmap_pgt) - .fill 512,8,0 - -NEXT_PAGE(level2_ident_pgt) - /* Since I easily can, map the first 1G. - * Don't set NX because code runs from these pages. - */ - PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) - NEXT_PAGE(level2_kernel_pgt) /* * 512 MB kernel mapping. We spend a full page on this pagetable @@ -442,11 +482,16 @@ NEXT_PAGE(level2_kernel_pgt) PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE) -NEXT_PAGE(level2_spare_pgt) - .fill 512, 8, 0 +NEXT_PAGE(level2_fixmap_pgt) + .fill 506,8,0 + .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ + .fill 5,8,0 + +NEXT_PAGE(level1_fixmap_pgt) + .fill 512,8,0 #undef PMDS -#undef NEXT_PAGE .data .align 16 @@ -472,6 +517,5 @@ ENTRY(nmi_idt_table) .skip IDT_ENTRIES * 16 __PAGE_ALIGNED_BSS - .align PAGE_SIZE -ENTRY(empty_zero_page) +NEXT_PAGE(empty_zero_page) .skip PAGE_SIZE diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 85a8290801df..db9c41dae8d7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1005,6 +1005,8 @@ void __init setup_arch(char **cmdline_p) init_mem_mapping(); + early_trap_pf_init(); + setup_real_mode(); memblock.current_limit = get_max_mapped(); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ecffca11f4e9..68bda7a84159 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -688,10 +688,19 @@ void __init early_trap_init(void) set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); /* int3 can be called from all */ set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); +#ifdef CONFIG_X86_32 set_intr_gate(X86_TRAP_PF, &page_fault); +#endif load_idt(&idt_descr); } +void __init early_trap_pf_init(void) +{ +#ifdef CONFIG_X86_64 + set_intr_gate(X86_TRAP_PF, &page_fault); +#endif +} + void __init trap_init(void) { int i; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 78d1ef3eab66..3364a7643a4c 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -446,9 +446,10 @@ void __init init_mem_mapping(void) } #else early_ioremap_page_table_range_init(); +#endif + load_cr3(swapper_pg_dir); __flush_tlb_all(); -#endif early_memtest(0, max_pfn_mapped << PAGE_SHIFT); } From 6b9c75aca6cba4d99a6e8d8274b1788d4d4b50d9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:19:53 -0800 Subject: [PATCH 071/105] x86, 64bit: #PF handler set page to cover only 2M per #PF We only map a single 2 MiB page per #PF, even though we should be able to do this a full gigabyte at a time with no additional memory cost. This is a workaround for a broken AMD reference BIOS (and its derivatives in shipping system) which maps a large chunk of memory as WB in the MTRR system but will #MC if the processor wanders off and tries to prefetch that memory, which can happen any time the memory is mapped in the TLB. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-13-git-send-email-yinghai@kernel.org Cc: Alexander Duyck [ hpa: rewrote the patch description ] Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head64.c | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index f57df05ea126..816fc85c9bb3 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -53,15 +53,15 @@ int __init early_make_pgtable(unsigned long address) unsigned long physaddr = address - __PAGE_OFFSET; unsigned long i; pgdval_t pgd, *pgd_p; - pudval_t *pud_p; + pudval_t pud, *pud_p; pmdval_t pmd, *pmd_p; /* Invalid address or early pgt is done ? */ if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt)) return -1; - i = (address >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1); - pgd_p = &early_level4_pgt[i].pgd; +again: + pgd_p = &early_level4_pgt[pgd_index(address)].pgd; pgd = *pgd_p; /* @@ -69,29 +69,37 @@ int __init early_make_pgtable(unsigned long address) * critical -- __PAGE_OFFSET would point us back into the dynamic * range and we might end up looping forever... */ - if (pgd && next_early_pgt < EARLY_DYNAMIC_PAGE_TABLES) { + if (pgd) pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); - } else { - if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES-1) + else { + if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { reset_early_page_tables(); + goto again; + } pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; for (i = 0; i < PTRS_PER_PUD; i++) pud_p[i] = 0; - *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; } - i = (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); - pud_p += i; + pud_p += pud_index(address); + pud = *pud_p; - pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++]; - pmd = (physaddr & PUD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL); - for (i = 0; i < PTRS_PER_PMD; i++) { - pmd_p[i] = pmd; - pmd += PMD_SIZE; + if (pud) + pmd_p = (pmdval_t *)((pud & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); + else { + if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { + reset_early_page_tables(); + goto again; + } + + pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++]; + for (i = 0; i < PTRS_PER_PMD; i++) + pmd_p[i] = 0; + *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; } - - *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + pmd = (physaddr & PMD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL); + pmd_p[pmd_index(address)] = pmd; return 0; } From 100542306f644fc580857a8ca4896fb12b794d41 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:19:54 -0800 Subject: [PATCH 072/105] x86, 64bit: Don't set max_pfn_mapped wrong value early on native path We are not having max_pfn_mapped set correctly until init_memory_mapping. So don't print its initial value for 64bit Also need to use KERNEL_IMAGE_SIZE directly for highmap cleanup. -v2: update comments about max_pfn_mapped according to Stefano Stabellini. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-14-git-send-email-yinghai@kernel.org Acked-by: Borislav Petkov Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head64.c | 3 --- arch/x86/kernel/setup.c | 2 ++ arch/x86/mm/init_64.c | 10 +++++++++- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 816fc85c9bb3..f3b19685918e 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -148,9 +148,6 @@ void __init x86_64_start_kernel(char * real_mode_data) /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); - /* XXX - this is wrong... we need to build page tables from scratch */ - max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; - for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { #ifdef CONFIG_EARLY_PRINTK set_intr_gate(i, &early_idt_handlers[i]); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index db9c41dae8d7..d58083a2e158 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -996,8 +996,10 @@ void __init setup_arch(char **cmdline_p) setup_bios_corruption_check(); #endif +#ifdef CONFIG_X86_32 printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", (max_pfn_mapped< Date: Thu, 24 Jan 2013 12:19:55 -0800 Subject: [PATCH 073/105] x86: Merge early_reserve_initrd for 32bit and 64bit They are the same, could move them out from head32/64.c to setup.c. We are using memblock, and it could handle overlapping properly, so we don't need to reserve some at first to hold the location, and just need to make sure we reserve them before we are using memblock to find free mem to use. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-15-git-send-email-yinghai@kernel.org Reviewed-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head32.c | 11 ----------- arch/x86/kernel/head64.c | 11 ----------- arch/x86/kernel/setup.c | 22 ++++++++++++++++++---- 3 files changed, 18 insertions(+), 26 deletions(-) diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 6773c918b8cc..a795b54de7d3 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -36,17 +36,6 @@ void __init i386_start_kernel(void) memblock_reserve(__pa_symbol(&_text), __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); -#ifdef CONFIG_BLK_DEV_INITRD - /* Reserve INITRD */ - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { - /* Assume only end is not page aligned */ - u64 ramdisk_image = boot_params.hdr.ramdisk_image; - u64 ramdisk_size = boot_params.hdr.ramdisk_size; - u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); - } -#endif - /* Call the subarch specific early setup function */ switch (boot_params.hdr.hardware_subarch) { case X86_SUBARCH_MRST: diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index f3b19685918e..b88a1fab2158 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -178,17 +178,6 @@ void __init x86_64_start_reservations(char *real_mode_data) memblock_reserve(__pa_symbol(&_text), __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); -#ifdef CONFIG_BLK_DEV_INITRD - /* Reserve INITRD */ - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { - /* Assume only end is not page aligned */ - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; - unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); - } -#endif - reserve_ebda_region(); /* diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d58083a2e158..8e356923cbd0 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -360,6 +360,19 @@ static u64 __init get_mem_size(unsigned long limit_pfn) return mapped_pages << PAGE_SHIFT; } +static void __init early_reserve_initrd(void) +{ + /* Assume only end is not page aligned */ + u64 ramdisk_image = boot_params.hdr.ramdisk_image; + u64 ramdisk_size = boot_params.hdr.ramdisk_size; + u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); + + if (!boot_params.hdr.type_of_loader || + !ramdisk_image || !ramdisk_size) + return; /* No initrd provided by bootloader */ + + memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); +} static void __init reserve_initrd(void) { /* Assume only end is not page aligned */ @@ -386,10 +399,6 @@ static void __init reserve_initrd(void) if (pfn_range_is_mapped(PFN_DOWN(ramdisk_image), PFN_DOWN(ramdisk_end))) { /* All are mapped, easy case */ - /* - * don't need to reserve again, already reserved early - * in i386_start_kernel - */ initrd_start = ramdisk_image + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; return; @@ -400,6 +409,9 @@ static void __init reserve_initrd(void) memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); } #else +static void __init early_reserve_initrd(void) +{ +} static void __init reserve_initrd(void) { } @@ -760,6 +772,8 @@ early_param("reservelow", parse_reservelow); void __init setup_arch(char **cmdline_p) { + early_reserve_initrd(); + #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); visws_early_detect(); From a8a51a88d5152aa40e5e07dcdd939c7fafc42224 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:19:56 -0800 Subject: [PATCH 074/105] x86: Add get_ramdisk_image/size() There are several places to find ramdisk information early for reserving and relocating. Use accessor functions to make code more readable and consistent. Later will add ext_ramdisk_image/size in those functions to support loading ramdisk above 4g. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-16-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 8e356923cbd0..83b38617ff59 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -294,12 +294,25 @@ static void __init reserve_brk(void) #ifdef CONFIG_BLK_DEV_INITRD +static u64 __init get_ramdisk_image(void) +{ + u64 ramdisk_image = boot_params.hdr.ramdisk_image; + + return ramdisk_image; +} +static u64 __init get_ramdisk_size(void) +{ + u64 ramdisk_size = boot_params.hdr.ramdisk_size; + + return ramdisk_size; +} + #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) static void __init relocate_initrd(void) { /* Assume only end is not page aligned */ - u64 ramdisk_image = boot_params.hdr.ramdisk_image; - u64 ramdisk_size = boot_params.hdr.ramdisk_size; + u64 ramdisk_image = get_ramdisk_image(); + u64 ramdisk_size = get_ramdisk_size(); u64 area_size = PAGE_ALIGN(ramdisk_size); u64 ramdisk_here; unsigned long slop, clen, mapaddr; @@ -338,8 +351,8 @@ static void __init relocate_initrd(void) ramdisk_size -= clen; } - ramdisk_image = boot_params.hdr.ramdisk_image; - ramdisk_size = boot_params.hdr.ramdisk_size; + ramdisk_image = get_ramdisk_image(); + ramdisk_size = get_ramdisk_size(); printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" " [mem %#010llx-%#010llx]\n", ramdisk_image, ramdisk_image + ramdisk_size - 1, @@ -363,8 +376,8 @@ static u64 __init get_mem_size(unsigned long limit_pfn) static void __init early_reserve_initrd(void) { /* Assume only end is not page aligned */ - u64 ramdisk_image = boot_params.hdr.ramdisk_image; - u64 ramdisk_size = boot_params.hdr.ramdisk_size; + u64 ramdisk_image = get_ramdisk_image(); + u64 ramdisk_size = get_ramdisk_size(); u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); if (!boot_params.hdr.type_of_loader || @@ -376,8 +389,8 @@ static void __init early_reserve_initrd(void) static void __init reserve_initrd(void) { /* Assume only end is not page aligned */ - u64 ramdisk_image = boot_params.hdr.ramdisk_image; - u64 ramdisk_size = boot_params.hdr.ramdisk_size; + u64 ramdisk_image = get_ramdisk_image(); + u64 ramdisk_size = get_ramdisk_size(); u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); u64 mapped_size; From f1da834cd902f5e5df0b11a3948fc43c6071b590 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:19:57 -0800 Subject: [PATCH 075/105] x86, boot: Add get_cmd_line_ptr() Add an accessor function for the command line address. Later we will add support for holding a 64-bit address via ext_cmd_line_ptr. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-17-git-send-email-yinghai@kernel.org Cc: Gokul Caushik Cc: Josh Triplett Cc: Joe Millenbach Cc: Alexander Duyck Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/cmdline.c | 10 ++++++++-- arch/x86/kernel/head64.c | 13 +++++++++++-- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c index 10f6b1178c68..b4c913c5c4ad 100644 --- a/arch/x86/boot/compressed/cmdline.c +++ b/arch/x86/boot/compressed/cmdline.c @@ -13,13 +13,19 @@ static inline char rdfs8(addr_t addr) return *((char *)(fs + addr)); } #include "../cmdline.c" +static unsigned long get_cmd_line_ptr(void) +{ + unsigned long cmd_line_ptr = real_mode->hdr.cmd_line_ptr; + + return cmd_line_ptr; +} int cmdline_find_option(const char *option, char *buffer, int bufsize) { - return __cmdline_find_option(real_mode->hdr.cmd_line_ptr, option, buffer, bufsize); + return __cmdline_find_option(get_cmd_line_ptr(), option, buffer, bufsize); } int cmdline_find_option_bool(const char *option) { - return __cmdline_find_option_bool(real_mode->hdr.cmd_line_ptr, option); + return __cmdline_find_option_bool(get_cmd_line_ptr(), option); } #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index b88a1fab2158..62c8ce44cac4 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -112,14 +112,23 @@ static void __init clear_bss(void) (unsigned long) __bss_stop - (unsigned long) __bss_start); } +static unsigned long get_cmd_line_ptr(void) +{ + unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr; + + return cmd_line_ptr; +} + static void __init copy_bootdata(char *real_mode_data) { char * command_line; + unsigned long cmd_line_ptr; memcpy(&boot_params, real_mode_data, sizeof boot_params); sanitize_boot_params(&boot_params); - if (boot_params.hdr.cmd_line_ptr) { - command_line = __va(boot_params.hdr.cmd_line_ptr); + cmd_line_ptr = get_cmd_line_ptr(); + if (cmd_line_ptr) { + command_line = __va(cmd_line_ptr); memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); } } From 16a4baa642cf448742aaf150c4daa093f9dbebbb Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:19:58 -0800 Subject: [PATCH 076/105] x86, boot: Move checking of cmd_line_ptr out of common path cmdline.c::__cmdline_find_option... are shared between 16-bit setup code and 32/64 bit decompressor code. for 32/64 only path via kexec, we should not check if ptr is less 1M. as those cmdline could be put above 1M, or even 4G. Move out accessible checking out of __cmdline_find_option() So decompressor in misc.c can parse cmdline correctly. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-18-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/boot/boot.h | 14 ++++++++++++-- arch/x86/boot/cmdline.c | 8 ++++---- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 18997e5a1053..7fadf806f7ca 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -289,12 +289,22 @@ int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option); static inline int cmdline_find_option(const char *option, char *buffer, int bufsize) { - return __cmdline_find_option(boot_params.hdr.cmd_line_ptr, option, buffer, bufsize); + u32 cmd_line_ptr = boot_params.hdr.cmd_line_ptr; + + if (cmd_line_ptr >= 0x100000) + return -1; /* inaccessible */ + + return __cmdline_find_option(cmd_line_ptr, option, buffer, bufsize); } static inline int cmdline_find_option_bool(const char *option) { - return __cmdline_find_option_bool(boot_params.hdr.cmd_line_ptr, option); + u32 cmd_line_ptr = boot_params.hdr.cmd_line_ptr; + + if (cmd_line_ptr >= 0x100000) + return -1; /* inaccessible */ + + return __cmdline_find_option_bool(cmd_line_ptr, option); } diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c index 6b3b6f708c04..768f00f32469 100644 --- a/arch/x86/boot/cmdline.c +++ b/arch/x86/boot/cmdline.c @@ -41,8 +41,8 @@ int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int st_bufcpy /* Copying this to buffer */ } state = st_wordstart; - if (!cmdline_ptr || cmdline_ptr >= 0x100000) - return -1; /* No command line, or inaccessible */ + if (!cmdline_ptr) + return -1; /* No command line */ cptr = cmdline_ptr & 0xf; set_fs(cmdline_ptr >> 4); @@ -111,8 +111,8 @@ int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option) st_wordskip, /* Miscompare, skip */ } state = st_wordstart; - if (!cmdline_ptr || cmdline_ptr >= 0x100000) - return -1; /* No command line, or inaccessible */ + if (!cmdline_ptr) + return -1; /* No command line */ cptr = cmdline_ptr & 0xf; set_fs(cmdline_ptr >> 4); From 3db07e70f0b4742f8daeda5c4aa8fbe7aeb3799e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:19:59 -0800 Subject: [PATCH 077/105] x86, boot: Pass cmd_line_ptr with unsigned long instead boot/compressed/misc.c is used for bzImage in 64bit and 32bit, and cmd_line_ptr could point to buffer that is above 4g, cmd_line_ptr should be 64bit otherwise high 32bit will be capped out. So need to change data type to unsigned long, that will be 64bit get correct address of command line buffer. And it is still ok with 32bit bzImage, because unsigned long on 32bit kernel is still 32bit. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-19-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/boot/boot.h | 8 ++++---- arch/x86/boot/cmdline.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 7fadf806f7ca..5b7531966b84 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -285,11 +285,11 @@ struct biosregs { void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg); /* cmdline.c */ -int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize); -int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option); +int __cmdline_find_option(unsigned long cmdline_ptr, const char *option, char *buffer, int bufsize); +int __cmdline_find_option_bool(unsigned long cmdline_ptr, const char *option); static inline int cmdline_find_option(const char *option, char *buffer, int bufsize) { - u32 cmd_line_ptr = boot_params.hdr.cmd_line_ptr; + unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr; if (cmd_line_ptr >= 0x100000) return -1; /* inaccessible */ @@ -299,7 +299,7 @@ static inline int cmdline_find_option(const char *option, char *buffer, int bufs static inline int cmdline_find_option_bool(const char *option) { - u32 cmd_line_ptr = boot_params.hdr.cmd_line_ptr; + unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr; if (cmd_line_ptr >= 0x100000) return -1; /* inaccessible */ diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c index 768f00f32469..625d21b0cd3f 100644 --- a/arch/x86/boot/cmdline.c +++ b/arch/x86/boot/cmdline.c @@ -27,7 +27,7 @@ static inline int myisspace(u8 c) * Returns the length of the argument (regardless of if it was * truncated to fit in the buffer), or -1 on not found. */ -int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize) +int __cmdline_find_option(unsigned long cmdline_ptr, const char *option, char *buffer, int bufsize) { addr_t cptr; char c; @@ -99,7 +99,7 @@ int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int * Returns the position of that option (starts counting with 1) * or 0 on not found */ -int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option) +int __cmdline_find_option_bool(unsigned long cmdline_ptr, const char *option) { addr_t cptr; char c; From 187a8a73cee295b9407de0d6bfba65471a1f39d6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:20:00 -0800 Subject: [PATCH 078/105] x86, boot: Move verify_cpu.S and no_longmode down We need to move some code to 32bit section in following patch: x86, boot: Move lldt/ltr out of 64bit code section but that will push startup_64 down from 0x200. According to hpa, we can not change startup_64 position and that is an ABI. We could move function verify_cpu and no_longmode down, because verify_cpu is used via function call and no_longmode will not return, then we don't need to add extra code for jumping back. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-20-git-send-email-yinghai@kernel.org Cc: Matt Fleming Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/head_64.S | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 2c4b171eec33..fb984c0c0c99 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -176,14 +176,6 @@ ENTRY(startup_32) lret ENDPROC(startup_32) -no_longmode: - /* This isn't an x86-64 CPU so hang */ -1: - hlt - jmp 1b - -#include "../../kernel/verify_cpu.S" - /* * Be careful here startup_64 needs to be at a predictable * address so I can export it in an ELF header. Bootloaders @@ -349,6 +341,15 @@ relocated: */ jmp *%rbp + .code32 +no_longmode: + /* This isn't an x86-64 CPU so hang */ +1: + hlt + jmp 1b + +#include "../../kernel/verify_cpu.S" + .data gdt: .word gdt_end - gdt From d3c433bf9a01b6951286ec2cbf52e3549623d878 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:20:01 -0800 Subject: [PATCH 079/105] x86, boot: Move lldt/ltr out of 64bit code section commit 08da5a2ca x86_64: Early segment setup for VT sets up LDT and TR into a valid state in order to speed up boot decompression under VT. Those code are put in code64, and it is using GDT that is only loaded from code32 path. That breaks booting with 64bit bootloader that does not go through code32 path and jump to startup_64 directly, and it has different GDT. Move those lines into code32 after their GDT is loaded. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-21-git-send-email-yinghai@kernel.org Cc: Zachary Amsden Cc: Matt Fleming Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/head_64.S | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index fb984c0c0c99..5c80b94f6c4a 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -154,6 +154,12 @@ ENTRY(startup_32) btsl $_EFER_LME, %eax wrmsr + /* After gdt is loaded */ + xorl %eax, %eax + lldt %ax + movl $0x20, %eax + ltr %ax + /* * Setup for the jump to 64bit mode * @@ -239,9 +245,6 @@ preferred_addr: movl %eax, %ss movl %eax, %fs movl %eax, %gs - lldt %ax - movl $0x20, %eax - ltr %ax /* * Compute the decompressed kernel start address. It is where From 577af55d802d9fe114287e750504e09e7c677c9c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:20:02 -0800 Subject: [PATCH 080/105] x86, kexec: Remove 1024G limitation for kexec buffer on 64bit Now 64bit kernel supports more than 1T ram and kexec tools could find buffer above 1T, remove that obsolete limitation. and use MAXMEM instead. Tested on system with more than 1024G ram. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-22-git-send-email-yinghai@kernel.org Cc: Eric W. Biederman Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/kexec.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 6080d2694bad..17483a492f18 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -48,11 +48,11 @@ # define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64) #else /* Maximum physical address we can use pages from */ -# define KEXEC_SOURCE_MEMORY_LIMIT (0xFFFFFFFFFFUL) +# define KEXEC_SOURCE_MEMORY_LIMIT (MAXMEM-1) /* Maximum address we can reach in physical address mode */ -# define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFFFFFFFFFUL) +# define KEXEC_DESTINATION_MEMORY_LIMIT (MAXMEM-1) /* Maximum address we can use for the control pages */ -# define KEXEC_CONTROL_MEMORY_LIMIT (0xFFFFFFFFFFUL) +# define KEXEC_CONTROL_MEMORY_LIMIT (MAXMEM-1) /* Allocate one page for the pdp and the second for the code */ # define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL) From 084d1283986a530828b8898f206adf44d5d3146d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:20:03 -0800 Subject: [PATCH 081/105] x86, kexec: Set ident mapping for kernel that is above max_pfn When first kernel is booted with memmap= or mem= to limit max_pfn. kexec can load second kernel above that max_pfn. We need to set ident mapping for whole image in this case instead of just for first 2M. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-23-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/machine_kexec_64.c | 43 +++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index b3ea9db39db6..be14ee120c43 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -56,6 +56,25 @@ out: return result; } +static int ident_mapping_init(struct kimage *image, pgd_t *level4p, + unsigned long mstart, unsigned long mend) +{ + int result; + + mstart = round_down(mstart, PMD_SIZE); + mend = round_up(mend - 1, PMD_SIZE); + + while (mstart < mend) { + result = init_one_level2_page(image, level4p, mstart); + if (result) + return result; + + mstart += PMD_SIZE; + } + + return 0; +} + static void init_level2_page(pmd_t *level2p, unsigned long addr) { unsigned long end_addr; @@ -184,22 +203,34 @@ err: return result; } - static int init_pgtable(struct kimage *image, unsigned long start_pgtable) { + unsigned long mstart, mend; pgd_t *level4p; int result; + int i; + level4p = (pgd_t *)__va(start_pgtable); result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); if (result) return result; + /* - * image->start may be outside 0 ~ max_pfn, for example when - * jump back to original kernel from kexeced kernel + * segments's mem ranges could be outside 0 ~ max_pfn, + * for example when jump back to original kernel from kexeced kernel. + * or first kernel is booted with user mem map, and second kernel + * could be loaded out of that range. */ - result = init_one_level2_page(image, level4p, image->start); - if (result) - return result; + for (i = 0; i < image->nr_segments; i++) { + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + + result = ident_mapping_init(image, level4p, mstart, mend); + + if (result) + return result; + } + return init_transition_pgtable(image, level4p); } From 9ebdc79f7a177d3098b89ba8ef2dd2b235163685 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:20:04 -0800 Subject: [PATCH 082/105] x86, kexec: Replace ident_mapping_init and init_level4_page Now ident_mapping_init is checking if pgd/pud is present for every 2M, so several 2Ms are in same PUD, it will keep checking if pud is there with same pud. init_level4_page just does not check existing pgd/pud. We could use generic mapping_init with different settings in info to replace those two local grown version functions. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-24-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/machine_kexec_64.c | 161 +++++------------------------ 1 file changed, 26 insertions(+), 135 deletions(-) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index be14ee120c43..d2d7e023a8c8 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -16,144 +16,12 @@ #include #include +#include #include #include #include #include -static int init_one_level2_page(struct kimage *image, pgd_t *pgd, - unsigned long addr) -{ - pud_t *pud; - pmd_t *pmd; - struct page *page; - int result = -ENOMEM; - - addr &= PMD_MASK; - pgd += pgd_index(addr); - if (!pgd_present(*pgd)) { - page = kimage_alloc_control_pages(image, 0); - if (!page) - goto out; - pud = (pud_t *)page_address(page); - clear_page(pud); - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); - } - pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) { - page = kimage_alloc_control_pages(image, 0); - if (!page) - goto out; - pmd = (pmd_t *)page_address(page); - clear_page(pmd); - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); - } - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) - set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); - result = 0; -out: - return result; -} - -static int ident_mapping_init(struct kimage *image, pgd_t *level4p, - unsigned long mstart, unsigned long mend) -{ - int result; - - mstart = round_down(mstart, PMD_SIZE); - mend = round_up(mend - 1, PMD_SIZE); - - while (mstart < mend) { - result = init_one_level2_page(image, level4p, mstart); - if (result) - return result; - - mstart += PMD_SIZE; - } - - return 0; -} - -static void init_level2_page(pmd_t *level2p, unsigned long addr) -{ - unsigned long end_addr; - - addr &= PAGE_MASK; - end_addr = addr + PUD_SIZE; - while (addr < end_addr) { - set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); - addr += PMD_SIZE; - } -} - -static int init_level3_page(struct kimage *image, pud_t *level3p, - unsigned long addr, unsigned long last_addr) -{ - unsigned long end_addr; - int result; - - result = 0; - addr &= PAGE_MASK; - end_addr = addr + PGDIR_SIZE; - while ((addr < last_addr) && (addr < end_addr)) { - struct page *page; - pmd_t *level2p; - - page = kimage_alloc_control_pages(image, 0); - if (!page) { - result = -ENOMEM; - goto out; - } - level2p = (pmd_t *)page_address(page); - init_level2_page(level2p, addr); - set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE)); - addr += PUD_SIZE; - } - /* clear the unused entries */ - while (addr < end_addr) { - pud_clear(level3p++); - addr += PUD_SIZE; - } -out: - return result; -} - - -static int init_level4_page(struct kimage *image, pgd_t *level4p, - unsigned long addr, unsigned long last_addr) -{ - unsigned long end_addr; - int result; - - result = 0; - addr &= PAGE_MASK; - end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE); - while ((addr < last_addr) && (addr < end_addr)) { - struct page *page; - pud_t *level3p; - - page = kimage_alloc_control_pages(image, 0); - if (!page) { - result = -ENOMEM; - goto out; - } - level3p = (pud_t *)page_address(page); - result = init_level3_page(image, level3p, addr, last_addr); - if (result) - goto out; - set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); - addr += PGDIR_SIZE; - } - /* clear the unused entries */ - while (addr < end_addr) { - pgd_clear(level4p++); - addr += PGDIR_SIZE; - } -out: - return result; -} - static void free_transition_pgtable(struct kimage *image) { free_page((unsigned long)image->arch.pud); @@ -203,15 +71,37 @@ err: return result; } +static void *alloc_pgt_page(void *data) +{ + struct kimage *image = (struct kimage *)data; + struct page *page; + void *p = NULL; + + page = kimage_alloc_control_pages(image, 0); + if (page) { + p = page_address(page); + clear_page(p); + } + + return p; +} + static int init_pgtable(struct kimage *image, unsigned long start_pgtable) { + struct x86_mapping_info info = { + .alloc_pgt_page = alloc_pgt_page, + .context = image, + .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, + }; unsigned long mstart, mend; pgd_t *level4p; int result; int i; level4p = (pgd_t *)__va(start_pgtable); - result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); + clear_page(level4p); + result = kernel_ident_mapping_init(&info, level4p, + 0, max_pfn << PAGE_SHIFT); if (result) return result; @@ -225,7 +115,8 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; - result = ident_mapping_init(image, level4p, mstart, mend); + result = kernel_ident_mapping_init(&info, + level4p, mstart, mend); if (result) return result; From 0e691cf824f76adefb4498fe39c300aba2c2575a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:20:05 -0800 Subject: [PATCH 083/105] x86, kexec, 64bit: Only set ident mapping for ram. We should set mappings only for usable memory ranges under max_pfn Otherwise causes same problem that is fixed by x86, mm: Only direct map addresses that are marked as E820_RAM This patch exposes pfn_mapped array, and only sets ident mapping for ranges in that array. This patch relies on new kernel_ident_mapping_init that could handle existing pgd/pud between different calls. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-25-git-send-email-yinghai@kernel.org Cc: Alexander Duyck Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page.h | 4 ++++ arch/x86/kernel/machine_kexec_64.c | 13 +++++++++---- arch/x86/mm/init.c | 4 ++-- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 8ca82839288a..100a20c7b98d 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -17,6 +17,10 @@ struct page; +#include +extern struct range pfn_mapped[]; +extern int nr_pfn_mapped; + static inline void clear_user_page(void *page, unsigned long vaddr, struct page *pg) { diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index d2d7e023a8c8..4eabc160696f 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -100,10 +100,15 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) level4p = (pgd_t *)__va(start_pgtable); clear_page(level4p); - result = kernel_ident_mapping_init(&info, level4p, - 0, max_pfn << PAGE_SHIFT); - if (result) - return result; + for (i = 0; i < nr_pfn_mapped; i++) { + mstart = pfn_mapped[i].start << PAGE_SHIFT; + mend = pfn_mapped[i].end << PAGE_SHIFT; + + result = kernel_ident_mapping_init(&info, + level4p, mstart, mend); + if (result) + return result; + } /* * segments's mem ranges could be outside 0 ~ max_pfn, diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 3364a7643a4c..d41815265a0b 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -302,8 +302,8 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, return nr_range; } -static struct range pfn_mapped[E820_X_MAX]; -static int nr_pfn_mapped; +struct range pfn_mapped[E820_X_MAX]; +int nr_pfn_mapped; static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn) { From ee92d815027a76ef92f3ec7b155b0c8aa345f239 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 28 Jan 2013 20:16:44 -0800 Subject: [PATCH 084/105] x86, boot: Support loading bzImage, boot_params and ramdisk above 4G xloadflags bit 1 indicates that we can load the kernel and all data structures above 4G; it is set if kernel is relocatable and 64bit. bootloader will check if xloadflags bit 1 is set to decide if it could load ramdisk and kernel high above 4G. bootloader will fill value to ext_ramdisk_image/size for high 32bits when it load ramdisk above 4G. kernel use get_ramdisk_image/size to use ext_ramdisk_image/size to get right positon for ramdisk. Signed-off-by: Yinghai Lu Cc: Rob Landley Cc: Matt Fleming Cc: Gokul Caushik Cc: Josh Triplett Cc: Joe Millenbach Link: http://lkml.kernel.org/r/1359058816-7615-26-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/cmdline.c | 2 ++ arch/x86/boot/header.S | 10 +++++++++- arch/x86/kernel/head64.c | 2 ++ arch/x86/kernel/setup.c | 4 ++++ 4 files changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c index b4c913c5c4ad..bffd73b45b1f 100644 --- a/arch/x86/boot/compressed/cmdline.c +++ b/arch/x86/boot/compressed/cmdline.c @@ -17,6 +17,8 @@ static unsigned long get_cmd_line_ptr(void) { unsigned long cmd_line_ptr = real_mode->hdr.cmd_line_ptr; + cmd_line_ptr |= (u64)real_mode->ext_cmd_line_ptr << 32; + return cmd_line_ptr; } int cmdline_find_option(const char *option, char *buffer, int bufsize) diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 944ce595f767..9ec06a1f6d61 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -374,6 +374,14 @@ xloadflags: #else # define XLF0 0 #endif + +#if defined(CONFIG_RELOCATABLE) && defined(CONFIG_X86_64) + /* kernel/boot_param/ramdisk could be loaded above 4g */ +# define XLF1 XLF_CAN_BE_LOADED_ABOVE_4G +#else +# define XLF1 0 +#endif + #ifdef CONFIG_EFI_STUB # ifdef CONFIG_X86_64 # define XLF23 XLF_EFI_HANDOVER_64 /* 64-bit EFI handover ok */ @@ -383,7 +391,7 @@ xloadflags: #else # define XLF23 0 #endif - .word XLF0 | XLF23 + .word XLF0 | XLF1 | XLF23 cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, #added with boot protocol diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 62c8ce44cac4..6873b070d72c 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -116,6 +116,8 @@ static unsigned long get_cmd_line_ptr(void) { unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr; + cmd_line_ptr |= (u64)boot_params.ext_cmd_line_ptr << 32; + return cmd_line_ptr; } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 83b38617ff59..519f2bc4950a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -298,12 +298,16 @@ static u64 __init get_ramdisk_image(void) { u64 ramdisk_image = boot_params.hdr.ramdisk_image; + ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32; + return ramdisk_image; } static u64 __init get_ramdisk_size(void) { u64 ramdisk_size = boot_params.hdr.ramdisk_size; + ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32; + return ramdisk_size; } From 8ee2f2dfdbdfe1851fcc0191b2d4faa4c26a39fb Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:20:07 -0800 Subject: [PATCH 085/105] x86, boot: Update comments about entries for 64bit image Now 64bit entry is fixed on 0x200, can not be changed anymore. Update the comments to reflect that. Also put info about it in boot.txt -v2: fix some grammar error Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-27-git-send-email-yinghai@kernel.org Cc: Rob Landley Cc: Matt Fleming Signed-off-by: H. Peter Anvin --- Documentation/x86/boot.txt | 38 ++++++++++++++++++++++++++++++ arch/x86/boot/compressed/head_64.S | 22 ++++++++++------- 2 files changed, 51 insertions(+), 9 deletions(-) diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt index 3edb4c2887a1..0e383169839a 100644 --- a/Documentation/x86/boot.txt +++ b/Documentation/x86/boot.txt @@ -1054,6 +1054,44 @@ must have read/write permission; CS must be __BOOT_CS and DS, ES, SS must be __BOOT_DS; interrupt must be disabled; %esi must hold the base address of the struct boot_params; %ebp, %edi and %ebx must be zero. +**** 64-bit BOOT PROTOCOL + +For machine with 64bit cpus and 64bit kernel, we could use 64bit bootloader +and we need a 64-bit boot protocol. + +In 64-bit boot protocol, the first step in loading a Linux kernel +should be to setup the boot parameters (struct boot_params, +traditionally known as "zero page"). The memory for struct boot_params +could be allocated anywhere (even above 4G) and initialized to all zero. +Then, the setup header at offset 0x01f1 of kernel image on should be +loaded into struct boot_params and examined. The end of setup header +can be calculated as follows: + + 0x0202 + byte value at offset 0x0201 + +In addition to read/modify/write the setup header of the struct +boot_params as that of 16-bit boot protocol, the boot loader should +also fill the additional fields of the struct boot_params as described +in zero-page.txt. + +After setting up the struct boot_params, the boot loader can load +64-bit kernel in the same way as that of 16-bit boot protocol, but +kernel could be loaded above 4G. + +In 64-bit boot protocol, the kernel is started by jumping to the +64-bit kernel entry point, which is the start address of loaded +64-bit kernel plus 0x200. + +At entry, the CPU must be in 64-bit mode with paging enabled. +The range with setup_header.init_size from start address of loaded +kernel and zero page and command line buffer get ident mapping; +a GDT must be loaded with the descriptors for selectors +__BOOT_CS(0x10) and __BOOT_DS(0x18); both descriptors must be 4G flat +segment; __BOOT_CS must have execute/read permission, and __BOOT_DS +must have read/write permission; CS must be __BOOT_CS and DS, ES, SS +must be __BOOT_DS; interrupt must be disabled; %rsi must hold the base +address of the struct boot_params. + **** EFI HANDOVER PROTOCOL This protocol allows boot loaders to defer initialisation to the EFI diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 5c80b94f6c4a..d9ae9a4ffcb9 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -37,6 +37,12 @@ __HEAD .code32 ENTRY(startup_32) + /* + * 32bit entry is 0 and it is ABI so immutable! + * If we come here directly from a bootloader, + * kernel(text+data+bss+brk) ramdisk, zero_page, command line + * all need to be under the 4G limit. + */ cld /* * Test KEEP_SEGMENTS flag to see if the bootloader is asking @@ -182,20 +188,18 @@ ENTRY(startup_32) lret ENDPROC(startup_32) - /* - * Be careful here startup_64 needs to be at a predictable - * address so I can export it in an ELF header. Bootloaders - * should look at the ELF header to find this address, as - * it may change in the future. - */ .code64 .org 0x200 ENTRY(startup_64) /* + * 64bit entry is 0x200 and it is ABI so immutable! * We come here either from startup_32 or directly from a - * 64bit bootloader. If we come here from a bootloader we depend on - * an identity mapped page table being provied that maps our - * entire text+data+bss and hopefully all of memory. + * 64bit bootloader. + * If we come here from a bootloader, kernel(text+data+bss+brk), + * ramdisk, zero_page, command line could be above 4G. + * We depend on an identity mapped page table being provided + * that maps our entire kernel(text+data+bss+brk), zero page + * and command line. */ #ifdef CONFIG_EFI_STUB /* From d1af6d045fba6b070fa81f54dfe9227214be99ea Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:20:08 -0800 Subject: [PATCH 086/105] x86, boot: Not need to check setup_header version for setup_data That is for bootloaders. setup_data is in setup_header, and bootloader is copying that from bzImage. So for old bootloader should keep that as 0 already. old kexec-tools till now for elf image set setup_data to 0, so it is ok. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-28-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 519f2bc4950a..b80bee10982f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -439,8 +439,6 @@ static void __init parse_setup_data(void) struct setup_data *data; u64 pa_data; - if (boot_params.hdr.version < 0x0209) - return; pa_data = boot_params.hdr.setup_data; while (pa_data) { u32 data_len, map_len; @@ -476,8 +474,6 @@ static void __init e820_reserve_setup_data(void) u64 pa_data; int found = 0; - if (boot_params.hdr.version < 0x0209) - return; pa_data = boot_params.hdr.setup_data; while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); @@ -501,8 +497,6 @@ static void __init memblock_x86_reserve_range_setup_data(void) struct setup_data *data; u64 pa_data; - if (boot_params.hdr.version < 0x0209) - return; pa_data = boot_params.hdr.setup_data; while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); From 595ad9af8584908ea5fb698b836169d05b99f186 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:20:09 -0800 Subject: [PATCH 087/105] memblock: Add memblock_mem_size() Use it to get mem size under the limit_pfn. to replace local version in x86 reserved_initrd. -v2: remove not needed cast that is pointed out by HPA. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-29-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 16 +--------------- include/linux/memblock.h | 1 + mm/memblock.c | 17 +++++++++++++++++ 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b80bee10982f..bbe8cdf7515e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -363,20 +363,6 @@ static void __init relocate_initrd(void) ramdisk_here, ramdisk_here + ramdisk_size - 1); } -static u64 __init get_mem_size(unsigned long limit_pfn) -{ - int i; - u64 mapped_pages = 0; - unsigned long start_pfn, end_pfn; - - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { - start_pfn = min_t(unsigned long, start_pfn, limit_pfn); - end_pfn = min_t(unsigned long, end_pfn, limit_pfn); - mapped_pages += end_pfn - start_pfn; - } - - return mapped_pages << PAGE_SHIFT; -} static void __init early_reserve_initrd(void) { /* Assume only end is not page aligned */ @@ -404,7 +390,7 @@ static void __init reserve_initrd(void) initrd_start = 0; - mapped_size = get_mem_size(max_pfn_mapped); + mapped_size = memblock_mem_size(max_pfn_mapped); if (ramdisk_size >= (mapped_size>>1)) panic("initrd too large to handle, " "disabling initrd (%lld needed, %lld available)\n", diff --git a/include/linux/memblock.h b/include/linux/memblock.h index d452ee191066..f388203db7e8 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -155,6 +155,7 @@ phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr); phys_addr_t memblock_phys_mem_size(void); +phys_addr_t memblock_mem_size(unsigned long limit_pfn); phys_addr_t memblock_start_of_DRAM(void); phys_addr_t memblock_end_of_DRAM(void); void memblock_enforce_memory_limit(phys_addr_t memory_limit); diff --git a/mm/memblock.c b/mm/memblock.c index 88adc8afb610..b8d9147e5c08 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -828,6 +828,23 @@ phys_addr_t __init memblock_phys_mem_size(void) return memblock.memory.total_size; } +phys_addr_t __init memblock_mem_size(unsigned long limit_pfn) +{ + unsigned long pages = 0; + struct memblock_region *r; + unsigned long start_pfn, end_pfn; + + for_each_memblock(memory, r) { + start_pfn = memblock_region_memory_base_pfn(r); + end_pfn = memblock_region_memory_end_pfn(r); + start_pfn = min_t(unsigned long, start_pfn, limit_pfn); + end_pfn = min_t(unsigned long, end_pfn, limit_pfn); + pages += end_pfn - start_pfn; + } + + return (phys_addr_t)pages << PAGE_SHIFT; +} + /* lowest address */ phys_addr_t __init_memblock memblock_start_of_DRAM(void) { From 7d41a8a4a2b2438621a9159477bff36a11d79a42 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:20:10 -0800 Subject: [PATCH 088/105] x86, kdump: Remove crashkernel range find limit for 64bit Now kexeced kernel/ramdisk could be above 4g, so remove 896 limit for 64bit. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-30-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bbe8cdf7515e..4778ddeedc8a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -501,13 +501,11 @@ static void __init memblock_x86_reserve_range_setup_data(void) /* * Keep the crash kernel below this limit. On 32 bits earlier kernels * would limit the kernel to the low 512 MiB due to mapping restrictions. - * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this - * limit once kexec-tools are fixed. */ #ifdef CONFIG_X86_32 # define CRASH_KERNEL_ADDR_MAX (512 << 20) #else -# define CRASH_KERNEL_ADDR_MAX (896 << 20) +# define CRASH_KERNEL_ADDR_MAX MAXMEM #endif static void __init reserve_crashkernel(void) From 0212f9159694be61c6bc52e925fa76643e0c1abf Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:20:11 -0800 Subject: [PATCH 089/105] x86: Add Crash kernel low reservation During kdump kernel's booting stage, it need to find low ram for swiotlb buffer when system does not support intel iommu/dmar remapping. kexed-tools is appending memmap=exactmap and range from /proc/iomem with "Crash kernel", and that range is above 4G for 64bit after boot protocol 2.12. We need to add another range in /proc/iomem like "Crash kernel low", so kexec-tools could find that info and append to kdump kernel command line. Try to reserve some under 4G if the normal "Crash kernel" is above 4G. User could specify the size with crashkernel_low=XX[KMG]. -v2: fix warning that is found by Fengguang's test robot. -v3: move out get_mem_size change to another patch, to solve compiling warning that is found by Borislav Petkov -v4: user must specify crashkernel_low if system does not support intel or amd iommu. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-31-git-send-email-yinghai@kernel.org Cc: Eric Biederman Cc: Rob Landley Signed-off-by: H. Peter Anvin --- Documentation/kernel-parameters.txt | 3 +++ arch/x86/kernel/setup.c | 42 +++++++++++++++++++++++++++-- include/linux/kexec.h | 3 +++ kernel/kexec.c | 34 +++++++++++++++++++---- 4 files changed, 75 insertions(+), 7 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 363e348bff9b..da0e0773ca96 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -594,6 +594,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. is selected automatically. Check Documentation/kdump/kdump.txt for further details. + crashkernel_low=size[KMG] + [KNL, x86] parts under 4G. + crashkernel=range1:size1[,range2:size2,...][@offset] [KNL] Same as above, but depends on the memory in the running system. The syntax of range is diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4778ddeedc8a..5dc47c3e537b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -508,8 +508,44 @@ static void __init memblock_x86_reserve_range_setup_data(void) # define CRASH_KERNEL_ADDR_MAX MAXMEM #endif +static void __init reserve_crashkernel_low(void) +{ +#ifdef CONFIG_X86_64 + const unsigned long long alignment = 16<<20; /* 16M */ + unsigned long long low_base = 0, low_size = 0; + unsigned long total_low_mem; + unsigned long long base; + int ret; + + total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT)); + ret = parse_crashkernel_low(boot_command_line, total_low_mem, + &low_size, &base); + if (ret != 0 || low_size <= 0) + return; + + low_base = memblock_find_in_range(low_size, (1ULL<<32), + low_size, alignment); + + if (!low_base) { + pr_info("crashkernel low reservation failed - No suitable area found.\n"); + + return; + } + + memblock_reserve(low_base, low_size); + pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n", + (unsigned long)(low_size >> 20), + (unsigned long)(low_base >> 20), + (unsigned long)(total_low_mem >> 20)); + crashk_low_res.start = low_base; + crashk_low_res.end = low_base + low_size - 1; + insert_resource(&iomem_resource, &crashk_low_res); +#endif +} + static void __init reserve_crashkernel(void) { + const unsigned long long alignment = 16<<20; /* 16M */ unsigned long long total_mem; unsigned long long crash_size, crash_base; int ret; @@ -523,8 +559,6 @@ static void __init reserve_crashkernel(void) /* 0 means: find the address automatically */ if (crash_base <= 0) { - const unsigned long long alignment = 16<<20; /* 16M */ - /* * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX */ @@ -535,6 +569,7 @@ static void __init reserve_crashkernel(void) pr_info("crashkernel reservation failed - No suitable area found.\n"); return; } + } else { unsigned long long start; @@ -556,6 +591,9 @@ static void __init reserve_crashkernel(void) crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; insert_resource(&iomem_resource, &crashk_res); + + if (crash_base >= (1ULL<<32)) + reserve_crashkernel_low(); } #else static void __init reserve_crashkernel(void) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index d0b8458a703a..d2e6927bbaae 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -191,6 +191,7 @@ extern struct kimage *kexec_crash_image; /* Location of a reserved region to hold the crash kernel. */ extern struct resource crashk_res; +extern struct resource crashk_low_res; typedef u32 note_buf_t[KEXEC_NOTE_BYTES/4]; extern note_buf_t __percpu *crash_notes; extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; @@ -199,6 +200,8 @@ extern size_t vmcoreinfo_max_size; int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base); +int parse_crashkernel_low(char *cmdline, unsigned long long system_ram, + unsigned long long *crash_size, unsigned long long *crash_base); int crash_shrink_memory(unsigned long new_size); size_t crash_get_memory_size(void); void crash_free_reserved_phys_range(unsigned long begin, unsigned long end); diff --git a/kernel/kexec.c b/kernel/kexec.c index 5e4bd7864c5d..2436ffcec91f 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -54,6 +54,12 @@ struct resource crashk_res = { .end = 0, .flags = IORESOURCE_BUSY | IORESOURCE_MEM }; +struct resource crashk_low_res = { + .name = "Crash kernel low", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; int kexec_should_crash(struct task_struct *p) { @@ -1369,10 +1375,11 @@ static int __init parse_crashkernel_simple(char *cmdline, * That function is the entry point for command line parsing and should be * called from the arch-specific code. */ -int __init parse_crashkernel(char *cmdline, +static int __init __parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, - unsigned long long *crash_base) + unsigned long long *crash_base, + const char *name) { char *p = cmdline, *ck_cmdline = NULL; char *first_colon, *first_space; @@ -1382,16 +1389,16 @@ int __init parse_crashkernel(char *cmdline, *crash_base = 0; /* find crashkernel and use the last one if there are more */ - p = strstr(p, "crashkernel="); + p = strstr(p, name); while (p) { ck_cmdline = p; - p = strstr(p+1, "crashkernel="); + p = strstr(p+1, name); } if (!ck_cmdline) return -EINVAL; - ck_cmdline += 12; /* strlen("crashkernel=") */ + ck_cmdline += strlen(name); /* * if the commandline contains a ':', then that's the extended @@ -1409,6 +1416,23 @@ int __init parse_crashkernel(char *cmdline, return 0; } +int __init parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel="); +} + +int __init parse_crashkernel_low(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel_low="); +} static void update_vmcoreinfo_note(void) { From 6c902b656c4a808d9c6f40a387b166455efecd62 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:20:12 -0800 Subject: [PATCH 090/105] x86: Merge early kernel reserve for 32bit and 64bit They are the same, and we could move them out from head32/64.c to setup.c. We are using memblock, and it could handle overlapping properly, so we don't need to reserve some at first to hold the location, and just need to make sure we reserve them before we are using memblock to find free mem to use. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-32-git-send-email-yinghai@kernel.org Cc: Alexander Duyck Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head32.c | 9 --------- arch/x86/kernel/head64.c | 9 --------- arch/x86/kernel/setup.c | 9 +++++++++ 3 files changed, 9 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index a795b54de7d3..138463a24877 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -33,9 +33,6 @@ void __init i386_start_kernel(void) { sanitize_boot_params(&boot_params); - memblock_reserve(__pa_symbol(&_text), - __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); - /* Call the subarch specific early setup function */ switch (boot_params.hdr.hardware_subarch) { case X86_SUBARCH_MRST: @@ -49,11 +46,5 @@ void __init i386_start_kernel(void) break; } - /* - * At this point everything still needed from the boot loader - * or BIOS or kernel text should be early reserved or marked not - * RAM in e820. All other memory is free game. - */ - start_kernel(); } diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 6873b070d72c..57334f4cd3af 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -186,16 +186,7 @@ void __init x86_64_start_reservations(char *real_mode_data) if (!boot_params.hdr.version) copy_bootdata(__va(real_mode_data)); - memblock_reserve(__pa_symbol(&_text), - __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); - reserve_ebda_region(); - /* - * At this point everything still needed from the boot loader - * or BIOS or kernel text should be early reserved or marked not - * RAM in e820. All other memory is free game. - */ - start_kernel(); } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 5dc47c3e537b..a74701af74e3 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -805,8 +805,17 @@ early_param("reservelow", parse_reservelow); void __init setup_arch(char **cmdline_p) { + memblock_reserve(__pa_symbol(_text), + (unsigned long)__bss_stop - (unsigned long)_text); + early_reserve_initrd(); + /* + * At this point everything still needed from the boot loader + * or BIOS or kernel text should be early reserved or marked not + * RAM in e820. All other memory is free game. + */ + #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); visws_early_detect(); From 72212675d1c96f5db8ec6fb35701879911193158 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:20:13 -0800 Subject: [PATCH 091/105] x86, 64bit, mm: Mark data/bss/brk to nx HPA said, we should not have RW and +x set at the time. for kernel layout: [ 0.000000] Kernel Layout: [ 0.000000] .text: [0x01000000-0x021434f8] [ 0.000000] .rodata: [0x02200000-0x02a13fff] [ 0.000000] .data: [0x02c00000-0x02dc763f] [ 0.000000] .init: [0x02dc9000-0x0312cfff] [ 0.000000] .bss: [0x0313b000-0x03dd6fff] [ 0.000000] .brk: [0x03dd7000-0x03dfffff] before the patch, we have ---[ High Kernel Mapping ]--- 0xffffffff80000000-0xffffffff81000000 16M pmd 0xffffffff81000000-0xffffffff82200000 18M ro PSE GLB x pmd 0xffffffff82200000-0xffffffff82c00000 10M ro PSE GLB NX pmd 0xffffffff82c00000-0xffffffff82dc9000 1828K RW GLB x pte 0xffffffff82dc9000-0xffffffff82e00000 220K RW GLB NX pte 0xffffffff82e00000-0xffffffff83000000 2M RW PSE GLB NX pmd 0xffffffff83000000-0xffffffff8313a000 1256K RW GLB NX pte 0xffffffff8313a000-0xffffffff83200000 792K RW GLB x pte 0xffffffff83200000-0xffffffff83e00000 12M RW PSE GLB x pmd 0xffffffff83e00000-0xffffffffa0000000 450M pmd after patch,, we get ---[ High Kernel Mapping ]--- 0xffffffff80000000-0xffffffff81000000 16M pmd 0xffffffff81000000-0xffffffff82200000 18M ro PSE GLB x pmd 0xffffffff82200000-0xffffffff82c00000 10M ro PSE GLB NX pmd 0xffffffff82c00000-0xffffffff82e00000 2M RW GLB NX pte 0xffffffff82e00000-0xffffffff83000000 2M RW PSE GLB NX pmd 0xffffffff83000000-0xffffffff83200000 2M RW GLB NX pte 0xffffffff83200000-0xffffffff83e00000 12M RW PSE GLB NX pmd 0xffffffff83e00000-0xffffffffa0000000 450M pmd so data, bss, brk get NX ... Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-33-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_64.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index dc67337d9bf0..e2fcbc34c9df 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -810,6 +810,7 @@ void mark_rodata_ro(void) unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table); unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata); unsigned long data_start = (unsigned long) &_sdata; + unsigned long all_end = PFN_ALIGN(&_end); printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); @@ -818,10 +819,10 @@ void mark_rodata_ro(void) kernel_set_to_readonly = 1; /* - * The rodata section (but not the kernel text!) should also be - * not-executable. + * The rodata/data/bss/brk section (but not the kernel text!) + * should also be not-executable. */ - set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT); + set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT); rodata_test(); From 8b78c21d72d9dbcb7230e97423a2cd8d8402c20c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:20:14 -0800 Subject: [PATCH 092/105] x86, 64bit, mm: hibernate use generic mapping_init We should set mappings only for usable memory ranges under max_pfn Otherwise causes same problem that is fixed by x86, mm: Only direct map addresses that are marked as E820_RAM Make it only map range in pfn_mapped array. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-34-git-send-email-yinghai@kernel.org Cc: Pavel Machek Cc: Rafael J. Wysocki Cc: linux-pm@vger.kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/power/hibernate_64.c | 64 ++++++++++++----------------------- 1 file changed, 21 insertions(+), 43 deletions(-) diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 460f314d13e5..a0fde91c16cf 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -11,6 +11,8 @@ #include #include #include + +#include #include #include #include @@ -39,41 +41,21 @@ pgd_t *temp_level4_pgt; void *relocated_restore_code; -static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) +static void *alloc_pgt_page(void *context) { - long i, j; - - i = pud_index(address); - pud = pud + i; - for (; i < PTRS_PER_PUD; pud++, i++) { - unsigned long paddr; - pmd_t *pmd; - - paddr = address + i*PUD_SIZE; - if (paddr >= end) - break; - - pmd = (pmd_t *)get_safe_page(GFP_ATOMIC); - if (!pmd) - return -ENOMEM; - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); - for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { - unsigned long pe; - - if (paddr >= end) - break; - pe = __PAGE_KERNEL_LARGE_EXEC | paddr; - pe &= __supported_pte_mask; - set_pmd(pmd, __pmd(pe)); - } - } - return 0; + return (void *)get_safe_page(GFP_ATOMIC); } static int set_up_temporary_mappings(void) { - unsigned long start, end, next; - int error; + struct x86_mapping_info info = { + .alloc_pgt_page = alloc_pgt_page, + .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, + .kernel_mapping = true, + }; + unsigned long mstart, mend; + int result; + int i; temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC); if (!temp_level4_pgt) @@ -84,21 +66,17 @@ static int set_up_temporary_mappings(void) init_level4_pgt[pgd_index(__START_KERNEL_map)]); /* Set up the direct mapping from scratch */ - start = (unsigned long)pfn_to_kaddr(0); - end = (unsigned long)pfn_to_kaddr(max_pfn); + for (i = 0; i < nr_pfn_mapped; i++) { + mstart = pfn_mapped[i].start << PAGE_SHIFT; + mend = pfn_mapped[i].end << PAGE_SHIFT; - for (; start < end; start = next) { - pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC); - if (!pud) - return -ENOMEM; - next = start + PGDIR_SIZE; - if (next > end) - next = end; - if ((error = res_phys_pud_init(pud, __pa(start), __pa(next)))) - return error; - set_pgd(temp_level4_pgt + pgd_index(start), - mk_kernel_pgd(__pa(pud))); + result = kernel_ident_mapping_init(&info, temp_level4_pgt, + mstart, mend); + + if (result) + return result; } + return 0; } From 38fa4175e60d98fb1c9815fb14f8057576dade73 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:20:15 -0800 Subject: [PATCH 093/105] mm: Add alloc_bootmem_low_pages_nopanic() We don't need to panic in some case, like for swiotlb preallocating. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-35-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- include/linux/bootmem.h | 5 +++++ mm/bootmem.c | 8 ++++++++ mm/nobootmem.c | 8 ++++++++ 3 files changed, 21 insertions(+) diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 3f778c27f825..3cd16ba82f15 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -99,6 +99,9 @@ void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat, extern void *__alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal); +void *__alloc_bootmem_low_nopanic(unsigned long size, + unsigned long align, + unsigned long goal); extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, @@ -132,6 +135,8 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, #define alloc_bootmem_low(x) \ __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0) +#define alloc_bootmem_low_pages_nopanic(x) \ + __alloc_bootmem_low_nopanic(x, PAGE_SIZE, 0) #define alloc_bootmem_low_pages(x) \ __alloc_bootmem_low(x, PAGE_SIZE, 0) #define alloc_bootmem_low_pages_node(pgdat, x) \ diff --git a/mm/bootmem.c b/mm/bootmem.c index b93376c39b61..2b0bcb019ec2 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -833,6 +833,14 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); } +void * __init __alloc_bootmem_low_nopanic(unsigned long size, + unsigned long align, + unsigned long goal) +{ + return ___alloc_bootmem_nopanic(size, align, goal, + ARCH_LOW_ADDRESS_LIMIT); +} + /** * __alloc_bootmem_low_node - allocate low boot memory from a specific node * @pgdat: node to allocate from diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 03d152a76acf..5e07d36e381e 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -391,6 +391,14 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); } +void * __init __alloc_bootmem_low_nopanic(unsigned long size, + unsigned long align, + unsigned long goal) +{ + return ___alloc_bootmem_nopanic(size, align, goal, + ARCH_LOW_ADDRESS_LIMIT); +} + /** * __alloc_bootmem_low_node - allocate low boot memory from a specific node * @pgdat: node to allocate from From ac2cbab21f318e19bc176a7f38a120cec835220f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:20:16 -0800 Subject: [PATCH 094/105] x86: Don't panic if can not alloc buffer for swiotlb Normal boot path on system with iommu support: swiotlb buffer will be allocated early at first and then try to initialize iommu, if iommu for intel or AMD could setup properly, swiotlb buffer will be freed. The early allocating is with bootmem, and could panic when we try to use kdump with buffer above 4G only, or with memmap to limit mem under 4G. for example: memmap=4095M$1M to remove memory under 4G. According to Eric, add _nopanic version and no_iotlb_memory to fail map single later if swiotlb is still needed. -v2: don't pass nopanic, and use -ENOMEM return value according to Eric. panic early instead of using swiotlb_full to panic...according to Eric/Konrad. -v3: make swiotlb_init to be notpanic, but will affect: arm64, ia64, powerpc, tile, unicore32, x86. -v4: cleanup swiotlb_init by removing swiotlb_init_with_default_size. Suggested-by: Eric W. Biederman Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-36-git-send-email-yinghai@kernel.org Reviewed-and-tested-by: Konrad Rzeszutek Wilk Cc: Joerg Roedel Cc: Ralf Baechle Cc: Jeremy Fitzhardinge Cc: Kyungmin Park Cc: Marek Szyprowski Cc: Arnd Bergmann Cc: Andrzej Pietrasiewicz Cc: linux-mips@linux-mips.org Cc: xen-devel@lists.xensource.com Cc: virtualization@lists.linux-foundation.org Cc: Shuah Khan Signed-off-by: H. Peter Anvin --- arch/mips/cavium-octeon/dma-octeon.c | 3 +- drivers/xen/swiotlb-xen.c | 4 ++- include/linux/swiotlb.h | 2 +- lib/swiotlb.c | 47 +++++++++++++++++----------- 4 files changed, 35 insertions(+), 21 deletions(-) diff --git a/arch/mips/cavium-octeon/dma-octeon.c b/arch/mips/cavium-octeon/dma-octeon.c index 41dd00884975..02f244475207 100644 --- a/arch/mips/cavium-octeon/dma-octeon.c +++ b/arch/mips/cavium-octeon/dma-octeon.c @@ -317,7 +317,8 @@ void __init plat_swiotlb_setup(void) octeon_swiotlb = alloc_bootmem_low_pages(swiotlbsize); - swiotlb_init_with_tbl(octeon_swiotlb, swiotlb_nslabs, 1); + if (swiotlb_init_with_tbl(octeon_swiotlb, swiotlb_nslabs, 1) == -ENOMEM) + panic("Cannot allocate SWIOTLB buffer"); mips_dma_map_ops = &octeon_linear_dma_map_ops.dma_map_ops; } diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index af47e7594460..1d94316f0ea4 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -231,7 +231,9 @@ retry: } start_dma_addr = xen_virt_to_bus(xen_io_tlb_start); if (early) { - swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, verbose); + if (swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, + verbose)) + panic("Cannot allocate SWIOTLB buffer"); rc = 0; } else rc = swiotlb_late_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs); diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 071d62c214a6..2de42f9401d2 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -23,7 +23,7 @@ extern int swiotlb_force; #define IO_TLB_SHIFT 11 extern void swiotlb_init(int verbose); -extern void swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose); +int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose); extern unsigned long swiotlb_nr_tbl(void); extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs); diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 196b06984dec..bfe02b8fc55b 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -122,11 +122,18 @@ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev, return phys_to_dma(hwdev, virt_to_phys(address)); } +static bool no_iotlb_memory; + void swiotlb_print_info(void) { unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT; unsigned char *vstart, *vend; + if (no_iotlb_memory) { + pr_warn("software IO TLB: No low mem\n"); + return; + } + vstart = phys_to_virt(io_tlb_start); vend = phys_to_virt(io_tlb_end); @@ -136,7 +143,7 @@ void swiotlb_print_info(void) bytes >> 20, vstart, vend - 1); } -void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) +int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) { void *v_overflow_buffer; unsigned long i, bytes; @@ -150,9 +157,10 @@ void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) /* * Get the overflow emergency buffer */ - v_overflow_buffer = alloc_bootmem_low_pages(PAGE_ALIGN(io_tlb_overflow)); + v_overflow_buffer = alloc_bootmem_low_pages_nopanic( + PAGE_ALIGN(io_tlb_overflow)); if (!v_overflow_buffer) - panic("Cannot allocate SWIOTLB overflow buffer!\n"); + return -ENOMEM; io_tlb_overflow_buffer = __pa(v_overflow_buffer); @@ -169,15 +177,19 @@ void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) if (verbose) swiotlb_print_info(); + + return 0; } /* * Statically reserve bounce buffer space and initialize bounce buffer data * structures for the software IO TLB used to implement the DMA API. */ -static void __init -swiotlb_init_with_default_size(size_t default_size, int verbose) +void __init +swiotlb_init(int verbose) { + /* default to 64MB */ + size_t default_size = 64UL<<20; unsigned char *vstart; unsigned long bytes; @@ -188,20 +200,16 @@ swiotlb_init_with_default_size(size_t default_size, int verbose) bytes = io_tlb_nslabs << IO_TLB_SHIFT; - /* - * Get IO TLB memory from the low pages - */ - vstart = alloc_bootmem_low_pages(PAGE_ALIGN(bytes)); - if (!vstart) - panic("Cannot allocate SWIOTLB buffer"); + /* Get IO TLB memory from the low pages */ + vstart = alloc_bootmem_low_pages_nopanic(PAGE_ALIGN(bytes)); + if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) + return; - swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose); -} - -void __init -swiotlb_init(int verbose) -{ - swiotlb_init_with_default_size(64 * (1<<20), verbose); /* default to 64MB */ + if (io_tlb_start) + free_bootmem(io_tlb_start, + PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); + pr_warn("Cannot allocate SWIOTLB buffer"); + no_iotlb_memory = true; } /* @@ -405,6 +413,9 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, unsigned long offset_slots; unsigned long max_slots; + if (no_iotlb_memory) + panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); + mask = dma_get_seg_boundary(hwdev); tbl_dma_addr &= mask; From 1e9209edc71b851d81f0316ca03a0e6335c0ef9a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 27 Jan 2013 01:18:21 +0100 Subject: [PATCH 095/105] x86/numa: Use __pa_nodebug() instead MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ... and fix the following warning: arch/x86/mm/numa.c: In function ‘setup_node_data’: arch/x86/mm/numa.c:222:3: warning: passing argument 1 of ‘__phys_addr_nodebug’ makes integer from pointer without a cast Signed-off-by: Borislav Petkov Acked-by: Dave Hansen Link: http://lkml.kernel.org/r/1359245901-8512-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/mm/numa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 76604eb9e4b0..b2313c6739f5 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -219,7 +219,7 @@ static void __init setup_node_data(int nid, u64 start, u64 end) */ nd = alloc_remap(nid, nd_size); if (nd) { - nd_pa = __phys_addr_nodebug(nd); + nd_pa = __pa_nodebug(nd); remapped = true; } else { nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); From f03574f2d5b2d6229dcdf2d322848065f72953c7 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 30 Jan 2013 16:56:16 -0800 Subject: [PATCH 096/105] x86-32, mm: Rip out x86_32 NUMA remapping code This code was an optimization for 32-bit NUMA systems. It has probably been the cause of a number of subtle bugs over the years, although the conditions to excite them would have been hard to trigger. Essentially, we remap part of the kernel linear mapping area, and then sometimes part of that area gets freed back in to the bootmem allocator. If those pages get used by kernel data structures (say mem_map[] or a dentry), there's no big deal. But, if anyone ever tried to use the linear mapping for these pages _and_ cared about their physical address, bad things happen. For instance, say you passed __GFP_ZERO to the page allocator and then happened to get handed one of these pages, it zero the remapped page, but it would make a pte to the _old_ page. There are probably a hundred other ways that it could screw with things. We don't need to hang on to performance optimizations for these old boxes any more. All my 32-bit NUMA systems are long dead and buried, and I probably had access to more than most people. This code is causing real things to break today: https://lkml.org/lkml/2013/1/9/376 I looked in to actually fixing this, but it requires surgery to way too much brittle code, as well as stuff like per_cpu_ptr_to_phys(). [ hpa: Cc: this for -stable, since it is a memory corruption issue. However, an alternative is to simply mark NUMA as depends BROKEN rather than EXPERIMENTAL in the X86_32 subclause... ] Link: http://lkml.kernel.org/r/20130131005616.1C79F411@kernel.stglabs.ibm.com Signed-off-by: H. Peter Anvin Cc: --- arch/x86/Kconfig | 4 - arch/x86/mm/numa.c | 3 - arch/x86/mm/numa_32.c | 161 ------------------------------------ arch/x86/mm/numa_internal.h | 6 -- 4 files changed, 174 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 79795af59810..108efcb21c9e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1253,10 +1253,6 @@ config NODES_SHIFT Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. -config HAVE_ARCH_ALLOC_REMAP - def_bool y - depends on X86_32 && NUMA - config ARCH_HAVE_MEMORY_PRESENT def_bool y depends on X86_32 && DISCONTIGMEM diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index b2313c6739f5..61c2b6f5ff88 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -205,9 +205,6 @@ static void __init setup_node_data(int nid, u64 start, u64 end) if (end && (end - start) < NODE_MIN_SIZE) return; - /* initialize remap allocator before aligning to ZONE_ALIGN */ - init_alloc_remap(nid, start, end); - start = roundup(start, ZONE_ALIGN); printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 534255a36b6b..73a6d7395bd3 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -73,167 +73,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, extern unsigned long highend_pfn, highstart_pfn; -#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) - -static void *node_remap_start_vaddr[MAX_NUMNODES]; -void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); - -/* - * Remap memory allocator - */ -static unsigned long node_remap_start_pfn[MAX_NUMNODES]; -static void *node_remap_end_vaddr[MAX_NUMNODES]; -static void *node_remap_alloc_vaddr[MAX_NUMNODES]; - -/** - * alloc_remap - Allocate remapped memory - * @nid: NUMA node to allocate memory from - * @size: The size of allocation - * - * Allocate @size bytes from the remap area of NUMA node @nid. The - * size of the remap area is predetermined by init_alloc_remap() and - * only the callers considered there should call this function. For - * more info, please read the comment on top of init_alloc_remap(). - * - * The caller must be ready to handle allocation failure from this - * function and fall back to regular memory allocator in such cases. - * - * CONTEXT: - * Single CPU early boot context. - * - * RETURNS: - * Pointer to the allocated memory on success, %NULL on failure. - */ -void *alloc_remap(int nid, unsigned long size) -{ - void *allocation = node_remap_alloc_vaddr[nid]; - - size = ALIGN(size, L1_CACHE_BYTES); - - if (!allocation || (allocation + size) > node_remap_end_vaddr[nid]) - return NULL; - - node_remap_alloc_vaddr[nid] += size; - memset(allocation, 0, size); - - return allocation; -} - -#ifdef CONFIG_HIBERNATION -/** - * resume_map_numa_kva - add KVA mapping to the temporary page tables created - * during resume from hibernation - * @pgd_base - temporary resume page directory - */ -void resume_map_numa_kva(pgd_t *pgd_base) -{ - int node; - - for_each_online_node(node) { - unsigned long start_va, start_pfn, nr_pages, pfn; - - start_va = (unsigned long)node_remap_start_vaddr[node]; - start_pfn = node_remap_start_pfn[node]; - nr_pages = (node_remap_end_vaddr[node] - - node_remap_start_vaddr[node]) >> PAGE_SHIFT; - - printk(KERN_DEBUG "%s: node %d\n", __func__, node); - - for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) { - unsigned long vaddr = start_va + (pfn << PAGE_SHIFT); - pgd_t *pgd = pgd_base + pgd_index(vaddr); - pud_t *pud = pud_offset(pgd, vaddr); - pmd_t *pmd = pmd_offset(pud, vaddr); - - set_pmd(pmd, pfn_pmd(start_pfn + pfn, - PAGE_KERNEL_LARGE_EXEC)); - - printk(KERN_DEBUG "%s: %08lx -> pfn %08lx\n", - __func__, vaddr, start_pfn + pfn); - } - } -} -#endif - -/** - * init_alloc_remap - Initialize remap allocator for a NUMA node - * @nid: NUMA node to initizlie remap allocator for - * - * NUMA nodes may end up without any lowmem. As allocating pgdat and - * memmap on a different node with lowmem is inefficient, a special - * remap allocator is implemented which can be used by alloc_remap(). - * - * For each node, the amount of memory which will be necessary for - * pgdat and memmap is calculated and two memory areas of the size are - * allocated - one in the node and the other in lowmem; then, the area - * in the node is remapped to the lowmem area. - * - * As pgdat and memmap must be allocated in lowmem anyway, this - * doesn't waste lowmem address space; however, the actual lowmem - * which gets remapped over is wasted. The amount shouldn't be - * problematic on machines this feature will be used. - * - * Initialization failure isn't fatal. alloc_remap() is used - * opportunistically and the callers will fall back to other memory - * allocation mechanisms on failure. - */ -void __init init_alloc_remap(int nid, u64 start, u64 end) -{ - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long end_pfn = end >> PAGE_SHIFT; - unsigned long size, pfn; - u64 node_pa, remap_pa; - void *remap_va; - - /* - * The acpi/srat node info can show hot-add memroy zones where - * memory could be added but not currently present. - */ - printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", - nid, start_pfn, end_pfn); - - /* calculate the necessary space aligned to large page size */ - size = node_memmap_size_bytes(nid, start_pfn, end_pfn); - size += ALIGN(sizeof(pg_data_t), PAGE_SIZE); - size = ALIGN(size, LARGE_PAGE_BYTES); - - /* allocate node memory and the lowmem remap area */ - node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES); - if (!node_pa) { - pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n", - size, nid); - return; - } - memblock_reserve(node_pa, size); - - remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT, - max_low_pfn << PAGE_SHIFT, - size, LARGE_PAGE_BYTES); - if (!remap_pa) { - pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n", - size, nid); - memblock_free(node_pa, size); - return; - } - memblock_reserve(remap_pa, size); - remap_va = phys_to_virt(remap_pa); - - /* perform actual remap */ - for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE) - set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT), - (node_pa >> PAGE_SHIFT) + pfn, - PAGE_KERNEL_LARGE); - - /* initialize remap allocator parameters */ - node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT; - node_remap_start_vaddr[nid] = remap_va; - node_remap_end_vaddr[nid] = remap_va + size; - node_remap_alloc_vaddr[nid] = remap_va; - - printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n", - nid, node_pa, node_pa + size, remap_va, remap_va + size); -} - void __init initmem_init(void) { x86_numa_init(); diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h index 7178c3afe05e..ad86ec91e640 100644 --- a/arch/x86/mm/numa_internal.h +++ b/arch/x86/mm/numa_internal.h @@ -21,12 +21,6 @@ void __init numa_reset_distance(void); void __init x86_numa_init(void); -#ifdef CONFIG_X86_64 -static inline void init_alloc_remap(int nid, u64 start, u64 end) { } -#else -void __init init_alloc_remap(int nid, u64 start, u64 end); -#endif - #ifdef CONFIG_NUMA_EMU void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt); From bb112aec5ee41427e9b9726e3d57b896709598ed Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 31 Jan 2013 13:53:10 -0800 Subject: [PATCH 097/105] x86-32, mm: Remove reference to resume_map_numa_kva() Remove reference to removed function resume_map_numa_kva(). Signed-off-by: H. Peter Anvin Cc: Dave Hansen Cc: Link: http://lkml.kernel.org/r/20130131005616.1C79F411@kernel.stglabs.ibm.com --- arch/x86/include/asm/mmzone_32.h | 6 ------ arch/x86/power/hibernate_32.c | 2 -- 2 files changed, 8 deletions(-) diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index eb05fb3b02fb..8a9b3e288cb4 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h @@ -14,12 +14,6 @@ extern struct pglist_data *node_data[]; #include -extern void resume_map_numa_kva(pgd_t *pgd); - -#else /* !CONFIG_NUMA */ - -static inline void resume_map_numa_kva(pgd_t *pgd) {} - #endif /* CONFIG_NUMA */ #ifdef CONFIG_DISCONTIGMEM diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index 74202c1910cd..7d28c885d238 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -129,8 +129,6 @@ static int resume_physical_mapping_init(pgd_t *pgd_base) } } - resume_map_numa_kva(pgd_base); - return 0; } From 07f4207a305c834f528d08428df4531744e25678 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 31 Jan 2013 14:00:48 -0800 Subject: [PATCH 098/105] x86-32, mm: Remove reference to alloc_remap() We have removed the remap allocator for x86-32, and x86-64 never had it (and doesn't need it). Remove residual reference to it. Reported-by: Yinghai Lu Signed-off-by: H. Peter Anvin Cc: Dave Hansen Cc: Link: http://lkml.kernel.org/r/CAE9FiQVn6_QZi3fNQ-JHYiR-7jeDJ5hT0SyT_%2BzVvfOj=PzF3w@mail.gmail.com --- arch/x86/mm/numa.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 61c2b6f5ff88..8504f3698753 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -193,7 +193,6 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) static void __init setup_node_data(int nid, u64 start, u64 end) { const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); - bool remapped = false; u64 nd_pa; void *nd; int tnid; @@ -211,28 +210,22 @@ static void __init setup_node_data(int nid, u64 start, u64 end) nid, start, end - 1); /* - * Allocate node data. Try remap allocator first, node-local - * memory and then any node. Never allocate in DMA zone. + * Allocate node data. Try node-local memory and then any node. + * Never allocate in DMA zone. */ - nd = alloc_remap(nid, nd_size); - if (nd) { - nd_pa = __pa_nodebug(nd); - remapped = true; - } else { - nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); - if (!nd_pa) { - pr_err("Cannot find %zu bytes in node %d\n", - nd_size, nid); - return; - } - nd = __va(nd_pa); + nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); + if (!nd_pa) { + pr_err("Cannot find %zu bytes in node %d\n", + nd_size, nid); + return; } + nd = __va(nd_pa); /* report and initialize */ - printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]%s\n", - nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : ""); + printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]\n", + nd_pa, nd_pa + nd_size - 1); tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); - if (!remapped && tnid != nid) + if (tnid != nid) printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); node_data[nid] = nd; From 96477b4cd705c5416346aef262b0a1116cfcdd80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Wed, 12 Dec 2012 13:34:03 +0200 Subject: [PATCH 099/105] x86-32: Add support for 64bit get_user() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement __get_user_8() for x86-32. It will return the 64-bit result in edx:eax register pair, and ecx is used to pass in the address and return the error value. For consistency, change the register assignment for all other __get_user_x() variants, so that address is passed in ecx/rcx, the error value is returned in ecx/rcx, and eax/rax contains the actual value. [ hpa: I modified the patch so that it does NOT change the calling conventions for the existing callsites, this also means that the code is completely unchanged for 64 bits. Instead, continue to use eax for address input/error output and use the ecx:edx register pair for the output. ] This is a partial refresh of a patch [1] by Jamie Lokier from 2004. Only the minimal changes to implement 64bit get_user() were picked from the original patch. [1] http://article.gmane.org/gmane.linux.kernel/198823 Originally-by: Jamie Lokier Signed-off-by: Ville Syrjälä Link: http://lkml.kernel.org/r/1355312043-11467-1-git-send-email-ville.syrjala@linux.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/uaccess.h | 19 +++++++++++++---- arch/x86/kernel/i386_ksyms_32.c | 1 + arch/x86/lib/getuser.S | 37 ++++++++++++++++++++++++++++----- 3 files changed, 48 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 1709801d18ec..1e963267d44e 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -151,8 +151,15 @@ extern int __get_user_bad(void); * On error, the variable @x is set to zero. */ #ifdef CONFIG_X86_32 -#define __get_user_8(__ret_gu, __val_gu, ptr) \ - __get_user_x(X, __ret_gu, __val_gu, ptr) +#define __get_user_8(ret, x, ptr) \ +do { \ + register unsigned long long __xx asm("%edx"); \ + asm volatile("call __get_user_8" \ + : "=a" (ret), "=r" (__xx) \ + : "0" (ptr)); \ + (x) = __xx; \ +} while (0) + #else #define __get_user_8(__ret_gu, __val_gu, ptr) \ __get_user_x(8, __ret_gu, __val_gu, ptr) @@ -162,6 +169,7 @@ extern int __get_user_bad(void); ({ \ int __ret_gu; \ unsigned long __val_gu; \ + unsigned long long __val_gu8; \ __chk_user_ptr(ptr); \ might_fault(); \ switch (sizeof(*(ptr))) { \ @@ -175,13 +183,16 @@ extern int __get_user_bad(void); __get_user_x(4, __ret_gu, __val_gu, ptr); \ break; \ case 8: \ - __get_user_8(__ret_gu, __val_gu, ptr); \ + __get_user_8(__ret_gu, __val_gu8, ptr); \ break; \ default: \ __get_user_x(X, __ret_gu, __val_gu, ptr); \ break; \ } \ - (x) = (__typeof__(*(ptr)))__val_gu; \ + if (sizeof(*(ptr)) == 8) \ + (x) = (__typeof__(*(ptr)))__val_gu8; \ + else \ + (x) = (__typeof__(*(ptr)))__val_gu; \ __ret_gu; \ }) diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index 9c3bd4a2050e..0fa69127209a 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -26,6 +26,7 @@ EXPORT_SYMBOL(csum_partial_copy_generic); EXPORT_SYMBOL(__get_user_1); EXPORT_SYMBOL(__get_user_2); EXPORT_SYMBOL(__get_user_4); +EXPORT_SYMBOL(__get_user_8); EXPORT_SYMBOL(__put_user_1); EXPORT_SYMBOL(__put_user_2); diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index 156b9c804670..d3bf9f99ca77 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -15,11 +15,10 @@ * __get_user_X * * Inputs: %[r|e]ax contains the address. - * The register is modified, but all changes are undone - * before returning because the C code doesn't know about it. * * Outputs: %[r|e]ax is error code (0 or -EFAULT) * %[r|e]dx contains zero-extended value + * %ecx contains the high half for 32-bit __get_user_8 * * * These functions should not modify any other registers, @@ -79,22 +78,35 @@ ENTRY(__get_user_4) CFI_ENDPROC ENDPROC(__get_user_4) -#ifdef CONFIG_X86_64 ENTRY(__get_user_8) CFI_STARTPROC +#ifdef CONFIG_X86_64 add $7,%_ASM_AX jc bad_get_user GET_THREAD_INFO(%_ASM_DX) cmp TI_addr_limit(%_ASM_DX),%_ASM_AX - jae bad_get_user + jae bad_get_user ASM_STAC 4: movq -7(%_ASM_AX),%_ASM_DX xor %eax,%eax ASM_CLAC ret +#else + add $7,%_ASM_AX + jc bad_get_user_8 + GET_THREAD_INFO(%_ASM_DX) + cmp TI_addr_limit(%_ASM_DX),%_ASM_AX + jae bad_get_user_8 + ASM_STAC +4: mov -7(%_ASM_AX),%edx +5: mov -3(%_ASM_AX),%ecx + xor %eax,%eax + ASM_CLAC + ret +#endif CFI_ENDPROC ENDPROC(__get_user_8) -#endif + bad_get_user: CFI_STARTPROC @@ -105,9 +117,24 @@ bad_get_user: CFI_ENDPROC END(bad_get_user) +#ifdef CONFIG_X86_32 +bad_get_user_8: + CFI_STARTPROC + xor %edx,%edx + xor %ecx,%ecx + mov $(-EFAULT),%_ASM_AX + ASM_CLAC + ret + CFI_ENDPROC +END(bad_get_user_8) +#endif + _ASM_EXTABLE(1b,bad_get_user) _ASM_EXTABLE(2b,bad_get_user) _ASM_EXTABLE(3b,bad_get_user) #ifdef CONFIG_X86_64 _ASM_EXTABLE(4b,bad_get_user) +#else + _ASM_EXTABLE(4b,bad_get_user_8) + _ASM_EXTABLE(5b,bad_get_user_8) #endif From 136867f517cbc3f8a91f035677911a6b503c3323 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Tue, 5 Feb 2013 19:57:22 -0700 Subject: [PATCH 100/105] x86/kvm: Fix compile warning in kvm_register_steal_time() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following compile warning in kvm_register_steal_time(): CC arch/x86/kernel/kvm.o arch/x86/kernel/kvm.c: In function ‘kvm_register_steal_time’: arch/x86/kernel/kvm.c:302:3: warning: format ‘%lx’ expects argument of type ‘long unsigned int’, but argument 3 has type ‘phys_addr_t’ [-Wformat] Introduced via: 5dfd486c4750 x86, kvm: Fix kvm's use of __pa() on percpu areas d76565344512 x86, mm: Create slow_virt_to_phys() f3c4fbb68e93 x86, mm: Use new pagetable helpers in try_preserve_large_page() 4cbeb51b860c x86, mm: Pagetable level size/shift/mask helpers a25b9316841c x86, mm: Make DEBUG_VIRTUAL work earlier in boot Signed-off-by: Shuah Khan Acked-by: Gleb Natapov Cc: Marcelo Tosatti Cc: Dave Hansen Cc: Rik van Riel Cc: shuahkhan@gmail.com Cc: avi@redhat.com Cc: gleb@redhat.com Cc: mst@redhat.com Link: http://lkml.kernel.org/r/1360119442.8356.8.camel@lorien2 Signed-off-by: Ingo Molnar --- arch/x86/kernel/kvm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index aa7e58b82b39..9cec20253093 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -298,8 +298,8 @@ static void kvm_register_steal_time(void) memset(st, 0, sizeof(*st)); wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); - printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n", - cpu, slow_virt_to_phys(st)); + pr_info("kvm-stealtime: cpu %d, msr %llx\n", + cpu, (unsigned long long) slow_virt_to_phys(st)); } static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; From b390784dc1649f6e6c5e66e5f53c21e715ccf39b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 11 Feb 2013 16:27:28 -0800 Subject: [PATCH 101/105] x86, mm: Use a bitfield to mask nuisance get_user() warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Even though it is never executed, gcc wants to warn for casting from a large integer to a pointer. Furthermore, using a variable with __typeof__() doesn't work because __typeof__ retains storage specifiers (const, restrict, volatile). However, we can declare a bitfield using sizeof(), which is legal because sizeof() is a constant expression. This quiets the warning, although the code generated isn't 100% identical from the baseline before 96477b4 x86-32: Add support for 64bit get_user(): [x86-mb is baseline, x86-mm is this commit] text data bss filename 113716147 15858380 35037184 tip.x86-mb/o.i386-allconfig/vmlinux 113716145 15858380 35037184 tip.x86-mm/o.i386-allconfig/vmlinux 12989837 3597944 12255232 tip.x86-mb/o.i386-modconfig/vmlinux 12989831 3597944 12255232 tip.x86-mm/o.i386-modconfig/vmlinux 1462784 237608 1401988 tip.x86-mb/o.i386-noconfig/vmlinux 1462837 237608 1401964 tip.x86-mm/o.i386-noconfig/vmlinux 7938994 553688 7639040 tip.x86-mb/o.i386-pae/vmlinux 7943136 557784 7639040 tip.x86-mm/o.i386-pae/vmlinux 7186126 510572 6574080 tip.x86-mb/o.i386/vmlinux 7186124 510572 6574080 tip.x86-mm/o.i386/vmlinux 103747269 33578856 65888256 tip.x86-mb/o.x86_64-allconfig/vmlinux 103746949 33578856 65888256 tip.x86-mm/o.x86_64-allconfig/vmlinux 12116695 11035832 20160512 tip.x86-mb/o.x86_64-modconfig/vmlinux 12116567 11035832 20160512 tip.x86-mm/o.x86_64-modconfig/vmlinux 1700790 380524 511808 tip.x86-mb/o.x86_64-noconfig/vmlinux 1700790 380524 511808 tip.x86-mm/o.x86_64-noconfig/vmlinux 12413612 1133376 1101824 tip.x86-mb/o.x86_64/vmlinux 12413484 1133376 1101824 tip.x86-mm/o.x86_64/vmlinux Cc: Jamie Lokier Cc: Ville Syrjälä Cc: Borislav Petkov Cc: Russell King Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20130209110031.GA17833@n2100.arm.linux.org.uk Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/uaccess.h | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 1e963267d44e..a8d12653f304 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -168,31 +168,29 @@ do { \ #define get_user(x, ptr) \ ({ \ int __ret_gu; \ - unsigned long __val_gu; \ - unsigned long long __val_gu8; \ + struct { \ + unsigned long long __val_n : 8*sizeof(*(ptr)); \ + } __val_gu; \ __chk_user_ptr(ptr); \ might_fault(); \ switch (sizeof(*(ptr))) { \ case 1: \ - __get_user_x(1, __ret_gu, __val_gu, ptr); \ + __get_user_x(1, __ret_gu, __val_gu.__val_n, ptr); \ break; \ case 2: \ - __get_user_x(2, __ret_gu, __val_gu, ptr); \ + __get_user_x(2, __ret_gu, __val_gu.__val_n, ptr); \ break; \ case 4: \ - __get_user_x(4, __ret_gu, __val_gu, ptr); \ + __get_user_x(4, __ret_gu, __val_gu.__val_n, ptr); \ break; \ case 8: \ - __get_user_8(__ret_gu, __val_gu8, ptr); \ + __get_user_8(__ret_gu, __val_gu.__val_n, ptr); \ break; \ default: \ - __get_user_x(X, __ret_gu, __val_gu, ptr); \ + __get_user_x(X, __ret_gu, __val_gu.__val_n, ptr); \ break; \ } \ - if (sizeof(*(ptr)) == 8) \ - (x) = (__typeof__(*(ptr)))__val_gu8; \ - else \ - (x) = (__typeof__(*(ptr)))__val_gu; \ + (x) = (__typeof__(*(ptr)))__val_gu.__val_n; \ __ret_gu; \ }) From 16640165c9079e2cf36fdcfca093f29663a716f7 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 11 Feb 2013 23:14:48 -0800 Subject: [PATCH 102/105] x86: Be consistent with data size in getuser.S Consistently use the data register by name and use a sized assembly instruction in getuser.S. There is never any reason to macroize it, and being inconsistent in the same file is just annoying. No actual code change. Signed-off-by: H. Peter Anvin --- arch/x86/lib/getuser.S | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index d3bf9f99ca77..a4512359656a 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -41,7 +41,7 @@ ENTRY(__get_user_1) cmp TI_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user ASM_STAC -1: movzb (%_ASM_AX),%edx +1: movzbl (%_ASM_AX),%edx xor %eax,%eax ASM_CLAC ret @@ -71,7 +71,7 @@ ENTRY(__get_user_4) cmp TI_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user ASM_STAC -3: mov -3(%_ASM_AX),%edx +3: movl -3(%_ASM_AX),%edx xor %eax,%eax ASM_CLAC ret @@ -87,7 +87,7 @@ ENTRY(__get_user_8) cmp TI_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user ASM_STAC -4: movq -7(%_ASM_AX),%_ASM_DX +4: movq -7(%_ASM_AX),%rdx xor %eax,%eax ASM_CLAC ret @@ -98,8 +98,8 @@ ENTRY(__get_user_8) cmp TI_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user_8 ASM_STAC -4: mov -7(%_ASM_AX),%edx -5: mov -3(%_ASM_AX),%ecx +4: movl -7(%_ASM_AX),%edx +5: movl -3(%_ASM_AX),%ecx xor %eax,%eax ASM_CLAC ret From 3578baaed4613a9fc09bab9f79f6ce2ac682e8a3 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 12 Feb 2013 11:47:31 -0800 Subject: [PATCH 103/105] x86, mm: Redesign get_user with a __builtin_choose_expr hack MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of using a bitfield, use an odd little trick using typeof, __builtin_choose_expr, and sizeof. __builtin_choose_expr is explicitly defined to not convert its type (its argument is required to be a constant expression) so this should be well-defined. The code is still not 100% preturbation-free versus the baseline before 64-bit get_user(), but the differences seem to be very small, mostly related to padding and to gcc deciding when to spill registers. Cc: Jamie Lokier Cc: Ville Syrjälä Cc: Borislav Petkov Cc: Russell King Cc: Linus Torvalds Cc: H. J. Lu Link: http://lkml.kernel.org/r/511A8922.6050908@zytor.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/uaccess.h | 57 +++++++++------------------------- 1 file changed, 14 insertions(+), 43 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index a8d12653f304..d710a2555fd6 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -125,13 +125,12 @@ extern int __get_user_4(void); extern int __get_user_8(void); extern int __get_user_bad(void); -#define __get_user_x(size, ret, x, ptr) \ - asm volatile("call __get_user_" #size \ - : "=a" (ret), "=d" (x) \ - : "0" (ptr)) \ - -/* Careful: we have to cast the result to the type of the pointer - * for sign reasons */ +/* + * This is a type: either unsigned long, if the argument fits into + * that type, or otherwise unsigned long long. + */ +#define __inttype(x) \ +__typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) /** * get_user: - Get a simple variable from user space. @@ -149,48 +148,20 @@ extern int __get_user_bad(void); * * Returns zero on success, or -EFAULT on error. * On error, the variable @x is set to zero. + * + * Careful: we have to cast the result to the type of the pointer + * for sign reasons. */ -#ifdef CONFIG_X86_32 -#define __get_user_8(ret, x, ptr) \ -do { \ - register unsigned long long __xx asm("%edx"); \ - asm volatile("call __get_user_8" \ - : "=a" (ret), "=r" (__xx) \ - : "0" (ptr)); \ - (x) = __xx; \ -} while (0) - -#else -#define __get_user_8(__ret_gu, __val_gu, ptr) \ - __get_user_x(8, __ret_gu, __val_gu, ptr) -#endif - #define get_user(x, ptr) \ ({ \ int __ret_gu; \ - struct { \ - unsigned long long __val_n : 8*sizeof(*(ptr)); \ - } __val_gu; \ + register __inttype(*(ptr)) __val_gu asm("%edx"); \ __chk_user_ptr(ptr); \ might_fault(); \ - switch (sizeof(*(ptr))) { \ - case 1: \ - __get_user_x(1, __ret_gu, __val_gu.__val_n, ptr); \ - break; \ - case 2: \ - __get_user_x(2, __ret_gu, __val_gu.__val_n, ptr); \ - break; \ - case 4: \ - __get_user_x(4, __ret_gu, __val_gu.__val_n, ptr); \ - break; \ - case 8: \ - __get_user_8(__ret_gu, __val_gu.__val_n, ptr); \ - break; \ - default: \ - __get_user_x(X, __ret_gu, __val_gu.__val_n, ptr); \ - break; \ - } \ - (x) = (__typeof__(*(ptr)))__val_gu.__val_n; \ + asm volatile("call __get_user_%P3" \ + : "=a" (__ret_gu), "=r" (__val_gu) \ + : "0" (ptr), "i" (sizeof(*(ptr)))); \ + (x) = (__typeof__(*(ptr))) __val_gu; \ __ret_gu; \ }) From ff52c3b02b3f73178bfe0c219cd22abdcb0e46c3 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 12 Feb 2013 15:37:02 -0800 Subject: [PATCH 104/105] x86, doc: Clarify the use of asm("%edx") in uaccess.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Put in a comment that explains that the use of asm("%edx") in uaccess.h doesn't actually necessarily mean %edx alone. Cc: Jamie Lokier Cc: Ville Syrjälä Cc: Borislav Petkov Cc: Russell King Cc: Linus Torvalds Cc: H. J. Lu Link: http://lkml.kernel.org/r/511ACDFB.1050707@zytor.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/uaccess.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index d710a2555fd6..5ee26875baea 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -148,9 +148,16 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) * * Returns zero on success, or -EFAULT on error. * On error, the variable @x is set to zero. - * + */ +/* * Careful: we have to cast the result to the type of the pointer * for sign reasons. + * + * The use of %edx as the register specifier is a bit of a + * simplification, as gcc only cares about it as the starting point + * and not size: for a 64-bit value it will use %ecx:%edx on 32 bits + * (%ecx being the next register in gcc's x86 register sequence), and + * %rdx on 64 bits. */ #define get_user(x, ptr) \ ({ \ From 95c9608478d639dcffc14ea47b31bff021a99ed1 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 14 Feb 2013 14:02:52 -0800 Subject: [PATCH 105/105] x86, mm: Move reserving low memory later in initialization Move the reservation of low memory, except for the 4K which actually does belong to the BIOS, later in the initialization; in particular, after we have already reserved the trampoline. The current code locates the trampoline as high as possible, so by deferring the allocation we will still be able to reserve as much memory as is possible. This allows us to run with reservelow=640k without getting a crash on system startup. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/n/tip-0y9dqmmsousf69wutxwl3kkf@git.kernel.org --- arch/x86/kernel/setup.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 8354399b3aae..0aebd776018e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -608,8 +608,6 @@ static __init void reserve_ibft_region(void) memblock_reserve(addr, size); } -static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; - static bool __init snb_gfx_workaround_needed(void) { #ifdef CONFIG_PCI @@ -698,8 +696,7 @@ static void __init trim_bios_range(void) * since some BIOSes are known to corrupt low memory. See the * Kconfig help text for X86_RESERVE_LOW. */ - e820_update_range(0, ALIGN(reserve_low, PAGE_SIZE), - E820_RAM, E820_RESERVED); + e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED); /* * special case: Some BIOSen report the PC BIOS @@ -711,6 +708,8 @@ static void __init trim_bios_range(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); } +static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; + static int __init parse_reservelow(char *p) { unsigned long long size; @@ -733,6 +732,11 @@ static int __init parse_reservelow(char *p) early_param("reservelow", parse_reservelow); +static void __init trim_low_memory_range(void) +{ + memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE)); +} + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -987,6 +991,7 @@ void __init setup_arch(char **cmdline_p) setup_real_mode(); trim_platform_memory_ranges(); + trim_low_memory_range(); init_gbpages();