* New static EFI runtime services virtual mapping layout which is

groundwork for kexec support on EFI - Borislav Petkov
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v1.4.13 (GNU/Linux)
 
 iQIcBAABAgAGBQJSfMDjAAoJEC84WcCNIz1V0LIP/2WyTbJR6bL0HXwnGLpxmxag
 v0VgnKRhypNboA3WEu4a9as6AdExqB7qsiWIipHuDSMj/vkfZgHAKTd2f1iRxmsJ
 RZxzwV6YzvsWkdXjvpCoLWKSsvQDug++BAIti1PitW6RXRjo01t3ymo/Ho1CQrpI
 hNJbB3bbihMF+uqFvdSpO0KZtZE6EtnylrfBeuo0GzqqJdTGe1MmqlWmyUlEy5JW
 ZiHV8E/xTjh3N675tWPcT9hGVfCyOXXu/kPrXsJTXrdYyZL9qgA9b8SVRLs6DctX
 wVgL9lNv4wobsmZJ5DxkYl9+TaF7rbshUeIJbzrQyMVJjb3TpXk/ZpspDMAEjL7e
 bb76c1bAx4xZuUatR/f1ykkWKAEryAhXHkvwcbIBjebW33if1MgGJLk5udJQQv6H
 j+J9ROH38MDr0Geg+pM2RnCyTz8l+q+8Mfu4Yh9TSte+ttB6fr9phs3/G+fdSUn1
 0vI627v1HWzDcBh4eZjjslzJviR8PldsGVT3EsIaOnHGtk/9FPz/7n4efph4v7+9
 yqTkLvQHxsAx7f0tR/qRpkEIQ9WTMXO0IO79OC13QTSATJSl+WSPTJM7ccqOgn+P
 h89ssBnzlwmFHkuvTi599KVHdzOZrWTsB2zROn+NnSchJ+YAY6TXwznk3/MNo9rB
 d+euVcffL4yfWZ7Bzj8Z
 =w6ff
 -----END PGP SIGNATURE-----

Merge tag 'efi-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mfleming/efi into x86/efi

Pull EFI virtual mapping changes from Matt Fleming:

  * New static EFI runtime services virtual mapping layout which is
    groundwork for kexec support on EFI. (Borislav Petkov)

Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Ingo Molnar 2013-11-26 12:23:04 +01:00
commit 61d0669775
10 changed files with 755 additions and 70 deletions

View file

@ -890,6 +890,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
edd= [EDD]
Format: {"off" | "on" | "skip[mbr]"}
efi= [EFI]
Format: { "old_map" }
old_map [X86-64]: switch to the old ioremap-based EFI
runtime services mapping. 32-bit still uses this one by
default.
efi_no_storage_paranoia [EFI; X86]
Using this parameter you can use more than 50% of
your efi variable storage. Use this parameter only if

View file

@ -28,4 +28,11 @@ reference.
Current X86-64 implementations only support 40 bits of address space,
but we support up to 46 bits. This expands into MBZ space in the page tables.
->trampoline_pgd:
We map EFI runtime services in the aforementioned PGD in the virtual
range of 64Gb (arbitrarily set, can be raised if needed)
0xffffffef00000000 - 0xffffffff00000000
-Andi Kleen, Jul 2004

View file

@ -1,6 +1,24 @@
#ifndef _ASM_X86_EFI_H
#define _ASM_X86_EFI_H
/*
* We map the EFI regions needed for runtime services non-contiguously,
* with preserved alignment on virtual addresses starting from -4G down
* for a total max space of 64G. This way, we provide for stable runtime
* services addresses across kernels so that a kexec'd kernel can still
* use them.
*
* This is the main reason why we're doing stable VA mappings for RT
* services.
*
* This flag is used in conjuction with a chicken bit called
* "efi=old_map" which can be used as a fallback to the old runtime
* services mapping method in case there's some b0rkage with a
* particular EFI implementation (haha, it is hard to hold up the
* sarcasm here...).
*/
#define EFI_OLD_MEMMAP EFI_ARCH_1
#ifdef CONFIG_X86_32
#define EFI_LOADER_SIGNATURE "EL32"
@ -69,24 +87,31 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3,
efi_call6((f), (u64)(a1), (u64)(a2), (u64)(a3), \
(u64)(a4), (u64)(a5), (u64)(a6))
#define _efi_call_virtX(x, f, ...) \
({ \
efi_status_t __s; \
\
efi_sync_low_kernel_mappings(); \
preempt_disable(); \
__s = efi_call##x((void *)efi.systab->runtime->f, __VA_ARGS__); \
preempt_enable(); \
__s; \
})
#define efi_call_virt0(f) \
efi_call0((efi.systab->runtime->f))
#define efi_call_virt1(f, a1) \
efi_call1((efi.systab->runtime->f), (u64)(a1))
#define efi_call_virt2(f, a1, a2) \
efi_call2((efi.systab->runtime->f), (u64)(a1), (u64)(a2))
#define efi_call_virt3(f, a1, a2, a3) \
efi_call3((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
(u64)(a3))
#define efi_call_virt4(f, a1, a2, a3, a4) \
efi_call4((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
(u64)(a3), (u64)(a4))
#define efi_call_virt5(f, a1, a2, a3, a4, a5) \
efi_call5((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
(u64)(a3), (u64)(a4), (u64)(a5))
#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \
efi_call6((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
(u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
_efi_call_virtX(0, f)
#define efi_call_virt1(f, a1) \
_efi_call_virtX(1, f, (u64)(a1))
#define efi_call_virt2(f, a1, a2) \
_efi_call_virtX(2, f, (u64)(a1), (u64)(a2))
#define efi_call_virt3(f, a1, a2, a3) \
_efi_call_virtX(3, f, (u64)(a1), (u64)(a2), (u64)(a3))
#define efi_call_virt4(f, a1, a2, a3, a4) \
_efi_call_virtX(4, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4))
#define efi_call_virt5(f, a1, a2, a3, a4, a5) \
_efi_call_virtX(5, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5))
#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \
_efi_call_virtX(6, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
u32 type, u64 attribute);
@ -95,12 +120,17 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
extern int add_efi_memmap;
extern unsigned long x86_efi_facility;
extern struct efi_scratch efi_scratch;
extern void efi_set_executable(efi_memory_desc_t *md, bool executable);
extern int efi_memblock_x86_reserve_range(void);
extern void efi_call_phys_prelog(void);
extern void efi_call_phys_epilog(void);
extern void efi_unmap_memmap(void);
extern void efi_memory_uc(u64 addr, unsigned long size);
extern void __init efi_map_region(efi_memory_desc_t *md);
extern void efi_sync_low_kernel_mappings(void);
extern void efi_setup_page_tables(void);
extern void __init old_map_region(efi_memory_desc_t *md);
#ifdef CONFIG_EFI

View file

@ -382,7 +382,8 @@ static inline void update_page_count(int level, unsigned long pages) { }
*/
extern pte_t *lookup_address(unsigned long address, unsigned int *level);
extern phys_addr_t slow_virt_to_phys(void *__address);
extern int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
unsigned numpages, unsigned long page_flags);
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_DEFS_H */

View file

@ -30,6 +30,7 @@
*/
struct cpa_data {
unsigned long *vaddr;
pgd_t *pgd;
pgprot_t mask_set;
pgprot_t mask_clr;
int numpages;
@ -322,17 +323,9 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
return prot;
}
/*
* Lookup the page table entry for a virtual address. Return a pointer
* to the entry and the level of the mapping.
*
* Note: We return pud and pmd either when the entry is marked large
* or when the present bit is not set. Otherwise we would return a
* pointer to a nonexisting mapping.
*/
pte_t *lookup_address(unsigned long address, unsigned int *level)
static pte_t *__lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
unsigned int *level)
{
pgd_t *pgd = pgd_offset_k(address);
pud_t *pud;
pmd_t *pmd;
@ -361,8 +354,31 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
return pte_offset_kernel(pmd, address);
}
/*
* Lookup the page table entry for a virtual address. Return a pointer
* to the entry and the level of the mapping.
*
* Note: We return pud and pmd either when the entry is marked large
* or when the present bit is not set. Otherwise we would return a
* pointer to a nonexisting mapping.
*/
pte_t *lookup_address(unsigned long address, unsigned int *level)
{
return __lookup_address_in_pgd(pgd_offset_k(address), address, level);
}
EXPORT_SYMBOL_GPL(lookup_address);
static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
unsigned int *level)
{
if (cpa->pgd)
return __lookup_address_in_pgd(cpa->pgd + pgd_index(address),
address, level);
return lookup_address(address, level);
}
/*
* This is necessary because __pa() does not work on some
* kinds of memory, like vmalloc() or the alloc_remap()
@ -437,7 +453,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
* Check for races, another CPU might have split this page
* up already:
*/
tmp = lookup_address(address, &level);
tmp = _lookup_address_cpa(cpa, address, &level);
if (tmp != kpte)
goto out_unlock;
@ -543,7 +559,8 @@ out_unlock:
}
static int
__split_large_page(pte_t *kpte, unsigned long address, struct page *base)
__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
struct page *base)
{
pte_t *pbase = (pte_t *)page_address(base);
unsigned long pfn, pfninc = 1;
@ -556,7 +573,7 @@ __split_large_page(pte_t *kpte, unsigned long address, struct page *base)
* Check for races, another CPU might have split this page
* up for us already:
*/
tmp = lookup_address(address, &level);
tmp = _lookup_address_cpa(cpa, address, &level);
if (tmp != kpte) {
spin_unlock(&pgd_lock);
return 1;
@ -632,7 +649,8 @@ __split_large_page(pte_t *kpte, unsigned long address, struct page *base)
return 0;
}
static int split_large_page(pte_t *kpte, unsigned long address)
static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
unsigned long address)
{
struct page *base;
@ -644,15 +662,390 @@ static int split_large_page(pte_t *kpte, unsigned long address)
if (!base)
return -ENOMEM;
if (__split_large_page(kpte, address, base))
if (__split_large_page(cpa, kpte, address, base))
__free_page(base);
return 0;
}
static bool try_to_free_pte_page(pte_t *pte)
{
int i;
for (i = 0; i < PTRS_PER_PTE; i++)
if (!pte_none(pte[i]))
return false;
free_page((unsigned long)pte);
return true;
}
static bool try_to_free_pmd_page(pmd_t *pmd)
{
int i;
for (i = 0; i < PTRS_PER_PMD; i++)
if (!pmd_none(pmd[i]))
return false;
free_page((unsigned long)pmd);
return true;
}
static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
{
pte_t *pte = pte_offset_kernel(pmd, start);
while (start < end) {
set_pte(pte, __pte(0));
start += PAGE_SIZE;
pte++;
}
if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
pmd_clear(pmd);
return true;
}
return false;
}
static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
unsigned long start, unsigned long end)
{
if (unmap_pte_range(pmd, start, end))
if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
pud_clear(pud);
}
static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
{
pmd_t *pmd = pmd_offset(pud, start);
/*
* Not on a 2MB page boundary?
*/
if (start & (PMD_SIZE - 1)) {
unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
unsigned long pre_end = min_t(unsigned long, end, next_page);
__unmap_pmd_range(pud, pmd, start, pre_end);
start = pre_end;
pmd++;
}
/*
* Try to unmap in 2M chunks.
*/
while (end - start >= PMD_SIZE) {
if (pmd_large(*pmd))
pmd_clear(pmd);
else
__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
start += PMD_SIZE;
pmd++;
}
/*
* 4K leftovers?
*/
if (start < end)
return __unmap_pmd_range(pud, pmd, start, end);
/*
* Try again to free the PMD page if haven't succeeded above.
*/
if (!pud_none(*pud))
if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
pud_clear(pud);
}
static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
{
pud_t *pud = pud_offset(pgd, start);
/*
* Not on a GB page boundary?
*/
if (start & (PUD_SIZE - 1)) {
unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
unsigned long pre_end = min_t(unsigned long, end, next_page);
unmap_pmd_range(pud, start, pre_end);
start = pre_end;
pud++;
}
/*
* Try to unmap in 1G chunks?
*/
while (end - start >= PUD_SIZE) {
if (pud_large(*pud))
pud_clear(pud);
else
unmap_pmd_range(pud, start, start + PUD_SIZE);
start += PUD_SIZE;
pud++;
}
/*
* 2M leftovers?
*/
if (start < end)
unmap_pmd_range(pud, start, end);
/*
* No need to try to free the PUD page because we'll free it in
* populate_pgd's error path
*/
}
static int alloc_pte_page(pmd_t *pmd)
{
pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
if (!pte)
return -1;
set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
return 0;
}
static int alloc_pmd_page(pud_t *pud)
{
pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
if (!pmd)
return -1;
set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
return 0;
}
static void populate_pte(struct cpa_data *cpa,
unsigned long start, unsigned long end,
unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
{
pte_t *pte;
pte = pte_offset_kernel(pmd, start);
while (num_pages-- && start < end) {
/* deal with the NX bit */
if (!(pgprot_val(pgprot) & _PAGE_NX))
cpa->pfn &= ~_PAGE_NX;
set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot));
start += PAGE_SIZE;
cpa->pfn += PAGE_SIZE;
pte++;
}
}
static int populate_pmd(struct cpa_data *cpa,
unsigned long start, unsigned long end,
unsigned num_pages, pud_t *pud, pgprot_t pgprot)
{
unsigned int cur_pages = 0;
pmd_t *pmd;
/*
* Not on a 2M boundary?
*/
if (start & (PMD_SIZE - 1)) {
unsigned long pre_end = start + (num_pages << PAGE_SHIFT);
unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
pre_end = min_t(unsigned long, pre_end, next_page);
cur_pages = (pre_end - start) >> PAGE_SHIFT;
cur_pages = min_t(unsigned int, num_pages, cur_pages);
/*
* Need a PTE page?
*/
pmd = pmd_offset(pud, start);
if (pmd_none(*pmd))
if (alloc_pte_page(pmd))
return -1;
populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);
start = pre_end;
}
/*
* We mapped them all?
*/
if (num_pages == cur_pages)
return cur_pages;
while (end - start >= PMD_SIZE) {
/*
* We cannot use a 1G page so allocate a PMD page if needed.
*/
if (pud_none(*pud))
if (alloc_pmd_page(pud))
return -1;
pmd = pmd_offset(pud, start);
set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot)));
start += PMD_SIZE;
cpa->pfn += PMD_SIZE;
cur_pages += PMD_SIZE >> PAGE_SHIFT;
}
/*
* Map trailing 4K pages.
*/
if (start < end) {
pmd = pmd_offset(pud, start);
if (pmd_none(*pmd))
if (alloc_pte_page(pmd))
return -1;
populate_pte(cpa, start, end, num_pages - cur_pages,
pmd, pgprot);
}
return num_pages;
}
static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
pgprot_t pgprot)
{
pud_t *pud;
unsigned long end;
int cur_pages = 0;
end = start + (cpa->numpages << PAGE_SHIFT);
/*
* Not on a Gb page boundary? => map everything up to it with
* smaller pages.
*/
if (start & (PUD_SIZE - 1)) {
unsigned long pre_end;
unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
pre_end = min_t(unsigned long, end, next_page);
cur_pages = (pre_end - start) >> PAGE_SHIFT;
cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
pud = pud_offset(pgd, start);
/*
* Need a PMD page?
*/
if (pud_none(*pud))
if (alloc_pmd_page(pud))
return -1;
cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
pud, pgprot);
if (cur_pages < 0)
return cur_pages;
start = pre_end;
}
/* We mapped them all? */
if (cpa->numpages == cur_pages)
return cur_pages;
pud = pud_offset(pgd, start);
/*
* Map everything starting from the Gb boundary, possibly with 1G pages
*/
while (end - start >= PUD_SIZE) {
set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot)));
start += PUD_SIZE;
cpa->pfn += PUD_SIZE;
cur_pages += PUD_SIZE >> PAGE_SHIFT;
pud++;
}
/* Map trailing leftover */
if (start < end) {
int tmp;
pud = pud_offset(pgd, start);
if (pud_none(*pud))
if (alloc_pmd_page(pud))
return -1;
tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
pud, pgprot);
if (tmp < 0)
return cur_pages;
cur_pages += tmp;
}
return cur_pages;
}
/*
* Restrictions for kernel page table do not necessarily apply when mapping in
* an alternate PGD.
*/
static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
{
pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
bool allocd_pgd = false;
pgd_t *pgd_entry;
pud_t *pud = NULL; /* shut up gcc */
int ret;
pgd_entry = cpa->pgd + pgd_index(addr);
/*
* Allocate a PUD page and hand it down for mapping.
*/
if (pgd_none(*pgd_entry)) {
pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
if (!pud)
return -1;
set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE));
allocd_pgd = true;
}
pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
pgprot_val(pgprot) |= pgprot_val(cpa->mask_set);
ret = populate_pud(cpa, addr, pgd_entry, pgprot);
if (ret < 0) {
unmap_pud_range(pgd_entry, addr,
addr + (cpa->numpages << PAGE_SHIFT));
if (allocd_pgd) {
/*
* If I allocated this PUD page, I can just as well
* free it in this error path.
*/
pgd_clear(pgd_entry);
free_page((unsigned long)pud);
}
return ret;
}
cpa->numpages = ret;
return 0;
}
static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
int primary)
{
if (cpa->pgd)
return populate_pgd(cpa, vaddr);
/*
* Ignore all non primary paths.
*/
@ -697,7 +1090,7 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
else
address = *cpa->vaddr;
repeat:
kpte = lookup_address(address, &level);
kpte = _lookup_address_cpa(cpa, address, &level);
if (!kpte)
return __cpa_process_fault(cpa, address, primary);
@ -761,7 +1154,7 @@ repeat:
/*
* We have to split the large page:
*/
err = split_large_page(kpte, address);
err = split_large_page(cpa, kpte, address);
if (!err) {
/*
* Do a global flush tlb after splitting the large page
@ -910,6 +1303,8 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
int ret, cache, checkalias;
unsigned long baddr = 0;
memset(&cpa, 0, sizeof(cpa));
/*
* Check, if we are requested to change a not supported
* feature:
@ -1356,6 +1751,7 @@ static int __set_pages_p(struct page *page, int numpages)
{
unsigned long tempaddr = (unsigned long) page_address(page);
struct cpa_data cpa = { .vaddr = &tempaddr,
.pgd = NULL,
.numpages = numpages,
.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
.mask_clr = __pgprot(0),
@ -1374,6 +1770,7 @@ static int __set_pages_np(struct page *page, int numpages)
{
unsigned long tempaddr = (unsigned long) page_address(page);
struct cpa_data cpa = { .vaddr = &tempaddr,
.pgd = NULL,
.numpages = numpages,
.mask_set = __pgprot(0),
.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
@ -1434,6 +1831,36 @@ bool kernel_page_present(struct page *page)
#endif /* CONFIG_DEBUG_PAGEALLOC */
int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
unsigned numpages, unsigned long page_flags)
{
int retval = -EINVAL;
struct cpa_data cpa = {
.vaddr = &address,
.pfn = pfn,
.pgd = pgd,
.numpages = numpages,
.mask_set = __pgprot(0),
.mask_clr = __pgprot(0),
.flags = 0,
};
if (!(__supported_pte_mask & _PAGE_NX))
goto out;
if (!(page_flags & _PAGE_NX))
cpa.mask_clr = __pgprot(_PAGE_NX);
cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
retval = __change_page_attr_set_clr(&cpa, 0);
__flush_tlb_all();
out:
return retval;
}
/*
* The testcases use internal knowledge of the implementation that shouldn't
* be exposed to the rest of the kernel. Include these directly here.

View file

@ -12,6 +12,8 @@
* Bibo Mao <bibo.mao@intel.com>
* Chandramouli Narayanan <mouli@linux.intel.com>
* Huang Ying <ying.huang@intel.com>
* Copyright (C) 2013 SuSE Labs
* Borislav Petkov <bp@suse.de> - runtime services VA mapping
*
* Copied from efi_32.c to eliminate the duplicated code between EFI
* 32/64 support code. --ying 2007-10-26
@ -51,7 +53,7 @@
#include <asm/x86_init.h>
#include <asm/rtc.h>
#define EFI_DEBUG 1
#define EFI_DEBUG
#define EFI_MIN_RESERVE 5120
@ -398,9 +400,9 @@ int __init efi_memblock_x86_reserve_range(void)
return 0;
}
#if EFI_DEBUG
static void __init print_efi_memmap(void)
{
#ifdef EFI_DEBUG
efi_memory_desc_t *md;
void *p;
int i;
@ -415,8 +417,8 @@ static void __init print_efi_memmap(void)
md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
(md->num_pages >> (20 - EFI_PAGE_SHIFT)));
}
}
#endif /* EFI_DEBUG */
}
void __init efi_reserve_boot_services(void)
{
@ -696,10 +698,7 @@ void __init efi_init(void)
x86_platform.set_wallclock = efi_set_rtc_mmss;
}
#endif
#if EFI_DEBUG
print_efi_memmap();
#endif
}
void __init efi_late_init(void)
@ -748,21 +747,56 @@ void efi_memory_uc(u64 addr, unsigned long size)
set_memory_uc(addr, npages);
}
void __init old_map_region(efi_memory_desc_t *md)
{
u64 start_pfn, end_pfn, end;
unsigned long size;
void *va;
start_pfn = PFN_DOWN(md->phys_addr);
size = md->num_pages << PAGE_SHIFT;
end = md->phys_addr + size;
end_pfn = PFN_UP(end);
if (pfn_range_is_mapped(start_pfn, end_pfn)) {
va = __va(md->phys_addr);
if (!(md->attribute & EFI_MEMORY_WB))
efi_memory_uc((u64)(unsigned long)va, size);
} else
va = efi_ioremap(md->phys_addr, size,
md->type, md->attribute);
md->virt_addr = (u64) (unsigned long) va;
if (!va)
pr_err("ioremap of 0x%llX failed!\n",
(unsigned long long)md->phys_addr);
}
/*
* This function will switch the EFI runtime services to virtual mode.
* Essentially, look through the EFI memmap and map every region that
* has the runtime attribute bit set in its memory descriptor and update
* that memory descriptor with the virtual address obtained from ioremap().
* This enables the runtime services to be called without having to
* Essentially, we look through the EFI memmap and map every region that
* has the runtime attribute bit set in its memory descriptor into the
* ->trampoline_pgd page table using a top-down VA allocation scheme.
*
* The old method which used to update that memory descriptor with the
* virtual address obtained from ioremap() is still supported when the
* kernel is booted with efi=old_map on its command line. Same old
* method enabled the runtime services to be called without having to
* thunk back into physical mode for every invocation.
*
* The new method does a pagetable switch in a preemption-safe manner
* so that we're in a different address space when calling a runtime
* function. For function arguments passing we do copy the PGDs of the
* kernel page table into ->trampoline_pgd prior to each call.
*/
void __init efi_enter_virtual_mode(void)
{
efi_memory_desc_t *md, *prev_md = NULL;
efi_status_t status;
void *p, *new_memmap = NULL;
unsigned long size;
u64 end, systab, start_pfn, end_pfn;
void *p, *va, *new_memmap = NULL;
efi_status_t status;
u64 end, systab;
int count = 0;
efi.systab = NULL;
@ -771,7 +805,6 @@ void __init efi_enter_virtual_mode(void)
* We don't do virtual mode, since we don't do runtime services, on
* non-native EFI
*/
if (!efi_is_native()) {
efi_unmap_memmap();
return;
@ -802,6 +835,7 @@ void __init efi_enter_virtual_mode(void)
continue;
}
prev_md = md;
}
for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
@ -814,36 +848,24 @@ void __init efi_enter_virtual_mode(void)
continue;
}
efi_map_region(md);
size = md->num_pages << EFI_PAGE_SHIFT;
end = md->phys_addr + size;
start_pfn = PFN_DOWN(md->phys_addr);
end_pfn = PFN_UP(end);
if (pfn_range_is_mapped(start_pfn, end_pfn)) {
va = __va(md->phys_addr);
if (!(md->attribute & EFI_MEMORY_WB))
efi_memory_uc((u64)(unsigned long)va, size);
} else
va = efi_ioremap(md->phys_addr, size,
md->type, md->attribute);
md->virt_addr = (u64) (unsigned long) va;
if (!va) {
pr_err("ioremap of 0x%llX failed!\n",
(unsigned long long)md->phys_addr);
continue;
}
systab = (u64) (unsigned long) efi_phys.systab;
if (md->phys_addr <= systab && systab < end) {
systab += md->virt_addr - md->phys_addr;
efi.systab = (efi_system_table_t *) (unsigned long) systab;
}
new_memmap = krealloc(new_memmap,
(count + 1) * memmap.desc_size,
GFP_KERNEL);
if (!new_memmap)
goto err_out;
memcpy(new_memmap + (count * memmap.desc_size), md,
memmap.desc_size);
count++;
@ -851,6 +873,9 @@ void __init efi_enter_virtual_mode(void)
BUG_ON(!efi.systab);
efi_setup_page_tables();
efi_sync_low_kernel_mappings();
status = phys_efi_set_virtual_address_map(
memmap.desc_size * count,
memmap.desc_size,
@ -883,7 +908,8 @@ void __init efi_enter_virtual_mode(void)
efi.query_variable_info = virt_efi_query_variable_info;
efi.update_capsule = virt_efi_update_capsule;
efi.query_capsule_caps = virt_efi_query_capsule_caps;
if (__supported_pte_mask & _PAGE_NX)
if (efi_enabled(EFI_OLD_MEMMAP) && (__supported_pte_mask & _PAGE_NX))
runtime_code_page_mkexec();
kfree(new_memmap);
@ -894,6 +920,11 @@ void __init efi_enter_virtual_mode(void)
EFI_VARIABLE_BOOTSERVICE_ACCESS |
EFI_VARIABLE_RUNTIME_ACCESS,
0, NULL);
return;
err_out:
pr_err("Error reallocating memory, EFI runtime non-functional!\n");
}
/*
@ -1013,3 +1044,15 @@ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size)
return EFI_SUCCESS;
}
EXPORT_SYMBOL_GPL(efi_query_variable_store);
static int __init parse_efi_cmdline(char *str)
{
if (*str == '=')
str++;
if (!strncmp(str, "old_map", 7))
set_bit(EFI_OLD_MEMMAP, &x86_efi_facility);
return 0;
}
early_param("efi", parse_efi_cmdline);

View file

@ -37,9 +37,16 @@
* claim EFI runtime service handler exclusively and to duplicate a memory in
* low memory space say 0 - 3G.
*/
static unsigned long efi_rt_eflags;
void efi_sync_low_kernel_mappings(void) {}
void efi_setup_page_tables(void) {}
void __init efi_map_region(efi_memory_desc_t *md)
{
old_map_region(md);
}
void efi_call_phys_prelog(void)
{
struct desc_ptr gdt_descr;

View file

@ -38,10 +38,28 @@
#include <asm/efi.h>
#include <asm/cacheflush.h>
#include <asm/fixmap.h>
#include <asm/realmode.h>
static pgd_t *save_pgd __initdata;
static unsigned long efi_flags __initdata;
/*
* We allocate runtime services regions bottom-up, starting from -4G, i.e.
* 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G.
*/
static u64 efi_va = -4 * (1UL << 30);
#define EFI_VA_END (-68 * (1UL << 30))
/*
* Scratch space used for switching the pagetable in the EFI stub
*/
struct efi_scratch {
u64 r15;
u64 prev_cr3;
pgd_t *efi_pgt;
bool use_pgd;
};
static void __init early_code_mapping_set_exec(int executable)
{
efi_memory_desc_t *md;
@ -65,6 +83,9 @@ void __init efi_call_phys_prelog(void)
int pgd;
int n_pgds;
if (!efi_enabled(EFI_OLD_MEMMAP))
return;
early_code_mapping_set_exec(1);
local_irq_save(efi_flags);
@ -86,6 +107,10 @@ void __init efi_call_phys_epilog(void)
*/
int pgd;
int n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE);
if (!efi_enabled(EFI_OLD_MEMMAP))
return;
for (pgd = 0; pgd < n_pgds; pgd++)
set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), save_pgd[pgd]);
kfree(save_pgd);
@ -94,6 +119,90 @@ void __init efi_call_phys_epilog(void)
early_code_mapping_set_exec(0);
}
/*
* Add low kernel mappings for passing arguments to EFI functions.
*/
void efi_sync_low_kernel_mappings(void)
{
unsigned num_pgds;
pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
if (efi_enabled(EFI_OLD_MEMMAP))
return;
num_pgds = pgd_index(MODULES_END - 1) - pgd_index(PAGE_OFFSET);
memcpy(pgd + pgd_index(PAGE_OFFSET),
init_mm.pgd + pgd_index(PAGE_OFFSET),
sizeof(pgd_t) * num_pgds);
}
void efi_setup_page_tables(void)
{
efi_scratch.efi_pgt = (pgd_t *)(unsigned long)real_mode_header->trampoline_pgd;
if (!efi_enabled(EFI_OLD_MEMMAP))
efi_scratch.use_pgd = true;
}
static void __init __map_region(efi_memory_desc_t *md, u64 va)
{
pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
unsigned long pf = 0, size;
u64 end;
if (!(md->attribute & EFI_MEMORY_WB))
pf |= _PAGE_PCD;
size = md->num_pages << PAGE_SHIFT;
end = va + size;
if (kernel_map_pages_in_pgd(pgd, md->phys_addr, va, md->num_pages, pf))
pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n",
md->phys_addr, va);
}
void __init efi_map_region(efi_memory_desc_t *md)
{
unsigned long size = md->num_pages << PAGE_SHIFT;
u64 pa = md->phys_addr;
if (efi_enabled(EFI_OLD_MEMMAP))
return old_map_region(md);
/*
* Make sure the 1:1 mappings are present as a catch-all for b0rked
* firmware which doesn't update all internal pointers after switching
* to virtual mode and would otherwise crap on us.
*/
__map_region(md, md->phys_addr);
efi_va -= size;
/* Is PA 2M-aligned? */
if (!(pa & (PMD_SIZE - 1))) {
efi_va &= PMD_MASK;
} else {
u64 pa_offset = pa & (PMD_SIZE - 1);
u64 prev_va = efi_va;
/* get us the same offset within this 2M page */
efi_va = (efi_va & PMD_MASK) + pa_offset;
if (efi_va > prev_va)
efi_va -= PMD_SIZE;
}
if (efi_va < EFI_VA_END) {
pr_warn(FW_WARN "VA address range overflow!\n");
return;
}
/* Do the VA map */
__map_region(md, efi_va);
md->virt_addr = efi_va;
}
void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
u32 type, u64 attribute)
{

View file

@ -34,10 +34,47 @@
mov %rsi, %cr0; \
mov (%rsp), %rsp
/* stolen from gcc */
.macro FLUSH_TLB_ALL
movq %r15, efi_scratch(%rip)
movq %r14, efi_scratch+8(%rip)
movq %cr4, %r15
movq %r15, %r14
andb $0x7f, %r14b
movq %r14, %cr4
movq %r15, %cr4
movq efi_scratch+8(%rip), %r14
movq efi_scratch(%rip), %r15
.endm
.macro SWITCH_PGT
cmpb $0, efi_scratch+24(%rip)
je 1f
movq %r15, efi_scratch(%rip) # r15
# save previous CR3
movq %cr3, %r15
movq %r15, efi_scratch+8(%rip) # prev_cr3
movq efi_scratch+16(%rip), %r15 # EFI pgt
movq %r15, %cr3
1:
.endm
.macro RESTORE_PGT
cmpb $0, efi_scratch+24(%rip)
je 2f
movq efi_scratch+8(%rip), %r15
movq %r15, %cr3
movq efi_scratch(%rip), %r15
FLUSH_TLB_ALL
2:
.endm
ENTRY(efi_call0)
SAVE_XMM
subq $32, %rsp
SWITCH_PGT
call *%rdi
RESTORE_PGT
addq $32, %rsp
RESTORE_XMM
ret
@ -47,7 +84,9 @@ ENTRY(efi_call1)
SAVE_XMM
subq $32, %rsp
mov %rsi, %rcx
SWITCH_PGT
call *%rdi
RESTORE_PGT
addq $32, %rsp
RESTORE_XMM
ret
@ -57,7 +96,9 @@ ENTRY(efi_call2)
SAVE_XMM
subq $32, %rsp
mov %rsi, %rcx
SWITCH_PGT
call *%rdi
RESTORE_PGT
addq $32, %rsp
RESTORE_XMM
ret
@ -68,7 +109,9 @@ ENTRY(efi_call3)
subq $32, %rsp
mov %rcx, %r8
mov %rsi, %rcx
SWITCH_PGT
call *%rdi
RESTORE_PGT
addq $32, %rsp
RESTORE_XMM
ret
@ -80,7 +123,9 @@ ENTRY(efi_call4)
mov %r8, %r9
mov %rcx, %r8
mov %rsi, %rcx
SWITCH_PGT
call *%rdi
RESTORE_PGT
addq $32, %rsp
RESTORE_XMM
ret
@ -93,7 +138,9 @@ ENTRY(efi_call5)
mov %r8, %r9
mov %rcx, %r8
mov %rsi, %rcx
SWITCH_PGT
call *%rdi
RESTORE_PGT
addq $48, %rsp
RESTORE_XMM
ret
@ -109,8 +156,15 @@ ENTRY(efi_call6)
mov %r8, %r9
mov %rcx, %r8
mov %rsi, %rcx
SWITCH_PGT
call *%rdi
RESTORE_PGT
addq $48, %rsp
RESTORE_XMM
ret
ENDPROC(efi_call6)
.data
ENTRY(efi_scratch)
.fill 3,8,0
.byte 0

View file

@ -653,6 +653,7 @@ extern int __init efi_setup_pcdp_console(char *);
#define EFI_RUNTIME_SERVICES 3 /* Can we use runtime services? */
#define EFI_MEMMAP 4 /* Can we use EFI memory map? */
#define EFI_64BIT 5 /* Is the firmware 64-bit? */
#define EFI_ARCH_1 6 /* First arch-specific bit */
#ifdef CONFIG_EFI
# ifdef CONFIG_X86