android_kernel_samsung_msm8976/arch/arm64/lib/copy_page.S
Will Deacon c182462032 arm64: lib: improve copy_page to deal with 128 bytes at a time
We want to avoid lots of different copy_page implementations, settling
for something that is "good enough" everywhere and hopefully easy to
understand and maintain whilst we're at it.

This patch reworks our copy_page implementation based on discussions
with Cavium on the list and benchmarking on Cortex-A processors so that:

  - The loop is unrolled to copy 128 bytes per iteration

  - The reads are offset so that we read from the next 128-byte block
    in the same iteration that we store the previous block

  - Explicit prefetch instructions are removed for now, since they hurt
    performance on CPUs with hardware prefetching

  - The loop exit condition is calculated at the start of the loop

Change-Id: I0d9f3bbe4efa2751f41432a3b4b299fbb0e494be
Signed-off-by: Will Deacon <will.deacon@arm.com>
Tested-by: Andrew Pinski <apinski@cavium.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-04-18 12:17:45 +02:00

77 lines
1.7 KiB
ArmAsm

/*
* Copyright (C) 2012 ARM Ltd.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <linux/linkage.h>
#include <linux/const.h>
#include <asm/assembler.h>
#include <asm/page.h>
/*
* Copy a page from src to dest (both are page aligned)
*
* Parameters:
* x0 - dest
* x1 - src
*/
ENTRY(copy_page)
ldp x2, x3, [x1]
ldp x4, x5, [x1, #16]
ldp x6, x7, [x1, #32]
ldp x8, x9, [x1, #48]
ldp x10, x11, [x1, #64]
ldp x12, x13, [x1, #80]
ldp x14, x15, [x1, #96]
ldp x16, x17, [x1, #112]
mov x18, #(PAGE_SIZE - 128)
add x1, x1, #128
1:
subs x18, x18, #128
stnp x2, x3, [x0]
ldp x2, x3, [x1]
stnp x4, x5, [x0, #16]
ldp x4, x5, [x1, #16]
stnp x6, x7, [x0, #32]
ldp x6, x7, [x1, #32]
stnp x8, x9, [x0, #48]
ldp x8, x9, [x1, #48]
stnp x10, x11, [x0, #64]
ldp x10, x11, [x1, #64]
stnp x12, x13, [x0, #80]
ldp x12, x13, [x1, #80]
stnp x14, x15, [x0, #96]
ldp x14, x15, [x1, #96]
stnp x16, x17, [x0, #112]
ldp x16, x17, [x1, #112]
add x0, x0, #128
add x1, x1, #128
b.gt 1b
stnp x2, x3, [x0]
stnp x4, x5, [x0, #16]
stnp x6, x7, [x0, #32]
stnp x8, x9, [x0, #48]
stnp x10, x11, [x0, #64]
stnp x12, x13, [x0, #80]
stnp x14, x15, [x0, #96]
stnp x16, x17, [x0, #112]
ret
ENDPROC(copy_page)