mirror of
https://github.com/team-infusion-developers/android_hardware_samsung.git
synced 2024-11-06 21:55:41 +00:00
exynos: multimedia: speed up color conversion from ARGB8888 to YUV420SP
Use NEON instructions for color conversion from ARGB8888 to YUV420SP. This greatly improves performance and can help achieve 30fps+ framerates when dealing with OMX_COLOR_FormatAndroidOpaque during Screen Recording and Screen Casting. Change-Id: Ifdaaf03e1ce6909822df3f046ef35dd977b84d17
This commit is contained in:
parent
e48f7860b9
commit
bc0da3a96d
4 changed files with 374 additions and 1 deletions
|
@ -805,7 +805,7 @@ OMX_BOOL SEC_Preprocessor_InputData(OMX_COMPONENTTYPE *pOMXComponent)
|
|||
SEC_OSAL_GetInfoFromMetaData(inputData, ppBuf);
|
||||
SEC_OSAL_LockANBHandle((OMX_U32)ppBuf[0], width, height, OMX_COLOR_FormatAndroidOpaque, &pOutBuffer);
|
||||
|
||||
csc_ARGB8888_to_YUV420SP(pVideoEnc->MFCEncInputBuffer[pVideoEnc->indexInputBuffer].YVirAddr,
|
||||
csc_ARGB8888_to_YUV420SP_NEON(pVideoEnc->MFCEncInputBuffer[pVideoEnc->indexInputBuffer].YVirAddr,
|
||||
pVideoEnc->MFCEncInputBuffer[pVideoEnc->indexInputBuffer].CVirAddr,
|
||||
pOutBuffer, width, height);
|
||||
|
||||
|
|
|
@ -15,6 +15,7 @@ LOCAL_SRC_FILES := \
|
|||
csc_linear_to_tiled_interleave_crop_neon.s \
|
||||
csc_tiled_to_linear_crop_neon.s \
|
||||
csc_tiled_to_linear_deinterleave_crop_neon.s \
|
||||
csc_ARGB8888_to_YUV420SP_NEON.s \
|
||||
csc_interleave_memcpy_neon.s \
|
||||
csc_fimc.cpp
|
||||
|
||||
|
|
|
@ -411,4 +411,11 @@ void csc_ARGB8888_to_YUV420SP(
|
|||
unsigned int width,
|
||||
unsigned int height);
|
||||
|
||||
void csc_ARGB8888_to_YUV420SP_NEON(
|
||||
unsigned char *y_dst,
|
||||
unsigned char *uv_dst,
|
||||
unsigned char *rgb_src,
|
||||
unsigned int width,
|
||||
unsigned int height);
|
||||
|
||||
#endif /*COLOR_SPACE_CONVERTOR_H_*/
|
||||
|
|
|
@ -0,0 +1,365 @@
|
|||
|
||||
.arch armv7-a
|
||||
.text
|
||||
.global csc_ARGB8888_to_YUV420SP_NEON
|
||||
.type csc_ARGB8888_to_YUV420SP_NEON, %function
|
||||
csc_ARGB8888_to_YUV420SP_NEON:
|
||||
.fnstart
|
||||
|
||||
@r0 pDstY
|
||||
@r1 pDstUV
|
||||
@r2 pSrcRGB
|
||||
@r3 nWidth
|
||||
@r4 pDstY2 = pDstY + nWidth
|
||||
@r5 pSrcRGB2 = pSrcRGB + nWidthx2
|
||||
@r6 temp7, nWidth16m
|
||||
@r7 temp6, accumilator
|
||||
@r8 temp5, nWidthTemp
|
||||
@r9 temp4, Raw RGB565
|
||||
@r10 temp3, r,g,b
|
||||
@r11 temp2, immediate operand
|
||||
@r12 temp1, nHeight
|
||||
@r14 temp0, debugging pointer
|
||||
|
||||
.equ CACHE_LINE_SIZE, 32
|
||||
.equ PRE_LOAD_OFFSET, 6
|
||||
|
||||
stmfd sp!, {r4-r12,r14} @ backup registers
|
||||
ldr r12, [sp, #40] @ load nHeight
|
||||
@ldr r14, [sp, #44] @ load pTest
|
||||
add r4, r0, r3 @r4: pDstY2 = pDstY + nWidth
|
||||
add r5, r2, r3, lsl #2 @r5: pSrcRGB2 = tmpSrcRGB + nWidthx4
|
||||
sub r8, r3, #16 @r8: nWidthTmp = nWidth -16
|
||||
|
||||
@q0: temp1, R
|
||||
@q1: temp2, GB
|
||||
@q2: R
|
||||
@q3: G
|
||||
@q4: B
|
||||
@q5: temp3, output
|
||||
|
||||
|
||||
vmov.u16 q6, #66 @coefficient assignment
|
||||
vmov.u16 q7, #129
|
||||
vmov.u16 q8, #25
|
||||
vmov.u16 q9, #0x8080 @ 128<<8 + 128
|
||||
|
||||
vmov.u16 q10, #0x1000 @ 16<<8 + 128
|
||||
vorr.u16 q10, #0x0080
|
||||
|
||||
vmov.u16 q11, #38 @#-38
|
||||
vmov.u16 q12, #74 @#-74
|
||||
vmov.u16 q13, #112
|
||||
vmov.u16 q14, #94 @#-94
|
||||
vmov.u16 q15, #18 @#-18
|
||||
|
||||
|
||||
|
||||
|
||||
LOOP_NHEIGHT2:
|
||||
stmfd sp!, {r12} @ backup registers
|
||||
|
||||
LOOP_NWIDTH16:
|
||||
pld [r2, #(CACHE_LINE_SIZE*PRE_LOAD_OFFSET)]
|
||||
@-------------------------------------------YUV ------------------------------------------
|
||||
vmov.u16 q14, #94 @#94
|
||||
vmov.u16 q15, #18 @#18
|
||||
vld4.8 {d0,d1,d2,d3}, [r2]! @loadRGB interleavely
|
||||
vld4.8 {d4,d5,d6,d7}, [r2]! @loadRGB interleavely
|
||||
|
||||
|
||||
vmov.u16 d8,d2
|
||||
vmov.u16 d9,d6
|
||||
vmov.u16 d10,d1
|
||||
vmov.u16 d11,d5
|
||||
vmov.u16 d12,d0
|
||||
vmov.u16 d13,d4
|
||||
|
||||
vand.u16 q4,#0x00FF @R
|
||||
vand.u16 q5,#0x00FF @G
|
||||
vand.u16 q6,#0x00FF @B
|
||||
|
||||
vmov.u16 q8,q9 @ CalcU()
|
||||
vmla.u16 q8,q6,q13 @112 * B[k]
|
||||
vmls.u16 q8,q4,q11 @q0:U -(38 * R[k]) @128<<6+ 32 + u>>2
|
||||
vmls.u16 q8,q5,q12 @-(74 * G[k])
|
||||
vshr.u16 q8,q8, #8 @(128<<8+ 128 + u)>>8
|
||||
|
||||
vmov.u16 q7,q9 @CalcV()
|
||||
vmla.u16 q7,q4,q13 @112 * R[k]
|
||||
vmls.u16 q7,q5,q14 @q0:U -(94 * G[k]) @128<<6+ 32 + v>>2
|
||||
vmls.u16 q7,q6,q15 @-(18 * B[k])
|
||||
vshr.u16 q7,q7, #8 @(128<<8+ 128 + v)>>8
|
||||
|
||||
|
||||
vtrn.8 q8,q7
|
||||
vst1.8 {q8}, [r1]! @write UV component to yuv420_buffer+linear_ylanesiez
|
||||
|
||||
@-------------------------------------------Y ------------------------------------------
|
||||
|
||||
vmov.u16 q14, #66 @#66
|
||||
vmov.u16 q15, #129 @#129
|
||||
vmov.u16 q8, #25 @#25
|
||||
|
||||
@CalcY_Y()
|
||||
|
||||
vmul.u16 q7,q4,q14 @q0 = 66 *R[k]
|
||||
vmla.u16 q7,q5,q15 @q0 += 129 *G[k]
|
||||
vmla.u16 q7,q6,q8 @q0 += 25 *B[k]
|
||||
|
||||
vadd.u16 q7,q7,q10
|
||||
vshr.u16 q7,q7, #8
|
||||
|
||||
vmov.u16 d8,d2
|
||||
vmov.u16 d9,d6
|
||||
vmov.u16 d10,d1
|
||||
vmov.u16 d11,d5
|
||||
vmov.u16 d12,d0
|
||||
vmov.u16 d13,d4
|
||||
|
||||
vshr.u16 q4,q4,#8 @R
|
||||
vshr.u16 q5,q5,#8 @G
|
||||
vshr.u16 q6,q6,#8 @B
|
||||
|
||||
vmul.u16 q0,q4,q14 @q0 = 66 *R[k]
|
||||
vmla.u16 q0,q5,q15 @q0 += 129 *G[k]
|
||||
vmla.u16 q0,q6,q8 @q0 += 25 *B[k]
|
||||
vadd.u16 q0,q0,q10
|
||||
vshr.u16 q0,q0, #8
|
||||
|
||||
vtrn.8 q7,q0
|
||||
vst1.8 {q7}, [r0]!@write to Y to yuv420_buffer
|
||||
|
||||
|
||||
|
||||
@-------------------------------------------Y ------------------------------------------
|
||||
|
||||
@---------------------------------------------Y1-------------------------------------------
|
||||
|
||||
pld [r5, #(CACHE_LINE_SIZE*PRE_LOAD_OFFSET)]
|
||||
vld4.8 {d0,d1,d2,d3}, [r5]! @loadRGB interleavely
|
||||
vld4.8 {d4,d5,d6,d7}, [r5]! @loadRGB interleavely
|
||||
|
||||
vmov.u16 d8,d2
|
||||
vmov.u16 d9,d6
|
||||
vmov.u16 d10,d1
|
||||
vmov.u16 d11,d5
|
||||
vmov.u16 d12,d0
|
||||
vmov.u16 d13,d4
|
||||
|
||||
|
||||
vand.u16 q4,#0x00FF @R
|
||||
vand.u16 q5,#0x00FF @G
|
||||
vand.u16 q6,#0x00FF @B
|
||||
|
||||
|
||||
|
||||
vmul.u16 q7,q4,q14 @q0 = 66 *R[k]
|
||||
vmla.u16 q7,q5,q15 @q0 += 129 *G[k]
|
||||
vmla.u16 q7,q6,q8 @q0 += 25 *B[k]
|
||||
vadd.u16 q7,q7,q10
|
||||
vshr.u16 q7,q7, #8
|
||||
|
||||
vmov.u16 d8,d2
|
||||
vmov.u16 d9,d6
|
||||
vmov.u16 d10,d1
|
||||
vmov.u16 d11,d5
|
||||
vmov.u16 d12,d0
|
||||
vmov.u16 d13,d4
|
||||
|
||||
vshr.u16 q4,q4,#8 @R
|
||||
vshr.u16 q5,q5,#8 @G
|
||||
vshr.u16 q6,q6,#8 @B
|
||||
|
||||
vmul.u16 q0,q4,q14 @q0 = 66 *R[k]
|
||||
vmla.u16 q0,q5,q15 @q0 += 129 *G[k]
|
||||
vmla.u16 q0,q6,q8 @q0 += 25 *B[k]
|
||||
vadd.u16 q0,q0,q10
|
||||
vshr.u16 q0,q0, #8
|
||||
|
||||
vtrn.8 q7,q0
|
||||
vst1.8 {q7}, [r4]!@write to Y to yuv420_buffer
|
||||
|
||||
subs r8,r8,#16 @nWidth16--
|
||||
BPL LOOP_NWIDTH16 @if nWidth16>0
|
||||
@-----------------------------------unaligned ---------------------------------------
|
||||
|
||||
adds r8,r8,#16 @ + 16 - 2
|
||||
BEQ NO_UNALIGNED @in case that nWidht is multiple of 16
|
||||
LOOP_NWIDTH2:
|
||||
@----------------------------------pDstRGB1--Y------------------------------------------
|
||||
@stmfd sp!, {r14} @backup r14
|
||||
|
||||
|
||||
ldr r9, [r2], #4 @loadRGB int
|
||||
ldr r12, [r2], #4 @loadRGB int
|
||||
|
||||
mov r10, r9,lsr #16 @copy to r10
|
||||
mov r14, r12 @copy to r10
|
||||
|
||||
ldr r6, =0x000000FF
|
||||
and r10, r10, r6 @R: (rgbIn[k] & 0xF800) >> 10;
|
||||
ldr r6, =0x00FF0000
|
||||
and r14, r14, r6 @R: (rgbIn[k] & 0xF800) >> 10;
|
||||
add r10,r10,r14
|
||||
|
||||
mov r11, #66 @accumilator += R*66
|
||||
mul r7, r10, r11
|
||||
|
||||
mov r10, r9,lsr #8 @copy to r10
|
||||
mov r14, r12,lsl #8 @copy to r10
|
||||
|
||||
ldr r6, =0x000000FF
|
||||
and r10, r10, r6 @G:
|
||||
ldr r6, =0x00FF0000
|
||||
and r14, r14, r6 @G:
|
||||
add r10,r10,r14
|
||||
|
||||
mov r11, #129 @accumilator += G *129
|
||||
mla r7, r10, r11, r7
|
||||
|
||||
mov r10, r9 @copy to r10
|
||||
mov r14, r12,lsl #16 @copy to r10
|
||||
|
||||
ldr r6, =0x000000FF
|
||||
and r10, r10, r6 @B
|
||||
ldr r6, =0x00FF0000
|
||||
and r14, r14, r6 @B
|
||||
add r10,r10,r14
|
||||
|
||||
mov r11, #25 @accumilator 1 -= B *25
|
||||
mla r7, r10, r11, r7
|
||||
|
||||
ldr r6, =0x10801080
|
||||
add r7, r6
|
||||
|
||||
lsr r7, #8
|
||||
strb r7, [r0],#1
|
||||
lsr r7,#16
|
||||
strb r7, [r0],#1
|
||||
@ldmfd sp!, {r14} @load r14
|
||||
|
||||
|
||||
@----------------------------------pDstRGB2--UV------------------------------------------
|
||||
|
||||
mov r10, r9 @copy to r10
|
||||
ldr r7,=0x00008080
|
||||
mov r12,r7
|
||||
|
||||
ldr r6, =0x000000FF
|
||||
and r10, r10, r6 @B:
|
||||
|
||||
mov r11, #112 @accumilator += B*112
|
||||
mla r7, r10, r11, r7
|
||||
|
||||
|
||||
mov r11, #18 @accumilator -= B*18
|
||||
mul r11, r10, r11
|
||||
sub r12, r12, r11
|
||||
|
||||
|
||||
|
||||
|
||||
mov r10, r9, lsr #16 @copy to r10
|
||||
ldr r6, =0x000000FF
|
||||
and r10, r10, r6 @R: (rgbIn[k] & 0xF800) >> 10;
|
||||
|
||||
mov r11, #38 @accumilator -= R *38
|
||||
mul r11, r10, r11
|
||||
sub r7, r7, r11
|
||||
|
||||
mov r11, #112 @accumilator = R *112
|
||||
mla r12, r10, r11, r12
|
||||
|
||||
mov r10, r9,lsr #8 @copy to r10
|
||||
ldr r6, =0x000000FF
|
||||
and r10, r10, r6 @G: (rgbIn[k] & 0x07E0) >> 5;
|
||||
|
||||
mov r11, #74 @accumilator -= G*74
|
||||
mul r11, r10, r11
|
||||
sub r7, r7, r11
|
||||
|
||||
mov r11, #94 @accumilator -= G*94
|
||||
mul r11, r10, r11
|
||||
sub r12, r12, r11
|
||||
|
||||
lsr r7, #8 @ >>8
|
||||
strb r7, [r1],#1
|
||||
lsr r12, #8 @ >>8
|
||||
strb r12, [r1],#1
|
||||
|
||||
@----------------------------------pDstRGB2--Y------------------------------------------
|
||||
@stmfd sp!, {r14} @backup r14
|
||||
|
||||
|
||||
ldr r9, [r5], #4 @loadRGB int
|
||||
ldr r12, [r5], #4 @loadRGB int
|
||||
|
||||
mov r10, r9,lsr #16 @copy to r10
|
||||
mov r14, r12 @copy to r10
|
||||
|
||||
ldr r6, =0x000000FF
|
||||
and r10, r10, r6 @R: (rgbIn[k] & 0xF800) >> 10;
|
||||
ldr r6, =0x00FF0000
|
||||
and r14, r14, r6 @R: (rgbIn[k] & 0xF800) >> 10;
|
||||
add r10,r10,r14
|
||||
|
||||
mov r11, #66 @accumilator += R*66
|
||||
mul r7, r10, r11
|
||||
|
||||
mov r10, r9,lsr #8 @copy to r10
|
||||
mov r14, r12,lsl #8 @copy to r10
|
||||
|
||||
ldr r6, =0x000000FF
|
||||
and r10, r10, r6 @G:
|
||||
ldr r6, =0x00FF0000
|
||||
and r14, r14, r6 @G:
|
||||
add r10,r10,r14
|
||||
|
||||
mov r11, #129 @accumilator += G *129
|
||||
mla r7, r10, r11, r7
|
||||
|
||||
mov r10, r9 @copy to r10
|
||||
mov r14, r12,lsl #16 @copy to r10
|
||||
|
||||
ldr r6, =0x000000FF
|
||||
and r10, r10, r6 @B
|
||||
ldr r6, =0x00FF0000
|
||||
and r14, r14, r6 @B
|
||||
add r10,r10,r14
|
||||
|
||||
|
||||
|
||||
|
||||
mov r11, #25 @accumilator 1 -= B *25
|
||||
mla r7, r10, r11, r7
|
||||
|
||||
ldr r6, =0x10801080
|
||||
add r7, r6
|
||||
lsr r7, #8
|
||||
|
||||
strb r7, [r4],#1
|
||||
lsr r7,#16
|
||||
strb r7, [r4],#1
|
||||
@ldmfd sp!, {r14} @load r14
|
||||
|
||||
|
||||
subs r8,r8,#2 @ nWidth2 -= 2
|
||||
BGT LOOP_NWIDTH2 @ if nWidth2>0
|
||||
|
||||
|
||||
NO_UNALIGNED: @in case that nWidht is multiple of 16
|
||||
|
||||
@-----------------------------------------------------------------------------
|
||||
sub r8, r3, #16 @r8: nWidthTmp = nWidth -16
|
||||
add r0, r0, r3 @pDstY + nwidth
|
||||
add r2, r2, r3, lsl #2 @pSrcRGB + nwidthx4
|
||||
add r4, r4, r3 @pDstY2 + nwidth
|
||||
add r5, r5, r3, lsl #2 @pSrcRGB2 + nwidthx4
|
||||
|
||||
ldmfd sp!, {r12}
|
||||
subs r12,r12,#2 @nHeight -=2
|
||||
BGT LOOP_NHEIGHT2 @if nHeight2>0
|
||||
|
||||
ldmfd sp!, {r4-r12,pc} @ backup registers
|
||||
.fnend
|
Loading…
Reference in a new issue