VP8 for ARMv8 by using NEON intrinsics 17
Add vp8_subpixelvariance_neon.c
- vp8_sub_pixel_variance16x16_neon_func
- vp8_variance_halfpixvar16x16_h_neon
- vp8_variance_halfpixvar16x16_v_neon
- vp8_variance_halfpixvar16x16_hv_neon
- vp8_sub_pixel_variance8x8_neon
Change-Id: I3e5d85b2eafc26be0eef6a777789b80e4579257b
Signed-off-by:
James Yu <james.yu@linaro.org>
; | ||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | ||
; | ||
; Use of this source code is governed by a BSD-style license | ||
; that can be found in the LICENSE file in the root of the source | ||
; tree. An additional intellectual property rights grant can be found | ||
; in the file PATENTS. All contributing project authors may | ||
; be found in the AUTHORS file in the root of the source tree. | ||
; | ||
EXPORT |vp8_variance_halfpixvar16x16_h_neon| | ||
EXPORT |vp8_variance_halfpixvar16x16_v_neon| | ||
EXPORT |vp8_variance_halfpixvar16x16_hv_neon| | ||
EXPORT |vp8_sub_pixel_variance16x16s_neon| | ||
ARM | ||
REQUIRE8 | ||
PRESERVE8 | ||
AREA ||.text||, CODE, READONLY, ALIGN=2 | ||
;================================================ | ||
;unsigned int vp8_variance_halfpixvar16x16_h_neon | ||
;( | ||
; unsigned char *src_ptr, r0 | ||
; int src_pixels_per_line, r1 | ||
; unsigned char *dst_ptr, r2 | ||
; int dst_pixels_per_line, r3 | ||
; unsigned int *sse | ||
;); | ||
;================================================ | ||
|vp8_variance_halfpixvar16x16_h_neon| PROC | ||
push {lr} | ||
vpush {d8-d15} | ||
mov r12, #4 ;loop counter | ||
ldr lr, [sp, #68] ;load *sse from stack | ||
vmov.i8 q8, #0 ;q8 - sum | ||
vmov.i8 q9, #0 ;q9, q10 - sse | ||
vmov.i8 q10, #0 | ||
;First Pass: output_height lines x output_width columns (16x16) | ||
vp8_filt_fpo16x16s_4_0_loop_neon | ||
vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data | ||
vld1.8 {q11}, [r2], r3 | ||
vld1.u8 {d4, d5, d6, d7}, [r0], r1 | ||
vld1.8 {q12}, [r2], r3 | ||
vld1.u8 {d8, d9, d10, d11}, [r0], r1 | ||
vld1.8 {q13}, [r2], r3 | ||
vld1.u8 {d12, d13, d14, d15}, [r0], r1 | ||
;pld [r0] | ||
;pld [r0, r1] | ||
;pld [r0, r1, lsl #1] | ||
vext.8 q1, q0, q1, #1 ;construct src_ptr[1] | ||
vext.8 q3, q2, q3, #1 | ||
vext.8 q5, q4, q5, #1 | ||
vext.8 q7, q6, q7, #1 | ||
vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 | ||
vld1.8 {q14}, [r2], r3 | ||
vrhadd.u8 q1, q2, q3 | ||
vrhadd.u8 q2, q4, q5 | ||
vrhadd.u8 q3, q6, q7 | ||
vsubl.u8 q4, d0, d22 ;diff | ||
vsubl.u8 q5, d1, d23 | ||
vsubl.u8 q6, d2, d24 | ||
vsubl.u8 q7, d3, d25 | ||
vsubl.u8 q0, d4, d26 | ||
vsubl.u8 q1, d5, d27 | ||
vsubl.u8 q2, d6, d28 | ||
vsubl.u8 q3, d7, d29 | ||
vpadal.s16 q8, q4 ;sum | ||
vmlal.s16 q9, d8, d8 ;sse | ||
vmlal.s16 q10, d9, d9 | ||
subs r12, r12, #1 | ||
vpadal.s16 q8, q5 | ||
vmlal.s16 q9, d10, d10 | ||
vmlal.s16 q10, d11, d11 | ||
vpadal.s16 q8, q6 | ||
vmlal.s16 q9, d12, d12 | ||
vmlal.s16 q10, d13, d13 | ||
vpadal.s16 q8, q7 | ||
vmlal.s16 q9, d14, d14 | ||
vmlal.s16 q10, d15, d15 | ||
vpadal.s16 q8, q0 ;sum | ||
vmlal.s16 q9, d0, d0 ;sse | ||
vmlal.s16 q10, d1, d1 | ||
vpadal.s16 q8, q1 | ||
vmlal.s16 q9, d2, d2 | ||
vmlal.s16 q10, d3, d3 | ||
vpadal.s16 q8, q2 | ||
vmlal.s16 q9, d4, d4 | ||
vmlal.s16 q10, d5, d5 | ||
vpadal.s16 q8, q3 | ||
vmlal.s16 q9, d6, d6 | ||
vmlal.s16 q10, d7, d7 | ||
bne vp8_filt_fpo16x16s_4_0_loop_neon | ||
vadd.u32 q10, q9, q10 ;accumulate sse | ||
vpaddl.s32 q0, q8 ;accumulate sum | ||
vpaddl.u32 q1, q10 | ||
vadd.s64 d0, d0, d1 | ||
vadd.u64 d1, d2, d3 | ||
vmull.s32 q5, d0, d0 | ||
vst1.32 {d1[0]}, [lr] ;store sse | ||
vshr.u32 d10, d10, #8 | ||
vsub.u32 d0, d1, d10 | ||
vmov.32 r0, d0[0] ;return | ||
vpop {d8-d15} | ||
pop {pc} | ||
ENDP | ||
;================================================ | ||
;unsigned int vp8_variance_halfpixvar16x16_v_neon | ||
;( | ||
; unsigned char *src_ptr, r0 | ||
; int src_pixels_per_line, r1 | ||
; unsigned char *dst_ptr, r2 | ||
; int dst_pixels_per_line, r3 | ||
; unsigned int *sse | ||
;); | ||
;================================================ | ||
|vp8_variance_halfpixvar16x16_v_neon| PROC | ||
push {lr} | ||
vpush {d8-d15} | ||
mov r12, #4 ;loop counter | ||
vld1.u8 {q0}, [r0], r1 ;load src data | ||
ldr lr, [sp, #68] ;load *sse from stack | ||
vmov.i8 q8, #0 ;q8 - sum | ||
vmov.i8 q9, #0 ;q9, q10 - sse | ||
vmov.i8 q10, #0 | ||
vp8_filt_spo16x16s_0_4_loop_neon | ||
vld1.u8 {q2}, [r0], r1 | ||
vld1.8 {q1}, [r2], r3 | ||
vld1.u8 {q4}, [r0], r1 | ||
vld1.8 {q3}, [r2], r3 | ||
vld1.u8 {q6}, [r0], r1 | ||
vld1.8 {q5}, [r2], r3 | ||
vld1.u8 {q15}, [r0], r1 | ||
vrhadd.u8 q0, q0, q2 | ||
vld1.8 {q7}, [r2], r3 | ||
< |