Skip to content
Snippets Groups Projects
Commit 55c95f2d authored by James Yu's avatar James Yu Committed by Gerrit Code Review
Browse files

VP8 for ARMv8 by using NEON intrinsics 09


Add loopfiltersimplehorizontaledge_neon.c
- vp8_loop_filter_bhs_neon
- vp8_loop_filter_mbhs_neon

Change-Id: I77f9721b20585da8bf3869a3850ff0ae4b4bfeea
Signed-off-by: default avatarJames Yu <james.yu@linaro.org>
parent cf2262c4
No related branches found
No related tags found
No related merge requests found
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_loop_filter_bhs_neon|
EXPORT |vp8_loop_filter_mbhs_neon|
ARM
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *s, PRESERVE
; r1 int p, PRESERVE
; q1 limit, PRESERVE
|vp8_loop_filter_simple_horizontal_edge_neon| PROC
vpush {d8-d15}
sub r3, r0, r1, lsl #1 ; move src pointer down by 2 lines
vld1.u8 {q7}, [r0@128], r1 ; q0
vld1.u8 {q5}, [r3@128], r1 ; p0
vld1.u8 {q8}, [r0@128] ; q1
vld1.u8 {q6}, [r3@128] ; p1
vabd.u8 q15, q6, q7 ; abs(p0 - q0)
vabd.u8 q14, q5, q8 ; abs(p1 - q1)
vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
vmov.u8 q0, #0x80 ; 0x80
vmov.s16 q13, #3
vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value
veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value
veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value
veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value
vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1
vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
vsubl.s8 q3, d15, d13
vqsub.s8 q4, q5, q8 ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1)
vmul.s16 q2, q2, q13 ; 3 * ( qs0 - ps0)
vmul.s16 q3, q3, q13
vmov.u8 q10, #0x03 ; 0x03
vmov.u8 q9, #0x04 ; 0x04
vaddw.s8 q2, q2, d8 ; vp8_filter + 3 * ( qs0 - ps0)
vaddw.s8 q3, q3, d9
vqmovn.s16 d8, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d9, q3
vand q14, q4, q15 ; vp8_filter &= mask
vqadd.s8 q2, q14, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
vqadd.s8 q3, q14, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
vshr.s8 q4, q3, #3 ; Filter1 >>= 3
sub r0, r0, r1
;calculate output
vqadd.s8 q11, q6, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2)
vqsub.s8 q10, q7, q4 ; u = vp8_signed_char_clamp(qs0 - Filter1)
veor q6, q11, q0 ; *op0 = u^0x80
veor q7, q10, q0 ; *oq0 = u^0x80
vst1.u8 {q6}, [r3@128] ; store op0
vst1.u8 {q7}, [r0@128] ; store oq0
vpop {d8-d15}
bx lr
ENDP ; |vp8_loop_filter_simple_horizontal_edge_neon|
; r0 unsigned char *y
; r1 int ystride
; r2 const unsigned char *blimit
|vp8_loop_filter_bhs_neon| PROC
push {r4, lr}
ldrb r3, [r2] ; load blim from mem
vdup.s8 q1, r3 ; duplicate blim
add r0, r0, r1, lsl #2 ; src = y_ptr + 4 * y_stride
bl vp8_loop_filter_simple_horizontal_edge_neon
; vp8_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1
add r0, r0, r1, lsl #2 ; src = y_ptr + 8* y_stride
bl vp8_loop_filter_simple_horizontal_edge_neon
add r0, r0, r1, lsl #2 ; src = y_ptr + 12 * y_stride
pop {r4, lr}
b vp8_loop_filter_simple_horizontal_edge_neon
ENDP ;|vp8_loop_filter_bhs_neon|
; r0 unsigned char *y
; r1 int ystride
; r2 const unsigned char *blimit
|vp8_loop_filter_mbhs_neon| PROC
ldrb r3, [r2] ; load blim from mem
vdup.s8 q1, r3 ; duplicate mblim
b vp8_loop_filter_simple_horizontal_edge_neon
ENDP ;|vp8_loop_filter_bhs_neon|
END
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
static inline void vp8_loop_filter_simple_horizontal_edge_neon(
unsigned char *s,
int p,
const unsigned char *blimit) {
uint8_t *sp;
uint8x16_t qblimit, q0u8;
uint8x16_t q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, q14u8, q15u8;
int16x8_t q2s16, q3s16, q13s16;
int8x8_t d8s8, d9s8;
int8x16_t q2s8, q3s8, q4s8, q10s8, q11s8, q14s8;
qblimit = vdupq_n_u8(*blimit);
sp = s - (p << 1);
q5u8 = vld1q_u8(sp);
sp += p;
q6u8 = vld1q_u8(sp);
sp += p;
q7u8 = vld1q_u8(sp);
sp += p;
q8u8 = vld1q_u8(sp);
q15u8 = vabdq_u8(q6u8, q7u8);
q14u8 = vabdq_u8(q5u8, q8u8);
q15u8 = vqaddq_u8(q15u8, q15u8);
q14u8 = vshrq_n_u8(q14u8, 1);
q0u8 = vdupq_n_u8(0x80);
q13s16 = vdupq_n_s16(3);
q15u8 = vqaddq_u8(q15u8, q14u8);
q5u8 = veorq_u8(q5u8, q0u8);
q6u8 = veorq_u8(q6u8, q0u8);
q7u8 = veorq_u8(q7u8, q0u8);
q8u8 = veorq_u8(q8u8, q0u8);
q15u8 = vcgeq_u8(qblimit, q15u8);
q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7u8)),
vget_low_s8(vreinterpretq_s8_u8(q6u8)));
q3s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7u8)),
vget_high_s8(vreinterpretq_s8_u8(q6u8)));
q4s8 = vqsubq_s8(vreinterpretq_s8_u8(q5u8),
vreinterpretq_s8_u8(q8u8));
q2s16 = vmulq_s16(q2s16, q13s16);
q3s16 = vmulq_s16(q3s16, q13s16);
q10u8 = vdupq_n_u8(3);
q9u8 = vdupq_n_u8(4);
q2s16 = vaddw_s8(q2s16, vget_low_s8(q4s8));
q3s16 = vaddw_s8(q3s16, vget_high_s8(q4s8));
d8s8 = vqmovn_s16(q2s16);
d9s8 = vqmovn_s16(q3s16);
q4s8 = vcombine_s8(d8s8, d9s8);
q14s8 = vandq_s8(q4s8, vreinterpretq_s8_u8(q15u8));
q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q10u8));
q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q9u8));
q2s8 = vshrq_n_s8(q2s8, 3);
q3s8 = vshrq_n_s8(q3s8, 3);
q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6u8), q2s8);
q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7u8), q3s8);
q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
vst1q_u8(s, q7u8);
s -= p;
vst1q_u8(s, q6u8);
return;
}
void vp8_loop_filter_bhs_neon(
unsigned char *y_ptr,
int y_stride,
const unsigned char *blimit) {
y_ptr += y_stride * 4;
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
y_ptr += y_stride * 4;
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
y_ptr += y_stride * 4;
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
return;
}
void vp8_loop_filter_mbhs_neon(
unsigned char *y_ptr,
int y_stride,
const unsigned char *blimit) {
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
return;
}
......@@ -159,7 +159,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
# common (neon)
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon$(ASM)
......@@ -186,6 +185,7 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequantizeb_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_dequant_full_2x_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iwalsh_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfilter_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimplehorizontaledge_neon.c
$(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment