From 36e9b82080605f8f015ad1bf388b00cb7f83f325 Mon Sep 17 00:00:00 2001 From: hkuang <hkuang@google.com> Date: Mon, 26 Aug 2013 16:28:57 -0700 Subject: [PATCH] Add neon optimize vp9_short_idct8x8_1_add. Change-Id: I0b15d5e3b0eb97abb9ab5ec08e88b61f8723aaf4 --- .../arm/neon/vp9_short_idct8x8_1_add_neon.asm | 88 +++++++++++++++++++ vp9/common/vp9_rtcd_defs.sh | 2 +- vp9/vp9_common.mk | 1 + 3 files changed, 90 insertions(+), 1 deletion(-) create mode 100644 vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm diff --git a/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm new file mode 100644 index 0000000000..923804f90a --- /dev/null +++ b/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm @@ -0,0 +1,88 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp9_short_idct8x8_1_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void vp9_short_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, +; int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|vp9_short_idct8x8_1_add_neon| PROC + ldrsh r0, [r0] + + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + ; out = dct_const_round_shift(input[0] * cospi_16_64) + mul r0, r0, r12 ; input[0] * cospi_16_64 + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; out = dct_const_round_shift(out * cospi_16_64) + mul r0, r0, r12 ; out * cospi_16_64 + mov r12, r1 ; save dest + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; a1 = ROUND_POWER_OF_TWO(out, 5) + add r0, r0, #16 ; + (1 <<((5) - 1)) + asr r0, r0, #5 ; >> 5 + + vdup.s16 q0, r0 ; duplicate a1 + + ; load destination data + vld1.64 {d2}, [r1], r2 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r2 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r2 + vld1.64 {d7}, [r1], r2 + vld1.64 {d16}, [r1], r2 + vld1.64 {d17}, [r1] + + vaddw.u8 q9, q0, d2 ; dest[x] + a1 + vaddw.u8 q10, q0, d3 ; dest[x] + a1 + vaddw.u8 q11, q0, d4 ; dest[x] + a1 + vaddw.u8 q12, q0, d5 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r2 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r2 + vst1.64 {d31}, [r12], r2 + + vaddw.u8 q9, q0, d6 ; dest[x] + a1 + vaddw.u8 q10, q0, d7 ; dest[x] + a1 + vaddw.u8 q11, q0, d16 ; dest[x] + a1 + vaddw.u8 q12, q0, d17 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r2 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r2 + vst1.64 {d31}, [r12], r2 + + bx lr + ENDP ; |vp9_short_idct8x8_1_add_neon| + + END diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 30c1b26d0a..9f921acf42 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -301,7 +301,7 @@ prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_st specialize vp9_short_idct4x4_add sse2 neon prototype void vp9_short_idct8x8_1_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct8x8_1_add sse2 +specialize vp9_short_idct8x8_1_add sse2 neon prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct8x8_add sse2 neon diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index d5692efb18..299da8a5f1 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -98,6 +98,7 @@ VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct4x4_1_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct4x4_add_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_1_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_mb_lpf_neon$(ASM) -- GitLab