Commit cf6beea6 authored by hkuang's avatar hkuang
Browse files

Add neon optimize vp9_short_idct16x16_add.

Change-Id: I27134b9a5cace2bdad53534562c91d829b48838d
parent 2c091f97
/*
* Copyright (c) 2013 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_common.h"
extern void vp9_short_idct16x16_add_neon_pass1(int16_t *input,
int16_t *output,
int output_stride);
extern void vp9_short_idct16x16_add_neon_pass2(int16_t *src,
int16_t *output,
int16_t *pass1Output,
int16_t skip_adding,
uint8_t *dest,
int dest_stride);
extern void save_registers();
extern void restore_registers();
void vp9_short_idct16x16_add_neon(int16_t *input,
uint8_t *dest, int dest_stride) {
int16_t pass1_output[16*16] = {0};
int16_t row_idct_output[16*16] = {0};
// save d8-d15 register values.
save_registers();
/* Parallel idct on the upper 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
vp9_short_idct16x16_add_neon_pass1(input, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output.
vp9_short_idct16x16_add_neon_pass2(input+1,
row_idct_output,
pass1_output,
0,
dest,
dest_stride);
/* Parallel idct on the lower 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
vp9_short_idct16x16_add_neon_pass1(input+8*16, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output.
vp9_short_idct16x16_add_neon_pass2(input+8*16+1,
row_idct_output+8,
pass1_output,
0,
dest,
dest_stride);
/* Parallel idct on the left 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,
row_idct_output,
pass1_output,
1,
dest,
dest_stride);
/* Parallel idct on the right 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,
row_idct_output+8,
pass1_output,
1,
dest+8,
dest_stride);
// restore d8-d15 register values.
restore_registers();
return;
}
This diff is collapsed.
......@@ -313,7 +313,7 @@ prototype void vp9_short_idct16x16_1_add "int16_t *input, uint8_t *dest, int des
specialize vp9_short_idct16x16_1_add sse2
prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct16x16_add sse2
specialize vp9_short_idct16x16_add sse2 neon
prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct10_16x16_add sse2
......
......@@ -89,11 +89,13 @@ endif
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve_neon.c
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct16x16_neon.c
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_avg_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
$(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.sh))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment