Commit 6a501462 authored by Christian Duvivier's avatar Christian Duvivier
Browse files

First draft of vp9_short_idct32x32_add_neon.

Lots of TODO which will be taken care in upcoming changes. As is,
about 6x faster than C version.

Change-Id: Ie2557b72fd2d8edca376dbf400a4d173aa5e63e0
parent 23845947
......@@ -29,8 +29,8 @@ extern void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src,
int16_t skip_adding,
uint8_t *dest,
int dest_stride);
extern void save_registers();
extern void restore_registers();
extern void save_neon_registers();
extern void restore_neon_registers();
void vp9_short_idct16x16_add_neon(int16_t *input,
......@@ -39,7 +39,7 @@ void vp9_short_idct16x16_add_neon(int16_t *input,
int16_t row_idct_output[16*16] = {0};
// save d8-d15 register values.
save_registers();
save_neon_registers();
/* Parallel idct on the upper 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
......@@ -102,7 +102,7 @@ void vp9_short_idct16x16_add_neon(int16_t *input,
dest_stride);
// restore d8-d15 register values.
restore_registers();
restore_neon_registers();
return;
}
......@@ -113,7 +113,7 @@ void vp9_short_idct10_16x16_add_neon(int16_t *input,
int16_t row_idct_output[16*16] = {0};
// save d8-d15 register values.
save_registers();
save_neon_registers();
/* Parallel idct on the upper 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
......@@ -163,7 +163,7 @@ void vp9_short_idct10_16x16_add_neon(int16_t *input,
dest_stride);
// restore d8-d15 register values.
restore_registers();
restore_neon_registers();
return;
}
/*
* Copyright (c) 2013 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vp9/common/vp9_common.h"
// defined in vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
extern void idct32_transpose_and_transform(int16_t *transpose_buffer,
int16_t *output, int16_t *input);
extern void idct32_combine_add(uint8_t *dest, int16_t *out, int dest_stride);
// defined in vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
extern void save_neon_registers();
extern void restore_neon_registers();
void vp9_short_idct32x32_add_neon(int16_t *input, uint8_t *dest,
int dest_stride) {
// TODO(cd): move the creation of these buffers within the ASM file
// internal buffer used to transpose 8 lines into before transforming them
int16_t transpose_buffer[32 * 8];
// results of the first pass (transpose and transform rows)
int16_t pass1[32 * 32];
// results of the second pass (transpose and transform columns)
int16_t pass2[32 * 32];
// save register we need to preserve
save_neon_registers();
// process rows
idct32_transpose_and_transform(transpose_buffer, pass1, input);
// process columns
// TODO(cd): do these two steps/passes within the ASM file
idct32_transpose_and_transform(transpose_buffer, pass2, pass1);
// combine and add to dest
// TODO(cd): integrate this within the last storage step of the second pass
idct32_combine_add(dest, pass2, dest_stride);
// restore register we need to preserve
restore_neon_registers();
}
// TODO(cd): Eliminate this file altogether when everything is in ASM file
......@@ -12,8 +12,8 @@
EXPORT |vp9_short_idct16x16_add_neon_pass2|
EXPORT |vp9_short_idct10_16x16_add_neon_pass1|
EXPORT |vp9_short_idct10_16x16_add_neon_pass2|
EXPORT |save_registers|
EXPORT |restore_registers|
EXPORT |save_neon_registers|
EXPORT |restore_neon_registers|
ARM
REQUIRE8
PRESERVE8
......@@ -1178,13 +1178,13 @@ end_idct10_16x16_pass2
pop {r3-r9}
bx lr
ENDP ; |vp9_short_idct10_16x16_add_neon_pass2|
;void |save_registers|()
|save_registers| PROC
;void |save_neon_registers|()
|save_neon_registers| PROC
vpush {d8-d15}
bx lr
ENDP ; |save_registers|
;void |restore_registers|()
|restore_registers| PROC
;void |restore_neon_registers|()
|restore_neon_registers| PROC
vpop {d8-d15}
bx lr
ENDP ; |restore_registers|
......
This diff is collapsed.
......@@ -319,7 +319,7 @@ prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int de
specialize vp9_short_idct10_16x16_add sse2 neon
prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct32x32_add sse2
specialize vp9_short_idct32x32_add sse2 neon
prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output"
specialize vp9_short_idct1_32x32
......
......@@ -93,6 +93,7 @@ VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve_neon.c
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct16x16_neon.c
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct32x32_neon.c
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_avg_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM)
......@@ -103,6 +104,7 @@ VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_1_add_neon$(AS
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_1_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct32x32_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_iht4x4_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_copy_neon$(ASM)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment