Commit ed9c66f5 authored by Scott LaVarnway's avatar Scott LaVarnway
Browse files

Remove usage of predict buffer for decode

Instead of using the predict buffer, the decoder now writes
the predictor into the recon buffer.  For blocks with eob=0,
unnecessary idcts can be eliminated.  This gave a performance
boost of ~1.8% for the HD clips used.

Tero: Added needed changes to ARM side and scheduled some
      assembly code to prevent interlocks.

Patch Set 6:  Merged (I1bcdca7a95aacc3a181b9faa6b10e3a71ee24df3)
into this commit because of similarities in the idct
functions.
Patch Set 7: EC bug fix.

Change-Id: Ie31d90b5d3522e1108163f2ac491e455e3f955e6
parent 6505adf2
......@@ -45,7 +45,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
rtcd->subpix.bilinear8x4 = vp8_bilinear_predict8x4_armv6;
rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_armv6;
rtcd->idct.idct1 = vp8_short_idct4x4llm_1_v6;
rtcd->idct.idct16 = vp8_short_idct4x4llm_v6_dual;
rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_v6;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_v6;
......@@ -64,9 +63,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
rtcd->recon.copy16x16 = vp8_copy_mem16x16_v6;
rtcd->recon.copy8x8 = vp8_copy_mem8x8_v6;
rtcd->recon.copy8x4 = vp8_copy_mem8x4_v6;
rtcd->recon.recon = vp8_recon_b_armv6;
rtcd->recon.recon2 = vp8_recon2b_armv6;
rtcd->recon.recon4 = vp8_recon4b_armv6;
}
#endif
......@@ -82,7 +78,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
rtcd->subpix.bilinear8x4 = vp8_bilinear_predict8x4_neon;
rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_neon;
rtcd->idct.idct1 = vp8_short_idct4x4llm_1_neon;
rtcd->idct.idct16 = vp8_short_idct4x4llm_neon;
rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_neon;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_neon;
......@@ -99,10 +94,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
rtcd->recon.copy16x16 = vp8_copy_mem16x16_neon;
rtcd->recon.copy8x8 = vp8_copy_mem8x8_neon;
rtcd->recon.copy8x4 = vp8_copy_mem8x4_neon;
rtcd->recon.recon = vp8_recon_b_neon;
rtcd->recon.recon2 = vp8_recon2b_neon;
rtcd->recon.recon4 = vp8_recon4b_neon;
rtcd->recon.recon_mb = vp8_recon_mb_neon;
rtcd->recon.build_intra_predictors_mby =
vp8_build_intra_predictors_mby_neon;
rtcd->recon.build_intra_predictors_mby_s =
......
......@@ -11,25 +11,27 @@
AREA |.text|, CODE, READONLY
;void vp8_dc_only_idct_add_v6(short input_dc, unsigned char *pred_ptr,
; unsigned char *dst_ptr, int pitch, int stride)
;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
; int pred_stride, unsigned char *dst_ptr,
; int dst_stride)
; r0 input_dc
; r1 pred_ptr
; r2 dest_ptr
; r3 pitch
; sp stride
; r2 pred_stride
; r3 dst_ptr
; sp dst_stride
|vp8_dc_only_idct_add_v6| PROC
stmdb sp!, {r4 - r7, lr}
stmdb sp!, {r4 - r7}
add r0, r0, #4 ; input_dc += 4
ldr r12, c0x0000FFFF
ldr r4, [r1], r3
ldr r6, [r1], r3
ldr r4, [r1], r2
and r0, r12, r0, asr #3 ; input_dc >> 3 + mask
ldr lr, [sp, #20]
ldr r6, [r1], r2
orr r0, r0, r0, lsl #16 ; a1 | a1
ldr r12, [sp, #16] ; dst stride
uxtab16 r5, r0, r4 ; a1+2 | a1+0
uxtab16 r4, r0, r4, ror #8 ; a1+3 | a1+1
uxtab16 r7, r0, r6
......@@ -40,10 +42,10 @@
usat16 r6, #8, r6
orr r5, r5, r4, lsl #8
orr r7, r7, r6, lsl #8
ldr r4, [r1], r3
ldr r4, [r1], r2
str r5, [r3], r12
ldr r6, [r1]
str r5, [r2], lr
str r7, [r2], lr
str r7, [r3], r12
uxtab16 r5, r0, r4
uxtab16 r4, r0, r4, ror #8
......@@ -55,10 +57,11 @@
usat16 r6, #8, r6
orr r5, r5, r4, lsl #8
orr r7, r7, r6, lsl #8
str r5, [r2], lr
str r7, [r2]
str r5, [r3], r12
str r7, [r3]
ldmia sp!, {r4 - r7, pc}
ldmia sp!, {r4 - r7}
bx lr
ENDP ; |vp8_dc_only_idct_add_v6|
......
This diff is collapsed.
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_recon_b_armv6|
EXPORT |vp8_recon2b_armv6|
EXPORT |vp8_recon4b_armv6|
AREA |.text|, CODE, READONLY ; name this block of code
prd RN r0
dif RN r1
dst RN r2
stride RN r3
;void recon_b(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride)
; R0 char* pred_ptr
; R1 short * dif_ptr
; R2 char * dst_ptr
; R3 int stride
; Description:
; Loop through the block adding the Pred and Diff together. Clamp and then
; store back into the Dst.
; Restrictions :
; all buffers are expected to be 4 byte aligned coming in and
; going out.
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
;
;
;
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|vp8_recon_b_armv6| PROC
stmdb sp!, {r4 - r9, lr}
;0, 1, 2, 3
ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
ldr r6, [dif, #0] ; 1 | 0
ldr r7, [dif, #4] ; 3 | 2
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
pkhtb r9, r7, r6, asr #16 ; 3 | 1
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
usat16 r8, #8, r8
usat16 r9, #8, r9
add dif, dif, #32
orr r8, r8, r9, lsl #8
str r8, [dst], stride
;0, 1, 2, 3
ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
;; ldr r6, [dif, #8] ; 1 | 0
;; ldr r7, [dif, #12] ; 3 | 2
ldr r6, [dif, #0] ; 1 | 0
ldr r7, [dif, #4] ; 3 | 2
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
pkhtb r9, r7, r6, asr #16 ; 3 | 1
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
usat16 r8, #8, r8
usat16 r9, #8, r9
add dif, dif, #32
orr r8, r8, r9, lsl #8
str r8, [dst], stride
;0, 1, 2, 3
ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
;; ldr r6, [dif, #16] ; 1 | 0
;; ldr r7, [dif, #20] ; 3 | 2
ldr r6, [dif, #0] ; 1 | 0
ldr r7, [dif, #4] ; 3 | 2
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
pkhtb r9, r7, r6, asr #16 ; 3 | 1
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
usat16 r8, #8, r8
usat16 r9, #8, r9
add dif, dif, #32
orr r8, r8, r9, lsl #8
str r8, [dst], stride
;0, 1, 2, 3
ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
;; ldr r6, [dif, #24] ; 1 | 0
;; ldr r7, [dif, #28] ; 3 | 2
ldr r6, [dif, #0] ; 1 | 0
ldr r7, [dif, #4] ; 3 | 2
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
pkhtb r9, r7, r6, asr #16 ; 3 | 1
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
usat16 r8, #8, r8
usat16 r9, #8, r9
orr r8, r8, r9, lsl #8
str r8, [dst], stride
ldmia sp!, {r4 - r9, pc}
ENDP ; |recon_b|
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
;
;
;
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
; R0 char *pred_ptr
; R1 short *dif_ptr
; R2 char *dst_ptr
; R3 int stride
|vp8_recon4b_armv6| PROC
stmdb sp!, {r4 - r9, lr}
mov lr, #4
recon4b_loop
;0, 1, 2, 3
ldr r4, [prd], #4 ; 3 | 2 | 1 | 0
ldr r6, [dif, #0] ; 1 | 0
ldr r7, [dif, #4] ; 3 | 2
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
pkhtb r9, r7, r6, asr #16 ; 3 | 1
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
usat16 r8, #8, r8
usat16 r9, #8, r9
orr r8, r8, r9, lsl #8
str r8, [dst]
;4, 5, 6, 7
ldr r4, [prd], #4
;; ldr r6, [dif, #32]
;; ldr r7, [dif, #36]
ldr r6, [dif, #8]
ldr r7, [dif, #12]
pkhbt r8, r6, r7, lsl #16
pkhtb r9, r7, r6, asr #16
uxtab16 r8, r8, r4
uxtab16 r9, r9, r4, ror #8
usat16 r8, #8, r8
usat16 r9, #8, r9
orr r8, r8, r9, lsl #8
str r8, [dst, #4]
;8, 9, 10, 11
ldr r4, [prd], #4
;; ldr r6, [dif, #64]
;; ldr r7, [dif, #68]
ldr r6, [dif, #16]
ldr r7, [dif, #20]
pkhbt r8, r6, r7, lsl #16
pkhtb r9, r7, r6, asr #16
uxtab16 r8, r8, r4
uxtab16 r9, r9, r4, ror #8
usat16 r8, #8, r8
usat16 r9, #8, r9
orr r8, r8, r9, lsl #8
str r8, [dst, #8]
;12, 13, 14, 15
ldr r4, [prd], #4
;; ldr r6, [dif, #96]
;; ldr r7, [dif, #100]
ldr r6, [dif, #24]
ldr r7, [dif, #28]
pkhbt r8, r6, r7, lsl #16
pkhtb r9, r7, r6, asr #16
uxtab16 r8, r8, r4
uxtab16 r9, r9, r4, ror #8
usat16 r8, #8, r8
usat16 r9, #8, r9
orr r8, r8, r9, lsl #8
str r8, [dst, #12]
add dst, dst, stride
;; add dif, dif, #8
add dif, dif, #32
subs lr, lr, #1
bne recon4b_loop
ldmia sp!, {r4 - r9, pc}
ENDP ; |Recon4B|
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
;
;
;
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
; R0 char *pred_ptr
; R1 short *dif_ptr
; R2 char *dst_ptr
; R3 int stride
|vp8_recon2b_armv6| PROC
stmdb sp!, {r4 - r9, lr}
mov lr, #4
recon2b_loop
;0, 1, 2, 3
ldr r4, [prd], #4
ldr r6, [dif, #0]
ldr r7, [dif, #4]
pkhbt r8, r6, r7, lsl #16
pkhtb r9, r7, r6, asr #16
uxtab16 r8, r8, r4
uxtab16 r9, r9, r4, ror #8
usat16 r8, #8, r8
usat16 r9, #8, r9
orr r8, r8, r9, lsl #8
str r8, [dst]
;4, 5, 6, 7
ldr r4, [prd], #4
;; ldr r6, [dif, #32]
;; ldr r7, [dif, #36]
ldr r6, [dif, #8]
ldr r7, [dif, #12]
pkhbt r8, r6, r7, lsl #16
pkhtb r9, r7, r6, asr #16
uxtab16 r8, r8, r4
uxtab16 r9, r9, r4, ror #8
usat16 r8, #8, r8
usat16 r9, #8, r9
orr r8, r8, r9, lsl #8
str r8, [dst, #4]
add dst, dst, stride
;; add dif, dif, #8
add dif, dif, #16
subs lr, lr, #1
bne recon2b_loop
ldmia sp!, {r4 - r9, pc}
ENDP ; |Recon2B|
END
......@@ -13,16 +13,12 @@
#define IDCT_ARM_H
#if HAVE_ARMV6
extern prototype_idct(vp8_short_idct4x4llm_1_v6);
extern prototype_idct(vp8_short_idct4x4llm_v6_dual);
extern prototype_idct_scalar_add(vp8_dc_only_idct_add_v6);
extern prototype_second_order(vp8_short_inv_walsh4x4_1_v6);
extern prototype_second_order(vp8_short_inv_walsh4x4_v6);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_idct_idct1
#define vp8_idct_idct1 vp8_short_idct4x4llm_1_v6
#undef vp8_idct_idct16
#define vp8_idct_idct16 vp8_short_idct4x4llm_v6_dual
......@@ -38,16 +34,12 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_v6);
#endif
#if HAVE_ARMV7
extern prototype_idct(vp8_short_idct4x4llm_1_neon);
extern prototype_idct(vp8_short_idct4x4llm_neon);
extern prototype_idct_scalar_add(vp8_dc_only_idct_add_neon);
extern prototype_second_order(vp8_short_inv_walsh4x4_1_neon);
extern prototype_second_order(vp8_short_inv_walsh4x4_neon);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_idct_idct1
#define vp8_idct_idct1 vp8_short_idct4x4llm_1_neon
#undef vp8_idct_idct16
#define vp8_idct_idct16 vp8_short_idct4x4llm_neon
......
......@@ -14,22 +14,26 @@
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr,
; unsigned char *dst_ptr, int pitch, int stride)
;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
; int pred_stride, unsigned char *dst_ptr,
; int dst_stride)
; r0 input_dc
; r1 pred_ptr
; r2 dst_ptr
; r3 pitch
; sp stride
; r2 pred_stride
; r3 dst_ptr
; sp dst_stride
|vp8_dc_only_idct_add_neon| PROC
add r0, r0, #4
asr r0, r0, #3
ldr r12, [sp]
vdup.16 q0, r0
vld1.32 {d2[0]}, [r1], r3
vld1.32 {d2[1]}, [r1], r3
vld1.32 {d4[0]}, [r1], r3
vld1.32 {d2[0]}, [r1], r2
vld1.32 {d2[1]}, [r1], r2
vld1.32 {d4[0]}, [r1], r2
vld1.32 {d4[1]}, [r1]
vaddw.u8 q1, q0, d2
......@@ -38,12 +42,13 @@
vqmovun.s16 d2, q1
vqmovun.s16 d4, q2
vst1.32 {d2[0]}, [r2], r12
vst1.32 {d2[1]}, [r2], r12
vst1.32 {d4[0]}, [r2], r12
vst1.32 {d4[1]}, [r2]
bx lr
vst1.32 {d2[0]}, [r3], r12
vst1.32 {d2[1]}, [r3], r12
vst1.32 {d4[0]}, [r3], r12
vst1.32 {d4[1]}, [r3]
bx lr
ENDP
END
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_recon16x16mb_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *pred_ptr,
; r1 short *diff_ptr,
; r2 unsigned char *dst_ptr,
; r3 int ystride,
; stack unsigned char *udst_ptr,
; stack unsigned char *vdst_ptr
|vp8_recon16x16mb_neon| PROC
mov r12, #4 ;loop counter for Y loop
recon16x16mb_loop_y
vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr
vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr
vld1.u8 {q14, q15}, [r0]!
vld1.16 {q10, q11}, [r1]!
vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits
vmovl.u8 q1, d25
vmovl.u8 q2, d26
vmovl.u8 q3, d27
vmovl.u8 q4, d28
vmovl.u8 q5, d29
vmovl.u8 q6, d30
vld1.16 {q12, q13}, [r1]!
vmovl.u8 q7, d31
vld1.16 {q14, q15}, [r1]!
pld [r0]
pld [r1]
pld [r1, #64]
vadd.s16 q0, q0, q8 ;add Diff data and Pred data together
vadd.s16 q1, q1, q9
vadd.s16 q2, q2, q10
vadd.s16 q3, q3, q11
vadd.s16 q4, q4, q12
vadd.s16 q5, q5, q13
vadd.s16 q6, q6, q14
vadd.s16 q7, q7, q15
vqmovun.s16 d0, q0 ;CLAMP() saturation
vqmovun.s16 d1, q1
vqmovun.s16 d2, q2
vqmovun.s16 d3, q3
vqmovun.s16 d4, q4
vqmovun.s16 d5, q5
vst1.u8 {q0}, [r2], r3 ;store result
vqmovun.s16 d6, q6
vst1.u8 {q1}, [r2], r3
vqmovun.s16 d7, q7
vst1.u8 {q2}, [r2], r3
subs r12, r12, #1
moveq r12, #2 ;loop counter for UV loop
vst1.u8 {q3}, [r2], r3
bne recon16x16mb_loop_y
mov r3, r3, lsr #1 ;uv_stride = ystride>>1
ldr r2, [sp] ;load upred_ptr
recon16x16mb_loop_uv
vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr
vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr
vld1.u8 {q14, q15}, [r0]!
vld1.16 {q10, q11}, [r1]!
vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits
vmovl.u8 q1, d25
vmovl.u8 q2, d26
vmovl.u8 q3, d27
vmovl.u8 q4, d28
vmovl.u8 q5, d29
vmovl.u8 q6, d30
vld1.16 {q12, q13}, [r1]!
vmovl.u8 q7, d31
vld1.16 {q14, q15}, [r1]!
vadd.s16 q0, q0, q8 ;add Diff data and Pred data together
vadd.s16 q1, q1, q9
vadd.s16 q2, q2, q10
vadd.s16 q3, q3, q11
vadd.s16 q4, q4, q12
vadd.s16 q5, q5, q13
vadd.s16 q6, q6, q14
vqmovun.s16 d0, q0 ;CLAMP() saturation
vadd.s16 q7, q7, q15
vqmovun.s16 d1, q1
vqmovun.s16 d2, q2
vqmovun.s16 d3, q3
vst1.u8 {d0}, [r2], r3 ;store result
vqmovun.s16 d4, q4
vst1.u8 {d1}, [r2], r3
vqmovun.s16 d5, q5
vst1.u8 {d2}, [r2], r3
vqmovun.s16 d6, q6
vst1.u8 {d3}, [r2], r3
vqmovun.s16 d7, q7
vst1.u8 {d4}, [r2], r3
subs r12, r12, #1
vst1.u8 {d5}, [r2], r3
vst1.u8 {d6}, [r2], r3
vst1.u8 {d7}, [r2], r3
ldrne r2, [sp, #4] ;load vpred_ptr
bne recon16x16mb_loop_uv