Commit f46e17fd authored by Scott LaVarnway's avatar Scott LaVarnway Committed by Gerrit Code Review

Merge "Modified the inverse walsh to output directly"

parents e2bacd58 4a91541c
......@@ -46,7 +46,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_armv6;
rtcd->idct.idct16 = vp8_short_idct4x4llm_v6_dual;
rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_v6;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_v6;
rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_armv6;
......@@ -80,7 +79,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_neon;
rtcd->idct.idct16 = vp8_short_idct4x4llm_neon;
rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_neon;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_neon;
rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_neon;
......
......@@ -9,7 +9,6 @@
;
EXPORT |vp8_short_inv_walsh4x4_v6|
EXPORT |vp8_short_inv_walsh4x4_1_v6|
ARM
REQUIRE8
......@@ -17,19 +16,19 @@
AREA |.text|, CODE, READONLY ; name this block of code
;short vp8_short_inv_walsh4x4_v6(short *input, short *output)
;short vp8_short_inv_walsh4x4_v6(short *input, short *mb_dqcoeff)
|vp8_short_inv_walsh4x4_v6| PROC
stmdb sp!, {r4 - r11, lr}
stmdb sp!, {r4 - r12, lr}
ldr r2, [r0], #4 ; [1 | 0]
ldr r3, [r0], #4 ; [3 | 2]
ldr r4, [r0], #4 ; [5 | 4]
ldr r5, [r0], #4 ; [7 | 6]
ldr r6, [r0], #4 ; [9 | 8]
ldr r7, [r0], #4 ; [11 | 10]
ldr r8, [r0], #4 ; [13 | 12]
ldr r9, [r0] ; [15 | 14]
ldr r2, [r0, #0] ; [1 | 0]
ldr r3, [r0, #4] ; [3 | 2]
ldr r4, [r0, #8] ; [5 | 4]
ldr r5, [r0, #12] ; [7 | 6]
ldr r6, [r0, #16] ; [9 | 8]
ldr r7, [r0, #20] ; [11 | 10]
ldr r8, [r0, #24] ; [13 | 12]
ldr r9, [r0, #28] ; [15 | 14]
qadd16 r10, r2, r8 ; a1 [1+13 | 0+12]
qadd16 r11, r4, r6 ; b1 [5+9 | 4+8]
......@@ -69,24 +68,27 @@
qadd16 r4, r4, r10 ; [b2+3|c2+3]
qadd16 r5, r5, r10 ; [a2+3|d2+3]
asr r12, r2, #3 ; [1 | x]
pkhtb r12, r12, r3, asr #19; [1 | 0]
lsl lr, r3, #16 ; [~3 | x]
lsl r2, r2, #16 ; [~2 | x]
asr lr, lr, #3 ; [3 | x]
pkhtb lr, lr, r2, asr #19 ; [3 | 2]
asr r2, r4, #3 ; [5 | x]
pkhtb r2, r2, r5, asr #19 ; [5 | 4]
lsl r3, r5, #16 ; [~7 | x]
lsl r4, r4, #16 ; [~6 | x]
asr r3, r3, #3 ; [7 | x]
pkhtb r3, r3, r4, asr #19 ; [7 | 6]
str r12, [r1], #4
str lr, [r1], #4
str r2, [r1], #4
str r3, [r1], #4
asr r12, r3, #19 ; [0]
strh r12, [r1], #32
asr lr, r2, #19 ; [1]
strh lr, [r1], #32
sxth r2, r2
sxth r3, r3
asr r2, r2, #3 ; [2]
strh r2, [r1], #32
asr r3, r3, #3 ; [3]
strh r3, [r1], #32
asr r12, r5, #19 ; [4]
strh r12, [r1], #32
asr lr, r4, #19 ; [5]
strh lr, [r1], #32
sxth r4, r4
sxth r5, r5
asr r4, r4, #3 ; [6]
strh r4, [r1], #32
asr r5, r5, #3 ; [7]
strh r5, [r1], #32
qsubaddx r2, r6, r7 ; [c1|a1] [9-10 | 8+11]
qaddsubx r3, r6, r7 ; [b1|d1] [9+10 | 8-11]
......@@ -103,50 +105,32 @@
qadd16 r8, r8, r10 ; [b2+3|c2+3]
qadd16 r9, r9, r10 ; [a2+3|d2+3]
asr r2, r6, #3 ; [9 | x]
pkhtb r2, r2, r7, asr #19 ; [9 | 8]
lsl r3, r7, #16 ; [~11| x]
lsl r4, r6, #16 ; [~10| x]
asr r3, r3, #3 ; [11 | x]
pkhtb r3, r3, r4, asr #19 ; [11 | 10]
asr r4, r8, #3 ; [13 | x]
pkhtb r4, r4, r9, asr #19 ; [13 | 12]
lsl r5, r9, #16 ; [~15| x]
lsl r6, r8, #16 ; [~14| x]
asr r5, r5, #3 ; [15 | x]
pkhtb r5, r5, r6, asr #19 ; [15 | 14]
str r2, [r1], #4
str r3, [r1], #4
str r4, [r1], #4
str r5, [r1]
ldmia sp!, {r4 - r11, pc}
asr r12, r7, #19 ; [8]
strh r12, [r1], #32
asr lr, r6, #19 ; [9]
strh lr, [r1], #32
sxth r6, r6
sxth r7, r7
asr r6, r6, #3 ; [10]
strh r6, [r1], #32
asr r7, r7, #3 ; [11]
strh r7, [r1], #32
asr r12, r9, #19 ; [12]
strh r12, [r1], #32
asr lr, r8, #19 ; [13]
strh lr, [r1], #32
sxth r8, r8
sxth r9, r9
asr r8, r8, #3 ; [14]
strh r8, [r1], #32
asr r9, r9, #3 ; [15]
strh r9, [r1], #32
ldmia sp!, {r4 - r12, pc}
ENDP ; |vp8_short_inv_walsh4x4_v6|
;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output)
|vp8_short_inv_walsh4x4_1_v6| PROC
ldrsh r2, [r0] ; [0]
add r2, r2, #3 ; [0] + 3
asr r2, r2, #3 ; a1 ([0]+3) >> 3
lsl r2, r2, #16 ; [a1 | x]
orr r2, r2, r2, lsr #16 ; [a1 | a1]
str r2, [r1], #4
str r2, [r1], #4
str r2, [r1], #4
str r2, [r1], #4
str r2, [r1], #4
str r2, [r1], #4
str r2, [r1], #4
str r2, [r1]
bx lr
ENDP ; |vp8_short_inv_walsh4x4_1_v6|
; Constant Pool
c0x00030003 DCD 0x00030003
END
......@@ -25,9 +25,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_v6);
#undef vp8_idct_idct1_scalar_add
#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_v6
#undef vp8_idct_iwalsh1
#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_v6
#undef vp8_idct_iwalsh16
#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_v6
#endif
......@@ -46,9 +43,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_neon);
#undef vp8_idct_idct1_scalar_add
#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_neon
#undef vp8_idct_iwalsh1
#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_neon
#undef vp8_idct_iwalsh16
#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_neon
#endif
......
......@@ -8,7 +8,6 @@
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_inv_walsh4x4_neon|
EXPORT |vp8_short_inv_walsh4x4_1_neon|
ARM
REQUIRE8
......@@ -16,7 +15,7 @@
AREA |.text|, CODE, READONLY ; name this block of code
;short vp8_short_inv_walsh4x4_neon(short *input, short *output)
;short vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff)
|vp8_short_inv_walsh4x4_neon| PROC
; read in all four lines of values: d0->d3
......@@ -59,22 +58,30 @@
vshr.s16 q0, q0, #3 ;e/f >> 3
vshr.s16 q1, q1, #3 ;g/h >> 3
vst4.i16 {d0,d1,d2,d3}, [r1@128]
mov r2, #64
add r3, r1, #32
bx lr
ENDP ; |vp8_short_inv_walsh4x4_neon|
vst1.i16 d0[0], [r1],r2
vst1.i16 d1[0], [r3],r2
vst1.i16 d2[0], [r1],r2
vst1.i16 d3[0], [r3],r2
vst1.i16 d0[1], [r1],r2
vst1.i16 d1[1], [r3],r2
vst1.i16 d2[1], [r1],r2
vst1.i16 d3[1], [r3],r2
vst1.i16 d0[2], [r1],r2
vst1.i16 d1[2], [r3],r2
vst1.i16 d2[2], [r1],r2
vst1.i16 d3[2], [r3],r2
vst1.i16 d0[3], [r1],r2
vst1.i16 d1[3], [r3],r2
vst1.i16 d2[3], [r1]
vst1.i16 d3[3], [r3]
;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output)
|vp8_short_inv_walsh4x4_1_neon| PROC
ldrsh r2, [r0] ; load input[0]
add r3, r2, #3 ; add 3
add r2, r1, #16 ; base for last 8 output
asr r0, r3, #3 ; right shift 3
vdup.16 q0, r0 ; load and duplicate
vst1.16 {q0}, [r1@128] ; write back 8
vst1.16 {q0}, [r2@128] ; write back last 8
bx lr
ENDP ; |vp8_short_inv_walsh4x4_1_neon|
ENDP ; |vp8_short_inv_walsh4x4_neon|
END
......@@ -37,6 +37,10 @@
#define vp8_idct_idct16 vp8_short_idct4x4llm_c
#endif
extern prototype_idct(vp8_idct_idct16);
/* add this prototype to prevent compiler warning about implicit
* declaration of vp8_short_idct4x4llm_c function in dequantize.c
* when building, for example, neon optimized version */
extern prototype_idct(vp8_short_idct4x4llm_c);
#ifndef vp8_idct_idct1_scalar_add
#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_c
......
......@@ -137,8 +137,9 @@ void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
}
void vp8_short_inv_walsh4x4_c(short *input, short *output)
void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff)
{
short output[16];
int i;
int a1, b1, c1, d1;
int a2, b2, c2, d2;
......@@ -183,22 +184,21 @@ void vp8_short_inv_walsh4x4_c(short *input, short *output)
ip += 4;
op += 4;
}
for(i = 0; i < 16; i++)
{
mb_dqcoeff[i * 16] = output[i];
}
}
void vp8_short_inv_walsh4x4_1_c(short *input, short *output)
void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff)
{
int i;
int a1;
short *op = output;
a1 = ((input[0] + 3) >> 3);
for (i = 0; i < 4; i++)
for(i = 0; i < 16; i++)
{
op[0] = a1;
op[1] = a1;
op[2] = a1;
op[3] = a1;
op += 4;
mb_dqcoeff[i * 16] = a1;
}
}
......@@ -28,18 +28,6 @@ void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b,
}
static void recon_dcblock(MACROBLOCKD *x)
{
BLOCKD *b = &x->block[24];
int i;
for (i = 0; i < 16; i++)
{
x->block[i].dqcoeff[0] = b->diff[i];
}
}
void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
{
int i;
......@@ -47,9 +35,7 @@ void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *
if(x->mode_info_context->mbmi.mode != SPLITMV)
{
/* do 2nd order transform on the dc block */
IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff, x->block[24].diff);
recon_dcblock(x);
IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff, x->dqcoeff);
}
for (i = 0; i < 16; i++)
......
......@@ -24,7 +24,6 @@ extern prototype_idct(vp8_short_idct4x4llm_mmx);
extern prototype_idct_scalar_add(vp8_dc_only_idct_add_mmx);
extern prototype_second_order(vp8_short_inv_walsh4x4_mmx);
extern prototype_second_order(vp8_short_inv_walsh4x4_1_mmx);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_idct_idct16
......@@ -36,9 +35,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_1_mmx);
#undef vp8_idct_iwalsh16
#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_mmx
#undef vp8_idct_iwalsh1
#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_mmx
#endif
#endif
......
......@@ -11,42 +11,6 @@
%include "vpx_ports/x86_abi_support.asm"
;void vp8_short_inv_walsh4x4_1_mmx(short *input, short *output)
global sym(vp8_short_inv_walsh4x4_1_mmx)
sym(vp8_short_inv_walsh4x4_1_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 2
push rsi
push rdi
; end prolog
mov rsi, arg(0)
mov rax, 3
mov rdi, arg(1)
add rax, [rsi] ;input[0] + 3
movd mm0, eax
punpcklwd mm0, mm0 ;x x val val
punpckldq mm0, mm0 ;val val val val
psraw mm0, 3 ;(input[0] + 3) >> 3
movq [rdi + 0], mm0
movq [rdi + 8], mm0
movq [rdi + 16], mm0
movq [rdi + 24], mm0
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void vp8_short_inv_walsh4x4_mmx(short *input, short *output)
global sym(vp8_short_inv_walsh4x4_mmx)
sym(vp8_short_inv_walsh4x4_mmx):
......@@ -159,10 +123,50 @@ sym(vp8_short_inv_walsh4x4_mmx):
psraw mm2, 3
psraw mm3, 3
movq [rdi + 0], mm0
movq [rdi + 8], mm1
movq [rdi + 16], mm2
movq [rdi + 24], mm3
; movq [rdi + 0], mm0
; movq [rdi + 8], mm1
; movq [rdi + 16], mm2
; movq [rdi + 24], mm3
movd eax, mm0
psrlq mm0, 32
mov word ptr[rdi+32*0], ax
shr eax, 16
mov word ptr[rdi+32*1], ax
movd eax, mm0
mov word ptr[rdi+32*2], ax
shr eax, 16
mov word ptr[rdi+32*3], ax
movd ecx, mm1
psrlq mm1, 32
mov word ptr[rdi+32*4], cx
shr ecx, 16
mov word ptr[rdi+32*5], cx
movd ecx, mm1
mov word ptr[rdi+32*6], cx
shr ecx, 16
mov word ptr[rdi+32*7], cx
movd eax, mm2
psrlq mm2, 32
mov word ptr[rdi+32*8], ax
shr eax, 16
mov word ptr[rdi+32*9], ax
movd eax, mm2
mov word ptr[rdi+32*10], ax
shr eax, 16
mov word ptr[rdi+32*11], ax
movd ecx, mm3
psrlq mm3, 32
mov word ptr[rdi+32*12], cx
shr ecx, 16
mov word ptr[rdi+32*13], cx
movd ecx, mm3
mov word ptr[rdi+32*14], cx
shr ecx, 16
mov word ptr[rdi+32*15], cx
; begin epilog
pop rdi
......
......@@ -96,8 +96,50 @@ sym(vp8_short_inv_walsh4x4_sse2):
psraw xmm5, 3
psraw xmm1, 3
movdqa [rdi + 0], xmm5
movdqa [rdi + 16], xmm1
;; movdqa [rdi + 0], xmm5
;; movdqa [rdi + 16], xmm1
movd eax, xmm5
psrldq xmm5, 4
mov word ptr[rdi+32*0], ax
shr eax, 16
mov word ptr[rdi+32*1], ax
movd eax, xmm5
psrldq xmm5, 4
mov word ptr[rdi+32*2], ax
shr eax, 16
mov word ptr[rdi+32*3], ax
movd eax, xmm5
psrldq xmm5, 4
mov word ptr[rdi+32*4], ax
shr eax, 16
mov word ptr[rdi+32*5], ax
movd eax, xmm5
mov word ptr[rdi+32*6], ax
shr eax, 16
mov word ptr[rdi+32*7], ax
movd eax, xmm1
psrldq xmm1, 4
mov word ptr[rdi+32*8], ax
shr eax, 16
mov word ptr[rdi+32*9], ax
movd eax, xmm1
psrldq xmm1, 4
mov word ptr[rdi+32*10], ax
shr eax, 16
mov word ptr[rdi+32*11], ax
movd eax, xmm1
psrldq xmm1, 4
mov word ptr[rdi+32*12], ax
shr eax, 16
mov word ptr[rdi+32*13], ax
movd eax, xmm1
mov word ptr[rdi+32*14], ax
shr eax, 16
mov word ptr[rdi+32*15], ax
; begin epilog
pop rdi
......
......@@ -40,9 +40,6 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
rtcd->idct.idct16 = vp8_short_idct4x4llm_mmx;
rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_mmx;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_mmx;
rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_mmx;
rtcd->recon.copy8x8 = vp8_copy_mem8x8_mmx;
rtcd->recon.copy8x4 = vp8_copy_mem8x4_mmx;
......
......@@ -32,8 +32,6 @@ void vp8_arch_arm_decode_init(VP8D_COMP *pbi)
{
pbi->dequant.block = vp8_dequantize_b_v6;
pbi->dequant.idct_add = vp8_dequant_idct_add_v6;
pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_v6;
pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_v6;
pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_v6;
pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_v6;
}
......@@ -44,9 +42,6 @@ void vp8_arch_arm_decode_init(VP8D_COMP *pbi)
{
pbi->dequant.block = vp8_dequantize_b_neon;
pbi->dequant.idct_add = vp8_dequant_idct_add_neon;
/*This is not used: NEON always dequants two blocks at once.
pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_neon;*/
pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_neon;
pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_neon;
pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_neon;
}
......
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
EXPORT |vp8_dequant_dc_idct_add_v6|
AREA |.text|, CODE, READONLY
;void vp8_dequant_dc_idct_v6(short *input, short *dq,
; unsigned char *dest, int stride, int Dc)
; r0 = input
; r1 = dq
; r2 = dst
; r3 = stride
; sp + 36 = Dc
|vp8_dequant_dc_idct_add_v6| PROC
stmdb sp!, {r4-r11, lr}
ldr r6, [sp, #36]
ldr r4, [r0] ;input
ldr r5, [r1], #4 ;dq
sub sp, sp, #4
str r3, [sp]
smultt r7, r4, r5
ldr r4, [r0, #4] ;input
ldr r5, [r1], #4 ;dq
strh r6, [r0], #2
strh r7, [r0], #2
smulbb r6, r4, r5
smultt r7, r4, r5
ldr r4, [r0, #4] ;input
ldr r5, [r1], #4 ;dq
strh r6, [r0], #2
strh r7, [r0], #2
mov r12, #3
vp8_dequant_dc_add_loop
smulbb r6, r4, r5
smultt r7, r4, r5
ldr r4, [r0, #4] ;input
ldr r5, [r1], #4 ;dq
strh r6, [r0], #2
strh r7, [r0], #2
smulbb r6, r4, r5
smultt r7, r4, r5
subs r12, r12, #1
ldrne r4, [r0, #4]
ldrne r5, [r1], #4
strh r6, [r0], #2
strh r7, [r0], #2
bne vp8_dequant_dc_add_loop
sub r0, r0, #32
mov r1, r0
; short_idct4x4llm_v6_dual
ldr r3, cospi8sqrt2minus1
ldr r4, sinpi8sqrt2