Commit 0c2529a8 authored by Tero Rintaluoma's avatar Tero Rintaluoma
Browse files

NEON FDCT updated to match current C code

- Removed fast_fdct4x4_neon and fast_fdct8x4_neon
- Uses now short_fdct4x4 and short_fdct8x4
- Gives ~1-2% speed-up on Cortex-A8/A9

Change-Id: Ib62f2cb2080ae719f8fa1d518a3a5e71278a41ec
parent 35ce4eb0
......@@ -107,8 +107,8 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_neon;
cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_neon;
cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_neon;
cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_neon;
cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_neon;
cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_neon;
cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_neon;
/*cpi->rtcd.encodemb.berr = vp8_block_error_c;
......
......@@ -45,10 +45,10 @@ extern prototype_fdct(vp8_short_walsh4x4_neon);
#define vp8_fdct_short8x4 vp8_short_fdct8x4_neon
#undef vp8_fdct_fast4x4
#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_neon
#define vp8_fdct_fast4x4 vp8_short_fdct4x4_neon
#undef vp8_fdct_fast8x4
#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_neon
#define vp8_fdct_fast8x4 vp8_short_fdct8x4_neon
#undef vp8_fdct_walsh_short4x4
#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_neon
......
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_fast_fdct4x4_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void vp8_fast_fdct4x4_c(short *input, short *output, int pitch);
;NOTE:
;The input *src_diff. src_diff is calculated as:
;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function)
;In which *src_ptr and *pred_ptr both are unsigned char.
;Therefore, *src_diff should be in the range of [-255, 255].
;CAUTION:
;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255].
;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes
;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c.
|vp8_fast_fdct4x4_neon| PROC
vld1.16 {d2}, [r0], r2 ;load input
ldr r12, _ffdct_coeff_
vld1.16 {d3}, [r0], r2
vld1.16 {d4}, [r0], r2
vld1.16 {d0}, [r12]
vld1.16 {d5}, [r0], r2
;First for-loop
;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[1], d4=ip[2], d5=ip[3]
vtrn.32 d2, d4
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
vadd.s16 d6, d2, d5 ;ip[0]+ip[3]
vadd.s16 d7, d3, d4 ;ip[1]+ip[2]
vsub.s16 d8, d3, d4 ;ip[1]-ip[2]
vsub.s16 d9, d2, d5 ;ip[0]-ip[3]
vshl.i16 q3, q3, #1 ; a1, b1
vshl.i16 q4, q4, #1 ; c1, d1
vadd.s16 d10, d6, d7 ;temp1 = a1 + b1
vsub.s16 d11, d6, d7 ;temp2 = a1 - b1
vqdmulh.s16 q6, q5, d0[1]
vqdmulh.s16 q8, q4, d0[0]
vqdmulh.s16 q7, q4, d0[2]
vshr.s16 q6, q6, #1
vshr.s16 q8, q8, #1
vshr.s16 q7, q7, #1 ;d14:temp1 = ( c1 * x_c3)>>16; d15:temp1 = (d1 * x_c3)>>16
vadd.s16 q8, q4, q8 ;d16:temp2 = ((c1 * x_c1)>>16) + c1; d17:temp2 = ((d1 * x_c1)>>16) + d1
vadd.s16 d2, d10, d12 ;op[0] = ((temp1 * x_c2 )>>16) + temp1
vadd.s16 d4, d11, d13 ;op[2] = ((temp2 * x_c2 )>>16) + temp2
vadd.s16 d3, d14, d17 ;op[1] = temp1 + temp2 -- q is not necessary, just for protection
vsub.s16 d5, d15, d16 ;op[3] = temp1 - temp2
;Second for-loop
;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[4], d4=ip[8], d5=ip[12]
vtrn.32 d2, d4
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
vadd.s16 d6, d2, d5 ;a1 = ip[0]+ip[12]
vadd.s16 d7, d3, d4 ;b1 = ip[4]+ip[8]
vsub.s16 d8, d3, d4 ;c1 = ip[4]-ip[8]
vsub.s16 d9, d2, d5 ;d1 = ip[0]-ip[12]
vadd.s16 d10, d6, d7 ;temp1 = a1 + b1
vsub.s16 d11, d6, d7 ;temp2 = a1 - b1
vqdmulh.s16 q6, q5, d0[1]
vqdmulh.s16 q8, q4, d0[0]
vqdmulh.s16 q7, q4, d0[2]
vshr.s16 q6, q6, #1
vshr.s16 q8, q8, #1
vshr.s16 q7, q7, #1 ;d14:temp1 = ( c1 * x_c3)>>16; d15:temp1 = (d1 * x_c3)>>16
vadd.s16 q8, q4, q8 ;d16:temp2 = ((c1 * x_c1)>>16) + c1; d17:temp2 = ((d1 * x_c1)>>16) + d1
vadd.s16 d2, d10, d12 ;a2 = ((temp1 * x_c2 )>>16) + temp1
vadd.s16 d4, d11, d13 ;c2 = ((temp2 * x_c2 )>>16) + temp2
vadd.s16 d3, d14, d17 ;b2 = temp1 + temp2 -- q is not necessary, just for protection
vsub.s16 d5, d15, d16 ;d2 = temp1 - temp2
vclt.s16 q3, q1, #0
vclt.s16 q4, q2, #0
vsub.s16 q1, q1, q3
vsub.s16 q2, q2, q4
vshr.s16 q1, q1, #1
vshr.s16 q2, q2, #1
vst1.16 {q1, q2}, [r1]
bx lr
ENDP
;-----------------
_ffdct_coeff_
DCD ffdct_coeff
ffdct_coeff
; 60547 = 0xEC83
; 46341 = 0xB505
; 25080 = 0x61F8
DCD 0xB505EC83, 0x000061F8
END
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_fast_fdct8x4_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void vp8_fast_fdct4x4_c(short *input, short *output, int pitch);
;NOTE:
;The input *src_diff. src_diff is calculated as:
;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function)
;In which *src_ptr and *pred_ptr both are unsigned char.
;Therefore, *src_diff should be in the range of [-255, 255].
;CAUTION:
;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255].
;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes
;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c.
|vp8_fast_fdct8x4_neon| PROC
vld1.16 {q1}, [r0], r2 ;load input
ldr r12, _ffdct8_coeff_
vld1.16 {q2}, [r0], r2
vld1.16 {q3}, [r0], r2
vld1.16 {d0}, [r12]
vld1.16 {q4}, [r0], r2
;First for-loop
;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[1], d6=ip[2], d8=ip[3]
;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[1], d7=ip[2], d9=ip[3]
vtrn.32 d2, d6
vtrn.32 d3, d7
vtrn.32 d4, d8
vtrn.32 d5, d9
vtrn.16 d2, d4
vtrn.16 d3, d5
vtrn.16 d6, d8
vtrn.16 d7, d9
vadd.s16 d10, d2, d8 ;ip[0]+ip[3]
vadd.s16 d11, d4, d6 ;ip[1]+ip[2]
vsub.s16 d12, d4, d6 ;ip[1]-ip[2]
vsub.s16 d13, d2, d8 ;ip[0]-ip[3]
vadd.s16 d22, d3, d9
vadd.s16 d23, d5, d7
vsub.s16 d24, d5, d7
vsub.s16 d25, d3, d9
vshl.i16 q5, q5, #1 ; a1, b1
vshl.i16 q6, q6, #1 ; c1, d1
vshl.i16 q1, q11, #1
vshl.i16 q2, q12, #1
vadd.s16 d14, d10, d11 ;temp1 = a1 + b1
vsub.s16 d15, d10, d11 ;temp2 = a1 - b1
vadd.s16 d24, d2, d3
vsub.s16 d25, d2, d3
vqdmulh.s16 q8, q7, d0[1]
vqdmulh.s16 q13, q12, d0[1]
vqdmulh.s16 q10, q6, d0[0]
vqdmulh.s16 q15, q2, d0[0]
vqdmulh.s16 q9, q6, d0[2]
vqdmulh.s16 q14, q2, d0[2]
vshr.s16 q8, q8, #1
vshr.s16 q13, q13, #1
vshr.s16 q10, q10, #1
vshr.s16 q15, q15, #1
vshr.s16 q9, q9, #1 ;d18:temp1 = ( c1 * x_c3)>>16; d19:temp1 = (d1 * x_c3)>>16
vshr.s16 q14, q14, #1 ;d28:temp1 = ( c1 * x_c3)>>16; d29:temp1 = (d1 * x_c3)>>16
vadd.s16 q10, q6, q10 ;d20:temp2 = ((c1 * x_c1)>>16) + c1; d21:temp2 = ((d1 * x_c1)>>16) + d1
vadd.s16 q15, q2, q15 ;d30:temp2 = ((c1 * x_c1)>>16) + c1; d31:temp2 = ((d1 * x_c1)>>16) + d1
vadd.s16 d2, d14, d16 ;op[0] = ((temp1 * x_c2 )>>16) + temp1
vadd.s16 d3, d24, d26 ;op[0] = ((temp1 * x_c2 )>>16) + temp1
vadd.s16 d6, d15, d17 ;op[2] = ((temp2 * x_c2 )>>16) + temp2
vadd.s16 d7, d25, d27 ;op[2] = ((temp2 * x_c2 )>>16) + temp2
vadd.s16 d4, d18, d21 ;op[1] = temp1 + temp2 -- q is not necessary, just for protection
vadd.s16 d5, d28, d31 ;op[1] = temp1 + temp2 -- q is not necessary, just for protection
vsub.s16 d8, d19, d20 ;op[3] = temp1 - temp2
vsub.s16 d9, d29, d30 ;op[3] = temp1 - temp2
;Second for-loop
;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[4], d6=ip[8], d8=ip[12]
;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[4], d7=ip[8], d9=ip[12]
vtrn.32 d2, d6
vtrn.32 d3, d7
vtrn.32 d4, d8
vtrn.32 d5, d9
vtrn.16 d2, d4
vtrn.16 d3, d5
vtrn.16 d6, d8
vtrn.16 d7, d9
vadd.s16 d10, d2, d8 ;a1 = ip[0]+ip[12]
vadd.s16 d11, d4, d6 ;b1 = ip[4]+ip[8]
vsub.s16 d12, d4, d6 ;c1 = ip[4]-ip[8]
vsub.s16 d13, d2, d8 ;d1 = ip[0]-ip[12]
vadd.s16 d2, d3, d9
vadd.s16 d4, d5, d7
vsub.s16 d24, d5, d7
vsub.s16 d25, d3, d9
vadd.s16 d14, d10, d11 ;temp1 = a1 + b1
vsub.s16 d15, d10, d11 ;temp2 = a1 - b1
vadd.s16 d22, d2, d4
vsub.s16 d23, d2, d4
vqdmulh.s16 q8, q7, d0[1]
vqdmulh.s16 q13, q11, d0[1]
vqdmulh.s16 q10, q6, d0[0]
vqdmulh.s16 q15, q12, d0[0]
vqdmulh.s16 q9, q6, d0[2]
vqdmulh.s16 q14, q12, d0[2]
vshr.s16 q8, q8, #1
vshr.s16 q13, q13, #1
vshr.s16 q10, q10, #1
vshr.s16 q15, q15, #1
vshr.s16 q9, q9, #1 ;d18:temp1 = ( c1 * x_c3)>>16; d19:temp1 = (d1 * x_c3)>>16
vshr.s16 q14, q14, #1 ;d28:temp1 = ( c1 * x_c3)>>16; d29:temp1 = (d1 * x_c3)>>16
vadd.s16 q10, q6, q10 ;d20:temp2 = ((c1 * x_c1)>>16) + c1; d21:temp2 = ((d1 * x_c1)>>16) + d1
vadd.s16 q15, q12, q15 ;d30:temp2 = ((c1 * x_c1)>>16) + c1; d31:temp2 = ((d1 * x_c1)>>16) + d1
vadd.s16 d2, d14, d16 ;a2 = ((temp1 * x_c2 )>>16) + temp1
vadd.s16 d6, d22, d26 ;a2 = ((temp1 * x_c2 )>>16) + temp1
vadd.s16 d4, d15, d17 ;c2 = ((temp2 * x_c2 )>>16) + temp2
vadd.s16 d8, d23, d27 ;c2 = ((temp2 * x_c2 )>>16) + temp2
vadd.s16 d3, d18, d21 ;b2 = temp1 + temp2 -- q is not necessary, just for protection
vadd.s16 d7, d28, d31 ;b2 = temp1 + temp2 -- q is not necessary, just for protection
vsub.s16 d5, d19, d20 ;d2 = temp1 - temp2
vsub.s16 d9, d29, d30 ;d2 = temp1 - temp2
vclt.s16 q5, q1, #0
vclt.s16 q6, q2, #0
vclt.s16 q7, q3, #0
vclt.s16 q8, q4, #0
vsub.s16 q1, q1, q5
vsub.s16 q2, q2, q6
vsub.s16 q3, q3, q7
vsub.s16 q4, q4, q8
vshr.s16 q1, q1, #1
vshr.s16 q2, q2, #1
vshr.s16 q3, q3, #1
vshr.s16 q4, q4, #1
vst1.16 {q1, q2}, [r1]!
vst1.16 {q3, q4}, [r1]
bx lr
ENDP
;-----------------
_ffdct8_coeff_
DCD ffdct8_coeff
ffdct8_coeff
; 60547 = 0xEC83
; 46341 = 0xB505
; 25080 = 0x61F8
DCD 0xB505EC83, 0x000061F8
END
......@@ -11,134 +11,211 @@
EXPORT |vp8_short_fdct4x4_neon|
EXPORT |vp8_short_fdct8x4_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=4
AREA ||.text||, CODE, READONLY, ALIGN=2
ALIGN 16 ; enable use of @128 bit aligned loads
coeff
DCW 5352, 5352, 5352, 5352
DCW 2217, 2217, 2217, 2217
DCD 14500, 14500, 14500, 14500
DCD 7500, 7500, 7500, 7500
DCD 12000, 12000, 12000, 12000
DCD 51000, 51000, 51000, 51000
; r0 short *input
; r1 short *output
; r2 int pitch
; Input has a pitch, output is contiguous
;void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
|vp8_short_fdct4x4_neon| PROC
ldr r12, _dct_matrix_
vld1.16 d0, [r0], r2
vld1.16 d1, [r0], r2
vld1.16 d2, [r0], r2
vld1.16 d3, [r0]
vld1.16 {q2, q3}, [r12]
;first stage
vmull.s16 q11, d4, d0[0] ;i=0
vmull.s16 q12, d4, d1[0] ;i=1
vmull.s16 q13, d4, d2[0] ;i=2
vmull.s16 q14, d4, d3[0] ;i=3
vmlal.s16 q11, d5, d0[1]
vmlal.s16 q12, d5, d1[1]
vmlal.s16 q13, d5, d2[1]
vmlal.s16 q14, d5, d3[1]
vmlal.s16 q11, d6, d0[2]
vmlal.s16 q12, d6, d1[2]
vmlal.s16 q13, d6, d2[2]
vmlal.s16 q14, d6, d3[2]
vmlal.s16 q11, d7, d0[3] ;sumtemp for i=0
vmlal.s16 q12, d7, d1[3] ;sumtemp for i=1
vmlal.s16 q13, d7, d2[3] ;sumtemp for i=2
vmlal.s16 q14, d7, d3[3] ;sumtemp for i=3
; rounding
vrshrn.i32 d22, q11, #14
vrshrn.i32 d24, q12, #14
vrshrn.i32 d26, q13, #14
vrshrn.i32 d28, q14, #14
;second stage
vmull.s16 q4, d22, d4[0] ;i=0
vmull.s16 q5, d22, d4[1] ;i=1
vmull.s16 q6, d22, d4[2] ;i=2
vmull.s16 q7, d22, d4[3] ;i=3
vmlal.s16 q4, d24, d5[0]
vmlal.s16 q5, d24, d5[1]
vmlal.s16 q6, d24, d5[2]
vmlal.s16 q7, d24, d5[3]
vmlal.s16 q4, d26, d6[0]
vmlal.s16 q5, d26, d6[1]
vmlal.s16 q6, d26, d6[2]
vmlal.s16 q7, d26, d6[3]
vmlal.s16 q4, d28, d7[0] ;sumtemp for i=0
vmlal.s16 q5, d28, d7[1] ;sumtemp for i=1
vmlal.s16 q6, d28, d7[2] ;sumtemp for i=2
vmlal.s16 q7, d28, d7[3] ;sumtemp for i=3
vrshr.s32 q0, q4, #16
vrshr.s32 q1, q5, #16
vrshr.s32 q2, q6, #16
vrshr.s32 q3, q7, #16
vmovn.i32 d0, q0
vmovn.i32 d1, q1
vmovn.i32 d2, q2
vmovn.i32 d3, q3
vst1.16 {q0, q1}, [r1]
; Part one
vld1.16 {d0}, [r0@64], r2
adr r12, coeff
vld1.16 {d1}, [r0@64], r2
vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217
vld1.16 {d2}, [r0@64], r2
vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500
vld1.16 {d3}, [r0@64], r2
; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
vtrn.32 d0, d2
vtrn.32 d1, d3
vld1.32 {q11,q12}, [r12@128] ; q11=12000, q12=51000
vtrn.16 d0, d1
vtrn.16 d2, d3
vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[3]
vadd.s16 d5, d1, d2 ; b1 = ip[1] + ip[2]
vsub.s16 d6, d1, d2 ; c1 = ip[1] - ip[2]
vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[3]
vshl.s16 q2, q2, #3 ; (a1, b1) << 3
vshl.s16 q3, q3, #3 ; (c1, d1) << 3
vadd.s16 d0, d4, d5 ; op[0] = a1 + b1
vsub.s16 d2, d4, d5 ; op[2] = a1 - b1
vmlal.s16 q9, d7, d16 ; d1*5352 + 14500
vmlal.s16 q10, d7, d17 ; d1*2217 + 7500
vmlal.s16 q9, d6, d17 ; c1*2217 + d1*5352 + 14500
vmlsl.s16 q10, d6, d16 ; d1*2217 - c1*5352 + 7500
vshrn.s32 d1, q9, #12 ; op[1] = (c1*2217 + d1*5352 + 14500)>>12
vshrn.s32 d3, q10, #12 ; op[3] = (d1*2217 - c1*5352 + 7500)>>12
; Part two
; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
vtrn.32 d0, d2
vtrn.32 d1, d3
vtrn.16 d0, d1
vtrn.16 d2, d3
vmov.s16 d26, #7
vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[12]
vadd.s16 d5, d1, d2 ; b1 = ip[4] + ip[8]
vsub.s16 d6, d1, d2 ; c1 = ip[4] - ip[8]
vadd.s16 d4, d4, d26 ; a1 + 7
vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[12]
vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 + 7
vsub.s16 d2, d4, d5 ; op[8] = a1 - b1 + 7
vmlal.s16 q11, d7, d16 ; d1*5352 + 12000
vmlal.s16 q12, d7, d17 ; d1*2217 + 51000
vceq.s16 d4, d7, #0
vshr.s16 d0, d0, #4
vshr.s16 d2, d2, #4
vmlal.s16 q11, d6, d17 ; c1*2217 + d1*5352 + 12000
vmlsl.s16 q12, d6, d16 ; d1*2217 - c1*5352 + 51000
vmvn.s16 d4, d4
vshrn.s32 d1, q11, #16 ; op[4] = (c1*2217 + d1*5352 + 12000)>>16
vsub.s16 d1, d1, d4 ; op[4] += (d1!=0)
vshrn.s32 d3, q12, #16 ; op[12]= (d1*2217 - c1*5352 + 51000)>>16
vst1.16 {q0, q1}, [r1@128]
bx lr
ENDP
; r0 short *input
; r1 short *output
; r2 int pitch
;void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
|vp8_short_fdct8x4_neon| PROC
; Store link register and input before calling
; first 4x4 fdct. Do not need to worry about
; output or pitch because those pointers are not
; touched in the 4x4 fdct function
stmdb sp!, {r0, lr}
bl vp8_short_fdct4x4_neon
; Part one
vld1.16 {q0}, [r0@128], r2
adr r12, coeff
vld1.16 {q1}, [r0@128], r2
vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217
vld1.16 {q2}, [r0@128], r2
vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500
vld1.16 {q3}, [r0@128], r2
; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3]
vtrn.32 q0, q2 ; [A0|B0]
vtrn.32 q1, q3 ; [A1|B1]
vtrn.16 q0, q1 ; [A2|B2]
vtrn.16 q2, q3 ; [A3|B3]
ldmia sp!, {r0, lr}
vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[3]
vadd.s16 q12, q1, q2 ; b1 = ip[1] + ip[2]
vsub.s16 q13, q1, q2 ; c1 = ip[1] - ip[2]
vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[3]
; Move to the next block of data.
add r0, r0, #8
add r1, r1, #32
vshl.s16 q11, q11, #3 ; a1 << 3
vshl.s16 q12, q12, #3 ; b1 << 3
vshl.s16 q13, q13, #3 ; c1 << 3
vshl.s16 q14, q14, #3 ; d1 << 3
; Second time through do not store off the
; link register, just return from the 4x4 fdtc
b vp8_short_fdct4x4_neon
vadd.s16 q0, q11, q12 ; [A0 | B0] = a1 + b1
vsub.s16 q2, q11, q12 ; [A2 | B2] = a1 - b1
vmov.s16 q11, q9 ; 14500
vmov.s16 q12, q10 ; 7500
vmlal.s16 q9, d28, d16 ; A[1] = d1*5352 + 14500
vmlal.s16 q10, d28, d17 ; A[3] = d1*2217 + 7500
vmlal.s16 q11, d29, d16 ; B[1] = d1*5352 + 14500
vmlal.s16 q12, d29, d17 ; B[3] = d1*2217 + 7500
vmlal.s16 q9, d26, d17 ; A[1] = c1*2217 + d1*5352 + 14500
vmlsl.s16 q10, d26, d16 ; A[3] = d1*2217 - c1*5352 + 7500
vmlal.s16 q11, d27, d17 ; B[1] = c1*2217 + d1*5352 + 14500
vmlsl.s16 q12, d27, d16 ; B[3] = d1*2217 - c1*5352 + 7500
vshrn.s32 d2, q9, #12 ; A[1] = (c1*2217 + d1*5352 + 14500)>>12
vshrn.s32 d6, q10, #12 ; A[3] = (d1*2217 - c1*5352 + 7500)>>12
vshrn.s32 d3, q11, #12 ; B[1] = (c1*2217 + d1*5352 + 14500)>>12
vshrn.s32 d7, q12, #12 ; B[3] = (d1*2217 - c1*5352 + 7500)>>12
; Part two
vld1.32 {q9,q10}, [r12@128] ; q9=12000, q10=51000
; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12]
vtrn.32 q0, q2 ; q0=[A0 | B0]
vtrn.32 q1, q3 ; q1=[A4 | B4]
vtrn.16 q0, q1 ; q2=[A8 | B8]
vtrn.16 q2, q3 ; q3=[A12|B12]
vmov.s16 q15, #7
vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[12]
vadd.s16 q12, q1, q2 ; b1 = ip[4] + ip[8]
vadd.s16 q11, q11, q15 ; a1 + 7
vsub.s16 q13, q1, q2 ; c1 = ip[4] - ip[8]
vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[12]
vadd.s16 q0, q11, q12 ; a1 + b1 + 7
vsub.s16 q1, q11, q12 ; a1 - b1 + 7
vmov.s16 q11, q9 ; 12000
vmov.s16 q12, q10 ; 51000
vshr.s16 d0, d0, #4 ; A[0] = (a1 + b1 + 7)>>4
vshr.s16 d4, d1, #4 ; B[0] = (a1 + b1 + 7)>>4
vshr.s16 d2, d2, #4 ; A[8] = (a1 + b1 + 7)>>4
vshr.s16 d6, d3, #4 ; B[8] = (a1 + b1 + 7)>>4
vmlal.s16 q9, d28, d16 ; A[4] = d1*5352 + 12000