Commit 33df6d1f authored by Yunqing Wang's avatar Yunqing Wang
Browse files

Save NEON registers in VP8 NEON functions

The recent compiler can generate optimized code that uses NEON registers
for various operations besides floating-point operations. Therefore,
only saving callee-saved registers d8 - d15 at the beginning of the
encoder/decoder is not enough anymore. This patch added register saving
code in VP8 NEON functions that use those registers.

Change-Id: Ie9e44f5188cf410990c8aaaac68faceee9dffd31
parent 5ba44e37
......@@ -26,6 +26,7 @@
|vp8_build_intra_predictors_mby_neon_func| PROC
push {r4-r8, lr}
vpush {d8-d15}
cmp r3, #0
beq case_dc_pred
......@@ -37,8 +38,8 @@
beq case_tm_pred
case_dc_pred
ldr r4, [sp, #24] ; Up
ldr r5, [sp, #28] ; Left
ldr r4, [sp, #88] ; Up
ldr r5, [sp, #92] ; Left
; Default the DC average to 128
mov r12, #128
......@@ -143,6 +144,7 @@ skip_dc_pred_up_left
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vpop {d8-d15}
pop {r4-r8,pc}
case_v_pred
; Copy down above row
......@@ -165,6 +167,7 @@ case_v_pred
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vpop {d8-d15}
pop {r4-r8,pc}
case_h_pred
......@@ -224,6 +227,7 @@ case_h_pred
vst1.u8 {q2}, [r1]!
vst1.u8 {q3}, [r1]!
vpop {d8-d15}
pop {r4-r8,pc}
case_tm_pred
......@@ -293,6 +297,7 @@ case_tm_pred_loop
subs r12, r12, #1
bne case_tm_pred_loop
vpop {d8-d15}
pop {r4-r8,pc}
ENDP
......@@ -307,6 +312,7 @@ case_tm_pred_loop
|vp8_build_intra_predictors_mby_s_neon_func| PROC
push {r4-r8, lr}
vpush {d8-d15}
mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
......@@ -320,8 +326,8 @@ case_tm_pred_loop
beq case_tm_pred_s
case_dc_pred_s
ldr r4, [sp, #24] ; Up
ldr r5, [sp, #28] ; Left
ldr r4, [sp, #88] ; Up
ldr r5, [sp, #92] ; Left
; Default the DC average to 128
mov r12, #128
......@@ -426,6 +432,7 @@ skip_dc_pred_up_left_s
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vpop {d8-d15}
pop {r4-r8,pc}
case_v_pred_s
; Copy down above row
......@@ -448,6 +455,8 @@ case_v_pred_s
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vpop {d8-d15}
pop {r4-r8,pc}
case_h_pred_s
......@@ -507,6 +516,7 @@ case_h_pred_s
vst1.u8 {q2}, [r1], r2
vst1.u8 {q3}, [r1], r2
vpop {d8-d15}
pop {r4-r8,pc}
case_tm_pred_s
......@@ -576,6 +586,7 @@ case_tm_pred_loop_s
subs r12, r12, #1
bne case_tm_pred_loop_s
vpop {d8-d15}
pop {r4-r8,pc}
ENDP
......
......@@ -22,6 +22,7 @@
; r3 stride
|idct_dequant_0_2x_neon| PROC
push {r4, r5}
vpush {d8-d15}
add r12, r2, #4
vld1.32 {d2[0]}, [r2], r3
......@@ -72,6 +73,7 @@
vst1.32 {d4[1]}, [r2]
vst1.32 {d10[1]}, [r0]
vpop {d8-d15}
pop {r4, r5}
bx lr
......
......@@ -22,6 +22,8 @@
; r2 *dst
; r3 stride
|idct_dequant_full_2x_neon| PROC
vpush {d8-d15}
vld1.16 {q0, q1}, [r1] ; dq (same l/r)
vld1.16 {q2, q3}, [r0] ; l q
add r0, r0, #32
......@@ -184,6 +186,7 @@
vst1.32 {d3[0]}, [r2]
vst1.32 {d3[1]}, [r1]
vpop {d8-d15}
bx lr
ENDP ; |idct_dequant_full_2x_neon|
......
......@@ -24,10 +24,12 @@
; sp unsigned char thresh,
|vp8_loop_filter_horizontal_edge_y_neon| PROC
push {lr}
vpush {d8-d15}
vdup.u8 q0, r2 ; duplicate blimit
vdup.u8 q1, r3 ; duplicate limit
sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines
ldr r3, [sp, #4] ; load thresh
ldr r3, [sp, #68] ; load thresh
add r12, r2, r1
add r1, r1, r1
......@@ -52,6 +54,7 @@
vst1.u8 {q7}, [r2@128], r1 ; store oq0
vst1.u8 {q8}, [r12@128], r1 ; store oq1
vpop {d8-d15}
pop {pc}
ENDP ; |vp8_loop_filter_horizontal_edge_y_neon|
......@@ -64,10 +67,12 @@
; sp+4 unsigned char *v
|vp8_loop_filter_horizontal_edge_uv_neon| PROC
push {lr}
vpush {d8-d15}
vdup.u8 q0, r2 ; duplicate blimit
vdup.u8 q1, r3 ; duplicate limit
ldr r12, [sp, #4] ; load thresh
ldr r2, [sp, #8] ; load v ptr
ldr r12, [sp, #68] ; load thresh
ldr r2, [sp, #72] ; load v ptr
vdup.u8 q2, r12 ; duplicate thresh
sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines
......@@ -104,6 +109,7 @@
vst1.u8 {d16}, [r0@64] ; store u oq1
vst1.u8 {d17}, [r2@64] ; store v oq1
vpop {d8-d15}
pop {pc}
ENDP ; |vp8_loop_filter_horizontal_edge_uv_neon|
......@@ -120,11 +126,13 @@
|vp8_loop_filter_vertical_edge_y_neon| PROC
push {lr}
vpush {d8-d15}
vdup.u8 q0, r2 ; duplicate blimit
vdup.u8 q1, r3 ; duplicate limit
sub r2, r0, #4 ; src ptr down by 4 columns
add r1, r1, r1
ldr r3, [sp, #4] ; load thresh
ldr r3, [sp, #68] ; load thresh
add r12, r2, r1, asr #1
vld1.u8 {d6}, [r2], r1
......@@ -194,6 +202,7 @@
vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0]
vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12]
vpop {d8-d15}
pop {pc}
ENDP ; |vp8_loop_filter_vertical_edge_y_neon|
......@@ -210,9 +219,11 @@
; sp+4 unsigned char *v
|vp8_loop_filter_vertical_edge_uv_neon| PROC
push {lr}
vpush {d8-d15}
vdup.u8 q0, r2 ; duplicate blimit
sub r12, r0, #4 ; move u pointer down by 4 columns
ldr r2, [sp, #8] ; load v ptr
ldr r2, [sp, #72] ; load v ptr
vdup.u8 q1, r3 ; duplicate limit
sub r3, r2, #4 ; move v pointer down by 4 columns
......@@ -233,7 +244,7 @@
vld1.u8 {d20}, [r12]
vld1.u8 {d21}, [r3]
ldr r12, [sp, #4] ; load thresh
ldr r12, [sp, #68] ; load thresh
;transpose to 8x16 matrix
vtrn.32 q3, q7
......@@ -281,6 +292,7 @@
vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0]
vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2]
vpop {d8-d15}
pop {pc}
ENDP ; |vp8_loop_filter_vertical_edge_uv_neon|
......
......@@ -9,7 +9,6 @@
;
;EXPORT |vp8_loop_filter_simple_horizontal_edge_neon|
EXPORT |vp8_loop_filter_bhs_neon|
EXPORT |vp8_loop_filter_mbhs_neon|
ARM
......@@ -22,7 +21,7 @@
; q1 limit, PRESERVE
|vp8_loop_filter_simple_horizontal_edge_neon| PROC
vpush {d8-d15}
sub r3, r0, r1, lsl #1 ; move src pointer down by 2 lines
vld1.u8 {q7}, [r0@128], r1 ; q0
......@@ -82,6 +81,7 @@
vst1.u8 {q6}, [r3@128] ; store op0
vst1.u8 {q7}, [r0@128] ; store oq0
vpop {d8-d15}
bx lr
ENDP ; |vp8_loop_filter_simple_horizontal_edge_neon|
......
......@@ -9,7 +9,6 @@
;
;EXPORT |vp8_loop_filter_simple_vertical_edge_neon|
EXPORT |vp8_loop_filter_bvs_neon|
EXPORT |vp8_loop_filter_mbvs_neon|
ARM
......@@ -22,6 +21,8 @@
; q1 limit, PRESERVE
|vp8_loop_filter_simple_vertical_edge_neon| PROC
vpush {d8-d15}
sub r0, r0, #2 ; move src pointer down by 2 columns
add r12, r1, r1
add r3, r0, r1
......@@ -120,6 +121,7 @@
vst2.8 {d14[6], d15[6]}, [r0], r12
vst2.8 {d14[7], d15[7]}, [r3]
vpop {d8-d15}
bx lr
ENDP ; |vp8_loop_filter_simple_vertical_edge_neon|
......
......@@ -28,8 +28,10 @@
; sp unsigned char thresh,
|vp8_mbloop_filter_horizontal_edge_y_neon| PROC
push {lr}
vpush {d8-d15}
add r1, r1, r1 ; double stride
ldr r12, [sp, #4] ; load thresh
ldr r12, [sp, #68] ; load thresh
sub r0, r0, r1, lsl #1 ; move src pointer down by 4 lines
vdup.u8 q2, r12 ; thresh
add r12, r0, r1, lsr #1 ; move src pointer up by 1 line
......@@ -55,6 +57,7 @@
vst1.u8 {q8}, [r12@128] ; store oq1
vst1.u8 {q9}, [r0@128] ; store oq2
vpop {d8-d15}
pop {pc}
ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon|
......@@ -72,10 +75,12 @@
|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
push {lr}
ldr r12, [sp, #4] ; load thresh
vpush {d8-d15}
ldr r12, [sp, #68] ; load thresh
sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines
vdup.u8 q2, r12 ; thresh
ldr r12, [sp, #8] ; load v ptr
ldr r12, [sp, #72] ; load v ptr
sub r12, r12, r1, lsl #2 ; move v pointer down by 4 lines
vld1.u8 {d6}, [r0@64], r1 ; p3
......@@ -116,6 +121,7 @@
vst1.u8 {d18}, [r0@64], r1 ; store u oq2
vst1.u8 {d19}, [r12@64], r1 ; store v oq2
vpop {d8-d15}
pop {pc}
ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
......@@ -130,7 +136,9 @@
; sp unsigned char thresh,
|vp8_mbloop_filter_vertical_edge_y_neon| PROC
push {lr}
ldr r12, [sp, #4] ; load thresh
vpush {d8-d15}
ldr r12, [sp, #68] ; load thresh
sub r0, r0, #4 ; move src pointer down by 4 columns
vdup.s8 q2, r12 ; thresh
add r12, r0, r1, lsl #3 ; move src pointer down by 8 lines
......@@ -208,6 +216,7 @@
vst1.8 {d20}, [r0]
vst1.8 {d21}, [r12]
vpop {d8-d15}
pop {pc}
ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon|
......@@ -224,10 +233,12 @@
; sp+4 unsigned char *v
|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
push {lr}
ldr r12, [sp, #4] ; load thresh
vpush {d8-d15}
ldr r12, [sp, #68] ; load thresh
sub r0, r0, #4 ; move u pointer down by 4 columns
vdup.u8 q2, r12 ; thresh
ldr r12, [sp, #8] ; load v ptr
ldr r12, [sp, #72] ; load v ptr
sub r12, r12, #4 ; move v pointer down by 4 columns
vld1.u8 {d6}, [r0], r1 ;load u data
......@@ -303,6 +314,7 @@
vst1.8 {d20}, [r0]
vst1.8 {d21}, [r12]
vpop {d8-d15}
pop {pc}
ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon|
......
......@@ -24,6 +24,7 @@
; r3 int ref_stride
|vp8_sad16x16_neon| PROC
;;
vpush {d8-d15}
vld1.8 {q0}, [r0], r1
vld1.8 {q4}, [r2], r3
......@@ -132,6 +133,7 @@
vmov.32 r0, d0[0]
vpop {d8-d15}
bx lr
ENDP
......@@ -143,6 +145,8 @@
; unsigned char *ref_ptr,
; int ref_stride)
|vp8_sad16x8_neon| PROC
vpush {d8-d15}
vld1.8 {q0}, [r0], r1
vld1.8 {q4}, [r2], r3
......@@ -200,6 +204,7 @@
vmov.32 r0, d0[0]
vpop {d8-d15}
bx lr
ENDP
......
......@@ -25,6 +25,7 @@
; int ref_stride)
|vp8_sad8x8_neon| PROC
vpush {d8-d15}
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
......@@ -70,6 +71,7 @@
vmov.32 r0, d0[0]
vpop {d8-d15}
bx lr
ENDP
......@@ -82,6 +84,7 @@
; int ref_stride)
|vp8_sad8x16_neon| PROC
vpush {d8-d15}
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
......@@ -167,6 +170,7 @@
vmov.32 r0, d0[0]
vpop {d8-d15}
bx lr
ENDP
......@@ -179,6 +183,7 @@
; int ref_stride)
|vp8_sad4x4_neon| PROC
vpush {d8-d15}
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
......@@ -202,6 +207,7 @@
vpaddl.u32 d0, d1
vmov.32 r0, d0[0]
vpop {d8-d15}
bx lr
ENDP
......
......@@ -37,12 +37,14 @@
; result of the multiplication that is needed in IDCT.
|vp8_short_idct4x4llm_neon| PROC
vpush {d8-d15}
adr r12, idct_coeff
vld1.16 {q1, q2}, [r0]
vld1.16 {d0}, [r12]
vswp d3, d4 ;q2(vp[4] vp[12])
ldr r0, [sp] ; stride
ldr r0, [sp, #64] ; stride
vqdmulh.s16 q3, q2, d0[2]
vqdmulh.s16 q4, q2, d0[0]
......@@ -125,6 +127,7 @@
vst1.32 d2[0], [r3], r0
vst1.32 d2[1], [r3], r0
vpop {d8-d15}
bx lr
ENDP
......
......@@ -43,10 +43,11 @@ filter16_coeff
|vp8_sixtap_predict16x16_neon| PROC
push {r4-r5, lr}
vpush {d8-d15}
adr r12, filter16_coeff
ldr r4, [sp, #12] ;load parameters from stack
ldr r5, [sp, #16] ;load parameters from stack
ldr r4, [sp, #76] ;load parameters from stack
ldr r5, [sp, #80] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_filter16x16_only
......@@ -291,6 +292,8 @@ secondpass_inner_loop_neon
bne filt_blk2d_sp16x16_outloop_neon
add sp, sp, #336
vpop {d8-d15}
pop {r4-r5,pc}
;--------------------
......@@ -384,6 +387,7 @@ filt_blk2d_fpo16x16_loop_neon
bne filt_blk2d_fpo16x16_loop_neon
vpop {d8-d15}
pop {r4-r5,pc}
;--------------------
......@@ -482,6 +486,7 @@ secondpass_only_inner_loop_neon
bne filt_blk2d_spo16x16_outloop_neon
vpop {d8-d15}
pop {r4-r5,pc}
ENDP
......
......@@ -35,10 +35,11 @@ filter4_coeff
|vp8_sixtap_predict4x4_neon| PROC
push {r4, lr}
vpush {d8-d15}
adr r12, filter4_coeff
ldr r4, [sp, #8] ;load parameters from stack
ldr lr, [sp, #12] ;load parameters from stack
ldr r4, [sp, #72] ;load parameters from stack
ldr lr, [sp, #76] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_filter4x4_only
......@@ -261,6 +262,7 @@ filter4_coeff
vst1.32 {d4[0]}, [r1]
vst1.32 {d4[1]}, [r2]
vpop {d8-d15}
pop {r4, pc}
......@@ -348,6 +350,7 @@ firstpass_filter4x4_only
vst1.32 {d28[0]}, [r1]
vst1.32 {d28[1]}, [r2]
vpop {d8-d15}
pop {r4, pc}
......@@ -413,6 +416,7 @@ secondpass_filter4x4_only
vst1.32 {d4[0]}, [r1]
vst1.32 {d4[1]}, [r2]
vpop {d8-d15}
pop {r4, pc}
ENDP
......
......@@ -35,10 +35,11 @@ filter8_coeff
|vp8_sixtap_predict8x4_neon| PROC
push {r4-r5, lr}
vpush {d8-d15}
adr r12, filter8_coeff
ldr r4, [sp, #12] ;load parameters from stack
ldr r5, [sp, #16] ;load parameters from stack
ldr r4, [sp, #76] ;load parameters from stack
ldr r5, [sp, #80] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_filter8x4_only
......@@ -297,6 +298,8 @@ filter8_coeff
vst1.u8 {d9}, [r4], r5
add sp, sp, #32
vpop {d8-d15}
pop {r4-r5,pc}
;--------------------
......@@ -392,6 +395,7 @@ firstpass_filter8x4_only
vst1.u8 {d24}, [r4], r5
vst1.u8 {d25}, [r4], r5
vpop {d8-d15}
pop {r4-r5,pc}
;---------------------
......@@ -464,6 +468,7 @@ secondpass_filter8x4_only
vst1.u8 {d8}, [r4], r5
vst1.u8 {d9}, [r4], r5
vpop {d8-d15}
pop {r4-r5,pc}
ENDP
......
......@@ -35,11 +35,11 @@ filter8_coeff
|vp8_sixtap_predict8x8_neon| PROC
push {r4-r5, lr}
vpush {d8-d15}
adr r12, filter8_coeff
ldr r4, [sp, #12] ;load parameters from stack
ldr r5, [sp, #16] ;load parameters from stack
ldr r4, [sp, #76] ;load parameters from stack
ldr r5, [sp, #80] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_filter8x8_only
......@@ -324,6 +324,8 @@ filt_blk2d_sp8x8_loop_neon
bne filt_blk2d_sp8x8_loop_neon
add sp, sp, #64
vpop {d8-d15}
pop {r4-r5,pc}
;---------------------
......@@ -428,6 +430,7 @@ filt_blk2d_fpo8x8_loop_neon
bne filt_blk2d_fpo8x8_loop_neon
vpop {d8-d15}
pop {r4-r5,pc}
;---------------------
......@@ -515,6 +518,7 @@ filt_blk2d_spo8x8_loop_neon
bne filt_blk2d_spo8x8_loop_neon
vpop {d8-d15}
pop {r4-r5,pc}
ENDP
...