diff --git a/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm b/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm index e3ea91fe6c0493dc8e5c510f34fe7ce77e598b53..a8730aa04ef70b5f1b6250567459d9e17f86e89c 100644 --- a/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm +++ b/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm @@ -26,6 +26,7 @@ |vp8_build_intra_predictors_mby_neon_func| PROC push {r4-r8, lr} + vpush {d8-d15} cmp r3, #0 beq case_dc_pred @@ -37,8 +38,8 @@ beq case_tm_pred case_dc_pred - ldr r4, [sp, #24] ; Up - ldr r5, [sp, #28] ; Left + ldr r4, [sp, #88] ; Up + ldr r5, [sp, #92] ; Left ; Default the DC average to 128 mov r12, #128 @@ -143,6 +144,7 @@ skip_dc_pred_up_left vst1.u8 {q0}, [r1]! vst1.u8 {q0}, [r1]! + vpop {d8-d15} pop {r4-r8,pc} case_v_pred ; Copy down above row @@ -165,6 +167,7 @@ case_v_pred vst1.u8 {q0}, [r1]! vst1.u8 {q0}, [r1]! vst1.u8 {q0}, [r1]! + vpop {d8-d15} pop {r4-r8,pc} case_h_pred @@ -224,6 +227,7 @@ case_h_pred vst1.u8 {q2}, [r1]! vst1.u8 {q3}, [r1]! + vpop {d8-d15} pop {r4-r8,pc} case_tm_pred @@ -293,6 +297,7 @@ case_tm_pred_loop subs r12, r12, #1 bne case_tm_pred_loop + vpop {d8-d15} pop {r4-r8,pc} ENDP @@ -307,6 +312,7 @@ case_tm_pred_loop |vp8_build_intra_predictors_mby_s_neon_func| PROC push {r4-r8, lr} + vpush {d8-d15} mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor; @@ -320,8 +326,8 @@ case_tm_pred_loop beq case_tm_pred_s case_dc_pred_s - ldr r4, [sp, #24] ; Up - ldr r5, [sp, #28] ; Left + ldr r4, [sp, #88] ; Up + ldr r5, [sp, #92] ; Left ; Default the DC average to 128 mov r12, #128 @@ -426,6 +432,7 @@ skip_dc_pred_up_left_s vst1.u8 {q0}, [r1], r2 vst1.u8 {q0}, [r1], r2 + vpop {d8-d15} pop {r4-r8,pc} case_v_pred_s ; Copy down above row @@ -448,6 +455,8 @@ case_v_pred_s vst1.u8 {q0}, [r1], r2 vst1.u8 {q0}, [r1], r2 vst1.u8 {q0}, [r1], r2 + + vpop {d8-d15} pop {r4-r8,pc} case_h_pred_s @@ -507,6 +516,7 @@ case_h_pred_s vst1.u8 {q2}, [r1], r2 vst1.u8 {q3}, [r1], r2 + vpop {d8-d15} pop {r4-r8,pc} case_tm_pred_s @@ -576,6 +586,7 @@ case_tm_pred_loop_s subs r12, r12, #1 bne case_tm_pred_loop_s + vpop {d8-d15} pop {r4-r8,pc} ENDP diff --git a/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm b/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm index 6c29c55860d899a502bcd5aac71c5dfe459bd659..3a3921081c4410cebc23e88777fad6de057a3e4a 100644 --- a/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm +++ b/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm @@ -22,6 +22,7 @@ ; r3 stride |idct_dequant_0_2x_neon| PROC push {r4, r5} + vpush {d8-d15} add r12, r2, #4 vld1.32 {d2[0]}, [r2], r3 @@ -72,6 +73,7 @@ vst1.32 {d4[1]}, [r2] vst1.32 {d10[1]}, [r0] + vpop {d8-d15} pop {r4, r5} bx lr diff --git a/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm b/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm index d5dce63f6bd72b54d4a3d964a93dd485fccc06ed..8da0fa0b7ea535a825463de665a831f2948d41b6 100644 --- a/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm +++ b/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm @@ -22,6 +22,8 @@ ; r2 *dst ; r3 stride |idct_dequant_full_2x_neon| PROC + vpush {d8-d15} + vld1.16 {q0, q1}, [r1] ; dq (same l/r) vld1.16 {q2, q3}, [r0] ; l q add r0, r0, #32 @@ -184,6 +186,7 @@ vst1.32 {d3[0]}, [r2] vst1.32 {d3[1]}, [r1] + vpop {d8-d15} bx lr ENDP ; |idct_dequant_full_2x_neon| diff --git a/vp8/common/arm/neon/loopfilter_neon.asm b/vp8/common/arm/neon/loopfilter_neon.asm index e44be0a1e34d2199c20401aabc68315f2be2cb35..c4f09c7753bfc290b547671a8d81ba16b391f284 100644 --- a/vp8/common/arm/neon/loopfilter_neon.asm +++ b/vp8/common/arm/neon/loopfilter_neon.asm @@ -24,10 +24,12 @@ ; sp unsigned char thresh, |vp8_loop_filter_horizontal_edge_y_neon| PROC push {lr} + vpush {d8-d15} + vdup.u8 q0, r2 ; duplicate blimit vdup.u8 q1, r3 ; duplicate limit sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines - ldr r3, [sp, #4] ; load thresh + ldr r3, [sp, #68] ; load thresh add r12, r2, r1 add r1, r1, r1 @@ -52,6 +54,7 @@ vst1.u8 {q7}, [r2@128], r1 ; store oq0 vst1.u8 {q8}, [r12@128], r1 ; store oq1 + vpop {d8-d15} pop {pc} ENDP ; |vp8_loop_filter_horizontal_edge_y_neon| @@ -64,10 +67,12 @@ ; sp+4 unsigned char *v |vp8_loop_filter_horizontal_edge_uv_neon| PROC push {lr} + vpush {d8-d15} + vdup.u8 q0, r2 ; duplicate blimit vdup.u8 q1, r3 ; duplicate limit - ldr r12, [sp, #4] ; load thresh - ldr r2, [sp, #8] ; load v ptr + ldr r12, [sp, #68] ; load thresh + ldr r2, [sp, #72] ; load v ptr vdup.u8 q2, r12 ; duplicate thresh sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines @@ -104,6 +109,7 @@ vst1.u8 {d16}, [r0@64] ; store u oq1 vst1.u8 {d17}, [r2@64] ; store v oq1 + vpop {d8-d15} pop {pc} ENDP ; |vp8_loop_filter_horizontal_edge_uv_neon| @@ -120,11 +126,13 @@ |vp8_loop_filter_vertical_edge_y_neon| PROC push {lr} + vpush {d8-d15} + vdup.u8 q0, r2 ; duplicate blimit vdup.u8 q1, r3 ; duplicate limit sub r2, r0, #4 ; src ptr down by 4 columns add r1, r1, r1 - ldr r3, [sp, #4] ; load thresh + ldr r3, [sp, #68] ; load thresh add r12, r2, r1, asr #1 vld1.u8 {d6}, [r2], r1 @@ -194,6 +202,7 @@ vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0] vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12] + vpop {d8-d15} pop {pc} ENDP ; |vp8_loop_filter_vertical_edge_y_neon| @@ -210,9 +219,11 @@ ; sp+4 unsigned char *v |vp8_loop_filter_vertical_edge_uv_neon| PROC push {lr} + vpush {d8-d15} + vdup.u8 q0, r2 ; duplicate blimit sub r12, r0, #4 ; move u pointer down by 4 columns - ldr r2, [sp, #8] ; load v ptr + ldr r2, [sp, #72] ; load v ptr vdup.u8 q1, r3 ; duplicate limit sub r3, r2, #4 ; move v pointer down by 4 columns @@ -233,7 +244,7 @@ vld1.u8 {d20}, [r12] vld1.u8 {d21}, [r3] - ldr r12, [sp, #4] ; load thresh + ldr r12, [sp, #68] ; load thresh ;transpose to 8x16 matrix vtrn.32 q3, q7 @@ -281,6 +292,7 @@ vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0] vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2] + vpop {d8-d15} pop {pc} ENDP ; |vp8_loop_filter_vertical_edge_uv_neon| diff --git a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm index adf848b9c347966ecd5205b8f9a8f0a4cd46f9c2..6eb06516de05555f1586aaacedd94515238c3ef2 100644 --- a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm +++ b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm @@ -9,7 +9,6 @@ ; - ;EXPORT |vp8_loop_filter_simple_horizontal_edge_neon| EXPORT |vp8_loop_filter_bhs_neon| EXPORT |vp8_loop_filter_mbhs_neon| ARM @@ -22,7 +21,7 @@ ; q1 limit, PRESERVE |vp8_loop_filter_simple_horizontal_edge_neon| PROC - + vpush {d8-d15} sub r3, r0, r1, lsl #1 ; move src pointer down by 2 lines vld1.u8 {q7}, [r0@128], r1 ; q0 @@ -82,6 +81,7 @@ vst1.u8 {q6}, [r3@128] ; store op0 vst1.u8 {q7}, [r0@128] ; store oq0 + vpop {d8-d15} bx lr ENDP ; |vp8_loop_filter_simple_horizontal_edge_neon| diff --git a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm index e690df2f7de9d8e3e9cd502f78c24fd70c5c6241..78d13c895aa35b440f0ec3b42f1ac8bd7ad03445 100644 --- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm +++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm @@ -9,7 +9,6 @@ ; - ;EXPORT |vp8_loop_filter_simple_vertical_edge_neon| EXPORT |vp8_loop_filter_bvs_neon| EXPORT |vp8_loop_filter_mbvs_neon| ARM @@ -22,6 +21,8 @@ ; q1 limit, PRESERVE |vp8_loop_filter_simple_vertical_edge_neon| PROC + vpush {d8-d15} + sub r0, r0, #2 ; move src pointer down by 2 columns add r12, r1, r1 add r3, r0, r1 @@ -120,6 +121,7 @@ vst2.8 {d14[6], d15[6]}, [r0], r12 vst2.8 {d14[7], d15[7]}, [r3] + vpop {d8-d15} bx lr ENDP ; |vp8_loop_filter_simple_vertical_edge_neon| diff --git a/vp8/common/arm/neon/mbloopfilter_neon.asm b/vp8/common/arm/neon/mbloopfilter_neon.asm index f41c156df8b27783c36ef81ba0f1cada5f666e2c..d200c30909d5fee1bad6ac5582b2b5507df7dd7d 100644 --- a/vp8/common/arm/neon/mbloopfilter_neon.asm +++ b/vp8/common/arm/neon/mbloopfilter_neon.asm @@ -28,8 +28,10 @@ ; sp unsigned char thresh, |vp8_mbloop_filter_horizontal_edge_y_neon| PROC push {lr} + vpush {d8-d15} + add r1, r1, r1 ; double stride - ldr r12, [sp, #4] ; load thresh + ldr r12, [sp, #68] ; load thresh sub r0, r0, r1, lsl #1 ; move src pointer down by 4 lines vdup.u8 q2, r12 ; thresh add r12, r0, r1, lsr #1 ; move src pointer up by 1 line @@ -55,6 +57,7 @@ vst1.u8 {q8}, [r12@128] ; store oq1 vst1.u8 {q9}, [r0@128] ; store oq2 + vpop {d8-d15} pop {pc} ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon| @@ -72,10 +75,12 @@ |vp8_mbloop_filter_horizontal_edge_uv_neon| PROC push {lr} - ldr r12, [sp, #4] ; load thresh + vpush {d8-d15} + + ldr r12, [sp, #68] ; load thresh sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines vdup.u8 q2, r12 ; thresh - ldr r12, [sp, #8] ; load v ptr + ldr r12, [sp, #72] ; load v ptr sub r12, r12, r1, lsl #2 ; move v pointer down by 4 lines vld1.u8 {d6}, [r0@64], r1 ; p3 @@ -116,6 +121,7 @@ vst1.u8 {d18}, [r0@64], r1 ; store u oq2 vst1.u8 {d19}, [r12@64], r1 ; store v oq2 + vpop {d8-d15} pop {pc} ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon| @@ -130,7 +136,9 @@ ; sp unsigned char thresh, |vp8_mbloop_filter_vertical_edge_y_neon| PROC push {lr} - ldr r12, [sp, #4] ; load thresh + vpush {d8-d15} + + ldr r12, [sp, #68] ; load thresh sub r0, r0, #4 ; move src pointer down by 4 columns vdup.s8 q2, r12 ; thresh add r12, r0, r1, lsl #3 ; move src pointer down by 8 lines @@ -208,6 +216,7 @@ vst1.8 {d20}, [r0] vst1.8 {d21}, [r12] + vpop {d8-d15} pop {pc} ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon| @@ -224,10 +233,12 @@ ; sp+4 unsigned char *v |vp8_mbloop_filter_vertical_edge_uv_neon| PROC push {lr} - ldr r12, [sp, #4] ; load thresh + vpush {d8-d15} + + ldr r12, [sp, #68] ; load thresh sub r0, r0, #4 ; move u pointer down by 4 columns vdup.u8 q2, r12 ; thresh - ldr r12, [sp, #8] ; load v ptr + ldr r12, [sp, #72] ; load v ptr sub r12, r12, #4 ; move v pointer down by 4 columns vld1.u8 {d6}, [r0], r1 ;load u data @@ -303,6 +314,7 @@ vst1.8 {d20}, [r0] vst1.8 {d21}, [r12] + vpop {d8-d15} pop {pc} ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon| diff --git a/vp8/common/arm/neon/sad16_neon.asm b/vp8/common/arm/neon/sad16_neon.asm index d7c590e15a21fa33a70b78b684e1252fd447c937..7197e5655945b8cb8640333eb5db25e2c473b893 100644 --- a/vp8/common/arm/neon/sad16_neon.asm +++ b/vp8/common/arm/neon/sad16_neon.asm @@ -24,6 +24,7 @@ ; r3 int ref_stride |vp8_sad16x16_neon| PROC ;; + vpush {d8-d15} vld1.8 {q0}, [r0], r1 vld1.8 {q4}, [r2], r3 @@ -132,6 +133,7 @@ vmov.32 r0, d0[0] + vpop {d8-d15} bx lr ENDP @@ -143,6 +145,8 @@ ; unsigned char *ref_ptr, ; int ref_stride) |vp8_sad16x8_neon| PROC + vpush {d8-d15} + vld1.8 {q0}, [r0], r1 vld1.8 {q4}, [r2], r3 @@ -200,6 +204,7 @@ vmov.32 r0, d0[0] + vpop {d8-d15} bx lr ENDP diff --git a/vp8/common/arm/neon/sad8_neon.asm b/vp8/common/arm/neon/sad8_neon.asm index 23ba6df93a4dd8856e96396b0372cb646329d264..6b849d9338447875c73e9d87effba08bb0e8e42f 100644 --- a/vp8/common/arm/neon/sad8_neon.asm +++ b/vp8/common/arm/neon/sad8_neon.asm @@ -25,6 +25,7 @@ ; int ref_stride) |vp8_sad8x8_neon| PROC + vpush {d8-d15} vld1.8 {d0}, [r0], r1 vld1.8 {d8}, [r2], r3 @@ -70,6 +71,7 @@ vmov.32 r0, d0[0] + vpop {d8-d15} bx lr ENDP @@ -82,6 +84,7 @@ ; int ref_stride) |vp8_sad8x16_neon| PROC + vpush {d8-d15} vld1.8 {d0}, [r0], r1 vld1.8 {d8}, [r2], r3 @@ -167,6 +170,7 @@ vmov.32 r0, d0[0] + vpop {d8-d15} bx lr ENDP @@ -179,6 +183,7 @@ ; int ref_stride) |vp8_sad4x4_neon| PROC + vpush {d8-d15} vld1.8 {d0}, [r0], r1 vld1.8 {d8}, [r2], r3 @@ -202,6 +207,7 @@ vpaddl.u32 d0, d1 vmov.32 r0, d0[0] + vpop {d8-d15} bx lr ENDP diff --git a/vp8/common/arm/neon/shortidct4x4llm_neon.asm b/vp8/common/arm/neon/shortidct4x4llm_neon.asm index 67d2ab0150ddeea8ba17318a4d6e35bcad412b1b..87ca887be1ed843b98fc077f22da94d40c83a1c3 100644 --- a/vp8/common/arm/neon/shortidct4x4llm_neon.asm +++ b/vp8/common/arm/neon/shortidct4x4llm_neon.asm @@ -37,12 +37,14 @@ ; result of the multiplication that is needed in IDCT. |vp8_short_idct4x4llm_neon| PROC + vpush {d8-d15} + adr r12, idct_coeff vld1.16 {q1, q2}, [r0] vld1.16 {d0}, [r12] vswp d3, d4 ;q2(vp[4] vp[12]) - ldr r0, [sp] ; stride + ldr r0, [sp, #64] ; stride vqdmulh.s16 q3, q2, d0[2] vqdmulh.s16 q4, q2, d0[0] @@ -125,6 +127,7 @@ vst1.32 d2[0], [r3], r0 vst1.32 d2[1], [r3], r0 + vpop {d8-d15} bx lr ENDP diff --git a/vp8/common/arm/neon/sixtappredict16x16_neon.asm b/vp8/common/arm/neon/sixtappredict16x16_neon.asm index 9fdafd3609ed3d1d39a60b48536bbad4fd0fb83c..dd27719bf0855b0d3d0dfb36a52f5ced987436d4 100644 --- a/vp8/common/arm/neon/sixtappredict16x16_neon.asm +++ b/vp8/common/arm/neon/sixtappredict16x16_neon.asm @@ -43,10 +43,11 @@ filter16_coeff |vp8_sixtap_predict16x16_neon| PROC push {r4-r5, lr} + vpush {d8-d15} adr r12, filter16_coeff - ldr r4, [sp, #12] ;load parameters from stack - ldr r5, [sp, #16] ;load parameters from stack + ldr r4, [sp, #76] ;load parameters from stack + ldr r5, [sp, #80] ;load parameters from stack cmp r2, #0 ;skip first_pass filter if xoffset=0 beq secondpass_filter16x16_only @@ -291,6 +292,8 @@ secondpass_inner_loop_neon bne filt_blk2d_sp16x16_outloop_neon add sp, sp, #336 + + vpop {d8-d15} pop {r4-r5,pc} ;-------------------- @@ -384,6 +387,7 @@ filt_blk2d_fpo16x16_loop_neon bne filt_blk2d_fpo16x16_loop_neon + vpop {d8-d15} pop {r4-r5,pc} ;-------------------- @@ -482,6 +486,7 @@ secondpass_only_inner_loop_neon bne filt_blk2d_spo16x16_outloop_neon + vpop {d8-d15} pop {r4-r5,pc} ENDP diff --git a/vp8/common/arm/neon/sixtappredict4x4_neon.asm b/vp8/common/arm/neon/sixtappredict4x4_neon.asm index a4222bc62c54d750b1cfbe2ec2505962adad3f5f..e32e71305b9f573253586954c8f1826c07d158b5 100644 --- a/vp8/common/arm/neon/sixtappredict4x4_neon.asm +++ b/vp8/common/arm/neon/sixtappredict4x4_neon.asm @@ -35,10 +35,11 @@ filter4_coeff |vp8_sixtap_predict4x4_neon| PROC push {r4, lr} + vpush {d8-d15} adr r12, filter4_coeff - ldr r4, [sp, #8] ;load parameters from stack - ldr lr, [sp, #12] ;load parameters from stack + ldr r4, [sp, #72] ;load parameters from stack + ldr lr, [sp, #76] ;load parameters from stack cmp r2, #0 ;skip first_pass filter if xoffset=0 beq secondpass_filter4x4_only @@ -261,6 +262,7 @@ filter4_coeff vst1.32 {d4[0]}, [r1] vst1.32 {d4[1]}, [r2] + vpop {d8-d15} pop {r4, pc} @@ -348,6 +350,7 @@ firstpass_filter4x4_only vst1.32 {d28[0]}, [r1] vst1.32 {d28[1]}, [r2] + vpop {d8-d15} pop {r4, pc} @@ -413,6 +416,7 @@ secondpass_filter4x4_only vst1.32 {d4[0]}, [r1] vst1.32 {d4[1]}, [r2] + vpop {d8-d15} pop {r4, pc} ENDP diff --git a/vp8/common/arm/neon/sixtappredict8x4_neon.asm b/vp8/common/arm/neon/sixtappredict8x4_neon.asm index a57ec015f2c0c7404732cbeded169dd911b79a88..d19bf8920a3230425244dc1027b58dc3fb503a65 100644 --- a/vp8/common/arm/neon/sixtappredict8x4_neon.asm +++ b/vp8/common/arm/neon/sixtappredict8x4_neon.asm @@ -35,10 +35,11 @@ filter8_coeff |vp8_sixtap_predict8x4_neon| PROC push {r4-r5, lr} + vpush {d8-d15} adr r12, filter8_coeff - ldr r4, [sp, #12] ;load parameters from stack - ldr r5, [sp, #16] ;load parameters from stack + ldr r4, [sp, #76] ;load parameters from stack + ldr r5, [sp, #80] ;load parameters from stack cmp r2, #0 ;skip first_pass filter if xoffset=0 beq secondpass_filter8x4_only @@ -297,6 +298,8 @@ filter8_coeff vst1.u8 {d9}, [r4], r5 add sp, sp, #32 + + vpop {d8-d15} pop {r4-r5,pc} ;-------------------- @@ -392,6 +395,7 @@ firstpass_filter8x4_only vst1.u8 {d24}, [r4], r5 vst1.u8 {d25}, [r4], r5 + vpop {d8-d15} pop {r4-r5,pc} ;--------------------- @@ -464,6 +468,7 @@ secondpass_filter8x4_only vst1.u8 {d8}, [r4], r5 vst1.u8 {d9}, [r4], r5 + vpop {d8-d15} pop {r4-r5,pc} ENDP diff --git a/vp8/common/arm/neon/sixtappredict8x8_neon.asm b/vp8/common/arm/neon/sixtappredict8x8_neon.asm index 00ed5aeefe3c9cb4458cf15f8322eda756f4792f..4b049252c16c94b87ccdf47d1724914a9487bc99 100644 --- a/vp8/common/arm/neon/sixtappredict8x8_neon.asm +++ b/vp8/common/arm/neon/sixtappredict8x8_neon.asm @@ -35,11 +35,11 @@ filter8_coeff |vp8_sixtap_predict8x8_neon| PROC push {r4-r5, lr} - + vpush {d8-d15} adr r12, filter8_coeff - ldr r4, [sp, #12] ;load parameters from stack - ldr r5, [sp, #16] ;load parameters from stack + ldr r4, [sp, #76] ;load parameters from stack + ldr r5, [sp, #80] ;load parameters from stack cmp r2, #0 ;skip first_pass filter if xoffset=0 beq secondpass_filter8x8_only @@ -324,6 +324,8 @@ filt_blk2d_sp8x8_loop_neon bne filt_blk2d_sp8x8_loop_neon add sp, sp, #64 + + vpop {d8-d15} pop {r4-r5,pc} ;--------------------- @@ -428,6 +430,7 @@ filt_blk2d_fpo8x8_loop_neon bne filt_blk2d_fpo8x8_loop_neon + vpop {d8-d15} pop {r4-r5,pc} ;--------------------- @@ -515,6 +518,7 @@ filt_blk2d_spo8x8_loop_neon bne filt_blk2d_spo8x8_loop_neon + vpop {d8-d15} pop {r4-r5,pc} ENDP diff --git a/vp8/common/arm/neon/variance_neon.asm b/vp8/common/arm/neon/variance_neon.asm index e3b48327d3f5d2a9eb85d35ddcc20ea07ef01d22..8ecad72b9def22a7b9c16fc768a43b6e758cbddd 100644 --- a/vp8/common/arm/neon/variance_neon.asm +++ b/vp8/common/arm/neon/variance_neon.asm @@ -26,6 +26,7 @@ ; r3 int recon_stride ; stack unsigned int *sse |vp8_variance16x16_neon| PROC + vpush {q5} vmov.i8 q8, #0 ;q8 - sum vmov.i8 q9, #0 ;q9, q10 - sse vmov.i8 q10, #0 @@ -67,7 +68,7 @@ variance16x16_neon_loop vadd.u32 q10, q9, q10 ;accumulate sse vpaddl.s32 q0, q8 ;accumulate sum - ldr r12, [sp] ;load *sse from stack + ldr r12, [sp, #16] ;load *sse from stack vpaddl.u32 q1, q10 vadd.s64 d0, d0, d1 @@ -87,6 +88,8 @@ variance16x16_neon_loop vsub.u32 d0, d1, d10 vmov.32 r0, d0[0] ;return + + vpop {q5} bx lr ENDP @@ -99,6 +102,8 @@ variance16x16_neon_loop ; int recon_stride, ; unsigned int *sse) |vp8_variance16x8_neon| PROC + vpush {q5} + vmov.i8 q8, #0 ;q8 - sum vmov.i8 q9, #0 ;q9, q10 - sse vmov.i8 q10, #0 @@ -137,7 +142,7 @@ variance16x8_neon_loop vadd.u32 q10, q9, q10 ;accumulate sse vpaddl.s32 q0, q8 ;accumulate sum - ldr r12, [sp] ;load *sse from stack + ldr r12, [sp, #16] ;load *sse from stack vpaddl.u32 q1, q10 vadd.s64 d0, d0, d1 @@ -149,6 +154,8 @@ variance16x8_neon_loop vsub.u32 d0, d1, d10 vmov.32 r0, d0[0] ;return + + vpop {q5} bx lr ENDP @@ -162,6 +169,8 @@ variance16x8_neon_loop ; unsigned int *sse) |vp8_variance8x16_neon| PROC + vpush {q5} + vmov.i8 q8, #0 ;q8 - sum vmov.i8 q9, #0 ;q9, q10 - sse vmov.i8 q10, #0 @@ -192,7 +201,7 @@ variance8x16_neon_loop vadd.u32 q10, q9, q10 ;accumulate sse vpaddl.s32 q0, q8 ;accumulate sum - ldr r12, [sp] ;load *sse from stack + ldr r12, [sp, #16] ;load *sse from stack vpaddl.u32 q1, q10 vadd.s64 d0, d0, d1 @@ -204,6 +213,8 @@ variance8x16_neon_loop vsub.u32 d0, d1, d10 vmov.32 r0, d0[0] ;return + + vpop {q5} bx lr ENDP @@ -215,6 +226,8 @@ variance8x16_neon_loop ; r3 int recon_stride ; stack unsigned int *sse |vp8_variance8x8_neon| PROC + vpush {q5} + vmov.i8 q8, #0 ;q8 - sum vmov.i8 q9, #0 ;q9, q10 - sse vmov.i8 q10, #0 @@ -257,7 +270,7 @@ variance8x8_neon_loop vadd.u32 q10, q9, q10 ;accumulate sse vpaddl.s32 q0, q8 ;accumulate sum - ldr r12, [sp] ;load *sse from stack + ldr r12, [sp, #16] ;load *sse from stack vpaddl.u32 q1, q10 vadd.s64 d0, d0, d1 @@ -269,6 +282,8 @@ variance8x8_neon_loop vsub.u32 d0, d1, d10 vmov.32 r0, d0[0] ;return + + vpop {q5} bx lr ENDP diff --git a/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm b/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm index 9d22c52521c747ab7ab9fb7a8abfda4854998bce..adc5b7e3a7816242d12852486c9fb20bfee840fe 100644 --- a/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm +++ b/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm @@ -31,11 +31,12 @@ bilinear_taps_coeff |vp8_sub_pixel_variance16x16_neon_func| PROC push {r4-r6, lr} + vpush {d8-d15} adr r12, bilinear_taps_coeff - ldr r4, [sp, #16] ;load *dst_ptr from stack - ldr r5, [sp, #20] ;load dst_pixels_per_line from stack - ldr r6, [sp, #24] ;load *sse from stack + ldr r4, [sp, #80] ;load *dst_ptr from stack + ldr r5, [sp, #84] ;load dst_pixels_per_line from stack + ldr r6, [sp, #88] ;load *sse from stack cmp r2, #0 ;skip first_pass filter if xoffset=0 beq secondpass_bfilter16x16_only @@ -416,6 +417,7 @@ sub_pixel_variance16x16_neon_loop add sp, sp, #528 vmov.32 r0, d0[0] ;return + vpop {d8-d15} pop {r4-r6,pc} ENDP diff --git a/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm b/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm index 155be4fc54b41c0d603e85f98e5334fcf2a33909..b0829af7547be4280ba668462abf9491ab655e09 100644 --- a/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm +++ b/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm @@ -31,9 +31,10 @@ ;================================================ |vp8_variance_halfpixvar16x16_h_neon| PROC push {lr} + vpush {d8-d15} mov r12, #4 ;loop counter - ldr lr, [sp, #4] ;load *sse from stack + ldr lr, [sp, #68] ;load *sse from stack vmov.i8 q8, #0 ;q8 - sum vmov.i8 q9, #0 ;q9, q10 - sse vmov.i8 q10, #0 @@ -116,6 +117,8 @@ vp8_filt_fpo16x16s_4_0_loop_neon vsub.u32 d0, d1, d10 vmov.32 r0, d0[0] ;return + + vpop {d8-d15} pop {pc} ENDP @@ -131,11 +134,12 @@ vp8_filt_fpo16x16s_4_0_loop_neon ;================================================ |vp8_variance_halfpixvar16x16_v_neon| PROC push {lr} + vpush {d8-d15} mov r12, #4 ;loop counter vld1.u8 {q0}, [r0], r1 ;load src data - ldr lr, [sp, #4] ;load *sse from stack + ldr lr, [sp, #68] ;load *sse from stack vmov.i8 q8, #0 ;q8 - sum vmov.i8 q9, #0 ;q9, q10 - sse @@ -212,6 +216,8 @@ vp8_filt_spo16x16s_0_4_loop_neon vsub.u32 d0, d1, d10 vmov.32 r0, d0[0] ;return + + vpop {d8-d15} pop {pc} ENDP @@ -227,10 +233,11 @@ vp8_filt_spo16x16s_0_4_loop_neon ;================================================ |vp8_variance_halfpixvar16x16_hv_neon| PROC push {lr} + vpush {d8-d15} vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data - ldr lr, [sp, #4] ;load *sse from stack + ldr lr, [sp, #68] ;load *sse from stack vmov.i8 q13, #0 ;q8 - sum vext.8 q1, q0, q1, #1 ;construct src_ptr[1] @@ -331,6 +338,8 @@ vp8_filt16x16s_4_4_loop_neon vsub.u32 d0, d1, d10 vmov.32 r0, d0[0] ;return + + vpop {d8-d15} pop {pc} ENDP @@ -349,10 +358,11 @@ vp8_filt16x16s_4_4_loop_neon |vp8_sub_pixel_variance16x16s_neon| PROC push {r4, lr} + vpush {d8-d15} - ldr r4, [sp, #8] ;load *dst_ptr from stack - ldr r12, [sp, #12] ;load dst_pixels_per_line from stack - ldr lr, [sp, #16] ;load *sse from stack + ldr r4, [sp, #72] ;load *dst_ptr from stack + ldr r12, [sp, #76] ;load dst_pixels_per_line from stack + ldr lr, [sp, #80] ;load *sse from stack cmp r2, #0 ;skip first_pass filter if xoffset=0 beq secondpass_bfilter16x16s_only @@ -566,6 +576,7 @@ sub_pixel_variance16x16s_neon_loop add sp, sp, #256 vmov.32 r0, d0[0] ;return + vpop {d8-d15} pop {r4, pc} ENDP diff --git a/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm b/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm index f6b6847537f8e8025125b1e524845e4bd4377e06..9d9f9e0772a17b58e09ce5d68dbcebd84b45f7d1 100644 --- a/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm +++ b/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm @@ -26,11 +26,12 @@ |vp8_sub_pixel_variance8x8_neon| PROC push {r4-r5, lr} + vpush {d8-d15} adr r12, bilinear_taps_coeff - ldr r4, [sp, #12] ;load *dst_ptr from stack - ldr r5, [sp, #16] ;load dst_pixels_per_line from stack - ldr lr, [sp, #20] ;load *sse from stack + ldr r4, [sp, #76] ;load *dst_ptr from stack + ldr r5, [sp, #80] ;load dst_pixels_per_line from stack + ldr lr, [sp, #84] ;load *sse from stack cmp r2, #0 ;skip first_pass filter if xoffset=0 beq skip_firstpass_filter @@ -210,6 +211,8 @@ sub_pixel_variance8x8_neon_loop vsub.u32 d0, d1, d10 vmov.32 r0, d0[0] ;return + + vpop {d8-d15} pop {r4-r5, pc} ENDP diff --git a/vp8/encoder/arm/neon/subtract_neon.asm b/vp8/encoder/arm/neon/subtract_neon.asm index 5bda78678db9fe3098d4e425d73611ddbfb21533..840cb33d95723b33905e3b4b0eb74df557fb756a 100644 --- a/vp8/encoder/arm/neon/subtract_neon.asm +++ b/vp8/encoder/arm/neon/subtract_neon.asm @@ -65,8 +65,10 @@ ; unsigned char *pred, int pred_stride) |vp8_subtract_mby_neon| PROC push {r4-r7} + vpush {d8-d15} + mov r12, #4 - ldr r4, [sp, #16] ; pred_stride + ldr r4, [sp, #80] ; pred_stride mov r6, #32 ; "diff" stride x2 add r5, r0, #16 ; second diff pointer @@ -101,6 +103,7 @@ subtract_mby_loop subs r12, r12, #1 bne subtract_mby_loop + vpop {d8-d15} pop {r4-r7} bx lr ENDP @@ -112,9 +115,11 @@ subtract_mby_loop |vp8_subtract_mbuv_neon| PROC push {r4-r7} - ldr r4, [sp, #16] ; upred - ldr r5, [sp, #20] ; vpred - ldr r6, [sp, #24] ; pred_stride + vpush {d8-d15} + + ldr r4, [sp, #80] ; upred + ldr r5, [sp, #84] ; vpred + ldr r6, [sp, #88] ; pred_stride add r0, r0, #512 ; short *udiff = diff + 256; mov r12, #32 ; "diff" stride x2 add r7, r0, #16 ; second diff pointer @@ -191,6 +196,7 @@ subtract_mby_loop vst1.16 {q14}, [r0], r12 vst1.16 {q15}, [r7], r12 + vpop {d8-d15} pop {r4-r7} bx lr diff --git a/vp8/encoder/arm/neon/vp8_memcpy_neon.asm b/vp8/encoder/arm/neon/vp8_memcpy_neon.asm index 5b9f11e59352f1a1f6f658a39d03c75489d4e46b..d219e2d14248375b14d69af2ee9b5ac092ed6d51 100644 --- a/vp8/encoder/arm/neon/vp8_memcpy_neon.asm +++ b/vp8/encoder/arm/neon/vp8_memcpy_neon.asm @@ -21,6 +21,7 @@ ;void vp8_memcpy_partial_neon(unsigned char *dst_ptr, unsigned char *src_ptr, ; int sz); |vp8_memcpy_partial_neon| PROC + vpush {d8-d15} ;pld [r1] ;preload pred data ;pld [r1, #128] ;pld [r1, #256] @@ -64,6 +65,7 @@ extra_copy_neon_loop bne extra_copy_neon_loop done_copy_neon_loop + vpop {d8-d15} bx lr ENDP diff --git a/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm b/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm index 55edbf5129ed013ffa0b5ea32c5aa6682ad24e6c..f82af3ee333a08ceac25261da45796fc0b6d719d 100644 --- a/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm +++ b/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm @@ -27,6 +27,8 @@ ;from vp8_variance(). |vp8_mse16x16_neon| PROC + vpush {q7} + vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse vmov.i8 q8, #0 vmov.i8 q9, #0 @@ -62,7 +64,7 @@ mse16x16_neon_loop vadd.u32 q7, q7, q8 vadd.u32 q9, q9, q10 - ldr r12, [sp] ;load *sse from stack + ldr r12, [sp, #16] ;load *sse from stack vadd.u32 q10, q7, q9 vpaddl.u32 q1, q10 @@ -71,6 +73,7 @@ mse16x16_neon_loop vst1.32 {d0[0]}, [r12] vmov.32 r0, d0[0] + vpop {q7} bx lr ENDP @@ -82,6 +85,8 @@ mse16x16_neon_loop ; r2 unsigned char *ref_ptr, ; r3 int recon_stride |vp8_get4x4sse_cs_neon| PROC + vpush {q7} + vld1.8 {d0}, [r0], r1 ;Load up source and reference vld1.8 {d4}, [r2], r3 vld1.8 {d1}, [r0], r1 @@ -109,6 +114,8 @@ mse16x16_neon_loop vadd.u64 d0, d2, d3 vmov.32 r0, d0[0] + + vpop {q7} bx lr ENDP