diff --git a/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm b/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
index e3ea91fe6c0493dc8e5c510f34fe7ce77e598b53..a8730aa04ef70b5f1b6250567459d9e17f86e89c 100644
--- a/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
+++ b/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
@@ -26,6 +26,7 @@
 
 |vp8_build_intra_predictors_mby_neon_func| PROC
     push            {r4-r8, lr}
+    vpush           {d8-d15}
 
     cmp             r3, #0
     beq             case_dc_pred
@@ -37,8 +38,8 @@
     beq             case_tm_pred
 
 case_dc_pred
-    ldr             r4, [sp, #24]       ; Up
-    ldr             r5, [sp, #28]       ; Left
+    ldr             r4, [sp, #88]       ; Up
+    ldr             r5, [sp, #92]       ; Left
 
     ; Default the DC average to 128
     mov             r12, #128
@@ -143,6 +144,7 @@ skip_dc_pred_up_left
     vst1.u8         {q0}, [r1]!
     vst1.u8         {q0}, [r1]!
 
+    vpop            {d8-d15}
     pop             {r4-r8,pc}
 case_v_pred
     ; Copy down above row
@@ -165,6 +167,7 @@ case_v_pred
     vst1.u8         {q0}, [r1]!
     vst1.u8         {q0}, [r1]!
     vst1.u8         {q0}, [r1]!
+    vpop            {d8-d15}
     pop             {r4-r8,pc}
 
 case_h_pred
@@ -224,6 +227,7 @@ case_h_pred
     vst1.u8         {q2}, [r1]!
     vst1.u8         {q3}, [r1]!
 
+    vpop            {d8-d15}
     pop             {r4-r8,pc}
 
 case_tm_pred
@@ -293,6 +297,7 @@ case_tm_pred_loop
     subs            r12, r12, #1
     bne             case_tm_pred_loop
 
+    vpop            {d8-d15}
     pop             {r4-r8,pc}
 
     ENDP
@@ -307,6 +312,7 @@ case_tm_pred_loop
 
 |vp8_build_intra_predictors_mby_s_neon_func| PROC
     push            {r4-r8, lr}
+    vpush           {d8-d15}
 
     mov             r1, r0      ;   unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
 
@@ -320,8 +326,8 @@ case_tm_pred_loop
     beq             case_tm_pred_s
 
 case_dc_pred_s
-    ldr             r4, [sp, #24]       ; Up
-    ldr             r5, [sp, #28]       ; Left
+    ldr             r4, [sp, #88]       ; Up
+    ldr             r5, [sp, #92]       ; Left
 
     ; Default the DC average to 128
     mov             r12, #128
@@ -426,6 +432,7 @@ skip_dc_pred_up_left_s
     vst1.u8         {q0}, [r1], r2
     vst1.u8         {q0}, [r1], r2
 
+    vpop            {d8-d15}
     pop             {r4-r8,pc}
 case_v_pred_s
     ; Copy down above row
@@ -448,6 +455,8 @@ case_v_pred_s
     vst1.u8         {q0}, [r1], r2
     vst1.u8         {q0}, [r1], r2
     vst1.u8         {q0}, [r1], r2
+
+    vpop            {d8-d15}
     pop             {r4-r8,pc}
 
 case_h_pred_s
@@ -507,6 +516,7 @@ case_h_pred_s
     vst1.u8         {q2}, [r1], r2
     vst1.u8         {q3}, [r1], r2
 
+    vpop            {d8-d15}
     pop             {r4-r8,pc}
 
 case_tm_pred_s
@@ -576,6 +586,7 @@ case_tm_pred_loop_s
     subs            r12, r12, #1
     bne             case_tm_pred_loop_s
 
+    vpop            {d8-d15}
     pop             {r4-r8,pc}
 
     ENDP
diff --git a/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm b/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
index 6c29c55860d899a502bcd5aac71c5dfe459bd659..3a3921081c4410cebc23e88777fad6de057a3e4a 100644
--- a/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
+++ b/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
@@ -22,6 +22,7 @@
 ; r3   stride
 |idct_dequant_0_2x_neon| PROC
     push            {r4, r5}
+    vpush           {d8-d15}
 
     add             r12, r2, #4
     vld1.32         {d2[0]}, [r2], r3
@@ -72,6 +73,7 @@
     vst1.32         {d4[1]}, [r2]
     vst1.32         {d10[1]}, [r0]
 
+    vpop            {d8-d15}
     pop             {r4, r5}
     bx              lr
 
diff --git a/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm b/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
index d5dce63f6bd72b54d4a3d964a93dd485fccc06ed..8da0fa0b7ea535a825463de665a831f2948d41b6 100644
--- a/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
+++ b/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
@@ -22,6 +22,8 @@
 ; r2    *dst
 ; r3    stride
 |idct_dequant_full_2x_neon| PROC
+    vpush           {d8-d15}
+
     vld1.16         {q0, q1}, [r1]          ; dq (same l/r)
     vld1.16         {q2, q3}, [r0]          ; l q
     add             r0, r0, #32
@@ -184,6 +186,7 @@
     vst1.32         {d3[0]}, [r2]
     vst1.32         {d3[1]}, [r1]
 
+    vpop            {d8-d15}
     bx             lr
 
     ENDP           ; |idct_dequant_full_2x_neon|
diff --git a/vp8/common/arm/neon/loopfilter_neon.asm b/vp8/common/arm/neon/loopfilter_neon.asm
index e44be0a1e34d2199c20401aabc68315f2be2cb35..c4f09c7753bfc290b547671a8d81ba16b391f284 100644
--- a/vp8/common/arm/neon/loopfilter_neon.asm
+++ b/vp8/common/arm/neon/loopfilter_neon.asm
@@ -24,10 +24,12 @@
 ; sp    unsigned char thresh,
 |vp8_loop_filter_horizontal_edge_y_neon| PROC
     push        {lr}
+    vpush       {d8-d15}
+
     vdup.u8     q0, r2                     ; duplicate blimit
     vdup.u8     q1, r3                     ; duplicate limit
     sub         r2, r0, r1, lsl #2         ; move src pointer down by 4 lines
-    ldr         r3, [sp, #4]               ; load thresh
+    ldr         r3, [sp, #68]              ; load thresh
     add         r12, r2, r1
     add         r1, r1, r1
 
@@ -52,6 +54,7 @@
     vst1.u8     {q7}, [r2@128], r1              ; store oq0
     vst1.u8     {q8}, [r12@128], r1             ; store oq1
 
+    vpop        {d8-d15}
     pop         {pc}
     ENDP        ; |vp8_loop_filter_horizontal_edge_y_neon|
 
@@ -64,10 +67,12 @@
 ; sp+4  unsigned char *v
 |vp8_loop_filter_horizontal_edge_uv_neon| PROC
     push        {lr}
+    vpush       {d8-d15}
+
     vdup.u8     q0, r2                      ; duplicate blimit
     vdup.u8     q1, r3                      ; duplicate limit
-    ldr         r12, [sp, #4]               ; load thresh
-    ldr         r2, [sp, #8]                ; load v ptr
+    ldr         r12, [sp, #68]              ; load thresh
+    ldr         r2, [sp, #72]               ; load v ptr
     vdup.u8     q2, r12                     ; duplicate thresh
 
     sub         r3, r0, r1, lsl #2          ; move u pointer down by 4 lines
@@ -104,6 +109,7 @@
     vst1.u8     {d16}, [r0@64]                 ; store u oq1
     vst1.u8     {d17}, [r2@64]                 ; store v oq1
 
+    vpop        {d8-d15}
     pop         {pc}
     ENDP        ; |vp8_loop_filter_horizontal_edge_uv_neon|
 
@@ -120,11 +126,13 @@
 
 |vp8_loop_filter_vertical_edge_y_neon| PROC
     push        {lr}
+    vpush       {d8-d15}
+
     vdup.u8     q0, r2                     ; duplicate blimit
     vdup.u8     q1, r3                     ; duplicate limit
     sub         r2, r0, #4                 ; src ptr down by 4 columns
     add         r1, r1, r1
-    ldr         r3, [sp, #4]               ; load thresh
+    ldr         r3, [sp, #68]              ; load thresh
     add         r12, r2, r1, asr #1
 
     vld1.u8     {d6}, [r2], r1
@@ -194,6 +202,7 @@
     vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r0]
     vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r12]
 
+    vpop        {d8-d15}
     pop         {pc}
     ENDP        ; |vp8_loop_filter_vertical_edge_y_neon|
 
@@ -210,9 +219,11 @@
 ; sp+4  unsigned char *v
 |vp8_loop_filter_vertical_edge_uv_neon| PROC
     push        {lr}
+    vpush       {d8-d15}
+
     vdup.u8     q0, r2                      ; duplicate blimit
     sub         r12, r0, #4                 ; move u pointer down by 4 columns
-    ldr         r2, [sp, #8]                ; load v ptr
+    ldr         r2, [sp, #72]               ; load v ptr
     vdup.u8     q1, r3                      ; duplicate limit
     sub         r3, r2, #4                  ; move v pointer down by 4 columns
 
@@ -233,7 +244,7 @@
     vld1.u8     {d20}, [r12]
     vld1.u8     {d21}, [r3]
 
-    ldr        r12, [sp, #4]               ; load thresh
+    ldr        r12, [sp, #68]              ; load thresh
 
     ;transpose to 8x16 matrix
     vtrn.32     q3, q7
@@ -281,6 +292,7 @@
     vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r0]
     vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r2]
 
+    vpop        {d8-d15}
     pop         {pc}
     ENDP        ; |vp8_loop_filter_vertical_edge_uv_neon|
 
diff --git a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
index adf848b9c347966ecd5205b8f9a8f0a4cd46f9c2..6eb06516de05555f1586aaacedd94515238c3ef2 100644
--- a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
+++ b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
@@ -9,7 +9,6 @@
 ;
 
 
-    ;EXPORT  |vp8_loop_filter_simple_horizontal_edge_neon|
     EXPORT  |vp8_loop_filter_bhs_neon|
     EXPORT  |vp8_loop_filter_mbhs_neon|
     ARM
@@ -22,7 +21,7 @@
 ; q1    limit, PRESERVE
 
 |vp8_loop_filter_simple_horizontal_edge_neon| PROC
-
+    vpush       {d8-d15}
     sub         r3, r0, r1, lsl #1          ; move src pointer down by 2 lines
 
     vld1.u8     {q7}, [r0@128], r1          ; q0
@@ -82,6 +81,7 @@
     vst1.u8     {q6}, [r3@128]              ; store op0
     vst1.u8     {q7}, [r0@128]              ; store oq0
 
+    vpop        {d8-d15}
     bx          lr
     ENDP        ; |vp8_loop_filter_simple_horizontal_edge_neon|
 
diff --git a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
index e690df2f7de9d8e3e9cd502f78c24fd70c5c6241..78d13c895aa35b440f0ec3b42f1ac8bd7ad03445 100644
--- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
+++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
@@ -9,7 +9,6 @@
 ;
 
 
-    ;EXPORT  |vp8_loop_filter_simple_vertical_edge_neon|
     EXPORT |vp8_loop_filter_bvs_neon|
     EXPORT |vp8_loop_filter_mbvs_neon|
     ARM
@@ -22,6 +21,8 @@
 ; q1    limit, PRESERVE
 
 |vp8_loop_filter_simple_vertical_edge_neon| PROC
+    vpush       {d8-d15}
+
     sub         r0, r0, #2                  ; move src pointer down by 2 columns
     add         r12, r1, r1
     add         r3, r0, r1
@@ -120,6 +121,7 @@
     vst2.8      {d14[6], d15[6]}, [r0], r12
     vst2.8      {d14[7], d15[7]}, [r3]
 
+    vpop        {d8-d15}
     bx          lr
     ENDP        ; |vp8_loop_filter_simple_vertical_edge_neon|
 
diff --git a/vp8/common/arm/neon/mbloopfilter_neon.asm b/vp8/common/arm/neon/mbloopfilter_neon.asm
index f41c156df8b27783c36ef81ba0f1cada5f666e2c..d200c30909d5fee1bad6ac5582b2b5507df7dd7d 100644
--- a/vp8/common/arm/neon/mbloopfilter_neon.asm
+++ b/vp8/common/arm/neon/mbloopfilter_neon.asm
@@ -28,8 +28,10 @@
 ; sp    unsigned char thresh,
 |vp8_mbloop_filter_horizontal_edge_y_neon| PROC
     push        {lr}
+    vpush       {d8-d15}
+
     add         r1, r1, r1                  ; double stride
-    ldr         r12, [sp, #4]               ; load thresh
+    ldr         r12, [sp, #68]              ; load thresh
     sub         r0, r0, r1, lsl #1          ; move src pointer down by 4 lines
     vdup.u8     q2, r12                     ; thresh
     add         r12, r0, r1,  lsr #1        ; move src pointer up by 1 line
@@ -55,6 +57,7 @@
     vst1.u8     {q8}, [r12@128]            ; store oq1
     vst1.u8     {q9}, [r0@128]             ; store oq2
 
+    vpop        {d8-d15}
     pop         {pc}
     ENDP        ; |vp8_mbloop_filter_horizontal_edge_y_neon|
 
@@ -72,10 +75,12 @@
 
 |vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
     push        {lr}
-    ldr         r12, [sp, #4]                 ; load thresh
+    vpush       {d8-d15}
+
+    ldr         r12, [sp, #68]                ; load thresh
     sub         r0, r0, r1, lsl #2            ; move u pointer down by 4 lines
     vdup.u8     q2, r12                       ; thresh
-    ldr         r12, [sp, #8]                 ; load v ptr
+    ldr         r12, [sp, #72]                ; load v ptr
     sub         r12, r12, r1, lsl #2          ; move v pointer down by 4 lines
 
     vld1.u8     {d6}, [r0@64], r1              ; p3
@@ -116,6 +121,7 @@
     vst1.u8     {d18}, [r0@64], r1             ; store u oq2
     vst1.u8     {d19}, [r12@64], r1             ; store v oq2
 
+    vpop        {d8-d15}
     pop         {pc}
     ENDP        ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
 
@@ -130,7 +136,9 @@
 ; sp    unsigned char thresh,
 |vp8_mbloop_filter_vertical_edge_y_neon| PROC
     push        {lr}
-    ldr         r12, [sp, #4]               ; load thresh
+    vpush       {d8-d15}
+
+    ldr         r12, [sp, #68]              ; load thresh
     sub         r0, r0, #4                  ; move src pointer down by 4 columns
     vdup.s8     q2, r12                     ; thresh
     add         r12, r0, r1, lsl #3         ; move src pointer down by 8 lines
@@ -208,6 +216,7 @@
     vst1.8      {d20}, [r0]
     vst1.8      {d21}, [r12]
 
+    vpop        {d8-d15}
     pop         {pc}
     ENDP        ; |vp8_mbloop_filter_vertical_edge_y_neon|
 
@@ -224,10 +233,12 @@
 ; sp+4  unsigned char *v
 |vp8_mbloop_filter_vertical_edge_uv_neon| PROC
     push        {lr}
-    ldr         r12, [sp, #4]               ; load thresh
+    vpush       {d8-d15}
+
+    ldr         r12, [sp, #68]              ; load thresh
     sub         r0, r0, #4                  ; move u pointer down by 4 columns
     vdup.u8     q2, r12                     ; thresh
-    ldr         r12, [sp, #8]               ; load v ptr
+    ldr         r12, [sp, #72]              ; load v ptr
     sub         r12, r12, #4                ; move v pointer down by 4 columns
 
     vld1.u8     {d6}, [r0], r1              ;load u data
@@ -303,6 +314,7 @@
     vst1.8      {d20}, [r0]
     vst1.8      {d21}, [r12]
 
+    vpop        {d8-d15}
     pop         {pc}
     ENDP        ; |vp8_mbloop_filter_vertical_edge_uv_neon|
 
diff --git a/vp8/common/arm/neon/sad16_neon.asm b/vp8/common/arm/neon/sad16_neon.asm
index d7c590e15a21fa33a70b78b684e1252fd447c937..7197e5655945b8cb8640333eb5db25e2c473b893 100644
--- a/vp8/common/arm/neon/sad16_neon.asm
+++ b/vp8/common/arm/neon/sad16_neon.asm
@@ -24,6 +24,7 @@
 ; r3    int  ref_stride
 |vp8_sad16x16_neon| PROC
 ;;
+    vpush           {d8-d15}
     vld1.8          {q0}, [r0], r1
     vld1.8          {q4}, [r2], r3
 
@@ -132,6 +133,7 @@
 
     vmov.32         r0, d0[0]
 
+    vpop            {d8-d15}
     bx              lr
 
     ENDP
@@ -143,6 +145,8 @@
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride)
 |vp8_sad16x8_neon| PROC
+    vpush           {d8-d15}
+
     vld1.8          {q0}, [r0], r1
     vld1.8          {q4}, [r2], r3
 
@@ -200,6 +204,7 @@
 
     vmov.32         r0, d0[0]
 
+    vpop            {d8-d15}
     bx              lr
 
     ENDP
diff --git a/vp8/common/arm/neon/sad8_neon.asm b/vp8/common/arm/neon/sad8_neon.asm
index 23ba6df93a4dd8856e96396b0372cb646329d264..6b849d9338447875c73e9d87effba08bb0e8e42f 100644
--- a/vp8/common/arm/neon/sad8_neon.asm
+++ b/vp8/common/arm/neon/sad8_neon.asm
@@ -25,6 +25,7 @@
 ;    int  ref_stride)
 
 |vp8_sad8x8_neon| PROC
+    vpush           {d8-d15}
     vld1.8          {d0}, [r0], r1
     vld1.8          {d8}, [r2], r3
 
@@ -70,6 +71,7 @@
 
     vmov.32         r0, d0[0]
 
+    vpop            {d8-d15}
     bx              lr
 
     ENDP
@@ -82,6 +84,7 @@
 ;    int  ref_stride)
 
 |vp8_sad8x16_neon| PROC
+    vpush           {d8-d15}
     vld1.8          {d0}, [r0], r1
     vld1.8          {d8}, [r2], r3
 
@@ -167,6 +170,7 @@
 
     vmov.32         r0, d0[0]
 
+    vpop            {d8-d15}
     bx              lr
 
     ENDP
@@ -179,6 +183,7 @@
 ;    int  ref_stride)
 
 |vp8_sad4x4_neon| PROC
+    vpush           {d8-d15}
     vld1.8          {d0}, [r0], r1
     vld1.8          {d8}, [r2], r3
 
@@ -202,6 +207,7 @@
     vpaddl.u32      d0, d1
     vmov.32         r0, d0[0]
 
+    vpop            {d8-d15}
     bx              lr
 
     ENDP
diff --git a/vp8/common/arm/neon/shortidct4x4llm_neon.asm b/vp8/common/arm/neon/shortidct4x4llm_neon.asm
index 67d2ab0150ddeea8ba17318a4d6e35bcad412b1b..87ca887be1ed843b98fc077f22da94d40c83a1c3 100644
--- a/vp8/common/arm/neon/shortidct4x4llm_neon.asm
+++ b/vp8/common/arm/neon/shortidct4x4llm_neon.asm
@@ -37,12 +37,14 @@
 ; result of the multiplication that is needed in IDCT.
 
 |vp8_short_idct4x4llm_neon| PROC
+    vpush           {d8-d15}
+
     adr             r12, idct_coeff
     vld1.16         {q1, q2}, [r0]
     vld1.16         {d0}, [r12]
 
     vswp            d3, d4                  ;q2(vp[4] vp[12])
-    ldr             r0, [sp]                ; stride
+    ldr             r0, [sp, #64]           ; stride
 
     vqdmulh.s16     q3, q2, d0[2]
     vqdmulh.s16     q4, q2, d0[0]
@@ -125,6 +127,7 @@
     vst1.32         d2[0], [r3], r0
     vst1.32         d2[1], [r3], r0
 
+    vpop            {d8-d15}
     bx              lr
 
     ENDP
diff --git a/vp8/common/arm/neon/sixtappredict16x16_neon.asm b/vp8/common/arm/neon/sixtappredict16x16_neon.asm
index 9fdafd3609ed3d1d39a60b48536bbad4fd0fb83c..dd27719bf0855b0d3d0dfb36a52f5ced987436d4 100644
--- a/vp8/common/arm/neon/sixtappredict16x16_neon.asm
+++ b/vp8/common/arm/neon/sixtappredict16x16_neon.asm
@@ -43,10 +43,11 @@ filter16_coeff
 
 |vp8_sixtap_predict16x16_neon| PROC
     push            {r4-r5, lr}
+    vpush           {d8-d15}
 
     adr             r12, filter16_coeff
-    ldr             r4, [sp, #12]           ;load parameters from stack
-    ldr             r5, [sp, #16]           ;load parameters from stack
+    ldr             r4, [sp, #76]           ;load parameters from stack
+    ldr             r5, [sp, #80]           ;load parameters from stack
 
     cmp             r2, #0                  ;skip first_pass filter if xoffset=0
     beq             secondpass_filter16x16_only
@@ -291,6 +292,8 @@ secondpass_inner_loop_neon
     bne filt_blk2d_sp16x16_outloop_neon
 
     add             sp, sp, #336
+
+    vpop            {d8-d15}
     pop             {r4-r5,pc}
 
 ;--------------------
@@ -384,6 +387,7 @@ filt_blk2d_fpo16x16_loop_neon
 
     bne             filt_blk2d_fpo16x16_loop_neon
 
+    vpop            {d8-d15}
     pop             {r4-r5,pc}
 
 ;--------------------
@@ -482,6 +486,7 @@ secondpass_only_inner_loop_neon
 
     bne filt_blk2d_spo16x16_outloop_neon
 
+    vpop            {d8-d15}
     pop             {r4-r5,pc}
 
     ENDP
diff --git a/vp8/common/arm/neon/sixtappredict4x4_neon.asm b/vp8/common/arm/neon/sixtappredict4x4_neon.asm
index a4222bc62c54d750b1cfbe2ec2505962adad3f5f..e32e71305b9f573253586954c8f1826c07d158b5 100644
--- a/vp8/common/arm/neon/sixtappredict4x4_neon.asm
+++ b/vp8/common/arm/neon/sixtappredict4x4_neon.asm
@@ -35,10 +35,11 @@ filter4_coeff
 
 |vp8_sixtap_predict4x4_neon| PROC
     push            {r4, lr}
+    vpush           {d8-d15}
 
     adr             r12, filter4_coeff
-    ldr             r4, [sp, #8]            ;load parameters from stack
-    ldr             lr, [sp, #12]           ;load parameters from stack
+    ldr             r4, [sp, #72]            ;load parameters from stack
+    ldr             lr, [sp, #76]           ;load parameters from stack
 
     cmp             r2, #0                  ;skip first_pass filter if xoffset=0
     beq             secondpass_filter4x4_only
@@ -261,6 +262,7 @@ filter4_coeff
     vst1.32         {d4[0]}, [r1]
     vst1.32         {d4[1]}, [r2]
 
+    vpop            {d8-d15}
     pop             {r4, pc}
 
 
@@ -348,6 +350,7 @@ firstpass_filter4x4_only
     vst1.32         {d28[0]}, [r1]
     vst1.32         {d28[1]}, [r2]
 
+    vpop            {d8-d15}
     pop             {r4, pc}
 
 
@@ -413,6 +416,7 @@ secondpass_filter4x4_only
     vst1.32         {d4[0]}, [r1]
     vst1.32         {d4[1]}, [r2]
 
+    vpop            {d8-d15}
     pop             {r4, pc}
 
     ENDP
diff --git a/vp8/common/arm/neon/sixtappredict8x4_neon.asm b/vp8/common/arm/neon/sixtappredict8x4_neon.asm
index a57ec015f2c0c7404732cbeded169dd911b79a88..d19bf8920a3230425244dc1027b58dc3fb503a65 100644
--- a/vp8/common/arm/neon/sixtappredict8x4_neon.asm
+++ b/vp8/common/arm/neon/sixtappredict8x4_neon.asm
@@ -35,10 +35,11 @@ filter8_coeff
 
 |vp8_sixtap_predict8x4_neon| PROC
     push            {r4-r5, lr}
+    vpush           {d8-d15}
 
     adr             r12, filter8_coeff
-    ldr             r4, [sp, #12]           ;load parameters from stack
-    ldr             r5, [sp, #16]           ;load parameters from stack
+    ldr             r4, [sp, #76]           ;load parameters from stack
+    ldr             r5, [sp, #80]           ;load parameters from stack
 
     cmp             r2, #0                  ;skip first_pass filter if xoffset=0
     beq             secondpass_filter8x4_only
@@ -297,6 +298,8 @@ filter8_coeff
     vst1.u8         {d9}, [r4], r5
 
     add             sp, sp, #32
+
+    vpop            {d8-d15}
     pop             {r4-r5,pc}
 
 ;--------------------
@@ -392,6 +395,7 @@ firstpass_filter8x4_only
     vst1.u8         {d24}, [r4], r5
     vst1.u8         {d25}, [r4], r5
 
+    vpop            {d8-d15}
     pop             {r4-r5,pc}
 
 ;---------------------
@@ -464,6 +468,7 @@ secondpass_filter8x4_only
     vst1.u8         {d8}, [r4], r5
     vst1.u8         {d9}, [r4], r5
 
+    vpop            {d8-d15}
     pop             {r4-r5,pc}
 
     ENDP
diff --git a/vp8/common/arm/neon/sixtappredict8x8_neon.asm b/vp8/common/arm/neon/sixtappredict8x8_neon.asm
index 00ed5aeefe3c9cb4458cf15f8322eda756f4792f..4b049252c16c94b87ccdf47d1724914a9487bc99 100644
--- a/vp8/common/arm/neon/sixtappredict8x8_neon.asm
+++ b/vp8/common/arm/neon/sixtappredict8x8_neon.asm
@@ -35,11 +35,11 @@ filter8_coeff
 
 |vp8_sixtap_predict8x8_neon| PROC
     push            {r4-r5, lr}
-
+    vpush           {d8-d15}
     adr             r12, filter8_coeff
 
-    ldr             r4, [sp, #12]           ;load parameters from stack
-    ldr             r5, [sp, #16]           ;load parameters from stack
+    ldr             r4, [sp, #76]           ;load parameters from stack
+    ldr             r5, [sp, #80]           ;load parameters from stack
 
     cmp             r2, #0                  ;skip first_pass filter if xoffset=0
     beq             secondpass_filter8x8_only
@@ -324,6 +324,8 @@ filt_blk2d_sp8x8_loop_neon
     bne filt_blk2d_sp8x8_loop_neon
 
     add             sp, sp, #64
+
+    vpop            {d8-d15}
     pop             {r4-r5,pc}
 
 ;---------------------
@@ -428,6 +430,7 @@ filt_blk2d_fpo8x8_loop_neon
 
     bne             filt_blk2d_fpo8x8_loop_neon
 
+    vpop            {d8-d15}
     pop             {r4-r5,pc}
 
 ;---------------------
@@ -515,6 +518,7 @@ filt_blk2d_spo8x8_loop_neon
 
     bne filt_blk2d_spo8x8_loop_neon
 
+    vpop            {d8-d15}
     pop             {r4-r5,pc}
 
     ENDP
diff --git a/vp8/common/arm/neon/variance_neon.asm b/vp8/common/arm/neon/variance_neon.asm
index e3b48327d3f5d2a9eb85d35ddcc20ea07ef01d22..8ecad72b9def22a7b9c16fc768a43b6e758cbddd 100644
--- a/vp8/common/arm/neon/variance_neon.asm
+++ b/vp8/common/arm/neon/variance_neon.asm
@@ -26,6 +26,7 @@
 ; r3    int  recon_stride
 ; stack unsigned int *sse
 |vp8_variance16x16_neon| PROC
+    vpush           {q5}
     vmov.i8         q8, #0                      ;q8 - sum
     vmov.i8         q9, #0                      ;q9, q10 - sse
     vmov.i8         q10, #0
@@ -67,7 +68,7 @@ variance16x16_neon_loop
     vadd.u32        q10, q9, q10                ;accumulate sse
     vpaddl.s32      q0, q8                      ;accumulate sum
 
-    ldr             r12, [sp]                   ;load *sse from stack
+    ldr             r12, [sp, #16]              ;load *sse from stack
 
     vpaddl.u32      q1, q10
     vadd.s64        d0, d0, d1
@@ -87,6 +88,8 @@ variance16x16_neon_loop
     vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
+
+    vpop            {q5}
     bx              lr
 
     ENDP
@@ -99,6 +102,8 @@ variance16x16_neon_loop
 ;    int  recon_stride,
 ;   unsigned int *sse)
 |vp8_variance16x8_neon| PROC
+    vpush           {q5}
+
     vmov.i8         q8, #0                      ;q8 - sum
     vmov.i8         q9, #0                      ;q9, q10 - sse
     vmov.i8         q10, #0
@@ -137,7 +142,7 @@ variance16x8_neon_loop
     vadd.u32        q10, q9, q10                ;accumulate sse
     vpaddl.s32      q0, q8                      ;accumulate sum
 
-    ldr             r12, [sp]                   ;load *sse from stack
+    ldr             r12, [sp, #16]              ;load *sse from stack
 
     vpaddl.u32      q1, q10
     vadd.s64        d0, d0, d1
@@ -149,6 +154,8 @@ variance16x8_neon_loop
     vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
+
+    vpop            {q5}
     bx              lr
 
     ENDP
@@ -162,6 +169,8 @@ variance16x8_neon_loop
 ;   unsigned int *sse)
 
 |vp8_variance8x16_neon| PROC
+    vpush           {q5}
+
     vmov.i8         q8, #0                      ;q8 - sum
     vmov.i8         q9, #0                      ;q9, q10 - sse
     vmov.i8         q10, #0
@@ -192,7 +201,7 @@ variance8x16_neon_loop
     vadd.u32        q10, q9, q10                ;accumulate sse
     vpaddl.s32      q0, q8                      ;accumulate sum
 
-    ldr             r12, [sp]                   ;load *sse from stack
+    ldr             r12, [sp, #16]              ;load *sse from stack
 
     vpaddl.u32      q1, q10
     vadd.s64        d0, d0, d1
@@ -204,6 +213,8 @@ variance8x16_neon_loop
     vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
+
+    vpop            {q5}
     bx              lr
 
     ENDP
@@ -215,6 +226,8 @@ variance8x16_neon_loop
 ; r3    int  recon_stride
 ; stack unsigned int *sse
 |vp8_variance8x8_neon| PROC
+    vpush           {q5}
+
     vmov.i8         q8, #0                      ;q8 - sum
     vmov.i8         q9, #0                      ;q9, q10 - sse
     vmov.i8         q10, #0
@@ -257,7 +270,7 @@ variance8x8_neon_loop
     vadd.u32        q10, q9, q10                ;accumulate sse
     vpaddl.s32      q0, q8                      ;accumulate sum
 
-    ldr             r12, [sp]                   ;load *sse from stack
+    ldr             r12, [sp, #16]              ;load *sse from stack
 
     vpaddl.u32      q1, q10
     vadd.s64        d0, d0, d1
@@ -269,6 +282,8 @@ variance8x8_neon_loop
     vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
+
+    vpop            {q5}
     bx              lr
 
     ENDP
diff --git a/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm b/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm
index 9d22c52521c747ab7ab9fb7a8abfda4854998bce..adc5b7e3a7816242d12852486c9fb20bfee840fe 100644
--- a/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm
+++ b/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm
@@ -31,11 +31,12 @@ bilinear_taps_coeff
 
 |vp8_sub_pixel_variance16x16_neon_func| PROC
     push            {r4-r6, lr}
+    vpush           {d8-d15}
 
     adr             r12, bilinear_taps_coeff
-    ldr             r4, [sp, #16]           ;load *dst_ptr from stack
-    ldr             r5, [sp, #20]           ;load dst_pixels_per_line from stack
-    ldr             r6, [sp, #24]           ;load *sse from stack
+    ldr             r4, [sp, #80]           ;load *dst_ptr from stack
+    ldr             r5, [sp, #84]           ;load dst_pixels_per_line from stack
+    ldr             r6, [sp, #88]           ;load *sse from stack
 
     cmp             r2, #0                  ;skip first_pass filter if xoffset=0
     beq             secondpass_bfilter16x16_only
@@ -416,6 +417,7 @@ sub_pixel_variance16x16_neon_loop
     add             sp, sp, #528
     vmov.32         r0, d0[0]                   ;return
 
+    vpop            {d8-d15}
     pop             {r4-r6,pc}
 
     ENDP
diff --git a/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm b/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm
index 155be4fc54b41c0d603e85f98e5334fcf2a33909..b0829af7547be4280ba668462abf9491ab655e09 100644
--- a/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm
+++ b/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm
@@ -31,9 +31,10 @@
 ;================================================
 |vp8_variance_halfpixvar16x16_h_neon| PROC
     push            {lr}
+    vpush           {d8-d15}
 
     mov             r12, #4                  ;loop counter
-    ldr             lr, [sp, #4]           ;load *sse from stack
+    ldr             lr, [sp, #68]            ;load *sse from stack
     vmov.i8         q8, #0                      ;q8 - sum
     vmov.i8         q9, #0                      ;q9, q10 - sse
     vmov.i8         q10, #0
@@ -116,6 +117,8 @@ vp8_filt_fpo16x16s_4_0_loop_neon
     vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
+
+    vpop            {d8-d15}
     pop             {pc}
     ENDP
 
@@ -131,11 +134,12 @@ vp8_filt_fpo16x16s_4_0_loop_neon
 ;================================================
 |vp8_variance_halfpixvar16x16_v_neon| PROC
     push            {lr}
+    vpush           {d8-d15}
 
     mov             r12, #4                     ;loop counter
 
     vld1.u8         {q0}, [r0], r1              ;load src data
-    ldr             lr, [sp, #4]                ;load *sse from stack
+    ldr             lr, [sp, #68]               ;load *sse from stack
 
     vmov.i8         q8, #0                      ;q8 - sum
     vmov.i8         q9, #0                      ;q9, q10 - sse
@@ -212,6 +216,8 @@ vp8_filt_spo16x16s_0_4_loop_neon
     vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
+
+    vpop            {d8-d15}
     pop             {pc}
     ENDP
 
@@ -227,10 +233,11 @@ vp8_filt_spo16x16s_0_4_loop_neon
 ;================================================
 |vp8_variance_halfpixvar16x16_hv_neon| PROC
     push            {lr}
+    vpush           {d8-d15}
 
     vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
 
-    ldr             lr, [sp, #4]           ;load *sse from stack
+    ldr             lr, [sp, #68]           ;load *sse from stack
     vmov.i8         q13, #0                      ;q8 - sum
     vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
 
@@ -331,6 +338,8 @@ vp8_filt16x16s_4_4_loop_neon
     vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
+
+    vpop            {d8-d15}
     pop             {pc}
     ENDP
 
@@ -349,10 +358,11 @@ vp8_filt16x16s_4_4_loop_neon
 
 |vp8_sub_pixel_variance16x16s_neon| PROC
     push            {r4, lr}
+    vpush           {d8-d15}
 
-    ldr             r4, [sp, #8]            ;load *dst_ptr from stack
-    ldr             r12, [sp, #12]          ;load dst_pixels_per_line from stack
-    ldr             lr, [sp, #16]           ;load *sse from stack
+    ldr             r4, [sp, #72]           ;load *dst_ptr from stack
+    ldr             r12, [sp, #76]          ;load dst_pixels_per_line from stack
+    ldr             lr, [sp, #80]           ;load *sse from stack
 
     cmp             r2, #0                  ;skip first_pass filter if xoffset=0
     beq             secondpass_bfilter16x16s_only
@@ -566,6 +576,7 @@ sub_pixel_variance16x16s_neon_loop
     add             sp, sp, #256
     vmov.32         r0, d0[0]                   ;return
 
+    vpop            {d8-d15}
     pop             {r4, pc}
     ENDP
 
diff --git a/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm b/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm
index f6b6847537f8e8025125b1e524845e4bd4377e06..9d9f9e0772a17b58e09ce5d68dbcebd84b45f7d1 100644
--- a/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm
+++ b/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm
@@ -26,11 +26,12 @@
 
 |vp8_sub_pixel_variance8x8_neon| PROC
     push            {r4-r5, lr}
+    vpush           {d8-d15}
 
     adr             r12, bilinear_taps_coeff
-    ldr             r4, [sp, #12]           ;load *dst_ptr from stack
-    ldr             r5, [sp, #16]           ;load dst_pixels_per_line from stack
-    ldr             lr, [sp, #20]           ;load *sse from stack
+    ldr             r4, [sp, #76]           ;load *dst_ptr from stack
+    ldr             r5, [sp, #80]           ;load dst_pixels_per_line from stack
+    ldr             lr, [sp, #84]           ;load *sse from stack
 
     cmp             r2, #0                  ;skip first_pass filter if xoffset=0
     beq             skip_firstpass_filter
@@ -210,6 +211,8 @@ sub_pixel_variance8x8_neon_loop
     vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
+
+    vpop            {d8-d15}
     pop             {r4-r5, pc}
 
     ENDP
diff --git a/vp8/encoder/arm/neon/subtract_neon.asm b/vp8/encoder/arm/neon/subtract_neon.asm
index 5bda78678db9fe3098d4e425d73611ddbfb21533..840cb33d95723b33905e3b4b0eb74df557fb756a 100644
--- a/vp8/encoder/arm/neon/subtract_neon.asm
+++ b/vp8/encoder/arm/neon/subtract_neon.asm
@@ -65,8 +65,10 @@
 ;                           unsigned char *pred, int pred_stride)
 |vp8_subtract_mby_neon| PROC
     push            {r4-r7}
+    vpush           {d8-d15}
+
     mov             r12, #4
-    ldr             r4, [sp, #16]           ; pred_stride
+    ldr             r4, [sp, #80]           ; pred_stride
     mov             r6, #32                 ; "diff" stride x2
     add             r5, r0, #16             ; second diff pointer
 
@@ -101,6 +103,7 @@ subtract_mby_loop
     subs            r12, r12, #1
     bne             subtract_mby_loop
 
+    vpop            {d8-d15}
     pop             {r4-r7}
     bx              lr
     ENDP
@@ -112,9 +115,11 @@ subtract_mby_loop
 
 |vp8_subtract_mbuv_neon| PROC
     push            {r4-r7}
-    ldr             r4, [sp, #16]       ; upred
-    ldr             r5, [sp, #20]       ; vpred
-    ldr             r6, [sp, #24]       ; pred_stride
+    vpush           {d8-d15}
+
+    ldr             r4, [sp, #80]       ; upred
+    ldr             r5, [sp, #84]       ; vpred
+    ldr             r6, [sp, #88]       ; pred_stride
     add             r0, r0, #512        ; short *udiff = diff + 256;
     mov             r12, #32            ; "diff" stride x2
     add             r7, r0, #16         ; second diff pointer
@@ -191,6 +196,7 @@ subtract_mby_loop
     vst1.16         {q14}, [r0], r12
     vst1.16         {q15}, [r7], r12
 
+    vpop            {d8-d15}
     pop             {r4-r7}
     bx              lr
 
diff --git a/vp8/encoder/arm/neon/vp8_memcpy_neon.asm b/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
index 5b9f11e59352f1a1f6f658a39d03c75489d4e46b..d219e2d14248375b14d69af2ee9b5ac092ed6d51 100644
--- a/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
+++ b/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
@@ -21,6 +21,7 @@
 ;void vp8_memcpy_partial_neon(unsigned char *dst_ptr, unsigned char *src_ptr,
 ;                             int sz);
 |vp8_memcpy_partial_neon| PROC
+    vpush               {d8-d15}
     ;pld                [r1]                        ;preload pred data
     ;pld                [r1, #128]
     ;pld                [r1, #256]
@@ -64,6 +65,7 @@ extra_copy_neon_loop
     bne             extra_copy_neon_loop
 
 done_copy_neon_loop
+    vpop            {d8-d15}
     bx              lr
     ENDP
 
diff --git a/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm b/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
index 55edbf5129ed013ffa0b5ea32c5aa6682ad24e6c..f82af3ee333a08ceac25261da45796fc0b6d719d 100644
--- a/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
+++ b/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
@@ -27,6 +27,8 @@
 ;from vp8_variance().
 
 |vp8_mse16x16_neon| PROC
+    vpush           {q7}
+
     vmov.i8         q7, #0                      ;q7, q8, q9, q10 - sse
     vmov.i8         q8, #0
     vmov.i8         q9, #0
@@ -62,7 +64,7 @@ mse16x16_neon_loop
     vadd.u32        q7, q7, q8
     vadd.u32        q9, q9, q10
 
-    ldr             r12, [sp]               ;load *sse from stack
+    ldr             r12, [sp, #16]              ;load *sse from stack
 
     vadd.u32        q10, q7, q9
     vpaddl.u32      q1, q10
@@ -71,6 +73,7 @@ mse16x16_neon_loop
     vst1.32         {d0[0]}, [r12]
     vmov.32         r0, d0[0]
 
+    vpop            {q7}
     bx              lr
 
     ENDP
@@ -82,6 +85,8 @@ mse16x16_neon_loop
 ; r2    unsigned char *ref_ptr,
 ; r3    int  recon_stride
 |vp8_get4x4sse_cs_neon| PROC
+    vpush           {q7}
+
     vld1.8          {d0}, [r0], r1              ;Load up source and reference
     vld1.8          {d4}, [r2], r3
     vld1.8          {d1}, [r0], r1
@@ -109,6 +114,8 @@ mse16x16_neon_loop
     vadd.u64        d0, d2, d3
 
     vmov.32         r0, d0[0]
+
+    vpop            {q7}
     bx              lr
 
     ENDP