vpx_convolve8_avg_neon_asm.asm 7.91 KB
Newer Older
Johann's avatar
Johann committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
;
;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;


    ; These functions are only valid when:
    ; x_step_q4 == 16
    ; w%4 == 0
    ; h%4 == 0
    ; taps == 8
    ; VP9_FILTER_WEIGHT == 128
    ; VP9_FILTER_SHIFT == 7

Zoe Liu's avatar
Zoe Liu committed
20 21
    EXPORT  |vpx_convolve8_avg_horiz_neon|
    EXPORT  |vpx_convolve8_avg_vert_neon|
Johann's avatar
Johann committed
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
    ARM
    REQUIRE8
    PRESERVE8

    AREA ||.text||, CODE, READONLY, ALIGN=2

    ; Multiply and accumulate by q0
    MACRO
    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
    vmull.s16 $dst, $src0, d0[0]
    vmlal.s16 $dst, $src1, d0[1]
    vmlal.s16 $dst, $src2, d0[2]
    vmlal.s16 $dst, $src3, d0[3]
    vmlal.s16 $dst, $src4, d1[0]
    vmlal.s16 $dst, $src5, d1[1]
    vmlal.s16 $dst, $src6, d1[2]
    vmlal.s16 $dst, $src7, d1[3]
    MEND

; r0    const uint8_t *src
; r1    int src_stride
; r2    uint8_t *dst
; r3    int dst_stride
; sp[]const int16_t *filter_x
; sp[]int x_step_q4
; sp[]const int16_t *filter_y ; unused
; sp[]int y_step_q4           ; unused
; sp[]int w
; sp[]int h

Zoe Liu's avatar
Zoe Liu committed
52
|vpx_convolve8_avg_horiz_neon| PROC
Johann's avatar
Johann committed
53 54 55 56 57 58 59 60 61 62
    push            {r4-r10, lr}

    sub             r0, r0, #3              ; adjust for taps

    ldr             r5, [sp, #32]           ; filter_x
    ldr             r6, [sp, #48]           ; w
    ldr             r7, [sp, #52]           ; h

    vld1.s16        {q0}, [r5]              ; filter_x

63 64
    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
    add             r8, r8, #4              ; -src_stride * 3 + 4
Johann's avatar
Johann committed
65

66 67
    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
    add             r4, r4, #4              ; -dst_stride * 3 + 4
Johann's avatar
Johann committed
68

69 70
    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
    sub             r9, r9, #7
Johann's avatar
Johann committed
71 72 73 74
    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop

    mov             r10, r6                 ; w loop counter

Zoe Liu's avatar
Zoe Liu committed
75
vpx_convolve8_avg_loop_horiz_v
76 77 78 79
    vld1.8          {d24}, [r0], r1
    vld1.8          {d25}, [r0], r1
    vld1.8          {d26}, [r0], r1
    vld1.8          {d27}, [r0], r8
Johann's avatar
Johann committed
80

81 82 83 84
    vtrn.16         q12, q13
    vtrn.8          d24, d25
    vtrn.8          d26, d27

85 86
    pld             [r0, r1, lsl #2]

Johann's avatar
Johann committed
87 88 89 90
    vmovl.u8        q8, d24
    vmovl.u8        q9, d25
    vmovl.u8        q10, d26
    vmovl.u8        q11, d27
91 92 93 94 95 96 97

    ; save a few instructions in the inner loop
    vswp            d17, d18
    vmov            d23, d21

    add             r0, r0, #3

Zoe Liu's avatar
Zoe Liu committed
98
vpx_convolve8_avg_loop_horiz
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
    add             r5, r0, #64

    vld1.32         {d28[]}, [r0], r1
    vld1.32         {d29[]}, [r0], r1
    vld1.32         {d31[]}, [r0], r1
    vld1.32         {d30[]}, [r0], r8

    pld             [r5]

    vtrn.16         d28, d31
    vtrn.16         d29, d30
    vtrn.8          d28, d29
    vtrn.8          d31, d30

    pld             [r5, r1]

    ; extract to s16
    vtrn.32         q14, q15
Johann's avatar
Johann committed
117
    vmovl.u8        q12, d28
118 119 120
    vmovl.u8        q13, d29

    pld             [r5, r1, lsl #1]
Johann's avatar
Johann committed
121 122 123 124 125 126 127 128 129 130

    ; slightly out of order load to match the existing data
    vld1.u32        {d6[0]}, [r2], r3
    vld1.u32        {d7[0]}, [r2], r3
    vld1.u32        {d6[1]}, [r2], r3
    vld1.u32        {d7[1]}, [r2], r3

    sub             r2, r2, r3, lsl #2      ; reset for store

    ; src[] * filter_x
131 132 133 134 135 136
    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25

    pld             [r5, -r8]
Johann's avatar
Johann committed
137 138 139 140 141 142 143 144

    ; += 64 >> 7
    vqrshrun.s32    d2, q1, #7
    vqrshrun.s32    d3, q2, #7
    vqrshrun.s32    d4, q14, #7
    vqrshrun.s32    d5, q15, #7

    ; saturate
145 146
    vqmovn.u16      d2, q1
    vqmovn.u16      d3, q2
Johann's avatar
Johann committed
147 148 149 150 151

    ; transpose
    vtrn.16         d2, d3
    vtrn.32         d2, d3
    vtrn.8          d2, d3
152

Johann's avatar
Johann committed
153
    ; average the new value and the dst value
154
    vrhadd.u8       q1, q1, q3
Johann's avatar
Johann committed
155

156 157 158 159
    vst1.u32        {d2[0]}, [r2@32], r3
    vst1.u32        {d3[0]}, [r2@32], r3
    vst1.u32        {d2[1]}, [r2@32], r3
    vst1.u32        {d3[1]}, [r2@32], r4
Johann's avatar
Johann committed
160

161 162 163 164 165
    vmov            q8,  q9
    vmov            d20, d23
    vmov            q11, q12
    vmov            q9,  q13

Johann's avatar
Johann committed
166
    subs            r6, r6, #4              ; w -= 4
Zoe Liu's avatar
Zoe Liu committed
167
    bgt             vpx_convolve8_avg_loop_horiz
Johann's avatar
Johann committed
168 169 170

    ; outer loop
    mov             r6, r10                 ; restore w counter
171
    add             r0, r0, r9              ; src += src_stride * 4 - w
Johann's avatar
Johann committed
172 173
    add             r2, r2, r12             ; dst += dst_stride * 4 - w
    subs            r7, r7, #4              ; h -= 4
Zoe Liu's avatar
Zoe Liu committed
174
    bgt vpx_convolve8_avg_loop_horiz_v
Johann's avatar
Johann committed
175 176 177 178 179

    pop             {r4-r10, pc}

    ENDP

Zoe Liu's avatar
Zoe Liu committed
180
|vpx_convolve8_avg_vert_neon| PROC
181
    push            {r4-r8, lr}
Johann's avatar
Johann committed
182 183 184 185 186

    ; adjust for taps
    sub             r0, r0, r1
    sub             r0, r0, r1, lsl #1

187 188 189
    ldr             r4, [sp, #32]           ; filter_y
    ldr             r6, [sp, #40]           ; w
    ldr             lr, [sp, #44]           ; h
Johann's avatar
Johann committed
190

191
    vld1.s16        {q0}, [r4]              ; filter_y
Johann's avatar
Johann committed
192

193 194
    lsl             r1, r1, #1
    lsl             r3, r3, #1
Johann's avatar
Johann committed
195

Zoe Liu's avatar
Zoe Liu committed
196
vpx_convolve8_avg_loop_vert_h
197 198 199 200 201
    mov             r4, r0
    add             r7, r0, r1, asr #1
    mov             r5, r2
    add             r8, r2, r3, asr #1
    mov             r12, lr                 ; h loop counter
Johann's avatar
Johann committed
202

203 204 205 206 207 208 209
    vld1.u32        {d16[0]}, [r4], r1
    vld1.u32        {d16[1]}, [r7], r1
    vld1.u32        {d18[0]}, [r4], r1
    vld1.u32        {d18[1]}, [r7], r1
    vld1.u32        {d20[0]}, [r4], r1
    vld1.u32        {d20[1]}, [r7], r1
    vld1.u32        {d22[0]}, [r4], r1
Johann's avatar
Johann committed
210

211 212 213 214
    vmovl.u8        q8, d16
    vmovl.u8        q9, d18
    vmovl.u8        q10, d20
    vmovl.u8        q11, d22
Johann's avatar
Johann committed
215

Zoe Liu's avatar
Zoe Liu committed
216
vpx_convolve8_avg_loop_vert
Johann's avatar
Johann committed
217
    ; always process a 4x4 block at a time
218 219 220 221
    vld1.u32        {d24[0]}, [r7], r1
    vld1.u32        {d26[0]}, [r4], r1
    vld1.u32        {d26[1]}, [r7], r1
    vld1.u32        {d24[1]}, [r4], r1
Johann's avatar
Johann committed
222 223 224 225 226

    ; extract to s16
    vmovl.u8        q12, d24
    vmovl.u8        q13, d26

227 228 229 230
    vld1.u32        {d6[0]}, [r5@32], r3
    vld1.u32        {d6[1]}, [r8@32], r3
    vld1.u32        {d7[0]}, [r5@32], r3
    vld1.u32        {d7[1]}, [r8@32], r3
Johann's avatar
Johann committed
231

232 233
    pld             [r7]
    pld             [r4]
Johann's avatar
Johann committed
234 235

    ; src[] * filter_y
236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24

    pld             [r7, r1]
    pld             [r4, r1]

    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26

    pld             [r5]
    pld             [r8]

    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27

    pld             [r5, r3]
    pld             [r8, r3]

    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
Johann's avatar
Johann committed
252 253 254 255 256 257 258 259

    ; += 64 >> 7
    vqrshrun.s32    d2, q1, #7
    vqrshrun.s32    d3, q2, #7
    vqrshrun.s32    d4, q14, #7
    vqrshrun.s32    d5, q15, #7

    ; saturate
260 261
    vqmovn.u16      d2, q1
    vqmovn.u16      d3, q2
Johann's avatar
Johann committed
262 263

    ; average the new value and the dst value
264
    vrhadd.u8       q1, q1, q3
Johann's avatar
Johann committed
265

266 267 268
    sub             r5, r5, r3, lsl #1      ; reset for store
    sub             r8, r8, r3, lsl #1

269 270 271 272
    vst1.u32        {d2[0]}, [r5@32], r3
    vst1.u32        {d2[1]}, [r8@32], r3
    vst1.u32        {d3[0]}, [r5@32], r3
    vst1.u32        {d3[1]}, [r8@32], r3
273 274 275 276 277 278

    vmov            q8, q10
    vmov            d18, d22
    vmov            d19, d24
    vmov            q10, q13
    vmov            d22, d25
Johann's avatar
Johann committed
279

280
    subs            r12, r12, #4            ; h -= 4
Zoe Liu's avatar
Zoe Liu committed
281
    bgt             vpx_convolve8_avg_loop_vert
Johann's avatar
Johann committed
282 283

    ; outer loop
284 285 286
    add             r0, r0, #4
    add             r2, r2, #4
    subs            r6, r6, #4              ; w -= 4
Zoe Liu's avatar
Zoe Liu committed
287
    bgt             vpx_convolve8_avg_loop_vert_h
Johann's avatar
Johann committed
288

289
    pop             {r4-r8, pc}
Johann's avatar
Johann committed
290 291 292

    ENDP
    END