aom_convolve8_avg_neon_asm.asm 8.02 KB
Newer Older
Johann's avatar
Johann committed
1
;
Yaowu Xu's avatar
Yaowu Xu committed
2
; Copyright (c) 2016, Alliance for Open Media. All rights reserved
Johann's avatar
Johann committed
3
;
Yaowu Xu's avatar
Yaowu Xu committed
4 5 6 7 8 9 10 11
; This source code is subject to the terms of the BSD 2 Clause License and
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
; was not distributed with this source code in the LICENSE file, you can
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
; Media Patent License 1.0 was not distributed with this source code in the
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
;

Johann's avatar
Johann committed
12 13 14 15 16 17 18 19
;


    ; These functions are only valid when:
    ; x_step_q4 == 16
    ; w%4 == 0
    ; h%4 == 0
    ; taps == 8
Yaowu Xu's avatar
Yaowu Xu committed
20 21
    ; AV1_FILTER_WEIGHT == 128
    ; AV1_FILTER_SHIFT == 7
Johann's avatar
Johann committed
22

Yaowu Xu's avatar
Yaowu Xu committed
23 24
    EXPORT  |aom_convolve8_avg_horiz_neon|
    EXPORT  |aom_convolve8_avg_vert_neon|
Johann's avatar
Johann committed
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
    ARM
    REQUIRE8
    PRESERVE8

    AREA ||.text||, CODE, READONLY, ALIGN=2

    ; Multiply and accumulate by q0
    MACRO
    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
    vmull.s16 $dst, $src0, d0[0]
    vmlal.s16 $dst, $src1, d0[1]
    vmlal.s16 $dst, $src2, d0[2]
    vmlal.s16 $dst, $src3, d0[3]
    vmlal.s16 $dst, $src4, d1[0]
    vmlal.s16 $dst, $src5, d1[1]
    vmlal.s16 $dst, $src6, d1[2]
    vmlal.s16 $dst, $src7, d1[3]
    MEND

; r0    const uint8_t *src
; r1    int src_stride
; r2    uint8_t *dst
; r3    int dst_stride
; sp[]const int16_t *filter_x
; sp[]int x_step_q4
; sp[]const int16_t *filter_y ; unused
; sp[]int y_step_q4           ; unused
; sp[]int w
; sp[]int h

Yaowu Xu's avatar
Yaowu Xu committed
55
|aom_convolve8_avg_horiz_neon| PROC
Johann's avatar
Johann committed
56 57 58 59 60 61 62 63 64 65
    push            {r4-r10, lr}

    sub             r0, r0, #3              ; adjust for taps

    ldr             r5, [sp, #32]           ; filter_x
    ldr             r6, [sp, #48]           ; w
    ldr             r7, [sp, #52]           ; h

    vld1.s16        {q0}, [r5]              ; filter_x

66 67
    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
    add             r8, r8, #4              ; -src_stride * 3 + 4
Johann's avatar
Johann committed
68

69 70
    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
    add             r4, r4, #4              ; -dst_stride * 3 + 4
Johann's avatar
Johann committed
71

72 73
    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
    sub             r9, r9, #7
Johann's avatar
Johann committed
74 75 76 77
    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop

    mov             r10, r6                 ; w loop counter

Yaowu Xu's avatar
Yaowu Xu committed
78
aom_convolve8_avg_loop_horiz_v
79 80 81 82
    vld1.8          {d24}, [r0], r1
    vld1.8          {d25}, [r0], r1
    vld1.8          {d26}, [r0], r1
    vld1.8          {d27}, [r0], r8
Johann's avatar
Johann committed
83

84 85 86 87
    vtrn.16         q12, q13
    vtrn.8          d24, d25
    vtrn.8          d26, d27

88 89
    pld             [r0, r1, lsl #2]

Johann's avatar
Johann committed
90 91 92 93
    vmovl.u8        q8, d24
    vmovl.u8        q9, d25
    vmovl.u8        q10, d26
    vmovl.u8        q11, d27
94 95 96 97 98 99 100

    ; save a few instructions in the inner loop
    vswp            d17, d18
    vmov            d23, d21

    add             r0, r0, #3

Yaowu Xu's avatar
Yaowu Xu committed
101
aom_convolve8_avg_loop_horiz
102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
    add             r5, r0, #64

    vld1.32         {d28[]}, [r0], r1
    vld1.32         {d29[]}, [r0], r1
    vld1.32         {d31[]}, [r0], r1
    vld1.32         {d30[]}, [r0], r8

    pld             [r5]

    vtrn.16         d28, d31
    vtrn.16         d29, d30
    vtrn.8          d28, d29
    vtrn.8          d31, d30

    pld             [r5, r1]

    ; extract to s16
    vtrn.32         q14, q15
Johann's avatar
Johann committed
120
    vmovl.u8        q12, d28
121 122 123
    vmovl.u8        q13, d29

    pld             [r5, r1, lsl #1]
Johann's avatar
Johann committed
124 125 126 127 128 129 130 131 132 133

    ; slightly out of order load to match the existing data
    vld1.u32        {d6[0]}, [r2], r3
    vld1.u32        {d7[0]}, [r2], r3
    vld1.u32        {d6[1]}, [r2], r3
    vld1.u32        {d7[1]}, [r2], r3

    sub             r2, r2, r3, lsl #2      ; reset for store

    ; src[] * filter_x
134 135 136 137 138 139
    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25

    pld             [r5, -r8]
Johann's avatar
Johann committed
140 141 142 143 144 145 146 147

    ; += 64 >> 7
    vqrshrun.s32    d2, q1, #7
    vqrshrun.s32    d3, q2, #7
    vqrshrun.s32    d4, q14, #7
    vqrshrun.s32    d5, q15, #7

    ; saturate
148 149
    vqmovn.u16      d2, q1
    vqmovn.u16      d3, q2
Johann's avatar
Johann committed
150 151 152 153 154

    ; transpose
    vtrn.16         d2, d3
    vtrn.32         d2, d3
    vtrn.8          d2, d3
155

Johann's avatar
Johann committed
156
    ; average the new value and the dst value
157
    vrhadd.u8       q1, q1, q3
Johann's avatar
Johann committed
158

159 160 161 162
    vst1.u32        {d2[0]}, [r2@32], r3
    vst1.u32        {d3[0]}, [r2@32], r3
    vst1.u32        {d2[1]}, [r2@32], r3
    vst1.u32        {d3[1]}, [r2@32], r4
Johann's avatar
Johann committed
163

164 165 166 167 168
    vmov            q8,  q9
    vmov            d20, d23
    vmov            q11, q12
    vmov            q9,  q13

Johann's avatar
Johann committed
169
    subs            r6, r6, #4              ; w -= 4
Yaowu Xu's avatar
Yaowu Xu committed
170
    bgt             aom_convolve8_avg_loop_horiz
Johann's avatar
Johann committed
171 172 173

    ; outer loop
    mov             r6, r10                 ; restore w counter
174
    add             r0, r0, r9              ; src += src_stride * 4 - w
Johann's avatar
Johann committed
175 176
    add             r2, r2, r12             ; dst += dst_stride * 4 - w
    subs            r7, r7, #4              ; h -= 4
Yaowu Xu's avatar
Yaowu Xu committed
177
    bgt aom_convolve8_avg_loop_horiz_v
Johann's avatar
Johann committed
178 179 180 181 182

    pop             {r4-r10, pc}

    ENDP

Yaowu Xu's avatar
Yaowu Xu committed
183
|aom_convolve8_avg_vert_neon| PROC
184
    push            {r4-r8, lr}
Johann's avatar
Johann committed
185 186 187 188 189

    ; adjust for taps
    sub             r0, r0, r1
    sub             r0, r0, r1, lsl #1

190 191 192
    ldr             r4, [sp, #32]           ; filter_y
    ldr             r6, [sp, #40]           ; w
    ldr             lr, [sp, #44]           ; h
Johann's avatar
Johann committed
193

194
    vld1.s16        {q0}, [r4]              ; filter_y
Johann's avatar
Johann committed
195

196 197
    lsl             r1, r1, #1
    lsl             r3, r3, #1
Johann's avatar
Johann committed
198

Yaowu Xu's avatar
Yaowu Xu committed
199
aom_convolve8_avg_loop_vert_h
200 201 202 203 204
    mov             r4, r0
    add             r7, r0, r1, asr #1
    mov             r5, r2
    add             r8, r2, r3, asr #1
    mov             r12, lr                 ; h loop counter
Johann's avatar
Johann committed
205

206 207 208 209 210 211 212
    vld1.u32        {d16[0]}, [r4], r1
    vld1.u32        {d16[1]}, [r7], r1
    vld1.u32        {d18[0]}, [r4], r1
    vld1.u32        {d18[1]}, [r7], r1
    vld1.u32        {d20[0]}, [r4], r1
    vld1.u32        {d20[1]}, [r7], r1
    vld1.u32        {d22[0]}, [r4], r1
Johann's avatar
Johann committed
213

214 215 216 217
    vmovl.u8        q8, d16
    vmovl.u8        q9, d18
    vmovl.u8        q10, d20
    vmovl.u8        q11, d22
Johann's avatar
Johann committed
218

Yaowu Xu's avatar
Yaowu Xu committed
219
aom_convolve8_avg_loop_vert
Johann's avatar
Johann committed
220
    ; always process a 4x4 block at a time
221 222 223 224
    vld1.u32        {d24[0]}, [r7], r1
    vld1.u32        {d26[0]}, [r4], r1
    vld1.u32        {d26[1]}, [r7], r1
    vld1.u32        {d24[1]}, [r4], r1
Johann's avatar
Johann committed
225 226 227 228 229

    ; extract to s16
    vmovl.u8        q12, d24
    vmovl.u8        q13, d26

230 231 232 233
    vld1.u32        {d6[0]}, [r5@32], r3
    vld1.u32        {d6[1]}, [r8@32], r3
    vld1.u32        {d7[0]}, [r5@32], r3
    vld1.u32        {d7[1]}, [r8@32], r3
Johann's avatar
Johann committed
234

235 236
    pld             [r7]
    pld             [r4]
Johann's avatar
Johann committed
237 238

    ; src[] * filter_y
239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254
    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24

    pld             [r7, r1]
    pld             [r4, r1]

    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26

    pld             [r5]
    pld             [r8]

    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27

    pld             [r5, r3]
    pld             [r8, r3]

    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
Johann's avatar
Johann committed
255 256 257 258 259 260 261 262

    ; += 64 >> 7
    vqrshrun.s32    d2, q1, #7
    vqrshrun.s32    d3, q2, #7
    vqrshrun.s32    d4, q14, #7
    vqrshrun.s32    d5, q15, #7

    ; saturate
263 264
    vqmovn.u16      d2, q1
    vqmovn.u16      d3, q2
Johann's avatar
Johann committed
265 266

    ; average the new value and the dst value
267
    vrhadd.u8       q1, q1, q3
Johann's avatar
Johann committed
268

269 270 271
    sub             r5, r5, r3, lsl #1      ; reset for store
    sub             r8, r8, r3, lsl #1

272 273 274 275
    vst1.u32        {d2[0]}, [r5@32], r3
    vst1.u32        {d2[1]}, [r8@32], r3
    vst1.u32        {d3[0]}, [r5@32], r3
    vst1.u32        {d3[1]}, [r8@32], r3
276 277 278 279 280 281

    vmov            q8, q10
    vmov            d18, d22
    vmov            d19, d24
    vmov            q10, q13
    vmov            d22, d25
Johann's avatar
Johann committed
282

283
    subs            r12, r12, #4            ; h -= 4
Yaowu Xu's avatar
Yaowu Xu committed
284
    bgt             aom_convolve8_avg_loop_vert
Johann's avatar
Johann committed
285 286

    ; outer loop
287 288 289
    add             r0, r0, #4
    add             r2, r2, #4
    subs            r6, r6, #4              ; w -= 4
Yaowu Xu's avatar
Yaowu Xu committed
290
    bgt             aom_convolve8_avg_loop_vert_h
Johann's avatar
Johann committed
291

292
    pop             {r4-r8, pc}
Johann's avatar
Johann committed
293 294 295

    ENDP
    END