aom_convolve8_neon_asm.asm 7.29 KB
Newer Older
Johann's avatar
Johann committed
1
;
Yaowu Xu's avatar
Yaowu Xu committed
2
; Copyright (c) 2016, Alliance for Open Media. All rights reserved
Johann's avatar
Johann committed
3
;
Yaowu Xu's avatar
Yaowu Xu committed
4 5 6 7 8 9 10 11
; This source code is subject to the terms of the BSD 2 Clause License and
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
; was not distributed with this source code in the LICENSE file, you can
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
; Media Patent License 1.0 was not distributed with this source code in the
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
;

Johann's avatar
Johann committed
12 13 14 15 16 17 18 19
;


    ; These functions are only valid when:
    ; x_step_q4 == 16
    ; w%4 == 0
    ; h%4 == 0
    ; taps == 8
Yaowu Xu's avatar
Yaowu Xu committed
20 21
    ; AV1_FILTER_WEIGHT == 128
    ; AV1_FILTER_SHIFT == 7
Johann's avatar
Johann committed
22

Yaowu Xu's avatar
Yaowu Xu committed
23 24
    EXPORT  |aom_convolve8_horiz_neon|
    EXPORT  |aom_convolve8_vert_neon|
Johann's avatar
Johann committed
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
    ARM
    REQUIRE8
    PRESERVE8

    AREA ||.text||, CODE, READONLY, ALIGN=2

    ; Multiply and accumulate by q0
    MACRO
    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
    vmull.s16 $dst, $src0, d0[0]
    vmlal.s16 $dst, $src1, d0[1]
    vmlal.s16 $dst, $src2, d0[2]
    vmlal.s16 $dst, $src3, d0[3]
    vmlal.s16 $dst, $src4, d1[0]
    vmlal.s16 $dst, $src5, d1[1]
    vmlal.s16 $dst, $src6, d1[2]
    vmlal.s16 $dst, $src7, d1[3]
    MEND

; r0    const uint8_t *src
; r1    int src_stride
; r2    uint8_t *dst
; r3    int dst_stride
; sp[]const int16_t *filter_x
; sp[]int x_step_q4
; sp[]const int16_t *filter_y ; unused
; sp[]int y_step_q4           ; unused
; sp[]int w
; sp[]int h

Yaowu Xu's avatar
Yaowu Xu committed
55
|aom_convolve8_horiz_neon| PROC
Johann's avatar
Johann committed
56 57 58 59 60 61 62 63 64 65
    push            {r4-r10, lr}

    sub             r0, r0, #3              ; adjust for taps

    ldr             r5, [sp, #32]           ; filter_x
    ldr             r6, [sp, #48]           ; w
    ldr             r7, [sp, #52]           ; h

    vld1.s16        {q0}, [r5]              ; filter_x

66 67
    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
    add             r8, r8, #4              ; -src_stride * 3 + 4
Johann's avatar
Johann committed
68

69 70
    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
    add             r4, r4, #4              ; -dst_stride * 3 + 4
Johann's avatar
Johann committed
71

72 73
    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
    sub             r9, r9, #7
Johann's avatar
Johann committed
74 75 76 77
    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop

    mov             r10, r6                 ; w loop counter

Yaowu Xu's avatar
Yaowu Xu committed
78
aom_convolve8_loop_horiz_v
79 80 81 82
    vld1.8          {d24}, [r0], r1
    vld1.8          {d25}, [r0], r1
    vld1.8          {d26}, [r0], r1
    vld1.8          {d27}, [r0], r8
Johann's avatar
Johann committed
83

84 85 86 87
    vtrn.16         q12, q13
    vtrn.8          d24, d25
    vtrn.8          d26, d27

88 89
    pld             [r0, r1, lsl #2]

Johann's avatar
Johann committed
90 91 92 93
    vmovl.u8        q8, d24
    vmovl.u8        q9, d25
    vmovl.u8        q10, d26
    vmovl.u8        q11, d27
94 95 96 97 98 99 100

    ; save a few instructions in the inner loop
    vswp            d17, d18
    vmov            d23, d21

    add             r0, r0, #3

Yaowu Xu's avatar
Yaowu Xu committed
101
aom_convolve8_loop_horiz
102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
    add             r5, r0, #64

    vld1.32         {d28[]}, [r0], r1
    vld1.32         {d29[]}, [r0], r1
    vld1.32         {d31[]}, [r0], r1
    vld1.32         {d30[]}, [r0], r8

    pld             [r5]

    vtrn.16         d28, d31
    vtrn.16         d29, d30
    vtrn.8          d28, d29
    vtrn.8          d31, d30

    pld             [r5, r1]

    ; extract to s16
    vtrn.32         q14, q15
Johann's avatar
Johann committed
120
    vmovl.u8        q12, d28
121 122 123
    vmovl.u8        q13, d29

    pld             [r5, r1, lsl #1]
Johann's avatar
Johann committed
124 125

    ; src[] * filter_x
126 127 128 129 130 131
    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25

    pld             [r5, -r8]
Johann's avatar
Johann committed
132 133 134 135 136 137 138 139

    ; += 64 >> 7
    vqrshrun.s32    d2, q1, #7
    vqrshrun.s32    d3, q2, #7
    vqrshrun.s32    d4, q14, #7
    vqrshrun.s32    d5, q15, #7

    ; saturate
140 141
    vqmovn.u16      d2, q1
    vqmovn.u16      d3, q2
Johann's avatar
Johann committed
142 143 144 145 146 147

    ; transpose
    vtrn.16         d2, d3
    vtrn.32         d2, d3
    vtrn.8          d2, d3

148 149 150 151
    vst1.u32        {d2[0]}, [r2@32], r3
    vst1.u32        {d3[0]}, [r2@32], r3
    vst1.u32        {d2[1]}, [r2@32], r3
    vst1.u32        {d3[1]}, [r2@32], r4
Johann's avatar
Johann committed
152

153 154 155 156 157
    vmov            q8,  q9
    vmov            d20, d23
    vmov            q11, q12
    vmov            q9,  q13

Johann's avatar
Johann committed
158
    subs            r6, r6, #4              ; w -= 4
Yaowu Xu's avatar
Yaowu Xu committed
159
    bgt             aom_convolve8_loop_horiz
Johann's avatar
Johann committed
160 161 162

    ; outer loop
    mov             r6, r10                 ; restore w counter
163
    add             r0, r0, r9              ; src += src_stride * 4 - w
Johann's avatar
Johann committed
164 165
    add             r2, r2, r12             ; dst += dst_stride * 4 - w
    subs            r7, r7, #4              ; h -= 4
Yaowu Xu's avatar
Yaowu Xu committed
166
    bgt aom_convolve8_loop_horiz_v
Johann's avatar
Johann committed
167 168 169 170 171

    pop             {r4-r10, pc}

    ENDP

Yaowu Xu's avatar
Yaowu Xu committed
172
|aom_convolve8_vert_neon| PROC
173
    push            {r4-r8, lr}
Johann's avatar
Johann committed
174 175 176 177 178

    ; adjust for taps
    sub             r0, r0, r1
    sub             r0, r0, r1, lsl #1

179 180 181
    ldr             r4, [sp, #32]           ; filter_y
    ldr             r6, [sp, #40]           ; w
    ldr             lr, [sp, #44]           ; h
Johann's avatar
Johann committed
182

183
    vld1.s16        {q0}, [r4]              ; filter_y
Johann's avatar
Johann committed
184

185 186
    lsl             r1, r1, #1
    lsl             r3, r3, #1
Johann's avatar
Johann committed
187

Yaowu Xu's avatar
Yaowu Xu committed
188
aom_convolve8_loop_vert_h
189 190 191 192 193
    mov             r4, r0
    add             r7, r0, r1, asr #1
    mov             r5, r2
    add             r8, r2, r3, asr #1
    mov             r12, lr                 ; h loop counter
Johann's avatar
Johann committed
194

195 196 197 198 199 200 201
    vld1.u32        {d16[0]}, [r4], r1
    vld1.u32        {d16[1]}, [r7], r1
    vld1.u32        {d18[0]}, [r4], r1
    vld1.u32        {d18[1]}, [r7], r1
    vld1.u32        {d20[0]}, [r4], r1
    vld1.u32        {d20[1]}, [r7], r1
    vld1.u32        {d22[0]}, [r4], r1
Johann's avatar
Johann committed
202

203 204 205 206
    vmovl.u8        q8, d16
    vmovl.u8        q9, d18
    vmovl.u8        q10, d20
    vmovl.u8        q11, d22
Johann's avatar
Johann committed
207

Yaowu Xu's avatar
Yaowu Xu committed
208
aom_convolve8_loop_vert
Johann's avatar
Johann committed
209
    ; always process a 4x4 block at a time
210 211 212 213
    vld1.u32        {d24[0]}, [r7], r1
    vld1.u32        {d26[0]}, [r4], r1
    vld1.u32        {d26[1]}, [r7], r1
    vld1.u32        {d24[1]}, [r4], r1
Johann's avatar
Johann committed
214 215 216 217 218

    ; extract to s16
    vmovl.u8        q12, d24
    vmovl.u8        q13, d26

219 220 221
    pld             [r5]
    pld             [r8]

Johann's avatar
Johann committed
222
    ; src[] * filter_y
223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238
    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24

    pld             [r5, r3]
    pld             [r8, r3]

    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26

    pld             [r7]
    pld             [r4]

    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27

    pld             [r7, r1]
    pld             [r4, r1]

    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
Johann's avatar
Johann committed
239 240 241 242 243 244 245 246

    ; += 64 >> 7
    vqrshrun.s32    d2, q1, #7
    vqrshrun.s32    d3, q2, #7
    vqrshrun.s32    d4, q14, #7
    vqrshrun.s32    d5, q15, #7

    ; saturate
247 248
    vqmovn.u16      d2, q1
    vqmovn.u16      d3, q2
Johann's avatar
Johann committed
249

250 251 252 253
    vst1.u32        {d2[0]}, [r5@32], r3
    vst1.u32        {d2[1]}, [r8@32], r3
    vst1.u32        {d3[0]}, [r5@32], r3
    vst1.u32        {d3[1]}, [r8@32], r3
254 255 256 257 258 259

    vmov            q8, q10
    vmov            d18, d22
    vmov            d19, d24
    vmov            q10, q13
    vmov            d22, d25
Johann's avatar
Johann committed
260

261
    subs            r12, r12, #4            ; h -= 4
Yaowu Xu's avatar
Yaowu Xu committed
262
    bgt             aom_convolve8_loop_vert
Johann's avatar
Johann committed
263 264

    ; outer loop
265 266 267
    add             r0, r0, #4
    add             r2, r2, #4
    subs            r6, r6, #4              ; w -= 4
Yaowu Xu's avatar
Yaowu Xu committed
268
    bgt             aom_convolve8_loop_vert_h
Johann's avatar
Johann committed
269

270
    pop             {r4-r8, pc}
Johann's avatar
Johann committed
271 272 273

    ENDP
    END