vpx_convolve8_avg_neon_asm.asm 7.91 KB
Newer Older
Johann's avatar
Johann committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
;
;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;


    ; These functions are only valid when:
    ; x_step_q4 == 16
    ; w%4 == 0
    ; h%4 == 0
    ; taps == 8
    ; VP9_FILTER_WEIGHT == 128
    ; VP9_FILTER_SHIFT == 7

Zoe Liu's avatar
Zoe Liu committed
20
21
    EXPORT  |vpx_convolve8_avg_horiz_neon|
    EXPORT  |vpx_convolve8_avg_vert_neon|
Johann's avatar
Johann committed
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
    ARM
    REQUIRE8
    PRESERVE8

    AREA ||.text||, CODE, READONLY, ALIGN=2

    ; Multiply and accumulate by q0
    MACRO
    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
    vmull.s16 $dst, $src0, d0[0]
    vmlal.s16 $dst, $src1, d0[1]
    vmlal.s16 $dst, $src2, d0[2]
    vmlal.s16 $dst, $src3, d0[3]
    vmlal.s16 $dst, $src4, d1[0]
    vmlal.s16 $dst, $src5, d1[1]
    vmlal.s16 $dst, $src6, d1[2]
    vmlal.s16 $dst, $src7, d1[3]
    MEND

; r0    const uint8_t *src
; r1    int src_stride
; r2    uint8_t *dst
; r3    int dst_stride
; sp[]const int16_t *filter_x
; sp[]int x_step_q4
; sp[]const int16_t *filter_y ; unused
; sp[]int y_step_q4           ; unused
; sp[]int w
; sp[]int h

Zoe Liu's avatar
Zoe Liu committed
52
|vpx_convolve8_avg_horiz_neon| PROC
Johann's avatar
Johann committed
53
54
55
56
57
58
59
60
61
62
    push            {r4-r10, lr}

    sub             r0, r0, #3              ; adjust for taps

    ldr             r5, [sp, #32]           ; filter_x
    ldr             r6, [sp, #48]           ; w
    ldr             r7, [sp, #52]           ; h

    vld1.s16        {q0}, [r5]              ; filter_x

63
64
    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
    add             r8, r8, #4              ; -src_stride * 3 + 4
Johann's avatar
Johann committed
65

66
67
    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
    add             r4, r4, #4              ; -dst_stride * 3 + 4
Johann's avatar
Johann committed
68

69
70
    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
    sub             r9, r9, #7
Johann's avatar
Johann committed
71
72
73
74
    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop

    mov             r10, r6                 ; w loop counter

Zoe Liu's avatar
Zoe Liu committed
75
vpx_convolve8_avg_loop_horiz_v
76
77
78
79
    vld1.8          {d24}, [r0], r1
    vld1.8          {d25}, [r0], r1
    vld1.8          {d26}, [r0], r1
    vld1.8          {d27}, [r0], r8
Johann's avatar
Johann committed
80

81
82
83
84
    vtrn.16         q12, q13
    vtrn.8          d24, d25
    vtrn.8          d26, d27

85
86
    pld             [r0, r1, lsl #2]

Johann's avatar
Johann committed
87
88
89
90
    vmovl.u8        q8, d24
    vmovl.u8        q9, d25
    vmovl.u8        q10, d26
    vmovl.u8        q11, d27
91
92
93
94
95
96
97

    ; save a few instructions in the inner loop
    vswp            d17, d18
    vmov            d23, d21

    add             r0, r0, #3

Zoe Liu's avatar
Zoe Liu committed
98
vpx_convolve8_avg_loop_horiz
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
    add             r5, r0, #64

    vld1.32         {d28[]}, [r0], r1
    vld1.32         {d29[]}, [r0], r1
    vld1.32         {d31[]}, [r0], r1
    vld1.32         {d30[]}, [r0], r8

    pld             [r5]

    vtrn.16         d28, d31
    vtrn.16         d29, d30
    vtrn.8          d28, d29
    vtrn.8          d31, d30

    pld             [r5, r1]

    ; extract to s16
    vtrn.32         q14, q15
Johann's avatar
Johann committed
117
    vmovl.u8        q12, d28
118
119
120
    vmovl.u8        q13, d29

    pld             [r5, r1, lsl #1]
Johann's avatar
Johann committed
121
122
123
124
125
126
127
128
129
130

    ; slightly out of order load to match the existing data
    vld1.u32        {d6[0]}, [r2], r3
    vld1.u32        {d7[0]}, [r2], r3
    vld1.u32        {d6[1]}, [r2], r3
    vld1.u32        {d7[1]}, [r2], r3

    sub             r2, r2, r3, lsl #2      ; reset for store

    ; src[] * filter_x
131
132
133
134
135
136
    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25

    pld             [r5, -r8]
Johann's avatar
Johann committed
137
138
139
140
141
142
143
144

    ; += 64 >> 7
    vqrshrun.s32    d2, q1, #7
    vqrshrun.s32    d3, q2, #7
    vqrshrun.s32    d4, q14, #7
    vqrshrun.s32    d5, q15, #7

    ; saturate
145
146
    vqmovn.u16      d2, q1
    vqmovn.u16      d3, q2
Johann's avatar
Johann committed
147
148
149
150
151

    ; transpose
    vtrn.16         d2, d3
    vtrn.32         d2, d3
    vtrn.8          d2, d3
152

Johann's avatar
Johann committed
153
    ; average the new value and the dst value
154
    vrhadd.u8       q1, q1, q3
Johann's avatar
Johann committed
155

156
157
158
159
    vst1.u32        {d2[0]}, [r2@32], r3
    vst1.u32        {d3[0]}, [r2@32], r3
    vst1.u32        {d2[1]}, [r2@32], r3
    vst1.u32        {d3[1]}, [r2@32], r4
Johann's avatar
Johann committed
160

161
162
163
164
165
    vmov            q8,  q9
    vmov            d20, d23
    vmov            q11, q12
    vmov            q9,  q13

Johann's avatar
Johann committed
166
    subs            r6, r6, #4              ; w -= 4
Zoe Liu's avatar
Zoe Liu committed
167
    bgt             vpx_convolve8_avg_loop_horiz
Johann's avatar
Johann committed
168
169
170

    ; outer loop
    mov             r6, r10                 ; restore w counter
171
    add             r0, r0, r9              ; src += src_stride * 4 - w
Johann's avatar
Johann committed
172
173
    add             r2, r2, r12             ; dst += dst_stride * 4 - w
    subs            r7, r7, #4              ; h -= 4
Zoe Liu's avatar
Zoe Liu committed
174
    bgt vpx_convolve8_avg_loop_horiz_v
Johann's avatar
Johann committed
175
176
177
178
179

    pop             {r4-r10, pc}

    ENDP

Zoe Liu's avatar
Zoe Liu committed
180
|vpx_convolve8_avg_vert_neon| PROC
181
    push            {r4-r8, lr}
Johann's avatar
Johann committed
182
183
184
185
186

    ; adjust for taps
    sub             r0, r0, r1
    sub             r0, r0, r1, lsl #1

187
188
189
    ldr             r4, [sp, #32]           ; filter_y
    ldr             r6, [sp, #40]           ; w
    ldr             lr, [sp, #44]           ; h
Johann's avatar
Johann committed
190

191
    vld1.s16        {q0}, [r4]              ; filter_y
Johann's avatar
Johann committed
192

193
194
    lsl             r1, r1, #1
    lsl             r3, r3, #1
Johann's avatar
Johann committed
195

Zoe Liu's avatar
Zoe Liu committed
196
vpx_convolve8_avg_loop_vert_h
197
198
199
200
201
    mov             r4, r0
    add             r7, r0, r1, asr #1
    mov             r5, r2
    add             r8, r2, r3, asr #1
    mov             r12, lr                 ; h loop counter
Johann's avatar
Johann committed
202

203
204
205
206
207
208
209
    vld1.u32        {d16[0]}, [r4], r1
    vld1.u32        {d16[1]}, [r7], r1
    vld1.u32        {d18[0]}, [r4], r1
    vld1.u32        {d18[1]}, [r7], r1
    vld1.u32        {d20[0]}, [r4], r1
    vld1.u32        {d20[1]}, [r7], r1
    vld1.u32        {d22[0]}, [r4], r1
Johann's avatar
Johann committed
210

211
212
213
214
    vmovl.u8        q8, d16
    vmovl.u8        q9, d18
    vmovl.u8        q10, d20
    vmovl.u8        q11, d22
Johann's avatar
Johann committed
215

Zoe Liu's avatar
Zoe Liu committed
216
vpx_convolve8_avg_loop_vert
Johann's avatar
Johann committed
217
    ; always process a 4x4 block at a time
218
219
220
221
    vld1.u32        {d24[0]}, [r7], r1
    vld1.u32        {d26[0]}, [r4], r1
    vld1.u32        {d26[1]}, [r7], r1
    vld1.u32        {d24[1]}, [r4], r1
Johann's avatar
Johann committed
222
223
224
225
226

    ; extract to s16
    vmovl.u8        q12, d24
    vmovl.u8        q13, d26

227
228
229
230
    vld1.u32        {d6[0]}, [r5@32], r3
    vld1.u32        {d6[1]}, [r8@32], r3
    vld1.u32        {d7[0]}, [r5@32], r3
    vld1.u32        {d7[1]}, [r8@32], r3
Johann's avatar
Johann committed
231

232
233
    pld             [r7]
    pld             [r4]
Johann's avatar
Johann committed
234
235

    ; src[] * filter_y
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24

    pld             [r7, r1]
    pld             [r4, r1]

    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26

    pld             [r5]
    pld             [r8]

    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27

    pld             [r5, r3]
    pld             [r8, r3]

    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
Johann's avatar
Johann committed
252
253
254
255
256
257
258
259

    ; += 64 >> 7
    vqrshrun.s32    d2, q1, #7
    vqrshrun.s32    d3, q2, #7
    vqrshrun.s32    d4, q14, #7
    vqrshrun.s32    d5, q15, #7

    ; saturate
260
261
    vqmovn.u16      d2, q1
    vqmovn.u16      d3, q2
Johann's avatar
Johann committed
262
263

    ; average the new value and the dst value
264
    vrhadd.u8       q1, q1, q3
Johann's avatar
Johann committed
265

266
267
268
    sub             r5, r5, r3, lsl #1      ; reset for store
    sub             r8, r8, r3, lsl #1

269
270
271
272
    vst1.u32        {d2[0]}, [r5@32], r3
    vst1.u32        {d2[1]}, [r8@32], r3
    vst1.u32        {d3[0]}, [r5@32], r3
    vst1.u32        {d3[1]}, [r8@32], r3
273
274
275
276
277
278

    vmov            q8, q10
    vmov            d18, d22
    vmov            d19, d24
    vmov            q10, q13
    vmov            d22, d25
Johann's avatar
Johann committed
279

280
    subs            r12, r12, #4            ; h -= 4
Zoe Liu's avatar
Zoe Liu committed
281
    bgt             vpx_convolve8_avg_loop_vert
Johann's avatar
Johann committed
282
283

    ; outer loop
284
285
286
    add             r0, r0, #4
    add             r2, r2, #4
    subs            r6, r6, #4              ; w -= 4
Zoe Liu's avatar
Zoe Liu committed
287
    bgt             vpx_convolve8_avg_loop_vert_h
Johann's avatar
Johann committed
288

289
    pop             {r4-r8, pc}
Johann's avatar
Johann committed
290
291
292

    ENDP
    END