loopfilter_8_neon.asm 15.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10
;
;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;

Adrian Grange's avatar
Adrian Grange committed
11 12
    EXPORT  |aom_lpf_horizontal_8_neon|
    EXPORT  |aom_lpf_vertical_8_neon|
13 14 15 16
    ARM

    AREA ||.text||, CODE, READONLY, ALIGN=2

Adrian Grange's avatar
Adrian Grange committed
17
; Currently aom only works on iterations 8 at a time. The aom loop filter
18 19 20 21
; works on 16 iterations at a time.
; TODO(fgalligan): See about removing the count code as this function is only
; called with a count of 1.
;
Adrian Grange's avatar
Adrian Grange committed
22
; void aom_lpf_horizontal_8_neon(uint8_t *s, int p,
Jim Bankoski's avatar
Jim Bankoski committed
23 24 25 26
;                                const uint8_t *blimit,
;                                const uint8_t *limit,
;                                const uint8_t *thresh,
;                                int count)
27 28 29 30 31 32
; r0    uint8_t *s,
; r1    int p, /* pitch */
; r2    const uint8_t *blimit,
; r3    const uint8_t *limit,
; sp    const uint8_t *thresh,
; sp+4  int count
Adrian Grange's avatar
Adrian Grange committed
33
|aom_lpf_horizontal_8_neon| PROC
34
    push        {r4-r5, lr}
35

Frank Galligan's avatar
Frank Galligan committed
36 37 38
    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
    ldr         r12, [sp, #16]             ; load count
    ldr         r2, [sp, #12]              ; load thresh
Frank Galligan's avatar
Frank Galligan committed
39
    add         r1, r1, r1                 ; double pitch
Frank Galligan's avatar
Frank Galligan committed
40

41
    cmp         r12, #0
Adrian Grange's avatar
Adrian Grange committed
42
    beq         end_aom_mblf_h_edge
43 44 45 46 47

    vld1.8      {d1[]}, [r3]               ; duplicate *limit
    vld1.8      {d2[]}, [r2]               ; duplicate *thresh

count_mblf_h_loop
Frank Galligan's avatar
Frank Galligan committed
48 49
    sub         r3, r0, r1, lsl #1         ; move src pointer down by 4 lines
    add         r2, r3, r1, lsr #1         ; set to 3 lines down
50 51 52 53 54 55 56 57 58 59 60 61 62

    vld1.u8     {d3}, [r3@64], r1          ; p3
    vld1.u8     {d4}, [r2@64], r1          ; p2
    vld1.u8     {d5}, [r3@64], r1          ; p1
    vld1.u8     {d6}, [r2@64], r1          ; p0
    vld1.u8     {d7}, [r3@64], r1          ; q0
    vld1.u8     {d16}, [r2@64], r1         ; q1
    vld1.u8     {d17}, [r3@64]             ; q2
    vld1.u8     {d18}, [r2@64], r1         ; q3

    sub         r3, r3, r1, lsl #1
    sub         r2, r2, r1, lsl #2

Adrian Grange's avatar
Adrian Grange committed
63
    bl          aom_mbloop_filter_neon
64

Frank Galligan's avatar
Frank Galligan committed
65 66 67 68 69 70
    vst1.u8     {d0}, [r2@64], r1          ; store op2
    vst1.u8     {d1}, [r3@64], r1          ; store op1
    vst1.u8     {d2}, [r2@64], r1          ; store op0
    vst1.u8     {d3}, [r3@64], r1          ; store oq0
    vst1.u8     {d4}, [r2@64], r1          ; store oq1
    vst1.u8     {d5}, [r3@64], r1          ; store oq2
71 72 73 74 75

    add         r0, r0, #8
    subs        r12, r12, #1
    bne         count_mblf_h_loop

Adrian Grange's avatar
Adrian Grange committed
76
end_aom_mblf_h_edge
77
    pop         {r4-r5, pc}
78

Adrian Grange's avatar
Adrian Grange committed
79
    ENDP        ; |aom_lpf_horizontal_8_neon|
80

Adrian Grange's avatar
Adrian Grange committed
81
; void aom_lpf_vertical_8_neon(uint8_t *s,
Jim Bankoski's avatar
Jim Bankoski committed
82 83 84 85 86
;                              int pitch,
;                              const uint8_t *blimit,
;                              const uint8_t *limit,
;                              const uint8_t *thresh,
;                              int count)
87 88 89 90 91 92 93
;
; r0    uint8_t *s,
; r1    int pitch,
; r2    const uint8_t *blimit,
; r3    const uint8_t *limit,
; sp    const uint8_t *thresh,
; sp+4  int count
Adrian Grange's avatar
Adrian Grange committed
94
|aom_lpf_vertical_8_neon| PROC
95
    push        {r4-r5, lr}
96

Frank Galligan's avatar
Frank Galligan committed
97 98 99 100 101 102
    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
    ldr         r12, [sp, #16]            ; load count
    vld1.8      {d1[]}, [r3]              ; duplicate *limit

    ldr         r3, [sp, #12]             ; load thresh
    sub         r2, r0, #4                ; move s pointer down by 4 columns
103
    cmp         r12, #0
Adrian Grange's avatar
Adrian Grange committed
104
    beq         end_aom_mblf_v_edge
105

Frank Galligan's avatar
Frank Galligan committed
106
    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136

count_mblf_v_loop
    vld1.u8     {d3}, [r2], r1             ; load s data
    vld1.u8     {d4}, [r2], r1
    vld1.u8     {d5}, [r2], r1
    vld1.u8     {d6}, [r2], r1
    vld1.u8     {d7}, [r2], r1
    vld1.u8     {d16}, [r2], r1
    vld1.u8     {d17}, [r2], r1
    vld1.u8     {d18}, [r2]

    ;transpose to 8x16 matrix
    vtrn.32     d3, d7
    vtrn.32     d4, d16
    vtrn.32     d5, d17
    vtrn.32     d6, d18

    vtrn.16     d3, d5
    vtrn.16     d4, d6
    vtrn.16     d7, d17
    vtrn.16     d16, d18

    vtrn.8      d3, d4
    vtrn.8      d5, d6
    vtrn.8      d7, d16
    vtrn.8      d17, d18

    sub         r2, r0, #3
    add         r3, r0, #1

Adrian Grange's avatar
Adrian Grange committed
137
    bl          aom_mbloop_filter_neon
138 139

    ;store op2, op1, op0, oq0
Frank Galligan's avatar
Frank Galligan committed
140 141 142 143 144 145 146 147
    vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r2], r1
    vst4.8      {d0[1], d1[1], d2[1], d3[1]}, [r2], r1
    vst4.8      {d0[2], d1[2], d2[2], d3[2]}, [r2], r1
    vst4.8      {d0[3], d1[3], d2[3], d3[3]}, [r2], r1
    vst4.8      {d0[4], d1[4], d2[4], d3[4]}, [r2], r1
    vst4.8      {d0[5], d1[5], d2[5], d3[5]}, [r2], r1
    vst4.8      {d0[6], d1[6], d2[6], d3[6]}, [r2], r1
    vst4.8      {d0[7], d1[7], d2[7], d3[7]}, [r2]
148 149

    ;store oq1, oq2
Frank Galligan's avatar
Frank Galligan committed
150 151 152 153 154 155 156 157
    vst2.8      {d4[0], d5[0]}, [r3], r1
    vst2.8      {d4[1], d5[1]}, [r3], r1
    vst2.8      {d4[2], d5[2]}, [r3], r1
    vst2.8      {d4[3], d5[3]}, [r3], r1
    vst2.8      {d4[4], d5[4]}, [r3], r1
    vst2.8      {d4[5], d5[5]}, [r3], r1
    vst2.8      {d4[6], d5[6]}, [r3], r1
    vst2.8      {d4[7], d5[7]}, [r3]
158 159 160

    add         r0, r0, r1, lsl #3         ; s += pitch * 8
    subs        r12, r12, #1
Frank Galligan's avatar
Frank Galligan committed
161
    subne       r2, r0, #4                 ; move s pointer down by 4 columns
162 163
    bne         count_mblf_v_loop

Adrian Grange's avatar
Adrian Grange committed
164
end_aom_mblf_v_edge
165
    pop         {r4-r5, pc}
Adrian Grange's avatar
Adrian Grange committed
166
    ENDP        ; |aom_lpf_vertical_8_neon|
167

Adrian Grange's avatar
Adrian Grange committed
168
; void aom_mbloop_filter_neon();
169 170 171 172
; This is a helper function for the loopfilters. The invidual functions do the
; necessary load, transpose (if necessary) and store. The function does not use
; registers d8-d15.
;
Frank Galligan's avatar
Frank Galligan committed
173
; Inputs:
174
; r0-r3, r12 PRESERVE
175 176 177 178 179 180 181 182 183 184 185
; d0    blimit
; d1    limit
; d2    thresh
; d3    p3
; d4    p2
; d5    p1
; d6    p0
; d7    q0
; d16   q1
; d17   q2
; d18   q3
Frank Galligan's avatar
Frank Galligan committed
186 187 188 189 190 191 192 193
;
; Outputs:
; d0    op2
; d1    op1
; d2    op0
; d3    oq0
; d4    oq1
; d5    oq2
Adrian Grange's avatar
Adrian Grange committed
194
|aom_mbloop_filter_neon| PROC
195
    ; filter_mask
Frank Galligan's avatar
Frank Galligan committed
196 197 198 199 200 201
    vabd.u8     d19, d3, d4                ; m1 = abs(p3 - p2)
    vabd.u8     d20, d4, d5                ; m2 = abs(p2 - p1)
    vabd.u8     d21, d5, d6                ; m3 = abs(p1 - p0)
    vabd.u8     d22, d16, d7               ; m4 = abs(q1 - q0)
    vabd.u8     d23, d17, d16              ; m5 = abs(q2 - q1)
    vabd.u8     d24, d18, d17              ; m6 = abs(q3 - q2)
202 203

    ; only compare the largest value to limit
Frank Galligan's avatar
Frank Galligan committed
204 205 206 207 208 209 210 211 212
    vmax.u8     d19, d19, d20              ; m1 = max(m1, m2)
    vmax.u8     d20, d21, d22              ; m2 = max(m3, m4)

    vabd.u8     d25, d6, d4                ; m7 = abs(p0 - p2)

    vmax.u8     d23, d23, d24              ; m3 = max(m5, m6)

    vabd.u8     d26, d7, d17               ; m8 = abs(q0 - q2)

213 214
    vmax.u8     d19, d19, d20

Frank Galligan's avatar
Frank Galligan committed
215 216 217
    vabd.u8     d24, d6, d7                ; m9 = abs(p0 - q0)
    vabd.u8     d27, d3, d6                ; m10 = abs(p3 - p0)
    vabd.u8     d28, d18, d7               ; m11 = abs(q3 - q0)
218 219 220 221 222 223 224 225 226 227

    vmax.u8     d19, d19, d23

    vabd.u8     d23, d5, d16               ; a = abs(p1 - q1)
    vqadd.u8    d24, d24, d24              ; b = abs(p0 - q0) * 2

    ; abs () > limit
    vcge.u8     d19, d1, d19

    ; only compare the largest value to thresh
Frank Galligan's avatar
Frank Galligan committed
228 229
    vmax.u8     d25, d25, d26              ; m4 = max(m7, m8)
    vmax.u8     d26, d27, d28              ; m5 = max(m10, m11)
230 231

    vshr.u8     d23, d23, #1               ; a = a / 2
Frank Galligan's avatar
Frank Galligan committed
232 233 234

    vmax.u8     d25, d25, d26              ; m4 = max(m4, m5)

235 236
    vqadd.u8    d24, d24, d23              ; a = b + a

Frank Galligan's avatar
Frank Galligan committed
237 238
    vmax.u8     d20, d20, d25              ; m2 = max(m2, m4)

239 240 241
    vmov.u8     d23, #1
    vcge.u8     d24, d0, d24               ; a > blimit

Frank Galligan's avatar
Frank Galligan committed
242 243
    vcgt.u8     d21, d21, d2               ; (abs(p1 - p0) > thresh)*-1

244 245 246 247
    vcge.u8     d20, d23, d20              ; flat

    vand        d19, d19, d24              ; mask

Frank Galligan's avatar
Frank Galligan committed
248 249
    vcgt.u8     d23, d22, d2               ; (abs(q1 - q0) > thresh)*-1

250 251
    vand        d20, d20, d19              ; flat & mask

Frank Galligan's avatar
Frank Galligan committed
252 253 254 255
    vmov.u8     d22, #0x80

    vorr        d23, d21, d23              ; hev

256 257 258 259 260 261 262
    ; This instruction will truncate the "flat & mask" masks down to 4 bits
    ; each to fit into one 32 bit arm register. The values are stored in
    ; q10.64[0].
    vshrn.u16   d30, q10, #4
    vmov.u32    r4, d30[0]                 ; flat & mask 4bits

    adds        r5, r4, #1                 ; Check for all 1's
Frank Galligan's avatar
Frank Galligan committed
263 264 265

    ; If mask and flat are 1's for all vectors, then we only need to execute
    ; the power branch for all vectors.
266 267 268 269
    beq         power_branch_only

    cmp         r4, #0                     ; Check for 0, set flag for later

270 271 272
    ; mbfilter() function
    ; filter() function
    ; convert to signed
Frank Galligan's avatar
Frank Galligan committed
273
    veor        d21, d7, d22               ; qs0
274 275 276 277 278 279
    veor        d24, d6, d22               ; ps0
    veor        d25, d5, d22               ; ps1
    veor        d26, d16, d22              ; qs1

    vmov.u8     d27, #3

Frank Galligan's avatar
Frank Galligan committed
280
    vsub.s8     d28, d21, d24              ; ( qs0 - ps0)
281 282 283 284 285

    vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)

    vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)

Frank Galligan's avatar
Frank Galligan committed
286
    vand        d29, d29, d23              ; filter &= hev
287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302

    vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)

    vmov.u8     d29, #4

    ; filter = clamp(filter + 3 * ( qs0 - ps0))
    vqmovn.s16  d28, q15

    vand        d28, d28, d19              ; filter &= mask

    vqadd.s8    d30, d28, d27              ; filter2 = clamp(filter+3)
    vqadd.s8    d29, d28, d29              ; filter1 = clamp(filter+4)
    vshr.s8     d30, d30, #3               ; filter2 >>= 3
    vshr.s8     d29, d29, #3               ; filter1 >>= 3

    vqadd.s8    d24, d24, d30              ; op0 = clamp(ps0 + filter2)
Frank Galligan's avatar
Frank Galligan committed
303
    vqsub.s8    d21, d21, d29              ; oq0 = clamp(qs0 - filter1)
304 305 306

    ; outer tap adjustments: ++filter1 >> 1
    vrshr.s8    d29, d29, #1
Frank Galligan's avatar
Frank Galligan committed
307
    vbic        d29, d29, d23              ; filter &= ~hev
308 309 310 311

    vqadd.s8    d25, d25, d29              ; op1 = clamp(ps1 + filter)
    vqsub.s8    d26, d26, d29              ; oq1 = clamp(qs1 - filter)

Frank Galligan's avatar
Frank Galligan committed
312 313
    ; If mask and flat are 0's for all vectors, then we only need to execute
    ; the filter branch for all vectors.
314 315
    beq         filter_branch_only

Frank Galligan's avatar
Frank Galligan committed
316 317
    ; If mask and flat are mixed then we must perform both branches and
    ; combine the data.
318
    veor        d24, d24, d22              ; *f_op0 = u^0x80
Frank Galligan's avatar
Frank Galligan committed
319
    veor        d21, d21, d22              ; *f_oq0 = u^0x80
320 321 322
    veor        d25, d25, d22              ; *f_op1 = u^0x80
    veor        d26, d26, d22              ; *f_oq1 = u^0x80

Frank Galligan's avatar
Frank Galligan committed
323 324 325 326 327 328 329 330 331 332 333 334 335 336
    ; At this point we have already executed the filter branch. The filter
    ; branch does not set op2 or oq2, so use p2 and q2. Execute the power
    ; branch and combine the data.
    vmov.u8     d23, #2
    vaddl.u8    q14, d6, d7                ; r_op2 = p0 + q0
    vmlal.u8    q14, d3, d27               ; r_op2 += p3 * 3
    vmlal.u8    q14, d4, d23               ; r_op2 += p2 * 2

    vbif        d0, d4, d20                ; op2 |= p2 & ~(flat & mask)

    vaddw.u8    q14, d5                    ; r_op2 += p1

    vbif        d1, d25, d20               ; op1 |= f_op1 & ~(flat & mask)

337 338
    vqrshrn.u16 d30, q14, #3               ; r_op2

Frank Galligan's avatar
Frank Galligan committed
339 340 341 342 343 344 345
    vsubw.u8    q14, d3                    ; r_op1 = r_op2 - p3
    vsubw.u8    q14, d4                    ; r_op1 -= p2
    vaddw.u8    q14, d5                    ; r_op1 += p1
    vaddw.u8    q14, d16                   ; r_op1 += q1

    vbif        d2, d24, d20               ; op0 |= f_op0 & ~(flat & mask)

346 347
    vqrshrn.u16 d31, q14, #3               ; r_op1

Frank Galligan's avatar
Frank Galligan committed
348 349 350 351 352 353 354 355 356 357 358 359 360 361
    vsubw.u8    q14, d3                    ; r_op0 = r_op1 - p3
    vsubw.u8    q14, d5                    ; r_op0 -= p1
    vaddw.u8    q14, d6                    ; r_op0 += p0
    vaddw.u8    q14, d17                   ; r_op0 += q2

    vbit        d0, d30, d20               ; op2 |= r_op2 & (flat & mask)

    vqrshrn.u16 d23, q14, #3               ; r_op0

    vsubw.u8    q14, d3                    ; r_oq0 = r_op0 - p3
    vsubw.u8    q14, d6                    ; r_oq0 -= p0
    vaddw.u8    q14, d7                    ; r_oq0 += q0

    vbit        d1, d31, d20               ; op1 |= r_op1 & (flat & mask)
362 363

    vaddw.u8    q14, d18                   ; oq0 += q3
Frank Galligan's avatar
Frank Galligan committed
364 365 366

    vbit        d2, d23, d20               ; op0 |= r_op0 & (flat & mask)

367 368
    vqrshrn.u16 d22, q14, #3               ; r_oq0

Frank Galligan's avatar
Frank Galligan committed
369 370 371
    vsubw.u8    q14, d4                    ; r_oq1 = r_oq0 - p2
    vsubw.u8    q14, d7                    ; r_oq1 -= q0
    vaddw.u8    q14, d16                   ; r_oq1 += q1
372

Frank Galligan's avatar
Frank Galligan committed
373 374 375
    vbif        d3, d21, d20               ; oq0 |= f_oq0 & ~(flat & mask)

    vaddw.u8    q14, d18                   ; r_oq1 += q3
376

Frank Galligan's avatar
Frank Galligan committed
377
    vbif        d4, d26, d20               ; oq1 |= f_oq1 & ~(flat & mask)
378

Frank Galligan's avatar
Frank Galligan committed
379
    vqrshrn.u16 d6, q14, #3                ; r_oq1
380

Frank Galligan's avatar
Frank Galligan committed
381 382 383 384
    vsubw.u8    q14, d5                    ; r_oq2 = r_oq1 - p1
    vsubw.u8    q14, d16                   ; r_oq2 -= q1
    vaddw.u8    q14, d17                   ; r_oq2 += q2
    vaddw.u8    q14, d18                   ; r_oq2 += q3
385

Frank Galligan's avatar
Frank Galligan committed
386
    vbif        d5, d17, d20               ; oq2 |= q2 & ~(flat & mask)
387

Frank Galligan's avatar
Frank Galligan committed
388
    vqrshrn.u16 d7, q14, #3                ; r_oq2
389

Frank Galligan's avatar
Frank Galligan committed
390 391 392
    vbit        d3, d22, d20               ; oq0 |= r_oq0 & (flat & mask)
    vbit        d4, d6, d20                ; oq1 |= r_oq1 & (flat & mask)
    vbit        d5, d7, d20                ; oq2 |= r_oq2 & (flat & mask)
393 394

    bx          lr
395 396 397 398 399 400 401 402

power_branch_only
    vmov.u8     d27, #3
    vmov.u8     d21, #2
    vaddl.u8    q14, d6, d7                ; op2 = p0 + q0
    vmlal.u8    q14, d3, d27               ; op2 += p3 * 3
    vmlal.u8    q14, d4, d21               ; op2 += p2 * 2
    vaddw.u8    q14, d5                    ; op2 += p1
Frank Galligan's avatar
Frank Galligan committed
403
    vqrshrn.u16 d0, q14, #3                ; op2
404 405 406 407 408

    vsubw.u8    q14, d3                    ; op1 = op2 - p3
    vsubw.u8    q14, d4                    ; op1 -= p2
    vaddw.u8    q14, d5                    ; op1 += p1
    vaddw.u8    q14, d16                   ; op1 += q1
Frank Galligan's avatar
Frank Galligan committed
409
    vqrshrn.u16 d1, q14, #3                ; op1
410 411 412 413 414

    vsubw.u8    q14, d3                    ; op0 = op1 - p3
    vsubw.u8    q14, d5                    ; op0 -= p1
    vaddw.u8    q14, d6                    ; op0 += p0
    vaddw.u8    q14, d17                   ; op0 += q2
Frank Galligan's avatar
Frank Galligan committed
415
    vqrshrn.u16 d2, q14, #3                ; op0
416 417 418 419 420

    vsubw.u8    q14, d3                    ; oq0 = op0 - p3
    vsubw.u8    q14, d6                    ; oq0 -= p0
    vaddw.u8    q14, d7                    ; oq0 += q0
    vaddw.u8    q14, d18                   ; oq0 += q3
Frank Galligan's avatar
Frank Galligan committed
421
    vqrshrn.u16 d3, q14, #3                ; oq0
422 423 424 425 426

    vsubw.u8    q14, d4                    ; oq1 = oq0 - p2
    vsubw.u8    q14, d7                    ; oq1 -= q0
    vaddw.u8    q14, d16                   ; oq1 += q1
    vaddw.u8    q14, d18                   ; oq1 += q3
Frank Galligan's avatar
Frank Galligan committed
427
    vqrshrn.u16 d4, q14, #3                ; oq1
428

Frank Galligan's avatar
Frank Galligan committed
429
    vsubw.u8    q14, d5                    ; oq2 = oq1 - p1
430 431 432
    vsubw.u8    q14, d16                   ; oq2 -= q1
    vaddw.u8    q14, d17                   ; oq2 += q2
    vaddw.u8    q14, d18                   ; oq2 += q3
Frank Galligan's avatar
Frank Galligan committed
433
    vqrshrn.u16 d5, q14, #3                ; oq2
434 435 436 437 438 439

    bx          lr

filter_branch_only
    ; TODO(fgalligan): See if we can rearange registers so we do not need to
    ; do the 2 vswp.
Frank Galligan's avatar
Frank Galligan committed
440 441 442 443 444 445
    vswp        d0, d4                      ; op2
    vswp        d5, d17                     ; oq2
    veor        d2, d24, d22                ; *op0 = u^0x80
    veor        d3, d21, d22                ; *oq0 = u^0x80
    veor        d1, d25, d22                ; *op1 = u^0x80
    veor        d4, d26, d22                ; *oq1 = u^0x80
446 447 448

    bx          lr

Adrian Grange's avatar
Adrian Grange committed
449
    ENDP        ; |aom_mbloop_filter_neon|
450

451
    END