idct4x4_1_add_neon.asm 2.14 KB
Newer Older
1 2 3 4 5 6 7 8 9 10
;
;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license and patent
;  grant that can be found in the LICENSE file in the root of the source
;  tree. All contributing project authors may be found in the AUTHORS
;  file in the root of the source tree.
;


11
    EXPORT  |vpx_idct4x4_1_add_neon|
12 13 14 15 16 17
    ARM
    REQUIRE8
    PRESERVE8

    AREA ||.text||, CODE, READONLY, ALIGN=2

18
;void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
19 20 21 22 23 24
;                                  int dest_stride)
;
; r0  int16_t input
; r1  uint8_t *dest
; r2  int dest_stride)

25
|vpx_idct4x4_1_add_neon| PROC
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
    ldrsh            r0, [r0]

    ; generate cospi_16_64 = 11585
    mov              r12, #0x2d00
    add              r12, #0x41

    ; out = dct_const_round_shift(input[0] * cospi_16_64)
    mul              r0, r0, r12               ; input[0] * cospi_16_64
    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
    asr              r0, r0, #14               ; >> DCT_CONST_BITS

    ; out = dct_const_round_shift(out * cospi_16_64)
    mul              r0, r0, r12               ; out * cospi_16_64
    mov              r12, r1                   ; save dest
    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
    asr              r0, r0, #14               ; >> DCT_CONST_BITS

    ; a1 = ROUND_POWER_OF_TWO(out, 4)
    add              r0, r0, #8                ; + (1 <<((4) - 1))
    asr              r0, r0, #4                ; >> 4

    vdup.s16         q0, r0                    ; duplicate a1

    vld1.32          {d2[0]}, [r1], r2
    vld1.32          {d2[1]}, [r1], r2
    vld1.32          {d4[0]}, [r1], r2
    vld1.32          {d4[1]}, [r1]

    vaddw.u8         q8, q0, d2                ; dest[x] + a1
    vaddw.u8         q9, q0, d4

    vqmovun.s16      d6, q8                    ; clip_pixel
    vqmovun.s16      d7, q9

    vst1.32          {d6[0]}, [r12], r2
    vst1.32          {d6[1]}, [r12], r2
    vst1.32          {d7[0]}, [r12], r2
    vst1.32          {d7[1]}, [r12]

    bx               lr
66
    ENDP             ; |vpx_idct4x4_1_add_neon|
67 68

    END