idct32x32_1_add_neon.asm 4.51 KB
Newer Older
1
;
Yaowu Xu's avatar
Yaowu Xu committed
2
; Copyright (c) 2016, Alliance for Open Media. All rights reserved
3
;
Yaowu Xu's avatar
Yaowu Xu committed
4 5 6 7 8 9
; This source code is subject to the terms of the BSD 2 Clause License and
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
; was not distributed with this source code in the LICENSE file, you can
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
; Media Patent License 1.0 was not distributed with this source code in the
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 11
;

Yaowu Xu's avatar
Yaowu Xu committed
12

Yaowu Xu's avatar
Yaowu Xu committed
13
    EXPORT  |aom_idct32x32_1_add_neon|
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
    ARM
    REQUIRE8
    PRESERVE8

    AREA ||.text||, CODE, READONLY, ALIGN=2

    ;TODO(hkuang): put the following macros in a seperate
    ;file so other idct function could also use them.
    MACRO
    LD_16x8          $src, $stride
    vld1.8           {q8}, [$src], $stride
    vld1.8           {q9}, [$src], $stride
    vld1.8           {q10}, [$src], $stride
    vld1.8           {q11}, [$src], $stride
    vld1.8           {q12}, [$src], $stride
    vld1.8           {q13}, [$src], $stride
    vld1.8           {q14}, [$src], $stride
    vld1.8           {q15}, [$src], $stride
    MEND

    MACRO
    ADD_DIFF_16x8    $diff
    vqadd.u8         q8, q8, $diff
    vqadd.u8         q9, q9, $diff
    vqadd.u8         q10, q10, $diff
    vqadd.u8         q11, q11, $diff
    vqadd.u8         q12, q12, $diff
    vqadd.u8         q13, q13, $diff
    vqadd.u8         q14, q14, $diff
    vqadd.u8         q15, q15, $diff
    MEND

    MACRO
    SUB_DIFF_16x8    $diff
    vqsub.u8         q8, q8, $diff
    vqsub.u8         q9, q9, $diff
    vqsub.u8         q10, q10, $diff
    vqsub.u8         q11, q11, $diff
    vqsub.u8         q12, q12, $diff
    vqsub.u8         q13, q13, $diff
    vqsub.u8         q14, q14, $diff
    vqsub.u8         q15, q15, $diff
    MEND

    MACRO
    ST_16x8          $dst, $stride
    vst1.8           {q8}, [$dst], $stride
    vst1.8           {q9}, [$dst], $stride
    vst1.8           {q10},[$dst], $stride
    vst1.8           {q11},[$dst], $stride
    vst1.8           {q12},[$dst], $stride
    vst1.8           {q13},[$dst], $stride
    vst1.8           {q14},[$dst], $stride
    vst1.8           {q15},[$dst], $stride
    MEND

Yaowu Xu's avatar
Yaowu Xu committed
70
;void aom_idct32x32_1_add_neon(int16_t *input, uint8_t *dest,
71 72 73 74 75 76
;                              int dest_stride)
;
; r0  int16_t input
; r1  uint8_t *dest
; r2  int dest_stride

Yaowu Xu's avatar
Yaowu Xu committed
77
|aom_idct32x32_1_add_neon| PROC
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
    push             {lr}
    pld              [r1]
    add              r3, r1, #16               ; r3 dest + 16 for second loop
    ldrsh            r0, [r0]

    ; generate cospi_16_64 = 11585
    mov              r12, #0x2d00
    add              r12, #0x41

    ; out = dct_const_round_shift(input[0] * cospi_16_64)
    mul              r0, r0, r12               ; input[0] * cospi_16_64
    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
    asr              r0, r0, #14               ; >> DCT_CONST_BITS

    ; out = dct_const_round_shift(out * cospi_16_64)
    mul              r0, r0, r12               ; out * cospi_16_64
    mov              r12, r1                   ; save dest
    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
    asr              r0, r0, #14               ; >> DCT_CONST_BITS

    ; a1 = ROUND_POWER_OF_TWO(out, 6)
    add              r0, r0, #32               ; + (1 <<((6) - 1))
    asrs             r0, r0, #6                ; >> 6
    bge              diff_positive_32_32

diff_negative_32_32
    neg              r0, r0
    usat             r0, #8, r0
    vdup.u8          q0, r0
    mov              r0, #4

diff_negative_32_32_loop
    sub              r0, #1
    LD_16x8          r1, r2
    SUB_DIFF_16x8    q0
    ST_16x8          r12, r2

    LD_16x8          r1, r2
    SUB_DIFF_16x8    q0
    ST_16x8          r12, r2
    cmp              r0, #2
    moveq            r1, r3
    moveq            r12, r3
    cmp              r0, #0
    bne              diff_negative_32_32_loop
hkuang's avatar
hkuang committed
123
    pop              {pc}
124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145

diff_positive_32_32
    usat             r0, #8, r0
    vdup.u8          q0, r0
    mov              r0, #4

diff_positive_32_32_loop
    sub              r0, #1
    LD_16x8          r1, r2
    ADD_DIFF_16x8    q0
    ST_16x8          r12, r2

    LD_16x8          r1, r2
    ADD_DIFF_16x8    q0
    ST_16x8          r12, r2
    cmp              r0, #2
    moveq            r1, r3
    moveq            r12, r3
    cmp              r0, #0
    bne              diff_positive_32_32_loop
    pop              {pc}

Yaowu Xu's avatar
Yaowu Xu committed
146
    ENDP             ; |aom_idct32x32_1_add_neon|
147
    END