variance_halfpixvar16x16_hv_media.asm 8.48 KB
Newer Older
1
;
Yaowu Xu's avatar
Yaowu Xu committed
2
; Copyright (c) 2016, Alliance for Open Media. All rights reserved
3
;
Yaowu Xu's avatar
Yaowu Xu committed
4 5 6 7 8 9 10 11
; This source code is subject to the terms of the BSD 2 Clause License and
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
; was not distributed with this source code in the LICENSE file, you can
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
; Media Patent License 1.0 was not distributed with this source code in the
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
;

12 13 14
;


Yaowu Xu's avatar
Yaowu Xu committed
15
    EXPORT  |aom_variance_halfpixvar16x16_hv_media|
16 17 18 19 20 21 22 23 24 25 26 27

    ARM
    REQUIRE8
    PRESERVE8

    AREA ||.text||, CODE, READONLY, ALIGN=2

; r0    unsigned char *src_ptr
; r1    int source_stride
; r2    unsigned char *ref_ptr
; r3    int  recon_stride
; stack unsigned int *sse
Yaowu Xu's avatar
Yaowu Xu committed
28
|aom_variance_halfpixvar16x16_hv_media| PROC
29 30

    stmfd   sp!, {r4-r12, lr}
31 32 33 34

    pld     [r0, r1, lsl #0]
    pld     [r2, r3, lsl #0]

35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
    mov     r8, #0              ; initialize sum = 0
    ldr     r10, c80808080
    mov     r11, #0             ; initialize sse = 0
    mov     r12, #16            ; set loop counter to 16 (=block height)
    mov     lr, #0              ; constant zero
loop
    add     r9, r0, r1          ; pointer to pixels on the next row
    ; 1st 4 pixels
    ldr     r4, [r0, #0]        ; load source pixels a, row N
    ldr     r6, [r0, #1]        ; load source pixels b, row N
    ldr     r5, [r9, #0]        ; load source pixels c, row N+1
    ldr     r7, [r9, #1]        ; load source pixels d, row N+1

    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
    mvn     r6, r6
    uhsub8  r4, r4, r6
    eor     r4, r4, r10
    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
    mvn     r7, r7
    uhsub8  r5, r5, r7
    eor     r5, r5, r10
    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
    mvn     r5, r5
    uhsub8  r4, r4, r5
    ldr     r5, [r2, #0]        ; load 4 ref pixels
    eor     r4, r4, r10

    usub8   r6, r4, r5          ; calculate difference
63
    pld     [r0, r1, lsl #1]
64 65
    sel     r7, r6, lr          ; select bytes with positive difference
    usub8   r6, r5, r4          ; calculate difference with reversed operands
66
    pld     [r2, r3, lsl #1]
67 68 69 70 71 72 73 74
    sel     r6, r6, lr          ; select bytes with negative difference

    ; calculate partial sums
    usad8   r4, r7, lr          ; calculate sum of positive differences
    usad8   r5, r6, lr          ; calculate sum of negative differences
    orr     r6, r6, r7          ; differences of all 4 pixels
    ; calculate total sum
    adds    r8, r8, r4          ; add positive differences to sum
75
    subs    r8, r8, r5          ; subtract negative differences from sum
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116

    ; calculate sse
    uxtb16  r5, r6              ; byte (two pixels) to halfwords
    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

    ; 2nd 4 pixels
    ldr     r4, [r0, #4]        ; load source pixels a, row N
    ldr     r6, [r0, #5]        ; load source pixels b, row N
    ldr     r5, [r9, #4]        ; load source pixels c, row N+1

    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

    ldr     r7, [r9, #5]        ; load source pixels d, row N+1

    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
    mvn     r6, r6
    uhsub8  r4, r4, r6
    eor     r4, r4, r10
    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
    mvn     r7, r7
    uhsub8  r5, r5, r7
    eor     r5, r5, r10
    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
    mvn     r5, r5
    uhsub8  r4, r4, r5
    ldr     r5, [r2, #4]        ; load 4 ref pixels
    eor     r4, r4, r10

    usub8   r6, r4, r5          ; calculate difference
    sel     r7, r6, lr          ; select bytes with positive difference
    usub8   r6, r5, r4          ; calculate difference with reversed operands
    sel     r6, r6, lr          ; select bytes with negative difference

    ; calculate partial sums
    usad8   r4, r7, lr          ; calculate sum of positive differences
    usad8   r5, r6, lr          ; calculate sum of negative differences
    orr     r6, r6, r7          ; differences of all 4 pixels

    ; calculate total sum
    add     r8, r8, r4          ; add positive differences to sum
117
    sub     r8, r8, r5          ; subtract negative differences from sum
118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158

    ; calculate sse
    uxtb16  r5, r6              ; byte (two pixels) to halfwords
    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

    ; 3rd 4 pixels
    ldr     r4, [r0, #8]        ; load source pixels a, row N
    ldr     r6, [r0, #9]        ; load source pixels b, row N
    ldr     r5, [r9, #8]        ; load source pixels c, row N+1

    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

    ldr     r7, [r9, #9]        ; load source pixels d, row N+1

    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
    mvn     r6, r6
    uhsub8  r4, r4, r6
    eor     r4, r4, r10
    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
    mvn     r7, r7
    uhsub8  r5, r5, r7
    eor     r5, r5, r10
    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
    mvn     r5, r5
    uhsub8  r4, r4, r5
    ldr     r5, [r2, #8]        ; load 4 ref pixels
    eor     r4, r4, r10

    usub8   r6, r4, r5          ; calculate difference
    sel     r7, r6, lr          ; select bytes with positive difference
    usub8   r6, r5, r4          ; calculate difference with reversed operands
    sel     r6, r6, lr          ; select bytes with negative difference

    ; calculate partial sums
    usad8   r4, r7, lr          ; calculate sum of positive differences
    usad8   r5, r6, lr          ; calculate sum of negative differences
    orr     r6, r6, r7          ; differences of all 4 pixels

    ; calculate total sum
    add     r8, r8, r4          ; add positive differences to sum
159
    sub     r8, r8, r5          ; subtract negative differences from sum
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200

    ; calculate sse
    uxtb16  r5, r6              ; byte (two pixels) to halfwords
    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

    ; 4th 4 pixels
    ldr     r4, [r0, #12]       ; load source pixels a, row N
    ldr     r6, [r0, #13]       ; load source pixels b, row N
    ldr     r5, [r9, #12]       ; load source pixels c, row N+1
    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
    ldr     r7, [r9, #13]       ; load source pixels d, row N+1

    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
    mvn     r6, r6
    uhsub8  r4, r4, r6
    eor     r4, r4, r10
    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
    mvn     r7, r7
    uhsub8  r5, r5, r7
    eor     r5, r5, r10
    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
    mvn     r5, r5
    uhsub8  r4, r4, r5
    ldr     r5, [r2, #12]       ; load 4 ref pixels
    eor     r4, r4, r10

    usub8   r6, r4, r5          ; calculate difference
    add     r0, r0, r1          ; set src_ptr to next row
    sel     r7, r6, lr          ; select bytes with positive difference
    usub8   r6, r5, r4          ; calculate difference with reversed operands
    add     r2, r2, r3          ; set dst_ptr to next row
    sel     r6, r6, lr          ; select bytes with negative difference

    ; calculate partial sums
    usad8   r4, r7, lr          ; calculate sum of positive differences
    usad8   r5, r6, lr          ; calculate sum of negative differences
    orr     r6, r6, r7          ; differences of all 4 pixels

    ; calculate total sum
    add     r8, r8, r4          ; add positive differences to sum
201
    sub     r8, r8, r5          ; subtract negative differences from sum
202 203 204 205 206 207 208 209 210 211 212 213 214 215

    ; calculate sse
    uxtb16  r5, r6              ; byte (two pixels) to halfwords
    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
    subs    r12, r12, #1
    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

    bne     loop

    ; return stuff
    ldr     r6, [sp, #40]       ; get address of sse
    mul     r0, r8, r8          ; sum * sum
    str     r11, [r6]           ; store sse
Johann's avatar
Johann committed
216
    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
217 218 219 220 221 222 223 224 225

    ldmfd   sp!, {r4-r12, pc}

    ENDP

c80808080
    DCD     0x80808080

    END