vp9_sad_sse4.asm 9.53 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
;
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;


%include "vpx_ports/x86_abi_support.asm"

%macro PROCESS_16X2X8 1
%if %1
        movdqa          xmm0,       XMMWORD PTR [rsi]
        movq            xmm1,       MMWORD PTR [rdi]
        movq            xmm3,       MMWORD PTR [rdi+8]
        movq            xmm2,       MMWORD PTR [rdi+16]
        punpcklqdq      xmm1,       xmm3
        punpcklqdq      xmm3,       xmm2

        movdqa          xmm2,       xmm1
        mpsadbw         xmm1,       xmm0,  0x0
        mpsadbw         xmm2,       xmm0,  0x5

        psrldq          xmm0,       8

        movdqa          xmm4,       xmm3
        mpsadbw         xmm3,       xmm0,  0x0
        mpsadbw         xmm4,       xmm0,  0x5

        paddw           xmm1,       xmm2
        paddw           xmm1,       xmm3
        paddw           xmm1,       xmm4
%else
        movdqa          xmm0,       XMMWORD PTR [rsi]
        movq            xmm5,       MMWORD PTR [rdi]
        movq            xmm3,       MMWORD PTR [rdi+8]
        movq            xmm2,       MMWORD PTR [rdi+16]
        punpcklqdq      xmm5,       xmm3
        punpcklqdq      xmm3,       xmm2

        movdqa          xmm2,       xmm5
        mpsadbw         xmm5,       xmm0,  0x0
        mpsadbw         xmm2,       xmm0,  0x5

        psrldq          xmm0,       8

        movdqa          xmm4,       xmm3
        mpsadbw         xmm3,       xmm0,  0x0
        mpsadbw         xmm4,       xmm0,  0x5

        paddw           xmm5,       xmm2
        paddw           xmm5,       xmm3
        paddw           xmm5,       xmm4

        paddw           xmm1,       xmm5
%endif
        movdqa          xmm0,       XMMWORD PTR [rsi + rax]
        movq            xmm5,       MMWORD PTR [rdi+ rdx]
        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
        movq            xmm2,       MMWORD PTR [rdi+ rdx+16]
        punpcklqdq      xmm5,       xmm3
        punpcklqdq      xmm3,       xmm2

        lea             rsi,        [rsi+rax*2]
        lea             rdi,        [rdi+rdx*2]

        movdqa          xmm2,       xmm5
        mpsadbw         xmm5,       xmm0,  0x0
        mpsadbw         xmm2,       xmm0,  0x5

        psrldq          xmm0,       8
        movdqa          xmm4,       xmm3
        mpsadbw         xmm3,       xmm0,  0x0
        mpsadbw         xmm4,       xmm0,  0x5

        paddw           xmm5,       xmm2
        paddw           xmm5,       xmm3
        paddw           xmm5,       xmm4

        paddw           xmm1,       xmm5
%endmacro

%macro PROCESS_8X2X8 1
%if %1
        movq            xmm0,       MMWORD PTR [rsi]
        movq            xmm1,       MMWORD PTR [rdi]
        movq            xmm3,       MMWORD PTR [rdi+8]
        punpcklqdq      xmm1,       xmm3

        movdqa          xmm2,       xmm1
        mpsadbw         xmm1,       xmm0,  0x0
        mpsadbw         xmm2,       xmm0,  0x5
        paddw           xmm1,       xmm2
%else
        movq            xmm0,       MMWORD PTR [rsi]
        movq            xmm5,       MMWORD PTR [rdi]
        movq            xmm3,       MMWORD PTR [rdi+8]
        punpcklqdq      xmm5,       xmm3

        movdqa          xmm2,       xmm5
        mpsadbw         xmm5,       xmm0,  0x0
        mpsadbw         xmm2,       xmm0,  0x5
        paddw           xmm5,       xmm2

        paddw           xmm1,       xmm5
%endif
        movq            xmm0,       MMWORD PTR [rsi + rax]
        movq            xmm5,       MMWORD PTR [rdi+ rdx]
        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
        punpcklqdq      xmm5,       xmm3

        lea             rsi,        [rsi+rax*2]
        lea             rdi,        [rdi+rdx*2]

        movdqa          xmm2,       xmm5
        mpsadbw         xmm5,       xmm0,  0x0
        mpsadbw         xmm2,       xmm0,  0x5
        paddw           xmm5,       xmm2

        paddw           xmm1,       xmm5
%endmacro

%macro PROCESS_4X2X8 1
%if %1
        movd            xmm0,       [rsi]
        movq            xmm1,       MMWORD PTR [rdi]
        movq            xmm3,       MMWORD PTR [rdi+8]
        punpcklqdq      xmm1,       xmm3

        mpsadbw         xmm1,       xmm0,  0x0
%else
        movd            xmm0,       [rsi]
        movq            xmm5,       MMWORD PTR [rdi]
        movq            xmm3,       MMWORD PTR [rdi+8]
        punpcklqdq      xmm5,       xmm3

        mpsadbw         xmm5,       xmm0,  0x0

        paddw           xmm1,       xmm5
%endif
        movd            xmm0,       [rsi + rax]
        movq            xmm5,       MMWORD PTR [rdi+ rdx]
        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
        punpcklqdq      xmm5,       xmm3

        lea             rsi,        [rsi+rax*2]
        lea             rdi,        [rdi+rdx*2]

        mpsadbw         xmm5,       xmm0,  0x0

        paddw           xmm1,       xmm5
%endmacro

157
158
159
160
161
162
163
164
165
166
%macro WRITE_AS_INTS 0
    mov             rdi,        arg(4)           ;Results
    pxor            xmm0, xmm0
    movdqa          xmm2, xmm1
    punpcklwd       xmm1, xmm0
    punpckhwd       xmm2, xmm0

    movdqa          [rdi],    xmm1
    movdqa          [rdi + 16],    xmm2
%endmacro
167

168
;void vp9_sad16x16x8_sse4(
169
170
171
172
173
;    const unsigned char *src_ptr,
;    int  src_stride,
;    const unsigned char *ref_ptr,
;    int  ref_stride,
;    unsigned short *sad_array);
174
global sym(vp9_sad16x16x8_sse4) PRIVATE
175
sym(vp9_sad16x16x8_sse4):
176
177
178
179
180
181
182
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    push        rsi
    push        rdi
    ; end prolog

183
184
    mov             rsi,        arg(0)           ;src_ptr
    mov             rdi,        arg(2)           ;ref_ptr
185

186
187
    movsxd          rax,        dword ptr arg(1) ;src_stride
    movsxd          rdx,        dword ptr arg(3) ;ref_stride
188

189
190
191
192
193
194
195
196
    PROCESS_16X2X8 1
    PROCESS_16X2X8 0
    PROCESS_16X2X8 0
    PROCESS_16X2X8 0
    PROCESS_16X2X8 0
    PROCESS_16X2X8 0
    PROCESS_16X2X8 0
    PROCESS_16X2X8 0
197

198
    WRITE_AS_INTS
199
200
201
202
203
204
205
206
207

    ; begin epilog
    pop         rdi
    pop         rsi
    UNSHADOW_ARGS
    pop         rbp
    ret


208
;void vp9_sad16x8x8_sse4(
209
210
211
212
213
214
;    const unsigned char *src_ptr,
;    int  src_stride,
;    const unsigned char *ref_ptr,
;    int  ref_stride,
;    unsigned short *sad_array
;);
215
global sym(vp9_sad16x8x8_sse4) PRIVATE
216
sym(vp9_sad16x8x8_sse4):
217
218
219
220
221
222
223
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    push        rsi
    push        rdi
    ; end prolog

224
225
    mov             rsi,        arg(0)           ;src_ptr
    mov             rdi,        arg(2)           ;ref_ptr
226

227
228
    movsxd          rax,        dword ptr arg(1) ;src_stride
    movsxd          rdx,        dword ptr arg(3) ;ref_stride
229

230
231
232
233
    PROCESS_16X2X8 1
    PROCESS_16X2X8 0
    PROCESS_16X2X8 0
    PROCESS_16X2X8 0
234

235
    WRITE_AS_INTS
236
237
238
239
240
241
242
243
244

    ; begin epilog
    pop         rdi
    pop         rsi
    UNSHADOW_ARGS
    pop         rbp
    ret


245
;void vp9_sad8x8x8_sse4(
246
247
248
249
250
251
;    const unsigned char *src_ptr,
;    int  src_stride,
;    const unsigned char *ref_ptr,
;    int  ref_stride,
;    unsigned short *sad_array
;);
252
global sym(vp9_sad8x8x8_sse4) PRIVATE
253
sym(vp9_sad8x8x8_sse4):
254
255
256
257
258
259
260
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    push        rsi
    push        rdi
    ; end prolog

261
262
    mov             rsi,        arg(0)           ;src_ptr
    mov             rdi,        arg(2)           ;ref_ptr
263

264
265
    movsxd          rax,        dword ptr arg(1) ;src_stride
    movsxd          rdx,        dword ptr arg(3) ;ref_stride
266

267
268
269
270
    PROCESS_8X2X8 1
    PROCESS_8X2X8 0
    PROCESS_8X2X8 0
    PROCESS_8X2X8 0
271

272
    WRITE_AS_INTS
273
274
275
276
277
278
279
280
281

    ; begin epilog
    pop         rdi
    pop         rsi
    UNSHADOW_ARGS
    pop         rbp
    ret


282
;void vp9_sad8x16x8_sse4(
283
284
285
286
287
288
;    const unsigned char *src_ptr,
;    int  src_stride,
;    const unsigned char *ref_ptr,
;    int  ref_stride,
;    unsigned short *sad_array
;);
289
global sym(vp9_sad8x16x8_sse4) PRIVATE
290
sym(vp9_sad8x16x8_sse4):
291
292
293
294
295
296
297
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    push        rsi
    push        rdi
    ; end prolog

298
299
300
301
302
    mov             rsi,        arg(0)           ;src_ptr
    mov             rdi,        arg(2)           ;ref_ptr

    movsxd          rax,        dword ptr arg(1) ;src_stride
    movsxd          rdx,        dword ptr arg(3) ;ref_stride
303

304
305
306
307
308
309
310
311
    PROCESS_8X2X8 1
    PROCESS_8X2X8 0
    PROCESS_8X2X8 0
    PROCESS_8X2X8 0
    PROCESS_8X2X8 0
    PROCESS_8X2X8 0
    PROCESS_8X2X8 0
    PROCESS_8X2X8 0
312

313
    WRITE_AS_INTS
314
315
316
317
318
319
320
321
322

    ; begin epilog
    pop         rdi
    pop         rsi
    UNSHADOW_ARGS
    pop         rbp
    ret


323
;void vp9_sad4x4x8_c(
324
325
326
327
328
329
;    const unsigned char *src_ptr,
;    int  src_stride,
;    const unsigned char *ref_ptr,
;    int  ref_stride,
;    unsigned short *sad_array
;);
330
global sym(vp9_sad4x4x8_sse4) PRIVATE
331
sym(vp9_sad4x4x8_sse4):
332
333
334
335
336
337
338
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    push        rsi
    push        rdi
    ; end prolog

339
340
    mov             rsi,        arg(0)           ;src_ptr
    mov             rdi,        arg(2)           ;ref_ptr
341

342
343
    movsxd          rax,        dword ptr arg(1) ;src_stride
    movsxd          rdx,        dword ptr arg(3) ;ref_stride
344

345
346
    PROCESS_4X2X8 1
    PROCESS_4X2X8 0
347

348
    WRITE_AS_INTS
349
350
351
352
353
354
355
356
357
358
359

    ; begin epilog
    pop         rdi
    pop         rsi
    UNSHADOW_ARGS
    pop         rbp
    ret