loopfilter_sse2.asm 73.8 KB
Newer Older
John Koleszar's avatar
John Koleszar committed
1
;
2
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
John Koleszar's avatar
John Koleszar committed
3
;
4
;  Use of this source code is governed by a BSD-style license
5
6
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
7
;  in the file PATENTS.  All contributing project authors may
8
;  be found in the AUTHORS file in the root of the source tree.
John Koleszar's avatar
John Koleszar committed
9
10
11
12
13
;


%include "vpx_ports/x86_abi_support.asm"

14
15
; Use of pmaxub instead of psubusb to compute filter mask was seen
; in ffvp8
John Koleszar's avatar
John Koleszar committed
16

17
%macro LFH_FILTER_AND_HEV_MASK 1
18
19
20
%if %1
        movdqa      xmm2,                   [rdi+2*rax]       ; q3
        movdqa      xmm1,                   [rsi+2*rax]       ; q2
21
22
23
        movdqa      xmm4,                   [rsi+rax]         ; q1
        movdqa      xmm5,                   [rsi]             ; q0
        neg         rax                     ; negate pitch to deal with above border
24
%else
25
26
27
28
29
30
31
32
33
34
35
36
37
        movlps      xmm2,                   [rsi + rcx*2]     ; q3
        movlps      xmm1,                   [rsi + rcx]       ; q2
        movlps      xmm4,                   [rsi]             ; q1
        movlps      xmm5,                   [rsi + rax]       ; q0

        movhps      xmm2,                   [rdi + rcx*2]
        movhps      xmm1,                   [rdi + rcx]
        movhps      xmm4,                   [rdi]
        movhps      xmm5,                   [rdi + rax]

        lea         rsi,                    [rsi + rax*4]
        lea         rdi,                    [rdi + rax*4]

38
        movdqa      XMMWORD PTR [rsp],      xmm1              ; store q2
39
        movdqa      XMMWORD PTR [rsp + 16], xmm4              ; store q1
40
%endif
John Koleszar's avatar
John Koleszar committed
41

42
        movdqa      xmm6,                   xmm1              ; q2
43
44
        movdqa      xmm3,                   xmm4              ; q1

45
46
        psubusb     xmm1,                   xmm2              ; q2-=q3
        psubusb     xmm2,                   xmm6              ; q3-=q2
John Koleszar's avatar
John Koleszar committed
47

48
49
        psubusb     xmm4,                   xmm6              ; q1-=q2
        psubusb     xmm6,                   xmm3              ; q2-=q1
50

51
        por         xmm4,                   xmm6              ; abs(q2-q1)
52
        por         xmm1,                   xmm2              ; abs(q3-q2)
John Koleszar's avatar
John Koleszar committed
53

54
55
        movdqa      xmm0,                   xmm5              ; q0
        pmaxub      xmm1,                   xmm4
John Koleszar's avatar
John Koleszar committed
56

57
        psubusb     xmm5,                   xmm3              ; q0-=q1
58
        psubusb     xmm3,                   xmm0              ; q1-=q0
John Koleszar's avatar
John Koleszar committed
59

60
61
62
63
        por         xmm5,                   xmm3              ; abs(q0-q1)
        movdqa      t0,                     xmm5              ; save to t0

        pmaxub      xmm1,                   xmm5
64

65
%if %1
66
67
        movdqa      xmm2,                   [rsi+4*rax]       ; p3
        movdqa      xmm4,                   [rdi+4*rax]       ; p2
68
        movdqa      xmm6,                   [rsi+2*rax]       ; p1
69
%else
70
71
72
73
74
75
76
        movlps      xmm2,                   [rsi + rax]       ; p3
        movlps      xmm4,                   [rsi]             ; p2
        movlps      xmm6,                   [rsi + rcx]       ; p1

        movhps      xmm2,                   [rdi + rax]
        movhps      xmm4,                   [rdi]
        movhps      xmm6,                   [rdi + rcx]
77
78

        movdqa      XMMWORD PTR [rsp + 32], xmm4              ; store p2
79
        movdqa      XMMWORD PTR [rsp + 48], xmm6              ; store p1
80
%endif
John Koleszar's avatar
John Koleszar committed
81

82
        movdqa      xmm5,                   xmm4              ; p2
83
84
        movdqa      xmm3,                   xmm6              ; p1

85
86
        psubusb     xmm4,                   xmm2              ; p2-=p3
        psubusb     xmm2,                   xmm5              ; p3-=p2
John Koleszar's avatar
John Koleszar committed
87

88
89
        psubusb     xmm3,                   xmm5              ; p1-=p2
        pmaxub      xmm1,                   xmm4              ; abs(p3 - p2)
John Koleszar's avatar
John Koleszar committed
90

91
92
        psubusb     xmm5,                   xmm6              ; p2-=p1
        pmaxub      xmm1,                   xmm2              ; abs(p3 - p2)
John Koleszar's avatar
John Koleszar committed
93

94
95
        pmaxub      xmm1,                   xmm5              ; abs(p2 - p1)
        movdqa      xmm2,                   xmm6              ; p1
John Koleszar's avatar
John Koleszar committed
96

97
        pmaxub      xmm1,                   xmm3              ; abs(p2 - p1)
98
99
%if %1
        movdqa      xmm4,                   [rsi+rax]         ; p0
100
        movdqa      xmm3,                   [rdi]             ; q1
101
%else
102
103
104
        movlps      xmm4,                   [rsi + rcx*2]     ; p0
        movhps      xmm4,                   [rdi + rcx*2]
        movdqa      xmm3,                   q1                ; q1
105
%endif
John Koleszar's avatar
John Koleszar committed
106

107
        movdqa      xmm5,                   xmm4              ; p0
108
        psubusb     xmm4,                   xmm6              ; p0-=p1
John Koleszar's avatar
John Koleszar committed
109

110
        psubusb     xmm6,                   xmm5              ; p1-=p0
John Koleszar's avatar
John Koleszar committed
111

112
113
114
115
        por         xmm6,                   xmm4              ; abs(p1 - p0)
        mov         rdx,                    arg(2)            ; get flimit

        movdqa        t1,                   xmm6              ; save to t1
John Koleszar's avatar
John Koleszar committed
116

117
        movdqa      xmm4,                   xmm3              ; q1
118
119
        pmaxub      xmm1,                   xmm6

120
121
        psubusb     xmm3,                   xmm2              ; q1-=p1
        psubusb     xmm2,                   xmm4              ; p1-=q1
122
123

        psubusb     xmm1,                   xmm7
124
        por         xmm2,                   xmm3              ; abs(p1-q1)
125
126
127
128

        movdqa      xmm4,                   XMMWORD PTR [rdx] ; flimit

        movdqa      xmm3,                   xmm0              ; q0
129
        pand        xmm2,                   [GLOBAL(tfe)]     ; set lsb of each byte to zero
130
131

        mov         rdx,                    arg(4)            ; hev get thresh
John Koleszar's avatar
John Koleszar committed
132

133
        movdqa      xmm6,                   xmm5              ; p0
134
135
        psrlw       xmm2,                   1                 ; abs(p1-q1)/2

136
        psubusb     xmm5,                   xmm3              ; p0-=q0
137
138
        paddb       xmm4,                   xmm4              ; flimit*2 (less than 255)

139
140
        psubusb     xmm3,                   xmm6              ; q0-=p0
        por         xmm5,                   xmm3              ; abs(p0 - q0)
141

142
        paddusb     xmm5,                   xmm5              ; abs(p0-q0)*2
143
144
145
146
147
148
        paddb       xmm7,                   xmm4              ; flimit * 2 + limit (less than 255)

        movdqa      xmm4,                   t0                ; hev get abs (q1 - q0)

        movdqa      xmm3,                   t1                ; get abs (p1 - p0)

149
        paddusb     xmm5,                   xmm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
John Koleszar's avatar
John Koleszar committed
150

151
        movdqa      xmm2,                   XMMWORD PTR [rdx] ; hev
John Koleszar's avatar
John Koleszar committed
152

153
        psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
154
155
156
        psubusb     xmm4,                   xmm2              ; hev

        psubusb     xmm3,                   xmm2              ; hev
157
        por         xmm1,                   xmm5
John Koleszar's avatar
John Koleszar committed
158

159
160
        pxor        xmm7,                   xmm7
        paddb       xmm4,                   xmm3              ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
John Koleszar's avatar
John Koleszar committed
161

162
163
        pcmpeqb     xmm4,                   xmm5              ; hev
        pcmpeqb     xmm3,                   xmm3              ; hev
John Koleszar's avatar
John Koleszar committed
164

165
166
        pcmpeqb     xmm1,                   xmm7              ; mask xmm1
        pxor        xmm4,                   xmm3              ; hev
167
%endmacro
John Koleszar's avatar
John Koleszar committed
168

169
170
%macro B_FILTER 1
%if %1 == 0
171
172
        movdqa      xmm2,                   p1                ; p1
        movdqa      xmm7,                   q1                ; q1
173
174
175
176
177
178
179
180
181
182
%elif %1 == 1
        movdqa      xmm2,                   [rsi+2*rax]       ; p1
        movdqa      xmm7,                   [rdi]             ; q1
%elif %1 == 2
        lea         rdx,                    srct

        movdqa      xmm2,                   [rdx]             ; p1
        movdqa      xmm7,                   [rdx+48]          ; q1
        movdqa      xmm6,                   [rdx+16]          ; p0
        movdqa      xmm0,                   [rdx+32]          ; q0
183
%endif
John Koleszar's avatar
John Koleszar committed
184

185
186
        pxor        xmm2,                   [GLOBAL(t80)]     ; p1 offset to convert to signed values
        pxor        xmm7,                   [GLOBAL(t80)]     ; q1 offset to convert to signed values
John Koleszar's avatar
John Koleszar committed
187

188
        psubsb      xmm2,                   xmm7              ; p1 - q1
189
        pxor        xmm6,                   [GLOBAL(t80)]     ; offset to convert to signed values
John Koleszar's avatar
John Koleszar committed
190

Fritz Koenig's avatar
Fritz Koenig committed
191
        pand        xmm2,                   xmm4              ; high var mask (hvm)(p1 - q1)
192
        pxor        xmm0,                   [GLOBAL(t80)]     ; offset to convert to signed values
John Koleszar's avatar
John Koleszar committed
193

Fritz Koenig's avatar
Fritz Koenig committed
194
        movdqa      xmm3,                   xmm0              ; q0
195
        psubsb      xmm0,                   xmm6              ; q0 - p0
196

197
        paddsb      xmm2,                   xmm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
198

199
        paddsb      xmm2,                   xmm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
200

201
        paddsb      xmm2,                   xmm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
202

203
        pand        xmm1,                   xmm2              ; mask filter values we don't care about
204

205
        movdqa      xmm2,                   xmm1
206

207
208
        paddsb      xmm1,                   [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
        paddsb      xmm2,                   [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
John Koleszar's avatar
John Koleszar committed
209

Fritz Koenig's avatar
Fritz Koenig committed
210
211
212
        punpckhbw   xmm5,                   xmm2              ; axbxcxdx
        punpcklbw   xmm2,                   xmm2              ; exfxgxhx

213
        punpcklbw   xmm0,                   xmm1              ; exfxgxhx
Fritz Koenig's avatar
Fritz Koenig committed
214
215
216
        psraw       xmm5,                   11                ; sign extended shift right by 3

        punpckhbw   xmm1,                   xmm1              ; axbxcxdx
217
        psraw       xmm2,                   11                ; sign extended shift right by 3
Fritz Koenig's avatar
Fritz Koenig committed
218

219
        packsswb    xmm2,                   xmm5              ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
220
221
        psraw       xmm0,                   11                ; sign extended shift right by 3

222
        psraw       xmm1,                   11                ; sign extended shift right by 3
Fritz Koenig's avatar
Fritz Koenig committed
223
224
        movdqa      xmm5,                   xmm0              ; save results

225
        packsswb    xmm0,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
226
        paddsw      xmm5,                   [GLOBAL(ones)]
Fritz Koenig's avatar
Fritz Koenig committed
227

228
        paddsw      xmm1,                   [GLOBAL(ones)]
229
        psraw       xmm5,                   1                 ; partial shifted one more time for 2nd tap
Fritz Koenig's avatar
Fritz Koenig committed
230

231
        psraw       xmm1,                   1                 ; partial shifted one more time for 2nd tap
232
233

        paddsb      xmm6,                   xmm2              ; p0+= p0 add
234
        packsswb    xmm5,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
235

236
237
238
239
240
241
%if %1 == 0
        movdqa      xmm1,                   p1                ; p1
%elif %1 == 1
        movdqa      xmm1,                   [rsi+2*rax]       ; p1
%elif %1 == 2
        movdqa      xmm1,                   [rdx]             ; p1
242
%endif
243
        pandn       xmm4,                   xmm5              ; high edge variance additive
244
        pxor        xmm6,                   [GLOBAL(t80)]     ; unoffset
245

246
        pxor        xmm1,                   [GLOBAL(t80)]     ; reoffset
247
        psubsb      xmm3,                   xmm0              ; q0-= q0 add
248
249

        paddsb      xmm1,                   xmm4              ; p1+= p1 add
250
        pxor        xmm3,                   [GLOBAL(t80)]     ; unoffset
251

252
        pxor        xmm1,                   [GLOBAL(t80)]     ; unoffset
253
        psubsb      xmm7,                   xmm4              ; q1-= q1 add
254

255
        pxor        xmm7,                   [GLOBAL(t80)]     ; unoffset
256
257
258
259
260
261
262
263
264
%if %1 == 0
        lea         rsi,                    [rsi + rcx*2]
        lea         rdi,                    [rdi + rcx*2]
        movq        MMWORD PTR [rsi],       xmm6              ; p0
        movhps      MMWORD PTR [rdi],       xmm6
        movq        MMWORD PTR [rsi + rax], xmm1              ; p1
        movhps      MMWORD PTR [rdi + rax], xmm1
        movq        MMWORD PTR [rsi + rcx], xmm3              ; q0
        movhps      MMWORD PTR [rdi + rcx], xmm3
265
        movq        MMWORD PTR [rsi + rcx*2],xmm7             ; q1
266
267
268
269
270
271
        movhps      MMWORD PTR [rdi + rcx*2],xmm7
%elif %1 == 1
        movdqa      [rsi+rax],              xmm6              ; write back
        movdqa      [rsi+2*rax],            xmm1              ; write back
        movdqa      [rsi],                  xmm3              ; write back
        movdqa      [rdi],                  xmm7              ; write back
272
%endif
273

274
%endmacro
John Koleszar's avatar
John Koleszar committed
275
276


277
;void vp8_loop_filter_horizontal_edge_sse2
John Koleszar's avatar
John Koleszar committed
278
279
280
281
282
283
284
;(
;    unsigned char *src_ptr,
;    int            src_pixel_step,
;    const char    *flimit,
;    const char    *limit,
;    const char    *thresh,
;    int            count
285
286
287
288
289
290
291
292
293
294
295
;)
global sym(vp8_loop_filter_horizontal_edge_sse2)
sym(vp8_loop_filter_horizontal_edge_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    SAVE_XMM
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog
John Koleszar's avatar
John Koleszar committed
296

297
298
299
300
    ALIGN_STACK 16, rax
    sub         rsp, 32     ; reserve 32 bytes
    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[16];
    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[16];
John Koleszar's avatar
John Koleszar committed
301

302
303
        mov         rsi,                    arg(0)           ;src_ptr
        movsxd      rax,                    dword ptr arg(1) ;src_pixel_step
John Koleszar's avatar
John Koleszar committed
304

305
306
        mov         rdx,                    arg(3)           ;limit
        movdqa      xmm7,                   XMMWORD PTR [rdx]
John Koleszar's avatar
John Koleszar committed
307

308
        lea         rdi,                    [rsi+rax]        ; rdi points to row +1 for indirect addressing
John Koleszar's avatar
John Koleszar committed
309

310
311
312
313
        ; calculate breakout conditions and high edge variance
        LFH_FILTER_AND_HEV_MASK 1
        ; filter and write back the result
        B_FILTER 1
John Koleszar's avatar
John Koleszar committed
314

315
316
317
318
319
320
321
322
323
324
    add rsp, 32
    pop rsp
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
John Koleszar's avatar
John Koleszar committed
325
326


327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
;void vp8_loop_filter_horizontal_edge_uv_sse2
;(
;    unsigned char *src_ptr,
;    int            src_pixel_step,
;    const char    *flimit,
;    const char    *limit,
;    const char    *thresh,
;    int            count
;)
global sym(vp8_loop_filter_horizontal_edge_uv_sse2)
sym(vp8_loop_filter_horizontal_edge_uv_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    SAVE_XMM
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog
John Koleszar's avatar
John Koleszar committed
346

347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
    ALIGN_STACK 16, rax
    sub         rsp, 96       ; reserve 96 bytes
    %define q2  [rsp + 0]     ;__declspec(align(16)) char q2[16];
    %define q1  [rsp + 16]    ;__declspec(align(16)) char q1[16];
    %define p2  [rsp + 32]    ;__declspec(align(16)) char p2[16];
    %define p1  [rsp + 48]    ;__declspec(align(16)) char p1[16];
    %define t0  [rsp + 64]    ;__declspec(align(16)) char t0[16];
    %define t1  [rsp + 80]    ;__declspec(align(16)) char t1[16];

        mov         rsi,                    arg(0)             ; u
        mov         rdi,                    arg(5)             ; v
        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
        mov         rcx,                    rax
        neg         rax                     ; negate pitch to deal with above border

        mov         rdx,                    arg(3)             ;limit
        movdqa      xmm7,                   XMMWORD PTR [rdx]
John Koleszar's avatar
John Koleszar committed
364

365
366
        lea         rsi,                    [rsi + rcx]
        lea         rdi,                    [rdi + rcx]
John Koleszar's avatar
John Koleszar committed
367

368
369
370
371
        ; calculate breakout conditions and high edge variance
        LFH_FILTER_AND_HEV_MASK 0
        ; filter and write back the result
        B_FILTER 0
John Koleszar's avatar
John Koleszar committed
372

373
374
375
376
377
378
379
380
381
382
    add rsp, 96
    pop rsp
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
John Koleszar's avatar
John Koleszar committed
383
384


385
386
387
388
389
390
391
%macro MB_FILTER_AND_WRITEBACK 1
%if %1 == 0
        movdqa      xmm2,                   p1              ; p1
        movdqa      xmm7,                   q1              ; q1
%elif %1 == 1
        movdqa      xmm2,                   [rsi+2*rax]     ; p1
        movdqa      xmm7,                   [rdi]           ; q1
John Koleszar's avatar
John Koleszar committed
392

393
394
395
396
397
398
399
400
401
402
        mov         rcx,                    rax
        neg         rcx
%elif %1 == 2
        lea         rdx,                    srct

        movdqa      xmm2,                   [rdx+32]        ; p1
        movdqa      xmm7,                   [rdx+80]        ; q1
        movdqa      xmm6,                   [rdx+48]        ; p0
        movdqa      xmm0,                   [rdx+64]        ; q0
%endif
John Koleszar's avatar
John Koleszar committed
403

404
405
406
407
        pxor        xmm2,                   [GLOBAL(t80)]   ; p1 offset to convert to signed values
        pxor        xmm7,                   [GLOBAL(t80)]   ; q1 offset to convert to signed values
        pxor        xmm6,                   [GLOBAL(t80)]   ; offset to convert to signed values
        pxor        xmm0,                   [GLOBAL(t80)]   ; offset to convert to signed values
John Koleszar's avatar
John Koleszar committed
408

409
410
        psubsb      xmm2,                   xmm7            ; p1 - q1
        movdqa      xmm3,                   xmm0            ; q0
Fritz Koenig's avatar
Fritz Koenig committed
411

412
        psubsb      xmm0,                   xmm6            ; q0 - p0
John Koleszar's avatar
John Koleszar committed
413

414
        paddsb      xmm2,                   xmm0            ; 1 * (q0 - p0) + (p1 - q1)
Fritz Koenig's avatar
Fritz Koenig committed
415

416
        paddsb      xmm2,                   xmm0            ; 2 * (q0 - p0)
Fritz Koenig's avatar
Fritz Koenig committed
417

418
        paddsb      xmm2,                   xmm0            ; 3 * (q0 - p0) + (p1 - q1)
John Koleszar's avatar
John Koleszar committed
419

420
        pand        xmm1,                   xmm2            ; mask filter values we don't care about
John Koleszar's avatar
John Koleszar committed
421

422
        movdqa      xmm2,                   xmm1            ; vp8_filter
John Koleszar's avatar
John Koleszar committed
423

424
425
        pand        xmm2,                   xmm4            ; Filter2 = vp8_filter & hev
        pxor        xmm0,                   xmm0
John Koleszar's avatar
John Koleszar committed
426

427
        pandn       xmm4,                   xmm1            ; vp8_filter&=~hev
John Koleszar's avatar
John Koleszar committed
428
429
        pxor        xmm1,                   xmm1

430
431
        punpcklbw   xmm0,                   xmm4            ; Filter 2 (hi)
        movdqa      xmm5,                   xmm2
John Koleszar's avatar
John Koleszar committed
432

433
        punpckhbw   xmm1,                   xmm4            ; Filter 2 (lo)
434
        paddsb      xmm5,                   [GLOBAL(t3)]    ; vp8_signed_char_clamp(Filter2 + 3)
John Koleszar's avatar
John Koleszar committed
435

436
        pmulhw      xmm1,                   [GLOBAL(s9)]    ; Filter 2 (lo) * 9
John Koleszar's avatar
John Koleszar committed
437

438
        pmulhw      xmm0,                   [GLOBAL(s9)]    ; Filter 2 (hi) * 9
John Koleszar's avatar
John Koleszar committed
439

440
        punpckhbw   xmm7,                   xmm5            ; axbxcxdx
441
        paddsb      xmm2,                   [GLOBAL(t4)]    ; vp8_signed_char_clamp(Filter2 + 4)
John Koleszar's avatar
John Koleszar committed
442

443
444
        punpcklbw   xmm5,                   xmm5            ; exfxgxhx
        psraw       xmm7,                   11              ; sign extended shift right by 3
John Koleszar's avatar
John Koleszar committed
445

446
447
        psraw       xmm5,                   11              ; sign extended shift right by 3
        punpckhbw   xmm4,                   xmm2            ; axbxcxdx
John Koleszar's avatar
John Koleszar committed
448

449
450
        punpcklbw   xmm2,                   xmm2            ; exfxgxhx
        psraw       xmm4,                   11              ; sign extended shift right by 3
451

452
453
        packsswb    xmm5,                   xmm7            ; Filter2 >>=3;
        psraw       xmm2,                   11              ; sign extended shift right by 3
John Koleszar's avatar
John Koleszar committed
454

455
456
        packsswb    xmm2,                   xmm4            ; Filter1 >>=3;
        movdqa      xmm7,                   xmm1
John Koleszar's avatar
John Koleszar committed
457

458
459
        paddsb      xmm6,                   xmm5            ; ps0 =ps0 + Fitler2
        movdqa      xmm4,                   xmm1
John Koleszar's avatar
John Koleszar committed
460

461
462
        psubsb      xmm3,                   xmm2            ; qs0 =qs0 - Filter1
        movdqa      xmm5,                   xmm0
John Koleszar's avatar
John Koleszar committed
463

464
        movdqa      xmm2,                   xmm5
465
        paddw       xmm0,                   [GLOBAL(s63)]   ; Filter 2 (hi) * 9 + 63
John Koleszar's avatar
John Koleszar committed
466

467
        paddw       xmm1,                   [GLOBAL(s63)]   ; Filter 2 (lo) * 9 + 63
468
        paddw       xmm5,                   xmm5            ; Filter 2 (hi) * 18
John Koleszar's avatar
John Koleszar committed
469

470
471
        paddw       xmm7,                   xmm7            ; Filter 2 (lo) * 18
        paddw       xmm5,                   xmm0            ; Filter 2 (hi) * 27 + 63
John Koleszar's avatar
John Koleszar committed
472

473
474
        paddw       xmm7,                   xmm1            ; Filter 2 (lo) * 27 + 63
        paddw       xmm2,                   xmm0            ; Filter 2 (hi) * 18 + 63
John Koleszar's avatar
John Koleszar committed
475

476
477
        paddw       xmm4,                   xmm1            ; Filter 2 (lo) * 18 + 63
        psraw       xmm0,                   7               ; (Filter 2 (hi) * 9 + 63) >> 7
John Koleszar's avatar
John Koleszar committed
478

479
480
        psraw       xmm1,                   7               ; (Filter 2 (lo) * 9 + 63) >> 7
        psraw       xmm2,                   7               ; (Filter 2 (hi) * 18 + 63) >> 7
John Koleszar's avatar
John Koleszar committed
481

482
483
        packsswb    xmm0,                   xmm1            ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
        psraw       xmm4,                   7               ; (Filter 2 (lo) * 18 + 63) >> 7
John Koleszar's avatar
John Koleszar committed
484

485
486
487
488
489
490
        psraw       xmm5,                   7               ; (Filter 2 (hi) * 27 + 63) >> 7
        packsswb    xmm2,                   xmm4            ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)

        psraw       xmm7,                   7               ; (Filter 2 (lo) * 27 + 63) >> 7

        packsswb    xmm5,                   xmm7            ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
491

492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
        psubsb      xmm3,                   xmm5            ; sq = vp8_signed_char_clamp(qs0 - u3)
        paddsb      xmm6,                   xmm5            ; sp = vp8_signed_char_clamp(ps0 - u3)

%if %1 == 0
        movdqa      xmm5,                   q2              ; q2
        movdqa      xmm1,                   q1              ; q1
        movdqa      xmm4,                   p1              ; p1
        movdqa      xmm7,                   p2              ; p2

%elif %1 == 1
        movdqa      xmm5,                   XMMWORD PTR [rdi+rcx]   ; q2
        movdqa      xmm1,                   XMMWORD PTR [rdi]       ; q1
        movdqa      xmm4,                   XMMWORD PTR [rsi+rax*2] ; p1
        movdqa      xmm7,                   XMMWORD PTR [rdi+rax*4] ; p2
%elif %1 == 2
        movdqa      xmm5,                   XMMWORD PTR [rdx+96]    ; q2
        movdqa      xmm1,                   XMMWORD PTR [rdx+80]    ; q1
        movdqa      xmm4,                   XMMWORD PTR [rdx+32]    ; p1
        movdqa      xmm7,                   XMMWORD PTR [rdx+16]    ; p2
511
%endif
John Koleszar's avatar
John Koleszar committed
512

513
514
        pxor        xmm3,                   [GLOBAL(t80)]   ; *oq0 = sq^0x80
        pxor        xmm6,                   [GLOBAL(t80)]   ; *oq0 = sp^0x80
John Koleszar's avatar
John Koleszar committed
515

516
517
        pxor        xmm1,                   [GLOBAL(t80)]
        pxor        xmm4,                   [GLOBAL(t80)]
John Koleszar's avatar
John Koleszar committed
518

519
520
        psubsb      xmm1,                   xmm2            ; sq = vp8_signed_char_clamp(qs1 - u2)
        paddsb      xmm4,                   xmm2            ; sp = vp8_signed_char_clamp(ps1 - u2)
John Koleszar's avatar
John Koleszar committed
521

522
523
        pxor        xmm1,                   [GLOBAL(t80)]   ; *oq1 = sq^0x80;
        pxor        xmm4,                   [GLOBAL(t80)]   ; *op1 = sp^0x80;
John Koleszar's avatar
John Koleszar committed
524

525
526
        pxor        xmm7,                   [GLOBAL(t80)]
        pxor        xmm5,                   [GLOBAL(t80)]
John Koleszar's avatar
John Koleszar committed
527

528
529
        paddsb      xmm7,                   xmm0            ; sp = vp8_signed_char_clamp(ps2 - u)
        psubsb      xmm5,                   xmm0            ; sq = vp8_signed_char_clamp(qs2 - u)
John Koleszar's avatar
John Koleszar committed
530

531
532
        pxor        xmm7,                   [GLOBAL(t80)]   ; *op2 = sp^0x80;
        pxor        xmm5,                   [GLOBAL(t80)]   ; *oq2 = sq^0x80;
533

534
535
536
%if %1 == 0
        lea         rsi,                    [rsi+rcx*2]
        lea         rdi,                    [rdi+rcx*2]
537

538
539
540
541
        movq        MMWORD PTR [rsi],       xmm6            ; p0
        movhps      MMWORD PTR [rdi],       xmm6
        movq        MMWORD PTR [rsi + rcx], xmm3            ; q0
        movhps      MMWORD PTR [rdi + rcx], xmm3
542

543
544
        movq        MMWORD PTR [rsi+rcx*2], xmm1            ; q1
        movhps      MMWORD PTR [rdi+rcx*2], xmm1
545

546
547
548
549
550
        movq        MMWORD PTR [rsi + rax], xmm4            ; p1
        movhps      MMWORD PTR [rdi + rax], xmm4

        movq        MMWORD PTR [rsi+rax*2], xmm7            ; p2
        movhps      MMWORD PTR [rdi+rax*2], xmm7
551
552
553

        lea         rsi,                    [rsi + rcx]
        lea         rdi,                    [rdi + rcx]
554
555
556
557
558
559
560
561
562
563
564
565
566
567
        movq        MMWORD PTR [rsi+rcx*2], xmm5            ; q2
        movhps      MMWORD PTR [rdi+rcx*2], xmm5
%elif %1 == 1
        movdqa      XMMWORD PTR [rdi+rcx],  xmm5            ; q2
        movdqa      XMMWORD PTR [rdi],      xmm1            ; q1
        movdqa      XMMWORD PTR [rsi],      xmm3            ; q0
        movdqa      XMMWORD PTR [rsi+rax  ],xmm6            ; p0
        movdqa      XMMWORD PTR [rsi+rax*2],xmm4            ; p1
        movdqa      XMMWORD PTR [rdi+rax*4],xmm7            ; p2
%elif %1 == 2
        movdqa      XMMWORD PTR [rdx+80],   xmm1            ; q1
        movdqa      XMMWORD PTR [rdx+64],   xmm3            ; q0
        movdqa      XMMWORD PTR [rdx+48],   xmm6            ; p0
        movdqa      XMMWORD PTR [rdx+32],   xmm4            ; p1
568
%endif
569

570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
%endmacro


;void vp8_mbloop_filter_horizontal_edge_sse2
;(
;    unsigned char *src_ptr,
;    int            src_pixel_step,
;    const char    *flimit,
;    const char    *limit,
;    const char    *thresh,
;    int            count
;)
global sym(vp8_mbloop_filter_horizontal_edge_sse2)
sym(vp8_mbloop_filter_horizontal_edge_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    SAVE_XMM
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    ALIGN_STACK 16, rax
    sub         rsp, 32     ; reserve 32 bytes
    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[16];
    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[16];

        mov         rsi,                    arg(0)            ;src_ptr
        movsxd      rax,                    dword ptr arg(1)  ;src_pixel_step

        mov         rdx,                    arg(3)            ;limit
        movdqa      xmm7,                   XMMWORD PTR [rdx]

        lea         rdi,                    [rsi+rax]         ; rdi points to row +1 for indirect addressing

606
607
608
609
        ; calculate breakout conditions and high edge variance
        LFH_FILTER_AND_HEV_MASK 1
        ; filter and write back the results
        MB_FILTER_AND_WRITEBACK 1
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663

    add rsp, 32
    pop rsp
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret


;void vp8_mbloop_filter_horizontal_edge_uv_sse2
;(
;    unsigned char *u,
;    int            src_pixel_step,
;    const char    *flimit,
;    const char    *limit,
;    const char    *thresh,
;    unsigned char *v
;)
global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2)
sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    SAVE_XMM
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    ALIGN_STACK 16, rax
    sub         rsp, 96       ; reserve 96 bytes
    %define q2  [rsp + 0]     ;__declspec(align(16)) char q2[16];
    %define q1  [rsp + 16]    ;__declspec(align(16)) char q1[16];
    %define p2  [rsp + 32]    ;__declspec(align(16)) char p2[16];
    %define p1  [rsp + 48]    ;__declspec(align(16)) char p1[16];
    %define t0  [rsp + 64]    ;__declspec(align(16)) char t0[16];
    %define t1  [rsp + 80]    ;__declspec(align(16)) char t1[16];

        mov         rsi,                    arg(0)             ; u
        mov         rdi,                    arg(5)             ; v
        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
        mov         rcx,                    rax
        neg         rax                     ; negate pitch to deal with above border

        mov         rdx,                    arg(3)             ;limit
        movdqa      xmm7,                   XMMWORD PTR [rdx]

        lea         rsi,                    [rsi + rcx]
        lea         rdi,                    [rdi + rcx]

664
665
666
667
        ; calculate breakout conditions and high edge variance
        LFH_FILTER_AND_HEV_MASK 0
        ; filter and write back the results
        MB_FILTER_AND_WRITEBACK 0
668
669
670
671
672
673
674
675
676
677
678
679
680

    add rsp, 96
    pop rsp
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret


681
682
683
%macro TRANSPOSE_16X8 2
        movq        xmm4,               QWORD PTR [rsi]        ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
        movq        xmm1,               QWORD PTR [rdi]        ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
684
        movq        xmm0,               QWORD PTR [rsi+2*rax]  ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
685
686
687
        movq        xmm7,               QWORD PTR [rdi+2*rax]  ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
        movq        xmm5,               QWORD PTR [rsi+4*rax]  ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
        movq        xmm2,               QWORD PTR [rdi+4*rax]  ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
688

689
        punpcklbw   xmm4,               xmm1            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
690

691
692
693
        movq        xmm1,               QWORD PTR [rdi+2*rcx]  ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70

        movdqa      xmm3,               xmm4            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
694
        punpcklbw   xmm0,               xmm7            ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
695

696
        movq        xmm7,               QWORD PTR [rsi+2*rcx]  ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
697

698
        punpcklbw   xmm5,               xmm2            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
699
700
701
702
703
%if %1
        lea         rsi,                [rsi+rax*8]
%else
        mov         rsi,                arg(5)          ; v_ptr
%endif
704

705
706
        movdqa      xmm6,               xmm5            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
        punpcklbw   xmm7,               xmm1            ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
707

708
709
710
        punpcklwd   xmm5,               xmm7            ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40

        punpckhwd   xmm6,               xmm7            ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
711
712
713
714
715
%if %1
        lea         rdi,                [rdi+rax*8]
%else
        lea         rsi,                [rsi - 4]
%endif
716
717

        punpcklwd   xmm3,               xmm0            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
718
719
720
721
722
723
724
%if %1
        lea         rdx,                srct
%else
        lea         rdi,                [rsi + rax]     ; rdi points to row +1 for indirect addressing
%endif

        movdqa      xmm2,               xmm3            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
725
726
727
        punpckhwd   xmm4,               xmm0            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04

        movdqa      xmm7,               xmm4            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
728
        punpckhdq   xmm3,               xmm5            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
729
730

        punpckhdq   xmm7,               xmm6            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
731

732
733
734
735
736
        punpckldq   xmm4,               xmm6            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04

        punpckldq   xmm2,               xmm5            ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00

        movdqa      t0,                 xmm2            ; save to free XMM2
737
738
739
740
741
        movq        xmm2,               QWORD PTR [rsi]       ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
        movq        xmm6,               QWORD PTR [rdi]       ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
        movq        xmm0,               QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
        movq        xmm5,               QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
        movq        xmm1,               QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
742

743
        punpcklbw   xmm2,               xmm6            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
744

745
        movq        xmm6,               QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
746
747
748

        punpcklbw   xmm0,               xmm5                  ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0

749
        movq        xmm5,               QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
750
751

        punpcklbw   xmm1,               xmm6            ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
752
753

        movq        xmm6,               QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
754

755
        punpcklbw   xmm5,               xmm6            ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
756
757
758
759
760
761
762
763

        movdqa      xmm6,               xmm1            ;
        punpckhwd   xmm6,               xmm5            ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4

        punpcklwd   xmm1,               xmm5            ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
        movdqa      xmm5,               xmm2            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80

        punpcklwd   xmm5,               xmm0            ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
764

765
766
767
768
769
770
771
772
773
774
        punpckhwd   xmm2,               xmm0            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84

        movdqa      xmm0,               xmm5
        punpckldq   xmm0,               xmm1            ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80

        punpckhdq   xmm5,               xmm1            ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
        movdqa      xmm1,               xmm2            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84

        punpckldq   xmm1,               xmm6            ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84

775
        punpckhdq   xmm2,               xmm6            ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
776
        movdqa      xmm6,               xmm7            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
777

778
779
780
        punpcklqdq  xmm6,               xmm2            ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06

        punpckhqdq  xmm7,               xmm2            ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
781
%if %2
782
783
784
785
        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02

        punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
786

787
788
789
790
791
792
        movdqa      [rdx],              xmm2            ; save 2

        movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
        punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04

        movdqa      [rdx+16],           xmm3            ; save 3
793

794
795
796
797
798
799
        punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05

        movdqa      [rdx+32],           xmm4            ; save 4
        movdqa      [rdx+48],           xmm5            ; save 5
        movdqa      xmm1,               t0              ; get

800
        movdqa      xmm2,               xmm1            ;
801
        punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
802

803
804
805
806
807
808
        punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
%else
        movdqa      [rdx+112],          xmm7            ; save 7

        movdqa      [rdx+96],           xmm6            ; save 6

809
        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
810
        punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
811
812
813

        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02

814
815
816
817
818
819
        movdqa      [rdx+32],           xmm2            ; save 2

        movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
        punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04

        movdqa      [rdx+48],           xmm3            ; save 3
820

821
822
823
824
825
826
        punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05

        movdqa      [rdx+64],           xmm4            ; save 4
        movdqa      [rdx+80],           xmm5            ; save 5
        movdqa      xmm1,               t0              ; get

827
        movdqa      xmm2,               xmm1
828
        punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
829

830
831
832
        punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00

        movdqa      [rdx+16],           xmm1
833

834
835
836
837
        movdqa      [rdx],              xmm2
%endif
%endmacro

838
%macro LFV_FILTER_MASK_HEV_MASK 1
839
840
841
842
843
        movdqa      xmm0,               xmm6            ; q2
        psubusb     xmm0,               xmm7            ; q2-q3

        psubusb     xmm7,               xmm6            ; q3-q2
        movdqa      xmm4,               xmm5            ; q1
John Koleszar's avatar
John Koleszar committed
844

845
846
        por         xmm7,               xmm0            ; abs (q3-q2)
        psubusb     xmm4,               xmm6            ; q1-q2
John Koleszar's avatar
John Koleszar committed
847

848
        movdqa      xmm0,               xmm1
849
        psubusb     xmm6,               xmm5            ; q2-q1
John Koleszar's avatar
John Koleszar committed
850

851
        por         xmm6,               xmm4            ; abs (q2-q1)
852
        psubusb     xmm0,               xmm2            ; p2 - p3;
John Koleszar's avatar
John Koleszar committed
853

854
        psubusb     xmm2,               xmm1            ; p3 - p2;
855
856
857
858
859
860
861
        por         xmm0,               xmm2            ; abs(p2-p3)
%if %1
        movdqa      xmm2,               [rdx]           ; p1
%else
        movdqa      xmm2,               [rdx+32]        ; p1
%endif
        movdqa      xmm5,               xmm2            ; p1
862
        pmaxub      xmm0,               xmm7
John Koleszar's avatar
John Koleszar committed
863

864
865
        psubusb     xmm5,               xmm1            ; p1-p2
        psubusb     xmm1,               xmm2            ; p2-p1
John Koleszar's avatar
John Koleszar committed
866

867
868
        movdqa      xmm7,               xmm3            ; p0
        psubusb     xmm7,               xmm2            ; p0-p1
John Koleszar's avatar
John Koleszar committed
869

870
871
        por         xmm1,               xmm5            ; abs(p2-p1)
        pmaxub      xmm0,               xmm6
John Koleszar's avatar
John Koleszar committed
872

873
        pmaxub      xmm0,               xmm1
874
        movdqa      xmm1,               xmm2            ; p1
John Koleszar's avatar
John Koleszar committed
875

876
        psubusb     xmm2,               xmm3            ; p1-p0
877
878
        lea         rdx,                srct

879
        por         xmm2,               xmm7            ; abs(p1-p0)
John Koleszar's avatar
John Koleszar committed
880

881
        movdqa      t0,                 xmm2            ; save abs(p1-p0)
John Koleszar's avatar
John Koleszar committed
882

883
884
        pmaxub      xmm0,               xmm2

885
886
887
888
889
890
891
%if %1
        movdqa      xmm5,               [rdx+32]        ; q0
        movdqa      xmm7,               [rdx+48]        ; q1
%else
        movdqa      xmm5,               [rdx+64]        ; q0
        movdqa      xmm7,               [rdx+80]        ; q1
%endif
892
893
        mov         rdx,                arg(3)          ; limit

894
895
896
        movdqa      xmm6,               xmm5            ; q0
        movdqa      xmm2,               xmm7            ; q1

897
        psubusb     xmm5,               xmm7            ; q0-q1
898
        psubusb     xmm7,               xmm6            ; q1-q0
899

900
901
902
903
        por         xmm7,               xmm5            ; abs(q1-q0)

        movdqa      t1,                 xmm7            ; save abs(q1-q0)

904
        movdqa      xmm4,               XMMWORD PTR [rdx]; limit
905
906

        pmaxub      xmm0,               xmm7
907
        mov         rdx,                arg(2)          ; flimit
908

909
        psubusb     xmm0,               xmm4
910
        movdqa      xmm5,               xmm2            ; q1
911

912
913
        psubusb     xmm5,               xmm1            ; q1-=p1
        psubusb     xmm1,               xmm2            ; p1-=q1
914

915
        por         xmm5,               xmm1            ; abs(p1-q1)
916
917
        movdqa      xmm1,               xmm3            ; p0

918
        pand        xmm5,               [GLOBAL(tfe)]   ; set lsb of each byte to zero
919
920
        psubusb     xmm1,               xmm6            ; p0-q0

921
        psrlw       xmm5,               1               ; abs(p1-q1)/2
922
        psubusb     xmm6,               xmm3            ; q0-p0
923

924
        movdqa      xmm2,               XMMWORD PTR [rdx]; flimit
925

926
        mov         rdx,                arg(4)          ; get thresh
927

928
        por         xmm1,               xmm6            ; abs(q0-p0)
929
930
        paddb       xmm2,               xmm2            ; flimit*2 (less than 255)

931
        movdqa      xmm6,               t0              ; get abs (q1 - q0)
John Koleszar's avatar
John Koleszar committed
932

933
        paddusb     xmm1,               xmm1            ; abs(q0-p0)*2
John Koleszar's avatar
John Koleszar committed
934

935
        movdqa      xmm3,               t1              ; get abs (p1 - p0)
John Koleszar's avatar
John Koleszar committed
936

937
        movdqa      xmm7,               XMMWORD PTR [rdx]
John Koleszar's avatar
John Koleszar committed
938

939
940
        paddusb     xmm1,               xmm5            ; abs (p0 - q0) *2 + abs(p1-q1)/2
        psubusb     xmm6,               xmm7            ; abs(q1 - q0) > thresh
John Koleszar's avatar
John Koleszar committed
941

942
943
        paddb       xmm4,               xmm2            ; flimit * 2 + limit (less than 255)
        psubusb     xmm3,               xmm7            ; abs(p1 - p0)> thresh
John Koleszar's avatar
John Koleszar committed
944

945
946
        psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
        por         xmm6,               xmm3            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
John Koleszar's avatar
John Koleszar committed
947

948
949
        por         xmm1,               xmm0            ; mask
        pcmpeqb     xmm6,               xmm0
John Koleszar's avatar
John Koleszar committed
950

951
952
        pxor        xmm0,               xmm0
        pcmpeqb     xmm4,               xmm4
John Koleszar's avatar
John Koleszar committed
953

954
955
        pcmpeqb     xmm1,               xmm0
        pxor        xmm4,               xmm6
956
%endmacro
John Koleszar's avatar
John Koleszar committed
957

958
959
960
961
962
963
964
%macro BV_TRANSPOSE 0
        ; xmm1 =    f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
        ; xmm6 =    f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
        ; xmm3 =    f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
        ; xmm7 =    f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
        movdqa      xmm2,               xmm1            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
        punpcklbw   xmm2,               xmm6            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
John Koleszar's avatar
John Koleszar committed
965

966
967
        movdqa      xmm4,               xmm3            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
        punpckhbw   xmm1,               xmm6            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
John Koleszar's avatar
John Koleszar committed
968

969
        punpcklbw   xmm4,               xmm7            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
970

971
        punpckhbw   xmm3,               xmm7            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
John Koleszar's avatar
John Koleszar committed
972

973
974
        movdqa      xmm6,               xmm2            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
        punpcklwd   xmm2,               xmm4            ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
John Koleszar's avatar
John Koleszar committed
975

976
977
        punpckhwd   xmm6,               xmm4            ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
        movdqa      xmm5,               xmm1            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
John Koleszar's avatar
John Koleszar committed
978

979
        punpcklwd   xmm1,               xmm3            ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
980