loopfilter_sse2.asm 72.8 KB
Newer Older
John Koleszar's avatar
John Koleszar committed
1
;
2
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
John Koleszar's avatar
John Koleszar committed
3
;
4
;  Use of this source code is governed by a BSD-style license
5
6
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
7
;  in the file PATENTS.  All contributing project authors may
8
;  be found in the AUTHORS file in the root of the source tree.
John Koleszar's avatar
John Koleszar committed
9
10
11
12
13
;


%include "vpx_ports/x86_abi_support.asm"

14
15
; Use of pmaxub instead of psubusb to compute filter mask was seen
; in ffvp8
John Koleszar's avatar
John Koleszar committed
16

17
%macro LFH_FILTER_AND_HEV_MASK 1
18
19
20
%if %1
        movdqa      xmm2,                   [rdi+2*rax]       ; q3
        movdqa      xmm1,                   [rsi+2*rax]       ; q2
21
22
23
        movdqa      xmm4,                   [rsi+rax]         ; q1
        movdqa      xmm5,                   [rsi]             ; q0
        neg         rax                     ; negate pitch to deal with above border
24
%else
25
26
27
28
29
30
31
32
33
34
35
36
37
        movlps      xmm2,                   [rsi + rcx*2]     ; q3
        movlps      xmm1,                   [rsi + rcx]       ; q2
        movlps      xmm4,                   [rsi]             ; q1
        movlps      xmm5,                   [rsi + rax]       ; q0

        movhps      xmm2,                   [rdi + rcx*2]
        movhps      xmm1,                   [rdi + rcx]
        movhps      xmm4,                   [rdi]
        movhps      xmm5,                   [rdi + rax]

        lea         rsi,                    [rsi + rax*4]
        lea         rdi,                    [rdi + rax*4]

38
        movdqa      XMMWORD PTR [rsp],      xmm1              ; store q2
39
        movdqa      XMMWORD PTR [rsp + 16], xmm4              ; store q1
40
%endif
John Koleszar's avatar
John Koleszar committed
41

42
        movdqa      xmm6,                   xmm1              ; q2
43
44
        movdqa      xmm3,                   xmm4              ; q1

45
46
        psubusb     xmm1,                   xmm2              ; q2-=q3
        psubusb     xmm2,                   xmm6              ; q3-=q2
John Koleszar's avatar
John Koleszar committed
47

48
49
        psubusb     xmm4,                   xmm6              ; q1-=q2
        psubusb     xmm6,                   xmm3              ; q2-=q1
50

51
        por         xmm4,                   xmm6              ; abs(q2-q1)
52
        por         xmm1,                   xmm2              ; abs(q3-q2)
John Koleszar's avatar
John Koleszar committed
53

54
55
        movdqa      xmm0,                   xmm5              ; q0
        pmaxub      xmm1,                   xmm4
John Koleszar's avatar
John Koleszar committed
56

57
        psubusb     xmm5,                   xmm3              ; q0-=q1
58
        psubusb     xmm3,                   xmm0              ; q1-=q0
John Koleszar's avatar
John Koleszar committed
59

60
61
62
63
        por         xmm5,                   xmm3              ; abs(q0-q1)
        movdqa      t0,                     xmm5              ; save to t0

        pmaxub      xmm1,                   xmm5
64

65
%if %1
66
67
        movdqa      xmm2,                   [rsi+4*rax]       ; p3
        movdqa      xmm4,                   [rdi+4*rax]       ; p2
68
        movdqa      xmm6,                   [rsi+2*rax]       ; p1
69
%else
70
71
72
73
74
75
76
        movlps      xmm2,                   [rsi + rax]       ; p3
        movlps      xmm4,                   [rsi]             ; p2
        movlps      xmm6,                   [rsi + rcx]       ; p1

        movhps      xmm2,                   [rdi + rax]
        movhps      xmm4,                   [rdi]
        movhps      xmm6,                   [rdi + rcx]
77
78

        movdqa      XMMWORD PTR [rsp + 32], xmm4              ; store p2
79
        movdqa      XMMWORD PTR [rsp + 48], xmm6              ; store p1
80
%endif
John Koleszar's avatar
John Koleszar committed
81

82
        movdqa      xmm5,                   xmm4              ; p2
83
84
        movdqa      xmm3,                   xmm6              ; p1

85
86
        psubusb     xmm4,                   xmm2              ; p2-=p3
        psubusb     xmm2,                   xmm5              ; p3-=p2
John Koleszar's avatar
John Koleszar committed
87

88
89
        psubusb     xmm3,                   xmm5              ; p1-=p2
        pmaxub      xmm1,                   xmm4              ; abs(p3 - p2)
John Koleszar's avatar
John Koleszar committed
90

91
92
        psubusb     xmm5,                   xmm6              ; p2-=p1
        pmaxub      xmm1,                   xmm2              ; abs(p3 - p2)
John Koleszar's avatar
John Koleszar committed
93

94
95
        pmaxub      xmm1,                   xmm5              ; abs(p2 - p1)
        movdqa      xmm2,                   xmm6              ; p1
John Koleszar's avatar
John Koleszar committed
96

97
        pmaxub      xmm1,                   xmm3              ; abs(p2 - p1)
98
99
%if %1
        movdqa      xmm4,                   [rsi+rax]         ; p0
100
        movdqa      xmm3,                   [rdi]             ; q1
101
%else
102
103
104
        movlps      xmm4,                   [rsi + rcx*2]     ; p0
        movhps      xmm4,                   [rdi + rcx*2]
        movdqa      xmm3,                   q1                ; q1
105
%endif
John Koleszar's avatar
John Koleszar committed
106

107
        movdqa      xmm5,                   xmm4              ; p0
108
        psubusb     xmm4,                   xmm6              ; p0-=p1
John Koleszar's avatar
John Koleszar committed
109

110
        psubusb     xmm6,                   xmm5              ; p1-=p0
John Koleszar's avatar
John Koleszar committed
111

112
        por         xmm6,                   xmm4              ; abs(p1 - p0)
Johann's avatar
Johann committed
113
        mov         rdx,                    arg(2)            ; get blimit
114
115

        movdqa        t1,                   xmm6              ; save to t1
John Koleszar's avatar
John Koleszar committed
116

117
        movdqa      xmm4,                   xmm3              ; q1
118
119
        pmaxub      xmm1,                   xmm6

120
121
        psubusb     xmm3,                   xmm2              ; q1-=p1
        psubusb     xmm2,                   xmm4              ; p1-=q1
122
123

        psubusb     xmm1,                   xmm7
124
        por         xmm2,                   xmm3              ; abs(p1-q1)
125

Johann's avatar
Johann committed
126
        movdqa      xmm7,                   XMMWORD PTR [rdx] ; blimit
127
128

        movdqa      xmm3,                   xmm0              ; q0
129
        pand        xmm2,                   [GLOBAL(tfe)]     ; set lsb of each byte to zero
130
131

        mov         rdx,                    arg(4)            ; hev get thresh
John Koleszar's avatar
John Koleszar committed
132

133
        movdqa      xmm6,                   xmm5              ; p0
134
135
        psrlw       xmm2,                   1                 ; abs(p1-q1)/2

136
        psubusb     xmm5,                   xmm3              ; p0-=q0
137

138
139
        psubusb     xmm3,                   xmm6              ; q0-=p0
        por         xmm5,                   xmm3              ; abs(p0 - q0)
140

141
        paddusb     xmm5,                   xmm5              ; abs(p0-q0)*2
142
143
144
145
146

        movdqa      xmm4,                   t0                ; hev get abs (q1 - q0)

        movdqa      xmm3,                   t1                ; get abs (p1 - p0)

147
        paddusb     xmm5,                   xmm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
John Koleszar's avatar
John Koleszar committed
148

149
        movdqa      xmm2,                   XMMWORD PTR [rdx] ; hev
John Koleszar's avatar
John Koleszar committed
150

Johann's avatar
Johann committed
151
        psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
152
153
154
        psubusb     xmm4,                   xmm2              ; hev

        psubusb     xmm3,                   xmm2              ; hev
155
        por         xmm1,                   xmm5
John Koleszar's avatar
John Koleszar committed
156

157
158
        pxor        xmm7,                   xmm7
        paddb       xmm4,                   xmm3              ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
John Koleszar's avatar
John Koleszar committed
159

160
161
        pcmpeqb     xmm4,                   xmm5              ; hev
        pcmpeqb     xmm3,                   xmm3              ; hev
John Koleszar's avatar
John Koleszar committed
162

163
164
        pcmpeqb     xmm1,                   xmm7              ; mask xmm1
        pxor        xmm4,                   xmm3              ; hev
165
%endmacro
John Koleszar's avatar
John Koleszar committed
166

167
168
%macro B_FILTER 1
%if %1 == 0
169
170
        movdqa      xmm2,                   p1                ; p1
        movdqa      xmm7,                   q1                ; q1
171
172
173
174
175
176
177
178
179
180
%elif %1 == 1
        movdqa      xmm2,                   [rsi+2*rax]       ; p1
        movdqa      xmm7,                   [rdi]             ; q1
%elif %1 == 2
        lea         rdx,                    srct

        movdqa      xmm2,                   [rdx]             ; p1
        movdqa      xmm7,                   [rdx+48]          ; q1
        movdqa      xmm6,                   [rdx+16]          ; p0
        movdqa      xmm0,                   [rdx+32]          ; q0
181
%endif
John Koleszar's avatar
John Koleszar committed
182

183
184
        pxor        xmm2,                   [GLOBAL(t80)]     ; p1 offset to convert to signed values
        pxor        xmm7,                   [GLOBAL(t80)]     ; q1 offset to convert to signed values
John Koleszar's avatar
John Koleszar committed
185

186
        psubsb      xmm2,                   xmm7              ; p1 - q1
187
        pxor        xmm6,                   [GLOBAL(t80)]     ; offset to convert to signed values
John Koleszar's avatar
John Koleszar committed
188

Fritz Koenig's avatar
Fritz Koenig committed
189
        pand        xmm2,                   xmm4              ; high var mask (hvm)(p1 - q1)
190
        pxor        xmm0,                   [GLOBAL(t80)]     ; offset to convert to signed values
John Koleszar's avatar
John Koleszar committed
191

Fritz Koenig's avatar
Fritz Koenig committed
192
        movdqa      xmm3,                   xmm0              ; q0
193
        psubsb      xmm0,                   xmm6              ; q0 - p0
194

195
        paddsb      xmm2,                   xmm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
196

197
        paddsb      xmm2,                   xmm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
198

199
        paddsb      xmm2,                   xmm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
200

201
        pand        xmm1,                   xmm2              ; mask filter values we don't care about
202

203
        movdqa      xmm2,                   xmm1
204

205
206
        paddsb      xmm1,                   [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
        paddsb      xmm2,                   [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
John Koleszar's avatar
John Koleszar committed
207

Fritz Koenig's avatar
Fritz Koenig committed
208
209
210
        punpckhbw   xmm5,                   xmm2              ; axbxcxdx
        punpcklbw   xmm2,                   xmm2              ; exfxgxhx

211
        punpcklbw   xmm0,                   xmm1              ; exfxgxhx
Fritz Koenig's avatar
Fritz Koenig committed
212
213
214
        psraw       xmm5,                   11                ; sign extended shift right by 3

        punpckhbw   xmm1,                   xmm1              ; axbxcxdx
215
        psraw       xmm2,                   11                ; sign extended shift right by 3
Fritz Koenig's avatar
Fritz Koenig committed
216

217
        packsswb    xmm2,                   xmm5              ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
218
219
        psraw       xmm0,                   11                ; sign extended shift right by 3

220
        psraw       xmm1,                   11                ; sign extended shift right by 3
Fritz Koenig's avatar
Fritz Koenig committed
221
222
        movdqa      xmm5,                   xmm0              ; save results

223
        packsswb    xmm0,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
224
        paddsw      xmm5,                   [GLOBAL(ones)]
Fritz Koenig's avatar
Fritz Koenig committed
225

226
        paddsw      xmm1,                   [GLOBAL(ones)]
227
        psraw       xmm5,                   1                 ; partial shifted one more time for 2nd tap
Fritz Koenig's avatar
Fritz Koenig committed
228

229
        psraw       xmm1,                   1                 ; partial shifted one more time for 2nd tap
230
231

        paddsb      xmm6,                   xmm2              ; p0+= p0 add
232
        packsswb    xmm5,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
233

234
235
236
237
238
239
%if %1 == 0
        movdqa      xmm1,                   p1                ; p1
%elif %1 == 1
        movdqa      xmm1,                   [rsi+2*rax]       ; p1
%elif %1 == 2
        movdqa      xmm1,                   [rdx]             ; p1
240
%endif
241
        pandn       xmm4,                   xmm5              ; high edge variance additive
242
        pxor        xmm6,                   [GLOBAL(t80)]     ; unoffset
243

244
        pxor        xmm1,                   [GLOBAL(t80)]     ; reoffset
245
        psubsb      xmm3,                   xmm0              ; q0-= q0 add
246
247

        paddsb      xmm1,                   xmm4              ; p1+= p1 add
248
        pxor        xmm3,                   [GLOBAL(t80)]     ; unoffset
249

250
        pxor        xmm1,                   [GLOBAL(t80)]     ; unoffset
251
        psubsb      xmm7,                   xmm4              ; q1-= q1 add
252

253
        pxor        xmm7,                   [GLOBAL(t80)]     ; unoffset
254
255
256
257
258
259
260
261
262
%if %1 == 0
        lea         rsi,                    [rsi + rcx*2]
        lea         rdi,                    [rdi + rcx*2]
        movq        MMWORD PTR [rsi],       xmm6              ; p0
        movhps      MMWORD PTR [rdi],       xmm6
        movq        MMWORD PTR [rsi + rax], xmm1              ; p1
        movhps      MMWORD PTR [rdi + rax], xmm1
        movq        MMWORD PTR [rsi + rcx], xmm3              ; q0
        movhps      MMWORD PTR [rdi + rcx], xmm3
263
        movq        MMWORD PTR [rsi + rcx*2],xmm7             ; q1
264
265
266
267
268
269
        movhps      MMWORD PTR [rdi + rcx*2],xmm7
%elif %1 == 1
        movdqa      [rsi+rax],              xmm6              ; write back
        movdqa      [rsi+2*rax],            xmm1              ; write back
        movdqa      [rsi],                  xmm3              ; write back
        movdqa      [rdi],                  xmm7              ; write back
270
%endif
271

272
%endmacro
John Koleszar's avatar
John Koleszar committed
273

Johann's avatar
Johann committed
274
%if ABI_IS_32BIT
John Koleszar's avatar
John Koleszar committed
275

276
;void vp8_loop_filter_horizontal_edge_sse2
John Koleszar's avatar
John Koleszar committed
277
278
279
;(
;    unsigned char *src_ptr,
;    int            src_pixel_step,
Johann's avatar
Johann committed
280
;    const char    *blimit,
John Koleszar's avatar
John Koleszar committed
281
282
283
;    const char    *limit,
;    const char    *thresh,
;    int            count
284
285
286
287
288
289
;)
global sym(vp8_loop_filter_horizontal_edge_sse2)
sym(vp8_loop_filter_horizontal_edge_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
290
    SAVE_XMM 7
291
292
293
294
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog
John Koleszar's avatar
John Koleszar committed
295

296
297
298
299
    ALIGN_STACK 16, rax
    sub         rsp, 32     ; reserve 32 bytes
    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[16];
    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[16];
John Koleszar's avatar
John Koleszar committed
300

301
302
        mov         rsi,                    arg(0)           ;src_ptr
        movsxd      rax,                    dword ptr arg(1) ;src_pixel_step
John Koleszar's avatar
John Koleszar committed
303

304
305
        mov         rdx,                    arg(3)           ;limit
        movdqa      xmm7,                   XMMWORD PTR [rdx]
John Koleszar's avatar
John Koleszar committed
306

307
        lea         rdi,                    [rsi+rax]        ; rdi points to row +1 for indirect addressing
John Koleszar's avatar
John Koleszar committed
308

309
310
311
312
        ; calculate breakout conditions and high edge variance
        LFH_FILTER_AND_HEV_MASK 1
        ; filter and write back the result
        B_FILTER 1
John Koleszar's avatar
John Koleszar committed
313

314
315
316
317
318
319
320
321
322
323
    add rsp, 32
    pop rsp
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
John Koleszar's avatar
John Koleszar committed
324

Johann's avatar
Johann committed
325
%endif
John Koleszar's avatar
John Koleszar committed
326

327
328
329
330
;void vp8_loop_filter_horizontal_edge_uv_sse2
;(
;    unsigned char *src_ptr,
;    int            src_pixel_step,
Johann's avatar
Johann committed
331
;    const char    *blimit,
332
333
334
335
336
337
338
339
340
;    const char    *limit,
;    const char    *thresh,
;    int            count
;)
global sym(vp8_loop_filter_horizontal_edge_uv_sse2)
sym(vp8_loop_filter_horizontal_edge_uv_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
341
    SAVE_XMM 7
342
343
344
345
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog
John Koleszar's avatar
John Koleszar committed
346

347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
    ALIGN_STACK 16, rax
    sub         rsp, 96       ; reserve 96 bytes
    %define q2  [rsp + 0]     ;__declspec(align(16)) char q2[16];
    %define q1  [rsp + 16]    ;__declspec(align(16)) char q1[16];
    %define p2  [rsp + 32]    ;__declspec(align(16)) char p2[16];
    %define p1  [rsp + 48]    ;__declspec(align(16)) char p1[16];
    %define t0  [rsp + 64]    ;__declspec(align(16)) char t0[16];
    %define t1  [rsp + 80]    ;__declspec(align(16)) char t1[16];

        mov         rsi,                    arg(0)             ; u
        mov         rdi,                    arg(5)             ; v
        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
        mov         rcx,                    rax
        neg         rax                     ; negate pitch to deal with above border

        mov         rdx,                    arg(3)             ;limit
        movdqa      xmm7,                   XMMWORD PTR [rdx]
John Koleszar's avatar
John Koleszar committed
364

365
366
        lea         rsi,                    [rsi + rcx]
        lea         rdi,                    [rdi + rcx]
John Koleszar's avatar
John Koleszar committed
367

368
369
370
371
        ; calculate breakout conditions and high edge variance
        LFH_FILTER_AND_HEV_MASK 0
        ; filter and write back the result
        B_FILTER 0
John Koleszar's avatar
John Koleszar committed
372

373
374
375
376
377
378
379
380
381
382
    add rsp, 96
    pop rsp
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
John Koleszar's avatar
John Koleszar committed
383
384


385
386
387
388
389
390
391
%macro MB_FILTER_AND_WRITEBACK 1
%if %1 == 0
        movdqa      xmm2,                   p1              ; p1
        movdqa      xmm7,                   q1              ; q1
%elif %1 == 1
        movdqa      xmm2,                   [rsi+2*rax]     ; p1
        movdqa      xmm7,                   [rdi]           ; q1
John Koleszar's avatar
John Koleszar committed
392

393
394
395
396
397
398
399
400
401
402
        mov         rcx,                    rax
        neg         rcx
%elif %1 == 2
        lea         rdx,                    srct

        movdqa      xmm2,                   [rdx+32]        ; p1
        movdqa      xmm7,                   [rdx+80]        ; q1
        movdqa      xmm6,                   [rdx+48]        ; p0
        movdqa      xmm0,                   [rdx+64]        ; q0
%endif
John Koleszar's avatar
John Koleszar committed
403

404
405
406
407
        pxor        xmm2,                   [GLOBAL(t80)]   ; p1 offset to convert to signed values
        pxor        xmm7,                   [GLOBAL(t80)]   ; q1 offset to convert to signed values
        pxor        xmm6,                   [GLOBAL(t80)]   ; offset to convert to signed values
        pxor        xmm0,                   [GLOBAL(t80)]   ; offset to convert to signed values
John Koleszar's avatar
John Koleszar committed
408

409
410
        psubsb      xmm2,                   xmm7            ; p1 - q1
        movdqa      xmm3,                   xmm0            ; q0
Fritz Koenig's avatar
Fritz Koenig committed
411

412
        psubsb      xmm0,                   xmm6            ; q0 - p0
John Koleszar's avatar
John Koleszar committed
413

414
        paddsb      xmm2,                   xmm0            ; 1 * (q0 - p0) + (p1 - q1)
Fritz Koenig's avatar
Fritz Koenig committed
415

416
        paddsb      xmm2,                   xmm0            ; 2 * (q0 - p0)
Fritz Koenig's avatar
Fritz Koenig committed
417

418
        paddsb      xmm2,                   xmm0            ; 3 * (q0 - p0) + (p1 - q1)
John Koleszar's avatar
John Koleszar committed
419

420
        pand        xmm1,                   xmm2            ; mask filter values we don't care about
John Koleszar's avatar
John Koleszar committed
421

422
        movdqa      xmm2,                   xmm1            ; vp8_filter
John Koleszar's avatar
John Koleszar committed
423

424
425
        pand        xmm2,                   xmm4            ; Filter2 = vp8_filter & hev
        pxor        xmm0,                   xmm0
John Koleszar's avatar
John Koleszar committed
426

427
        pandn       xmm4,                   xmm1            ; vp8_filter&=~hev
John Koleszar's avatar
John Koleszar committed
428
429
        pxor        xmm1,                   xmm1

430
431
        punpcklbw   xmm0,                   xmm4            ; Filter 2 (hi)
        movdqa      xmm5,                   xmm2
John Koleszar's avatar
John Koleszar committed
432

433
        punpckhbw   xmm1,                   xmm4            ; Filter 2 (lo)
434
        paddsb      xmm5,                   [GLOBAL(t3)]    ; vp8_signed_char_clamp(Filter2 + 3)
John Koleszar's avatar
John Koleszar committed
435

436
        pmulhw      xmm1,                   [GLOBAL(s9)]    ; Filter 2 (lo) * 9
John Koleszar's avatar
John Koleszar committed
437

438
        pmulhw      xmm0,                   [GLOBAL(s9)]    ; Filter 2 (hi) * 9
John Koleszar's avatar
John Koleszar committed
439

440
        punpckhbw   xmm7,                   xmm5            ; axbxcxdx
441
        paddsb      xmm2,                   [GLOBAL(t4)]    ; vp8_signed_char_clamp(Filter2 + 4)
John Koleszar's avatar
John Koleszar committed
442

443
444
        punpcklbw   xmm5,                   xmm5            ; exfxgxhx
        psraw       xmm7,                   11              ; sign extended shift right by 3
John Koleszar's avatar
John Koleszar committed
445

446
447
        psraw       xmm5,                   11              ; sign extended shift right by 3
        punpckhbw   xmm4,                   xmm2            ; axbxcxdx
John Koleszar's avatar
John Koleszar committed
448

449
450
        punpcklbw   xmm2,                   xmm2            ; exfxgxhx
        psraw       xmm4,                   11              ; sign extended shift right by 3
451

452
453
        packsswb    xmm5,                   xmm7            ; Filter2 >>=3;
        psraw       xmm2,                   11              ; sign extended shift right by 3
John Koleszar's avatar
John Koleszar committed
454

455
456
        packsswb    xmm2,                   xmm4            ; Filter1 >>=3;
        movdqa      xmm7,                   xmm1
John Koleszar's avatar
John Koleszar committed
457

458
459
        paddsb      xmm6,                   xmm5            ; ps0 =ps0 + Fitler2
        movdqa      xmm4,                   xmm1
John Koleszar's avatar
John Koleszar committed
460

461
462
        psubsb      xmm3,                   xmm2            ; qs0 =qs0 - Filter1
        movdqa      xmm5,                   xmm0
John Koleszar's avatar
John Koleszar committed
463

464
        movdqa      xmm2,                   xmm5
465
        paddw       xmm0,                   [GLOBAL(s63)]   ; Filter 2 (hi) * 9 + 63
John Koleszar's avatar
John Koleszar committed
466

467
        paddw       xmm1,                   [GLOBAL(s63)]   ; Filter 2 (lo) * 9 + 63
468
        paddw       xmm5,                   xmm5            ; Filter 2 (hi) * 18
John Koleszar's avatar
John Koleszar committed
469

470
471
        paddw       xmm7,                   xmm7            ; Filter 2 (lo) * 18
        paddw       xmm5,                   xmm0            ; Filter 2 (hi) * 27 + 63
John Koleszar's avatar
John Koleszar committed
472

473
474
        paddw       xmm7,                   xmm1            ; Filter 2 (lo) * 27 + 63
        paddw       xmm2,                   xmm0            ; Filter 2 (hi) * 18 + 63
John Koleszar's avatar
John Koleszar committed
475

476
477
        paddw       xmm4,                   xmm1            ; Filter 2 (lo) * 18 + 63
        psraw       xmm0,                   7               ; (Filter 2 (hi) * 9 + 63) >> 7
John Koleszar's avatar
John Koleszar committed
478

479
480
        psraw       xmm1,                   7               ; (Filter 2 (lo) * 9 + 63) >> 7
        psraw       xmm2,                   7               ; (Filter 2 (hi) * 18 + 63) >> 7
John Koleszar's avatar
John Koleszar committed
481

482
483
        packsswb    xmm0,                   xmm1            ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
        psraw       xmm4,                   7               ; (Filter 2 (lo) * 18 + 63) >> 7
John Koleszar's avatar
John Koleszar committed
484

485
486
487
488
489
490
        psraw       xmm5,                   7               ; (Filter 2 (hi) * 27 + 63) >> 7
        packsswb    xmm2,                   xmm4            ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)

        psraw       xmm7,                   7               ; (Filter 2 (lo) * 27 + 63) >> 7

        packsswb    xmm5,                   xmm7            ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
491

492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
        psubsb      xmm3,                   xmm5            ; sq = vp8_signed_char_clamp(qs0 - u3)
        paddsb      xmm6,                   xmm5            ; sp = vp8_signed_char_clamp(ps0 - u3)

%if %1 == 0
        movdqa      xmm5,                   q2              ; q2
        movdqa      xmm1,                   q1              ; q1
        movdqa      xmm4,                   p1              ; p1
        movdqa      xmm7,                   p2              ; p2

%elif %1 == 1
        movdqa      xmm5,                   XMMWORD PTR [rdi+rcx]   ; q2
        movdqa      xmm1,                   XMMWORD PTR [rdi]       ; q1
        movdqa      xmm4,                   XMMWORD PTR [rsi+rax*2] ; p1
        movdqa      xmm7,                   XMMWORD PTR [rdi+rax*4] ; p2
%elif %1 == 2
        movdqa      xmm5,                   XMMWORD PTR [rdx+96]    ; q2
        movdqa      xmm1,                   XMMWORD PTR [rdx+80]    ; q1
        movdqa      xmm4,                   XMMWORD PTR [rdx+32]    ; p1
        movdqa      xmm7,                   XMMWORD PTR [rdx+16]    ; p2
511
%endif
John Koleszar's avatar
John Koleszar committed
512

513
514
        pxor        xmm3,                   [GLOBAL(t80)]   ; *oq0 = sq^0x80
        pxor        xmm6,                   [GLOBAL(t80)]   ; *oq0 = sp^0x80
John Koleszar's avatar
John Koleszar committed
515

516
517
        pxor        xmm1,                   [GLOBAL(t80)]
        pxor        xmm4,                   [GLOBAL(t80)]
John Koleszar's avatar
John Koleszar committed
518

519
520
        psubsb      xmm1,                   xmm2            ; sq = vp8_signed_char_clamp(qs1 - u2)
        paddsb      xmm4,                   xmm2            ; sp = vp8_signed_char_clamp(ps1 - u2)
John Koleszar's avatar
John Koleszar committed
521

522
523
        pxor        xmm1,                   [GLOBAL(t80)]   ; *oq1 = sq^0x80;
        pxor        xmm4,                   [GLOBAL(t80)]   ; *op1 = sp^0x80;
John Koleszar's avatar
John Koleszar committed
524

525
526
        pxor        xmm7,                   [GLOBAL(t80)]
        pxor        xmm5,                   [GLOBAL(t80)]
John Koleszar's avatar
John Koleszar committed
527

528
529
        paddsb      xmm7,                   xmm0            ; sp = vp8_signed_char_clamp(ps2 - u)
        psubsb      xmm5,                   xmm0            ; sq = vp8_signed_char_clamp(qs2 - u)
John Koleszar's avatar
John Koleszar committed
530

531
532
        pxor        xmm7,                   [GLOBAL(t80)]   ; *op2 = sp^0x80;
        pxor        xmm5,                   [GLOBAL(t80)]   ; *oq2 = sq^0x80;
533

534
535
536
%if %1 == 0
        lea         rsi,                    [rsi+rcx*2]
        lea         rdi,                    [rdi+rcx*2]
537

538
539
540
541
        movq        MMWORD PTR [rsi],       xmm6            ; p0
        movhps      MMWORD PTR [rdi],       xmm6
        movq        MMWORD PTR [rsi + rcx], xmm3            ; q0
        movhps      MMWORD PTR [rdi + rcx], xmm3
542

543
544
        movq        MMWORD PTR [rsi+rcx*2], xmm1            ; q1
        movhps      MMWORD PTR [rdi+rcx*2], xmm1
545

546
547
548
549
550
        movq        MMWORD PTR [rsi + rax], xmm4            ; p1
        movhps      MMWORD PTR [rdi + rax], xmm4

        movq        MMWORD PTR [rsi+rax*2], xmm7            ; p2
        movhps      MMWORD PTR [rdi+rax*2], xmm7
551
552
553

        lea         rsi,                    [rsi + rcx]
        lea         rdi,                    [rdi + rcx]
554
555
556
557
558
559
560
561
562
563
564
565
566
567
        movq        MMWORD PTR [rsi+rcx*2], xmm5            ; q2
        movhps      MMWORD PTR [rdi+rcx*2], xmm5
%elif %1 == 1
        movdqa      XMMWORD PTR [rdi+rcx],  xmm5            ; q2
        movdqa      XMMWORD PTR [rdi],      xmm1            ; q1
        movdqa      XMMWORD PTR [rsi],      xmm3            ; q0
        movdqa      XMMWORD PTR [rsi+rax  ],xmm6            ; p0
        movdqa      XMMWORD PTR [rsi+rax*2],xmm4            ; p1
        movdqa      XMMWORD PTR [rdi+rax*4],xmm7            ; p2
%elif %1 == 2
        movdqa      XMMWORD PTR [rdx+80],   xmm1            ; q1
        movdqa      XMMWORD PTR [rdx+64],   xmm3            ; q0
        movdqa      XMMWORD PTR [rdx+48],   xmm6            ; p0
        movdqa      XMMWORD PTR [rdx+32],   xmm4            ; p1
568
%endif
569

570
571
572
573
574
575
576
%endmacro


;void vp8_mbloop_filter_horizontal_edge_sse2
;(
;    unsigned char *src_ptr,
;    int            src_pixel_step,
Johann's avatar
Johann committed
577
;    const char    *blimit,
578
579
580
581
582
583
584
585
586
;    const char    *limit,
;    const char    *thresh,
;    int            count
;)
global sym(vp8_mbloop_filter_horizontal_edge_sse2)
sym(vp8_mbloop_filter_horizontal_edge_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
587
    SAVE_XMM 7
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    ALIGN_STACK 16, rax
    sub         rsp, 32     ; reserve 32 bytes
    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[16];
    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[16];

        mov         rsi,                    arg(0)            ;src_ptr
        movsxd      rax,                    dword ptr arg(1)  ;src_pixel_step

        mov         rdx,                    arg(3)            ;limit
        movdqa      xmm7,                   XMMWORD PTR [rdx]

        lea         rdi,                    [rsi+rax]         ; rdi points to row +1 for indirect addressing

606
607
608
609
        ; calculate breakout conditions and high edge variance
        LFH_FILTER_AND_HEV_MASK 1
        ; filter and write back the results
        MB_FILTER_AND_WRITEBACK 1
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626

    add rsp, 32
    pop rsp
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret


;void vp8_mbloop_filter_horizontal_edge_uv_sse2
;(
;    unsigned char *u,
;    int            src_pixel_step,
Johann's avatar
Johann committed
627
;    const char    *blimit,
628
629
630
631
632
633
634
635
636
;    const char    *limit,
;    const char    *thresh,
;    unsigned char *v
;)
global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2)
sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
637
    SAVE_XMM 7
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    ALIGN_STACK 16, rax
    sub         rsp, 96       ; reserve 96 bytes
    %define q2  [rsp + 0]     ;__declspec(align(16)) char q2[16];
    %define q1  [rsp + 16]    ;__declspec(align(16)) char q1[16];
    %define p2  [rsp + 32]    ;__declspec(align(16)) char p2[16];
    %define p1  [rsp + 48]    ;__declspec(align(16)) char p1[16];
    %define t0  [rsp + 64]    ;__declspec(align(16)) char t0[16];
    %define t1  [rsp + 80]    ;__declspec(align(16)) char t1[16];

        mov         rsi,                    arg(0)             ; u
        mov         rdi,                    arg(5)             ; v
        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
        mov         rcx,                    rax
        neg         rax                     ; negate pitch to deal with above border

        mov         rdx,                    arg(3)             ;limit
        movdqa      xmm7,                   XMMWORD PTR [rdx]

        lea         rsi,                    [rsi + rcx]
        lea         rdi,                    [rdi + rcx]

664
665
666
667
        ; calculate breakout conditions and high edge variance
        LFH_FILTER_AND_HEV_MASK 0
        ; filter and write back the results
        MB_FILTER_AND_WRITEBACK 0
668
669
670
671
672
673
674
675
676
677
678
679
680

    add rsp, 96
    pop rsp
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret


681
682
683
%macro TRANSPOSE_16X8 2
        movq        xmm4,               QWORD PTR [rsi]        ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
        movq        xmm1,               QWORD PTR [rdi]        ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
684
        movq        xmm0,               QWORD PTR [rsi+2*rax]  ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
685
686
687
        movq        xmm7,               QWORD PTR [rdi+2*rax]  ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
        movq        xmm5,               QWORD PTR [rsi+4*rax]  ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
        movq        xmm2,               QWORD PTR [rdi+4*rax]  ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
688

689
        punpcklbw   xmm4,               xmm1            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
690

691
692
693
        movq        xmm1,               QWORD PTR [rdi+2*rcx]  ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70

        movdqa      xmm3,               xmm4            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
694
        punpcklbw   xmm0,               xmm7            ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
695

696
        movq        xmm7,               QWORD PTR [rsi+2*rcx]  ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
697

698
        punpcklbw   xmm5,               xmm2            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
699
700
701
702
703
%if %1
        lea         rsi,                [rsi+rax*8]
%else
        mov         rsi,                arg(5)          ; v_ptr
%endif
704

705
706
        movdqa      xmm6,               xmm5            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
        punpcklbw   xmm7,               xmm1            ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
707

708
709
710
        punpcklwd   xmm5,               xmm7            ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40

        punpckhwd   xmm6,               xmm7            ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
711
712
713
714
715
%if %1
        lea         rdi,                [rdi+rax*8]
%else
        lea         rsi,                [rsi - 4]
%endif
716
717

        punpcklwd   xmm3,               xmm0            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
718
719
720
721
722
723
724
%if %1
        lea         rdx,                srct
%else
        lea         rdi,                [rsi + rax]     ; rdi points to row +1 for indirect addressing
%endif

        movdqa      xmm2,               xmm3            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
725
726
727
        punpckhwd   xmm4,               xmm0            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04

        movdqa      xmm7,               xmm4            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
728
        punpckhdq   xmm3,               xmm5            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
729
730

        punpckhdq   xmm7,               xmm6            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
731

732
733
734
735
736
        punpckldq   xmm4,               xmm6            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04

        punpckldq   xmm2,               xmm5            ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00

        movdqa      t0,                 xmm2            ; save to free XMM2
737
738
739
740
741
        movq        xmm2,               QWORD PTR [rsi]       ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
        movq        xmm6,               QWORD PTR [rdi]       ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
        movq        xmm0,               QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
        movq        xmm5,               QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
        movq        xmm1,               QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
742

743
        punpcklbw   xmm2,               xmm6            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
744

745
        movq        xmm6,               QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
746
747
748

        punpcklbw   xmm0,               xmm5                  ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0

749
        movq        xmm5,               QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
750
751

        punpcklbw   xmm1,               xmm6            ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
752
753

        movq        xmm6,               QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
754

755
        punpcklbw   xmm5,               xmm6            ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
756
757
758
759
760
761
762
763

        movdqa      xmm6,               xmm1            ;
        punpckhwd   xmm6,               xmm5            ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4

        punpcklwd   xmm1,               xmm5            ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
        movdqa      xmm5,               xmm2            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80

        punpcklwd   xmm5,               xmm0            ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
764

765
766
767
768
769
770
771
772
773
774
        punpckhwd   xmm2,               xmm0            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84

        movdqa      xmm0,               xmm5
        punpckldq   xmm0,               xmm1            ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80

        punpckhdq   xmm5,               xmm1            ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
        movdqa      xmm1,               xmm2            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84

        punpckldq   xmm1,               xmm6            ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84

775
        punpckhdq   xmm2,               xmm6            ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
776
        movdqa      xmm6,               xmm7            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
777

778
779
780
        punpcklqdq  xmm6,               xmm2            ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06

        punpckhqdq  xmm7,               xmm2            ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
781
%if %2
782
783
784
785
        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02

        punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
786

787
788
789
790
791
792
        movdqa      [rdx],              xmm2            ; save 2

        movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
        punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04

        movdqa      [rdx+16],           xmm3            ; save 3
793

794
795
796
797
798
799
        punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05

        movdqa      [rdx+32],           xmm4            ; save 4
        movdqa      [rdx+48],           xmm5            ; save 5
        movdqa      xmm1,               t0              ; get

800
        movdqa      xmm2,               xmm1            ;
801
        punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
802

803
804
805
806
807
808
        punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
%else
        movdqa      [rdx+112],          xmm7            ; save 7

        movdqa      [rdx+96],           xmm6            ; save 6

809
        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
810
        punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
811
812
813

        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02

814
815
816
817
818
819
        movdqa      [rdx+32],           xmm2            ; save 2

        movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
        punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04

        movdqa      [rdx+48],           xmm3            ; save 3
820

821
822
823
824
825
826
        punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05

        movdqa      [rdx+64],           xmm4            ; save 4
        movdqa      [rdx+80],           xmm5            ; save 5
        movdqa      xmm1,               t0              ; get

827
        movdqa      xmm2,               xmm1
828
        punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
829

830
831
832
        punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00

        movdqa      [rdx+16],           xmm1
833

834
835
836
837
        movdqa      [rdx],              xmm2
%endif
%endmacro

838
%macro LFV_FILTER_MASK_HEV_MASK 1
839
840
841
842
843
        movdqa      xmm0,               xmm6            ; q2
        psubusb     xmm0,               xmm7            ; q2-q3

        psubusb     xmm7,               xmm6            ; q3-q2
        movdqa      xmm4,               xmm5            ; q1
John Koleszar's avatar
John Koleszar committed
844

845
846
        por         xmm7,               xmm0            ; abs (q3-q2)
        psubusb     xmm4,               xmm6            ; q1-q2
John Koleszar's avatar
John Koleszar committed
847

848
        movdqa      xmm0,               xmm1
849
        psubusb     xmm6,               xmm5            ; q2-q1
John Koleszar's avatar
John Koleszar committed
850

851
        por         xmm6,               xmm4            ; abs (q2-q1)
852
        psubusb     xmm0,               xmm2            ; p2 - p3;
John Koleszar's avatar
John Koleszar committed
853

854
        psubusb     xmm2,               xmm1            ; p3 - p2;
855
856
857
858
859
860
861
        por         xmm0,               xmm2            ; abs(p2-p3)
%if %1
        movdqa      xmm2,               [rdx]           ; p1
%else
        movdqa      xmm2,               [rdx+32]        ; p1
%endif
        movdqa      xmm5,               xmm2            ; p1
862
        pmaxub      xmm0,               xmm7
John Koleszar's avatar
John Koleszar committed
863

864
865
        psubusb     xmm5,               xmm1            ; p1-p2
        psubusb     xmm1,               xmm2            ; p2-p1
John Koleszar's avatar
John Koleszar committed
866

867
868
        movdqa      xmm7,               xmm3            ; p0
        psubusb     xmm7,               xmm2            ; p0-p1
John Koleszar's avatar
John Koleszar committed
869

870
871
        por         xmm1,               xmm5            ; abs(p2-p1)
        pmaxub      xmm0,               xmm6
John Koleszar's avatar
John Koleszar committed
872

873
        pmaxub      xmm0,               xmm1
874
        movdqa      xmm1,               xmm2            ; p1
John Koleszar's avatar
John Koleszar committed
875

876
        psubusb     xmm2,               xmm3            ; p1-p0
877
878
        lea         rdx,                srct

879
        por         xmm2,               xmm7            ; abs(p1-p0)
John Koleszar's avatar
John Koleszar committed
880

881
        movdqa      t0,                 xmm2            ; save abs(p1-p0)
John Koleszar's avatar
John Koleszar committed
882

883
884
        pmaxub      xmm0,               xmm2

885
886
887
888
889
890
891
%if %1
        movdqa      xmm5,               [rdx+32]        ; q0
        movdqa      xmm7,               [rdx+48]        ; q1
%else
        movdqa      xmm5,               [rdx+64]        ; q0
        movdqa      xmm7,               [rdx+80]        ; q1
%endif
892
893
        mov         rdx,                arg(3)          ; limit

894
895
896
        movdqa      xmm6,               xmm5            ; q0
        movdqa      xmm2,               xmm7            ; q1

897
        psubusb     xmm5,               xmm7            ; q0-q1
898
        psubusb     xmm7,               xmm6            ; q1-q0
899

900
901
902
903
        por         xmm7,               xmm5            ; abs(q1-q0)

        movdqa      t1,                 xmm7            ; save abs(q1-q0)

904
        movdqa      xmm4,               XMMWORD PTR [rdx]; limit
905
906

        pmaxub      xmm0,               xmm7
Johann's avatar
Johann committed
907
        mov         rdx,                arg(2)          ; blimit
908

909
        psubusb     xmm0,               xmm4
910
        movdqa      xmm5,               xmm2            ; q1
911

912
913
        psubusb     xmm5,               xmm1            ; q1-=p1
        psubusb     xmm1,               xmm2            ; p1-=q1
914

915
        por         xmm5,               xmm1            ; abs(p1-q1)
916
917
        movdqa      xmm1,               xmm3            ; p0

918
        pand        xmm5,               [GLOBAL(tfe)]   ; set lsb of each byte to zero
919
920
        psubusb     xmm1,               xmm6            ; p0-q0

921
        psrlw       xmm5,               1               ; abs(p1-q1)/2
922
        psubusb     xmm6,               xmm3            ; q0-p0
923

Johann's avatar
Johann committed
924
        movdqa      xmm4,               XMMWORD PTR [rdx]; blimit
925

926
        mov         rdx,                arg(4)          ; get thresh
927

928
        por         xmm1,               xmm6            ; abs(q0-p0)
929

930
        movdqa      xmm6,               t0              ; get abs (q1 - q0)
John Koleszar's avatar
John Koleszar committed
931

932
        paddusb     xmm1,               xmm1            ; abs(q0-p0)*2
John Koleszar's avatar
John Koleszar committed
933

934
        movdqa      xmm3,               t1              ; get abs (p1 - p0)
John Koleszar's avatar
John Koleszar committed
935

936
        movdqa      xmm7,               XMMWORD PTR [rdx]
John Koleszar's avatar
John Koleszar committed
937

938
939
        paddusb     xmm1,               xmm5            ; abs (p0 - q0) *2 + abs(p1-q1)/2
        psubusb     xmm6,               xmm7            ; abs(q1 - q0) > thresh
John Koleszar's avatar
John Koleszar committed
940

941
        psubusb     xmm3,               xmm7            ; abs(p1 - p0)> thresh
John Koleszar's avatar
John Koleszar committed
942

Johann's avatar
Johann committed
943
        psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
944
        por         xmm6,               xmm3            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
John Koleszar's avatar
John Koleszar committed
945

946
947
        por         xmm1,               xmm0            ; mask
        pcmpeqb     xmm6,               xmm0
John Koleszar's avatar
John Koleszar committed
948

949
950
        pxor        xmm0,               xmm0
        pcmpeqb     xmm4,               xmm4