Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
bb550d90
Commit
bb550d90
authored
Oct 22, 2012
by
Scott LaVarnway
Committed by
Gerrit Code Review
Oct 22, 2012
Browse files
Merge "sse2 intrinsic version of vp8_mbloop_filter_horizontal_edge()" into experimental
parents
09fb253b
992b5e2d
Changes
6
Hide whitespace changes
Inline
Side-by-side
vp8/common/loopfilter_filters.c
View file @
bb550d90
...
...
@@ -7,8 +7,6 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include
<stdlib.h>
#include
"vpx_config.h"
#include
"loopfilter.h"
...
...
@@ -94,6 +92,7 @@ static __inline void vp8_filter(signed char mask, uc hev, uc *op1,
*
op1
=
u
^
0x80
;
}
void
vp8_loop_filter_horizontal_edge_c
(
unsigned
char
*
s
,
...
...
@@ -218,6 +217,7 @@ static __inline void vp8_mbfilter(signed char mask, uc hev, uc flat,
Filter2
=
vp8_signed_char_clamp
(
vp8_filter
+
3
);
Filter1
>>=
3
;
Filter2
>>=
3
;
u
=
vp8_signed_char_clamp
(
qs0
-
Filter1
);
*
oq0
=
u
^
0x80
;
u
=
vp8_signed_char_clamp
(
ps0
+
Filter2
);
...
...
@@ -272,7 +272,6 @@ void vp8_mbloop_filter_horizontal_edge_c
}
void
vp8_mbloop_filter_vertical_edge_c
(
unsigned
char
*
s
,
...
...
vp8/common/rtcd_defs.sh
View file @
bb550d90
...
...
@@ -134,13 +134,13 @@ prototype void vp8_loop_filter_bv8x8 "unsigned char *y, unsigned char *u, unsign
specialize vp8_loop_filter_bv8x8
;
prototype void vp8_loop_filter_mbh
"unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
specialize vp8_loop_filter_mbh
;
specialize vp8_loop_filter_mbh
sse2
prototype void vp8_loop_filter_bh
"unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
specialize vp8_loop_filter_bh
;
prototype void vp8_loop_filter_bh8x8
"unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
specialize vp8_loop_filter_bh8x8
;
specialize vp8_loop_filter_bh8x8
sse2
prototype void vp8_loop_filter_simple_mbv
"unsigned char *y, int ystride, const unsigned char *blimit"
specialize vp8_loop_filter_simple_mbv mmx sse2 media neon
...
...
vp8/common/x86/loopfilter_mmx.asm
View file @
bb550d90
...
...
@@ -594,323 +594,6 @@ sym(vp8_loop_filter_vertical_edge_mmx):
ret
;void vp8_mbloop_filter_horizontal_edge_mmx
;(
; unsigned char *src_ptr,
; int src_pixel_step,
; const char *blimit,
; const char *limit,
; const char *thresh,
; int count
;)
global
sym
(
vp8_mbloop_filter_horizontal_edge_mmx
)
sym
(
vp8_mbloop_filter_horizontal_edge_mmx
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
6
GET_GOT
rbx
push
rsi
push
rdi
; end prolog
ALIGN
_STACK
16
,
rax
sub
rsp
,
32
; reserve 32 bytes
%define t0 [rsp + 0]
;__declspec(align(16)) char t0[8];
%define t1 [rsp + 16]
;__declspec(align(16)) char t1[8];
mov
rsi
,
arg
(
0
)
;src_ptr
movsxd
rax
,
dword
ptr
arg
(
1
)
;src_pixel_step ; destination pitch?
movsxd
rcx
,
dword
ptr
arg
(
5
)
;count
.next8_mbh:
mov
rdx
,
arg
(
3
)
;limit
movq
mm7
,
[
rdx
]
mov
rdi
,
rsi
; rdi points to row +1 for indirect addressing
add
rdi
,
rax
; calculate breakout conditions
movq
mm2
,
[
rdi
+
2
*
rax
]
; q3
movq
mm1
,
[
rsi
+
2
*
rax
]
; q2
movq
mm6
,
mm1
; q2
psubusb
mm1
,
mm2
; q2-=q3
psubusb
mm2
,
mm6
; q3-=q2
por
mm1
,
mm2
; abs(q3-q2)
psubusb
mm1
,
mm7
; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit
movq
mm4
,
[
rsi
+
rax
]
; q1
movq
mm3
,
mm4
; q1
psubusb
mm4
,
mm6
; q1-=q2
psubusb
mm6
,
mm3
; q2-=q1
por
mm4
,
mm6
; abs(q2-q1)
psubusb
mm4
,
mm7
por
mm1
,
mm4
; mm1 = mask, mm3=q1, mm7 = limit
movq
mm4
,
[
rsi
]
; q0
movq
mm0
,
mm4
; q0
psubusb
mm4
,
mm3
; q0-=q1
psubusb
mm3
,
mm0
; q1-=q0
por
mm4
,
mm3
; abs(q0-q1)
movq
t0
,
mm4
; save to t0
psubusb
mm4
,
mm7
por
mm1
,
mm4
; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
neg
rax
; negate pitch to deal with above border
movq
mm2
,
[
rsi
+
4
*
rax
]
; p3
movq
mm4
,
[
rdi
+
4
*
rax
]
; p2
movq
mm5
,
mm4
; p2
psubusb
mm4
,
mm2
; p2-=p3
psubusb
mm2
,
mm5
; p3-=p2
por
mm4
,
mm2
; abs(p3 - p2)
psubusb
mm4
,
mm7
por
mm1
,
mm4
; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
movq
mm4
,
[
rsi
+
2
*
rax
]
; p1
movq
mm3
,
mm4
; p1
psubusb
mm4
,
mm5
; p1-=p2
psubusb
mm5
,
mm3
; p2-=p1
por
mm4
,
mm5
; abs(p2 - p1)
psubusb
mm4
,
mm7
por
mm1
,
mm4
movq
mm2
,
mm3
; p1
; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
movq
mm4
,
[
rsi
+
rax
]
; p0
movq
mm5
,
mm4
; p0
psubusb
mm4
,
mm3
; p0-=p1
psubusb
mm3
,
mm5
; p1-=p0
por
mm4
,
mm3
; abs(p1 - p0)
movq
t1
,
mm4
; save to t1
psubusb
mm4
,
mm7
por
mm1
,
mm4
; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0)
; mm5 = p0
movq
mm3
,
[
rdi
]
; q1
movq
mm4
,
mm3
; q1
psubusb
mm3
,
mm2
; q1-=p1
psubusb
mm2
,
mm4
; p1-=q1
por
mm2
,
mm3
; abs(p1-q1)
pand
mm2
,
[
GLOBAL
(
tfe
)]
; set lsb of each byte to zero
psrlw
mm2
,
1
; abs(p1-q1)/2
movq
mm6
,
mm5
; p0
movq
mm3
,
mm0
; q0
psubusb
mm5
,
mm3
; p0-=q0
psubusb
mm3
,
mm6
; q0-=p0
por
mm5
,
mm3
; abs(p0 - q0)
paddusb
mm5
,
mm5
; abs(p0-q0)*2
paddusb
mm5
,
mm2
; abs (p0 - q0) *2 + abs(p1-q1)/2
mov
rdx
,
arg
(
2
)
;blimit ; get blimit
movq
mm7
,
[
rdx
]
; blimit
psubusb
mm5
,
mm7
; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
por
mm1
,
mm5
pxor
mm5
,
mm5
pcmpeqb
mm1
,
mm5
; mask mm1
; mm1 = mask, mm0=q0, mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
; mm6 = p0,
; calculate high edge variance
mov
rdx
,
arg
(
4
)
;thresh ; get thresh
movq
mm7
,
[
rdx
]
;
movq
mm4
,
t0
; get abs (q1 - q0)
psubusb
mm4
,
mm7
movq
mm3
,
t1
; get abs (p1 - p0)
psubusb
mm3
,
mm7
paddb
mm4
,
mm3
; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
pcmpeqb
mm4
,
mm5
pcmpeqb
mm5
,
mm5
pxor
mm4
,
mm5
; mm1 = mask, mm0=q0, mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0)
; mm6 = p0, mm4=hev
; start work on filters
movq
mm2
,
[
rsi
+
2
*
rax
]
; p1
movq
mm7
,
[
rdi
]
; q1
pxor
mm2
,
[
GLOBAL
(
t80
)]
; p1 offset to convert to signed values
pxor
mm7
,
[
GLOBAL
(
t80
)]
; q1 offset to convert to signed values
psubsb
mm2
,
mm7
; p1 - q1
pxor
mm6
,
[
GLOBAL
(
t80
)]
; offset to convert to signed values
pxor
mm0
,
[
GLOBAL
(
t80
)]
; offset to convert to signed values
movq
mm3
,
mm0
; q0
psubsb
mm0
,
mm6
; q0 - p0
paddsb
mm2
,
mm0
; 1 * (q0 - p0) + (p1 - q1)
paddsb
mm2
,
mm0
; 2 * (q0 - p0)
paddsb
mm2
,
mm0
; 3 * (q0 - p0) + (p1 - q1)
pand
mm1
,
mm2
; mask filter values we don't care about
; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
movq
mm2
,
mm1
; vp8_filter
pand
mm2
,
mm4
; ; Filter2 = vp8_filter & hev
movq
mm5
,
mm2
;
paddsb
mm5
,
[
GLOBAL
(
t3
)]
;
pxor
mm0
,
mm0
; 0
pxor
mm7
,
mm7
; 0
punpcklbw
mm0
,
mm5
; e0f0g0h0
psraw
mm0
,
11
; sign extended shift right by 3
punpckhbw
mm7
,
mm5
; a0b0c0d0
psraw
mm7
,
11
; sign extended shift right by 3
packsswb
mm0
,
mm7
; Filter2 >>=3;
movq
mm5
,
mm0
; Filter2
paddsb
mm2
,
[
GLOBAL
(
t4
)]
; vp8_signed_char_clamp(Filter2 + 4)
pxor
mm0
,
mm0
; 0
pxor
mm7
,
mm7
; 0
punpcklbw
mm0
,
mm2
; e0f0g0h0
psraw
mm0
,
11
; sign extended shift right by 3
punpckhbw
mm7
,
mm2
; a0b0c0d0
psraw
mm7
,
11
; sign extended shift right by 3
packsswb
mm0
,
mm7
; Filter2 >>=3;
; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0
psubsb
mm3
,
mm0
; qs0 =qs0 - filter1
paddsb
mm6
,
mm5
; ps0 =ps0 + Fitler2
; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
; vp8_filter &= ~hev;
; Filter2 = vp8_filter;
pandn
mm4
,
mm1
; vp8_filter&=~hev
; mm3=qs0, mm4=filter2, mm6=ps0
; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
; s = vp8_signed_char_clamp(qs0 - u);
; *oq0 = s^0x80;
; s = vp8_signed_char_clamp(ps0 + u);
; *op0 = s^0x80;
pxor
mm0
,
mm0
pxor
mm1
,
mm1
pxor
mm2
,
mm2
punpcklbw
mm1
,
mm4
punpckhbw
mm2
,
mm4
pmulhw
mm1
,
[
GLOBAL
(
s27
)]
pmulhw
mm2
,
[
GLOBAL
(
s27
)]
paddw
mm1
,
[
GLOBAL
(
s63
)]
paddw
mm2
,
[
GLOBAL
(
s63
)]
psraw
mm1
,
7
psraw
mm2
,
7
packsswb
mm1
,
mm2
psubsb
mm3
,
mm1
paddsb
mm6
,
mm1
pxor
mm3
,
[
GLOBAL
(
t80
)]
pxor
mm6
,
[
GLOBAL
(
t80
)]
movq
[
rsi
+
rax
],
mm6
movq
[
rsi
],
mm3
; roughly 2/7th difference across boundary
; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
; s = vp8_signed_char_clamp(qs1 - u);
; *oq1 = s^0x80;
; s = vp8_signed_char_clamp(ps1 + u);
; *op1 = s^0x80;
pxor
mm1
,
mm1
pxor
mm2
,
mm2
punpcklbw
mm1
,
mm4
punpckhbw
mm2
,
mm4
pmulhw
mm1
,
[
GLOBAL
(
s18
)]
pmulhw
mm2
,
[
GLOBAL
(
s18
)]
paddw
mm1
,
[
GLOBAL
(
s63
)]
paddw
mm2
,
[
GLOBAL
(
s63
)]
psraw
mm1
,
7
psraw
mm2
,
7
packsswb
mm1
,
mm2
movq
mm3
,
[
rdi
]
movq
mm6
,
[
rsi
+
rax
*
2
]
; p1
pxor
mm3
,
[
GLOBAL
(
t80
)]
pxor
mm6
,
[
GLOBAL
(
t80
)]
paddsb
mm6
,
mm1
psubsb
mm3
,
mm1
pxor
mm6
,
[
GLOBAL
(
t80
)]
pxor
mm3
,
[
GLOBAL
(
t80
)]
movq
[
rdi
],
mm3
movq
[
rsi
+
rax
*
2
],
mm6
; roughly 1/7th difference across boundary
; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
; s = vp8_signed_char_clamp(qs2 - u);
; *oq2 = s^0x80;
; s = vp8_signed_char_clamp(ps2 + u);
; *op2 = s^0x80;
pxor
mm1
,
mm1
pxor
mm2
,
mm2
punpcklbw
mm1
,
mm4
punpckhbw
mm2
,
mm4
pmulhw
mm1
,
[
GLOBAL
(
s9
)]
pmulhw
mm2
,
[
GLOBAL
(
s9
)]
paddw
mm1
,
[
GLOBAL
(
s63
)]
paddw
mm2
,
[
GLOBAL
(
s63
)]
psraw
mm1
,
7
psraw
mm2
,
7
packsswb
mm1
,
mm2
movq
mm6
,
[
rdi
+
rax
*
4
]
neg
rax
movq
mm3
,
[
rdi
+
rax
]
pxor
mm6
,
[
GLOBAL
(
t80
)]
pxor
mm3
,
[
GLOBAL
(
t80
)]
paddsb
mm6
,
mm1
psubsb
mm3
,
mm1
pxor
mm6
,
[
GLOBAL
(
t80
)]
pxor
mm3
,
[
GLOBAL
(
t80
)]
movq
[
rdi
+
rax
],
mm3
neg
rax
movq
[
rdi
+
rax
*
4
],
mm6
;EARLY_BREAK_OUT:
neg
rax
add
rsi
,
8
dec
rcx
jnz
.next8_mbh
add
rsp
,
32
pop
rsp
; begin epilog
pop
rdi
pop
rsi
REST
ORE_GOT
UNSHADOW_ARGS
pop
rbp
ret
;void vp8_mbloop_filter_vertical_edge_mmx
;(
; unsigned char *src_ptr,
...
...
vp8/common/x86/loopfilter_sse2.asm
View file @
bb550d90
...
...
@@ -567,115 +567,6 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2):
%endmacro
;void vp8_mbloop_filter_horizontal_edge_sse2
;(
; unsigned char *src_ptr,
; int src_pixel_step,
; const char *blimit,
; const char *limit,
; const char *thresh,
; int count
;)
global
sym
(
vp8_mbloop_filter_horizontal_edge_sse2
)
sym
(
vp8_mbloop_filter_horizontal_edge_sse2
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
6
SAVE_XMM
7
GET_GOT
rbx
push
rsi
push
rdi
; end prolog
ALIGN
_STACK
16
,
rax
sub
rsp
,
32
; reserve 32 bytes
%define t0 [rsp + 0]
;__declspec(align(16)) char t0[16];
%define t1 [rsp + 16]
;__declspec(align(16)) char t1[16];
mov
rsi
,
arg
(
0
)
;src_ptr
movsxd
rax
,
dword
ptr
arg
(
1
)
;src_pixel_step
mov
rdx
,
arg
(
3
)
;limit
movdqa
xmm7
,
XMMWORD
PTR
[
rdx
]
lea
rdi
,
[
rsi
+
rax
]
; rdi points to row +1 for indirect addressing
; calculate breakout conditions and high edge variance
LFH_FILTER_AND_HEV_MASK
1
; filter and write back the results
MB_FILTER_AND_WRITEBACK
1
add
rsp
,
32
pop
rsp
; begin epilog
pop
rdi
pop
rsi
REST
ORE_GOT
REST
ORE_XMM
UNSHADOW_ARGS
pop
rbp
ret
;void vp8_mbloop_filter_horizontal_edge_uv_sse2
;(
; unsigned char *u,
; int src_pixel_step,
; const char *blimit,
; const char *limit,
; const char *thresh,
; unsigned char *v
;)
global
sym
(
vp8_mbloop_filter_horizontal_edge_uv_sse2
)
sym
(
vp8_mbloop_filter_horizontal_edge_uv_sse2
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
6
SAVE_XMM
7
GET_GOT
rbx
push
rsi
push
rdi
; end prolog
ALIGN
_STACK
16
,
rax
sub
rsp
,
96
; reserve 96 bytes
%define q2 [rsp + 0]
;__declspec(align(16)) char q2[16];
%define q1 [rsp + 16]
;__declspec(align(16)) char q1[16];
%define p2 [rsp + 32]
;__declspec(align(16)) char p2[16];
%define p1 [rsp + 48]
;__declspec(align(16)) char p1[16];
%define t0 [rsp + 64]
;__declspec(align(16)) char t0[16];
%define t1 [rsp + 80]
;__declspec(align(16)) char t1[16];
mov
rsi
,
arg
(
0
)
; u
mov
rdi
,
arg
(
5
)
; v
movsxd
rax
,
dword
ptr
arg
(
1
)
; src_pixel_step
mov
rcx
,
rax
neg
rax
; negate pitch to deal with above border
mov
rdx
,
arg
(
3
)
;limit
movdqa
xmm7
,
XMMWORD
PTR
[
rdx
]
lea
rsi
,
[
rsi
+
rcx
]
lea
rdi
,
[
rdi
+
rcx
]
; calculate breakout conditions and high edge variance
LFH_FILTER_AND_HEV_MASK
0
; filter and write back the results
MB_FILTER_AND_WRITEBACK
0
add
rsp
,
96
pop
rsp
; begin epilog
pop
rdi
pop
rsi
REST
ORE_GOT
REST
ORE_XMM
UNSHADOW_ARGS
pop
rbp
ret
%macro TRANSPOSE_16X8 2
movq
xmm4
,
QWORD
PTR
[
rsi
]
; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
movq
xmm1
,
QWORD
PTR
[
rdi
]
; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
...
...
vp8/common/x86/loopfilter_x86.c
View file @
bb550d90
...
...
@@ -9,35 +9,26 @@
*/
#include
<emmintrin.h>
// SSE2
#include
"vpx_config.h"
#include
"vp8/common/loopfilter.h"
prototype_loopfilter
(
vp8_mbloop_filter_vertical_edge_mmx
);
prototype_loopfilter
(
vp8_mbloop_filter_horizontal_edge_mmx
);
prototype_loopfilter
(
vp8_loop_filter_vertical_edge_mmx
);
prototype_loopfilter
(
vp8_loop_filter_horizontal_edge_mmx
);
prototype_loopfilter
(
vp8_loop_filter_vertical_edge_sse2
);
prototype_loopfilter
(
vp8_loop_filter_horizontal_edge_sse2
);
prototype_loopfilter
(
vp8_mbloop_filter_vertical_edge_sse2
);
prototype_loopfilter
(
vp8_mbloop_filter_horizontal_edge_sse2
);
extern
loop_filter_uvfunction
vp8_loop_filter_horizontal_edge_uv_sse2
;
extern
loop_filter_uvfunction
vp8_loop_filter_vertical_edge_uv_sse2
;
extern
loop_filter_uvfunction
vp8_mbloop_filter_horizontal_edge_uv_sse2
;
extern
loop_filter_uvfunction
vp8_mbloop_filter_vertical_edge_uv_sse2
;
#if HAVE_MMX
/* Horizontal MB filtering */
void
vp8_loop_filter_mbh_mmx
(
unsigned
char
*
y_ptr
,
unsigned
char
*
u_ptr
,
unsigned
char
*
v_ptr
,
int
y_stride
,
int
uv_stride
,
struct
loop_filter_info
*
lfi
)
{
vp8_mbloop_filter_horizontal_edge_mmx
(
y_ptr
,
y_stride
,
lfi
->
mblim
,
lfi
->
lim
,
lfi
->
hev_thr
,
2
);
if
(
u_ptr
)
vp8_mbloop_filter_horizontal_edge_mmx
(
u_ptr
,
uv_stride
,
lfi
->
mblim
,
lfi
->
lim
,
lfi
->
hev_thr
,
1
);
if
(
v_ptr
)
vp8_mbloop_filter_horizontal_edge_mmx
(
v_ptr
,
uv_stride
,
lfi
->
mblim
,
lfi
->
lim
,
lfi
->
hev_thr
,
1
);
}
...
...
@@ -57,15 +48,7 @@ void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
/* Horizontal B Filtering */
void
vp8_loop_filter_bh_mmx
(
unsigned
char
*
y_ptr
,
unsigned
char
*
u_ptr
,
unsigned
char
*
v_ptr
,
int
y_stride
,
int
uv_stride
,
struct
loop_filter_info
*
lfi
)
{
vp8_loop_filter_horizontal_edge_mmx
(
y_ptr
+
4
*
y_stride
,
y_stride
,
lfi
->
blim
,
lfi
->
lim
,
lfi
->
hev_thr
,
2
);
vp8_loop_filter_horizontal_edge_mmx
(
y_ptr
+
8
*
y_stride
,
y_stride
,
lfi
->
blim
,
lfi
->
lim
,
lfi
->
hev_thr
,
2
);
vp8_loop_filter_horizontal_edge_mmx
(
y_ptr
+
12
*
y_stride
,
y_stride
,
lfi
->
blim
,
lfi
->
lim
,
lfi
->
hev_thr
,
2
);
if
(
u_ptr
)
vp8_loop_filter_horizontal_edge_mmx
(
u_ptr
+
4
*
uv_stride
,
uv_stride
,
lfi
->
blim
,
lfi
->
lim
,
lfi
->
hev_thr
,
1
);
if
(
v_ptr
)
vp8_loop_filter_horizontal_edge_mmx
(
v_ptr
+
4
*
uv_stride
,
uv_stride
,
lfi
->
blim
,
lfi
->
lim
,
lfi
->
hev_thr
,
1
);
}
...
...
@@ -99,16 +82,288 @@ void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, const unsigned
#endif
/* Horizontal MB filtering */
#if HAVE_SSE2
void
vp8_mbloop_filter_horizontal_edge_c_sse2
(
unsigned
char
*
s
,
int
p
,
const
unsigned
char
*
_blimit
,
const
unsigned
char
*
_limit
,
const
unsigned
char
*
_thresh
,
int
count
)
{
DECLARE_ALIGNED
(
16
,
unsigned
char
,
flat_op2
[
16
]);
DECLARE_ALIGNED
(
16
,
unsigned
char
,
flat_op1
[
16
]);
DECLARE_ALIGNED
(
16
,
unsigned
char
,
flat_op0
[
16
]);
DECLARE_ALIGNED
(
16
,
unsigned
char
,
flat_oq2
[
16
]);
DECLARE_ALIGNED
(
16
,
unsigned
char
,
flat_oq1
[
16
]);
DECLARE_ALIGNED
(
16
,
unsigned
char
,
flat_oq0
[
16
]);
__m128i
mask
,
hev
,
flat
;
__m128i
thresh
,
limit
,
blimit
;
const
__m128i
zero
=
_mm_set1_epi16
(
0
);
__m128i
p4
,
p3
,
p2
,
p1
,
p0
,
q0
,
q1
,
q2
,
q3
,
q4
;
thresh
=
_mm_shuffle_epi32
(
_mm_cvtsi32_si128
(
_thresh
[
0
]
*
0x01010101
),
0
);
limit
=
_mm_shuffle_epi32
(
_mm_cvtsi32_si128
(
_limit
[
0
]
*
0x01010101
),
0
);
blimit
=
_mm_shuffle_epi32
(
_mm_cvtsi32_si128
(
_blimit
[
0
]
*
0x01010101
),
0
);
p4
=
_mm_loadu_si128
((
__m128i
*
)(
s
-
5
*
p
));
p3
=
_mm_loadu_si128
((
__m128i
*
)(
s
-
4
*
p
));
p2
=
_mm_loadu_si128
((
__m128i
*
)(
s
-
3
*
p
));
p1
=
_mm_loadu_si128
((
__m128i
*
)(
s
-
2
*
p
));
p0
=
_mm_loadu_si128
((
__m128i
*
)(
s
-
1
*
p
));
q0
=
_mm_loadu_si128
((
__m128i
*
)(
s
-
0
*
p
));
q1
=
_mm_loadu_si128
((
__m128i
*
)(
s
+
1
*
p
));
q2
=
_mm_loadu_si128
((
__m128i
*
)(
s
+
2
*
p
));
q3
=
_mm_loadu_si128
((
__m128i
*
)(
s
+
3
*
p
));
q4
=
_mm_loadu_si128
((
__m128i
*
)(
s
+
4
*
p
));
{
const
__m128i
abs_p1p0
=
_mm_or_si128
(
_mm_subs_epu8
(
p1
,
p0
),
_mm_subs_epu8
(
p0
,
p1
));
const
__m128i
abs_q1q0
=
_mm_or_si128
(
_mm_subs_epu8
(
q1
,
q0
),
_mm_subs_epu8
(
q0
,
q1
));
const
__m128i
one
=
_mm_set1_epi8
(
1
);
const
__m128i
fe
=
_mm_set1_epi8
(
0xfe
);
const
__m128i
ff
=
_mm_cmpeq_epi8
(
abs_p1p0
,
abs_p1p0
);
__m128i
abs_p0q0
=
_mm_or_si128
(
_mm_subs_epu8
(
p0
,
q0
),
_mm_subs_epu8
(
q0
,
p0
));
__m128i
abs_p1q1
=
_mm_or_si128
(
_mm_subs_epu8
(
p1
,
q1
),
_mm_subs_epu8
(
q1
,
p1
));
__m128i
work
;
flat
=
_mm_max_epu8
(
abs_p1p0
,
abs_q1q0
);
hev
=
_mm_subs_epu8
(
flat
,
thresh
);
hev
=
_mm_xor_si128
(
_mm_cmpeq_epi8
(
hev
,
zero
),
ff
);
abs_p0q0
=
_mm_adds_epu8
(
abs_p0q0
,
abs_p0q0
);
abs_p1q1
=
_mm_srli_epi16
(
_mm_and_si128
(
abs_p1q1
,
fe
),
1
);
mask
=
_mm_subs_epu8
(
_mm_adds_epu8
(
abs_p0q0
,
abs_p1q1
),
blimit
);
mask
=
_mm_xor_si128
(
_mm_cmpeq_epi8
(
mask
,
zero
),
ff
);
// mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
mask
=
_mm_max_epu8
(
flat
,
mask
);
// mask |= (abs(p1 - p0) > limit) * -1;
// mask |= (abs(q1 - q0) > limit) * -1;
work
=
_mm_max_epu8
(
_mm_or_si128
(
_mm_subs_epu8
(
p2
,
p1
),
_mm_subs_epu8
(
p1