Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Guillaume Martres
aom-rav1e
Commits
effaa326
Commit
effaa326
authored
May 10, 2013
by
Dmitry Kovalev
Browse files
Removing unused simple loopfilter code.
Change-Id: Ic11dc052fb641687c015e1bbc37181b9babcd43e
parent
4305dd47
Changes
7
Hide whitespace changes
Inline
Side-by-side
vp9/common/vp9_loopfilter_filters.c
View file @
effaa326
...
...
@@ -282,29 +282,6 @@ static INLINE void simple_filter(int8_t mask,
*
op0
=
signed_char_clamp
(
p0
+
filter2
)
^
0x80
;
}
void
vp9_loop_filter_simple_horizontal_edge_c
(
uint8_t
*
s
,
int
p
,
const
uint8_t
*
blimit
)
{
int
i
=
0
;
do
{
const
int8_t
mask
=
simple_filter_mask
(
blimit
[
0
],
s
[
-
2
*
p
],
s
[
-
1
*
p
],
s
[
0
*
p
],
s
[
1
*
p
]);
simple_filter
(
mask
,
s
-
2
*
p
,
s
-
1
*
p
,
s
,
s
+
1
*
p
);
++
s
;
}
while
(
++
i
<
16
);
}
void
vp9_loop_filter_simple_vertical_edge_c
(
uint8_t
*
s
,
int
p
,
const
uint8_t
*
blimit
)
{
int
i
=
0
;
do
{
const
int8_t
mask
=
simple_filter_mask
(
blimit
[
0
],
s
[
-
2
],
s
[
-
1
],
s
[
0
],
s
[
1
]);
simple_filter
(
mask
,
s
-
2
,
s
-
1
,
s
,
s
+
1
);
s
+=
p
;
}
while
(
++
i
<
16
);
}
/* Vertical MB Filtering */
void
vp9_loop_filter_mbv_c
(
uint8_t
*
y_ptr
,
uint8_t
*
u_ptr
,
uint8_t
*
v_ptr
,
int
y_stride
,
int
uv_stride
,
...
...
@@ -392,11 +369,6 @@ void vp9_loop_filter_bh8x8_c(uint8_t *y, uint8_t *u, uint8_t *v,
lfi
->
blim
,
lfi
->
lim
,
lfi
->
hev_thr
,
1
);
}
void
vp9_loop_filter_bhs_c
(
uint8_t
*
y
,
int
y_stride
,
const
uint8_t
*
blimit
)
{
vp9_loop_filter_simple_horizontal_edge_c
(
y
+
4
*
y_stride
,
y_stride
,
blimit
);
vp9_loop_filter_simple_horizontal_edge_c
(
y
+
8
*
y_stride
,
y_stride
,
blimit
);
vp9_loop_filter_simple_horizontal_edge_c
(
y
+
12
*
y_stride
,
y_stride
,
blimit
);
}
void
vp9_loop_filter_bv8x8_c
(
uint8_t
*
y
,
uint8_t
*
u
,
uint8_t
*
v
,
int
y_stride
,
int
uv_stride
,
...
...
@@ -413,12 +385,6 @@ void vp9_loop_filter_bv8x8_c(uint8_t *y, uint8_t *u, uint8_t *v,
lfi
->
blim
,
lfi
->
lim
,
lfi
->
hev_thr
,
1
);
}
void
vp9_loop_filter_bvs_c
(
uint8_t
*
y
,
int
y_stride
,
const
uint8_t
*
blimit
)
{
vp9_loop_filter_simple_vertical_edge_c
(
y
+
4
,
y_stride
,
blimit
);
vp9_loop_filter_simple_vertical_edge_c
(
y
+
8
,
y_stride
,
blimit
);
vp9_loop_filter_simple_vertical_edge_c
(
y
+
12
,
y_stride
,
blimit
);
}
static
INLINE
void
wide_mbfilter
(
int8_t
mask
,
uint8_t
hev
,
uint8_t
flat
,
uint8_t
flat2
,
uint8_t
*
op7
,
uint8_t
*
op6
,
uint8_t
*
op5
,
...
...
vp9/common/vp9_rtcd_defs.sh
View file @
effaa326
...
...
@@ -128,30 +128,6 @@ specialize vp9_loop_filter_bh sse2
prototype void vp9_loop_filter_bh8x8
"uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
specialize vp9_loop_filter_bh8x8 sse2
prototype void vp9_loop_filter_simple_mbv
"uint8_t *y, int ystride, const uint8_t *blimit"
specialize vp9_loop_filter_simple_mbv mmx sse2
vp9_loop_filter_simple_mbv_c
=
vp9_loop_filter_simple_vertical_edge_c
vp9_loop_filter_simple_mbv_mmx
=
vp9_loop_filter_simple_vertical_edge_mmx
vp9_loop_filter_simple_mbv_sse2
=
vp9_loop_filter_simple_vertical_edge_sse2
prototype void vp9_loop_filter_simple_mbh
"uint8_t *y, int ystride, const uint8_t *blimit"
specialize vp9_loop_filter_simple_mbh mmx sse2
vp9_loop_filter_simple_mbh_c
=
vp9_loop_filter_simple_horizontal_edge_c
vp9_loop_filter_simple_mbh_mmx
=
vp9_loop_filter_simple_horizontal_edge_mmx
vp9_loop_filter_simple_mbh_sse2
=
vp9_loop_filter_simple_horizontal_edge_sse2
prototype void vp9_loop_filter_simple_bv
"uint8_t *y, int ystride, const uint8_t *blimit"
specialize vp9_loop_filter_simple_bv mmx sse2
vp9_loop_filter_simple_bv_c
=
vp9_loop_filter_bvs_c
vp9_loop_filter_simple_bv_mmx
=
vp9_loop_filter_bvs_mmx
vp9_loop_filter_simple_bv_sse2
=
vp9_loop_filter_bvs_sse2
prototype void vp9_loop_filter_simple_bh
"uint8_t *y, int ystride, const uint8_t *blimit"
specialize vp9_loop_filter_simple_bh mmx sse2
vp9_loop_filter_simple_bh_c
=
vp9_loop_filter_bhs_c
vp9_loop_filter_simple_bh_mmx
=
vp9_loop_filter_bhs_mmx
vp9_loop_filter_simple_bh_sse2
=
vp9_loop_filter_bhs_sse2
prototype void vp9_lpf_mbh_w
"unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi"
specialize vp9_lpf_mbh_w sse2
...
...
vp9/common/x86/vp9_loopfilter_intrin_mmx.c
View file @
effaa326
...
...
@@ -35,16 +35,6 @@ void vp9_loop_filter_bh_mmx(unsigned char *y_ptr,
}
void
vp9_loop_filter_bhs_mmx
(
unsigned
char
*
y_ptr
,
int
y_stride
,
const
unsigned
char
*
blimit
)
{
vp9_loop_filter_simple_horizontal_edge_mmx
(
y_ptr
+
4
*
y_stride
,
y_stride
,
blimit
);
vp9_loop_filter_simple_horizontal_edge_mmx
(
y_ptr
+
8
*
y_stride
,
y_stride
,
blimit
);
vp9_loop_filter_simple_horizontal_edge_mmx
(
y_ptr
+
12
*
y_stride
,
y_stride
,
blimit
);
}
/* Vertical B Filtering */
void
vp9_loop_filter_bv_mmx
(
unsigned
char
*
y_ptr
,
unsigned
char
*
u_ptr
,
unsigned
char
*
v_ptr
,
...
...
@@ -66,9 +56,3 @@ void vp9_loop_filter_bv_mmx(unsigned char *y_ptr,
lfi
->
blim
,
lfi
->
lim
,
lfi
->
hev_thr
,
1
);
}
void
vp9_loop_filter_bvs_mmx
(
unsigned
char
*
y_ptr
,
int
y_stride
,
const
unsigned
char
*
blimit
)
{
vp9_loop_filter_simple_vertical_edge_mmx
(
y_ptr
+
4
,
y_stride
,
blimit
);
vp9_loop_filter_simple_vertical_edge_mmx
(
y_ptr
+
8
,
y_stride
,
blimit
);
vp9_loop_filter_simple_vertical_edge_mmx
(
y_ptr
+
12
,
y_stride
,
blimit
);
}
vp9/common/x86/vp9_loopfilter_intrin_sse2.c
View file @
effaa326
...
...
@@ -1115,16 +1115,6 @@ void vp9_loop_filter_bh_sse2(unsigned char *y_ptr,
v_ptr
+
4
*
uv_stride
);
}
void
vp9_loop_filter_bhs_sse2
(
unsigned
char
*
y_ptr
,
int
y_stride
,
const
unsigned
char
*
blimit
)
{
vp9_loop_filter_simple_horizontal_edge_sse2
(
y_ptr
+
4
*
y_stride
,
y_stride
,
blimit
);
vp9_loop_filter_simple_horizontal_edge_sse2
(
y_ptr
+
8
*
y_stride
,
y_stride
,
blimit
);
vp9_loop_filter_simple_horizontal_edge_sse2
(
y_ptr
+
12
*
y_stride
,
y_stride
,
blimit
);
}
/* Vertical B Filtering */
void
vp9_loop_filter_bv_sse2
(
unsigned
char
*
y_ptr
,
unsigned
char
*
u_ptr
,
unsigned
char
*
v_ptr
,
...
...
@@ -1143,9 +1133,3 @@ void vp9_loop_filter_bv_sse2(unsigned char *y_ptr,
v_ptr
+
4
);
}
void
vp9_loop_filter_bvs_sse2
(
unsigned
char
*
y_ptr
,
int
y_stride
,
const
unsigned
char
*
blimit
)
{
vp9_loop_filter_simple_vertical_edge_sse2
(
y_ptr
+
4
,
y_stride
,
blimit
);
vp9_loop_filter_simple_vertical_edge_sse2
(
y_ptr
+
8
,
y_stride
,
blimit
);
vp9_loop_filter_simple_vertical_edge_sse2
(
y_ptr
+
12
,
y_stride
,
blimit
);
}
vp9/common/x86/vp9_loopfilter_mmx.asm
View file @
effaa326
...
...
@@ -593,349 +593,6 @@ sym(vp9_loop_filter_vertical_edge_mmx):
pop
rbp
ret
;void vp9_loop_filter_simple_horizontal_edge_mmx
;(
; unsigned char *src_ptr,
; int src_pixel_step,
; const char *blimit
;)
global
sym
(
vp9_loop_filter_simple_horizontal_edge_mmx
)
PRIVATE
sym
(
vp9_loop_filter_simple_horizontal_edge_mmx
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
3
GET_GOT
rbx
push
rsi
push
rdi
; end prolog
mov
rsi
,
arg
(
0
)
;src_ptr
movsxd
rax
,
dword
ptr
arg
(
1
)
;src_pixel_step ; destination pitch?
mov
rcx
,
2
; count
.nexts8_h:
mov
rdx
,
arg
(
2
)
;blimit ; get blimit
movq
mm3
,
[
rdx
]
;
mov
rdi
,
rsi
; rdi points to row +1 for indirect addressing
add
rdi
,
rax
neg
rax
; calculate mask
movq
mm1
,
[
rsi
+
2
*
rax
]
; p1
movq
mm0
,
[
rdi
]
; q1
movq
mm2
,
mm1
movq
mm7
,
mm0
movq
mm4
,
mm0
psubusb
mm0
,
mm1
; q1-=p1
psubusb
mm1
,
mm4
; p1-=q1
por
mm1
,
mm0
; abs(p1-q1)
pand
mm1
,
[
GLOBAL
(
tfe
)]
; set lsb of each byte to zero
psrlw
mm1
,
1
; abs(p1-q1)/2
movq
mm5
,
[
rsi
+
rax
]
; p0
movq
mm4
,
[
rsi
]
; q0
movq
mm0
,
mm4
; q0
movq
mm6
,
mm5
; p0
psubusb
mm5
,
mm4
; p0-=q0
psubusb
mm4
,
mm6
; q0-=p0
por
mm5
,
mm4
; abs(p0 - q0)
paddusb
mm5
,
mm5
; abs(p0-q0)*2
paddusb
mm5
,
mm1
; abs (p0 - q0) *2 + abs(p1-q1)/2
psubusb
mm5
,
mm3
; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
pxor
mm3
,
mm3
pcmpeqb
mm5
,
mm3
; start work on filters
pxor
mm2
,
[
GLOBAL
(
t80
)]
; p1 offset to convert to signed values
pxor
mm7
,
[
GLOBAL
(
t80
)]
; q1 offset to convert to signed values
psubsb
mm2
,
mm7
; p1 - q1
pxor
mm6
,
[
GLOBAL
(
t80
)]
; offset to convert to signed values
pxor
mm0
,
[
GLOBAL
(
t80
)]
; offset to convert to signed values
movq
mm3
,
mm0
; q0
psubsb
mm0
,
mm6
; q0 - p0
paddsb
mm2
,
mm0
; p1 - q1 + 1 * (q0 - p0)
paddsb
mm2
,
mm0
; p1 - q1 + 2 * (q0 - p0)
paddsb
mm2
,
mm0
; p1 - q1 + 3 * (q0 - p0)
pand
mm5
,
mm2
; mask filter values we don't care about
; do + 4 side
paddsb
mm5
,
[
GLOBAL
(
t4
)]
; 3* (q0 - p0) + (p1 - q1) + 4
movq
mm0
,
mm5
; get a copy of filters
psllw
mm0
,
8
; shift left 8
psraw
mm0
,
3
; arithmetic shift right 11
psrlw
mm0
,
8
movq
mm1
,
mm5
; get a copy of filters
psraw
mm1
,
11
; arithmetic shift right 11
psllw
mm1
,
8
; shift left 8 to put it back
por
mm0
,
mm1
; put the two together to get result
psubsb
mm3
,
mm0
; q0-= q0 add
pxor
mm3
,
[
GLOBAL
(
t80
)]
; unoffset
movq
[
rsi
],
mm3
; write back
; now do +3 side
psubsb
mm5
,
[
GLOBAL
(
t1s
)]
; +3 instead of +4
movq
mm0
,
mm5
; get a copy of filters
psllw
mm0
,
8
; shift left 8
psraw
mm0
,
3
; arithmetic shift right 11
psrlw
mm0
,
8
psraw
mm5
,
11
; arithmetic shift right 11
psllw
mm5
,
8
; shift left 8 to put it back
por
mm0
,
mm5
; put the two together to get result
paddsb
mm6
,
mm0
; p0+= p0 add
pxor
mm6
,
[
GLOBAL
(
t80
)]
; unoffset
movq
[
rsi
+
rax
],
mm6
; write back
add
rsi
,
8
neg
rax
dec
rcx
jnz
.nexts8_h
; begin epilog
pop
rdi
pop
rsi
REST
ORE_GOT
UNSHADOW_ARGS
pop
rbp
ret
;void vp9_loop_filter_simple_vertical_edge_mmx
;(
; unsigned char *src_ptr,
; int src_pixel_step,
; const char *blimit
;)
global
sym
(
vp9_loop_filter_simple_vertical_edge_mmx
)
PRIVATE
sym
(
vp9_loop_filter_simple_vertical_edge_mmx
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
3
GET_GOT
rbx
push
rsi
push
rdi
; end prolog
ALIGN
_STACK
16
,
rax
sub
rsp
,
32
; reserve 32 bytes
%define t0 [rsp + 0]
;__declspec(align(16)) char t0[8];
%define t1 [rsp + 16]
;__declspec(align(16)) char t1[8];
mov
rsi
,
arg
(
0
)
;src_ptr
movsxd
rax
,
dword
ptr
arg
(
1
)
;src_pixel_step ; destination pitch?
lea
rsi
,
[
rsi
+
rax
*
4
-
2
]
; ;
mov
rcx
,
2
; count
.nexts8_v:
lea
rdi
,
[
rsi
+
rax
]
;
movd
mm0
,
[
rdi
+
rax
*
2
]
; xx xx xx xx 73 72 71 70
movd
mm6
,
[
rsi
+
rax
*
2
]
; xx xx xx xx 63 62 61 60
punpcklbw
mm6
,
mm0
; 73 63 72 62 71 61 70 60
movd
mm0
,
[
rsi
+
rax
]
; xx xx xx xx 53 52 51 50
movd
mm4
,
[
rsi
]
; xx xx xx xx 43 42 41 40
punpcklbw
mm4
,
mm0
; 53 43 52 42 51 41 50 40
movq
mm5
,
mm4
; 53 43 52 42 51 41 50 40
punpcklwd
mm4
,
mm6
; 71 61 51 41 70 60 50 40
punpckhwd
mm5
,
mm6
; 73 63 53 43 72 62 52 42
neg
rax
movd
mm7
,
[
rsi
+
rax
]
; xx xx xx xx 33 32 31 30
movd
mm6
,
[
rsi
+
rax
*
2
]
; xx xx xx xx 23 22 21 20
punpcklbw
mm6
,
mm7
; 33 23 32 22 31 21 30 20
movd
mm1
,
[
rdi
+
rax
*
4
]
; xx xx xx xx 13 12 11 10
movd
mm0
,
[
rsi
+
rax
*
4
]
; xx xx xx xx 03 02 01 00
punpcklbw
mm0
,
mm1
; 13 03 12 02 11 01 10 00
movq
mm2
,
mm0
; 13 03 12 02 11 01 10 00
punpcklwd
mm0
,
mm6
; 31 21 11 01 30 20 10 00
punpckhwd
mm2
,
mm6
; 33 23 13 03 32 22 12 02
movq
mm1
,
mm0
; 13 03 12 02 11 01 10 00
punpckldq
mm0
,
mm4
; 70 60 50 40 30 20 10 00 = p1
movq
mm3
,
mm2
; 33 23 13 03 32 22 12 02
punpckhdq
mm1
,
mm4
; 71 61 51 41 31 21 11 01 = p0
punpckldq
mm2
,
mm5
; 72 62 52 42 32 22 12 02 = q0
punpckhdq
mm3
,
mm5
; 73 63 53 43 33 23 13 03 = q1
; calculate mask
movq
mm6
,
mm0
; p1
movq
mm7
,
mm3
; q1
psubusb
mm7
,
mm6
; q1-=p1
psubusb
mm6
,
mm3
; p1-=q1
por
mm6
,
mm7
; abs(p1-q1)
pand
mm6
,
[
GLOBAL
(
tfe
)]
; set lsb of each byte to zero
psrlw
mm6
,
1
; abs(p1-q1)/2
movq
mm5
,
mm1
; p0
movq
mm4
,
mm2
; q0
psubusb
mm5
,
mm2
; p0-=q0
psubusb
mm4
,
mm1
; q0-=p0
por
mm5
,
mm4
; abs(p0 - q0)
paddusb
mm5
,
mm5
; abs(p0-q0)*2
paddusb
mm5
,
mm6
; abs (p0 - q0) *2 + abs(p1-q1)/2
mov
rdx
,
arg
(
2
)
;blimit ; get blimit
movq
mm7
,
[
rdx
]
psubusb
mm5
,
mm7
; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
pxor
mm7
,
mm7
pcmpeqb
mm5
,
mm7
; mm5 = mask
; start work on filters
movq
t0
,
mm0
movq
t1
,
mm3
pxor
mm0
,
[
GLOBAL
(
t80
)]
; p1 offset to convert to signed values
pxor
mm3
,
[
GLOBAL
(
t80
)]
; q1 offset to convert to signed values
psubsb
mm0
,
mm3
; p1 - q1
movq
mm6
,
mm1
; p0
movq
mm7
,
mm2
; q0
pxor
mm6
,
[
GLOBAL
(
t80
)]
; offset to convert to signed values
pxor
mm7
,
[
GLOBAL
(
t80
)]
; offset to convert to signed values
movq
mm3
,
mm7
; offseted ; q0
psubsb
mm7
,
mm6
; q0 - p0
paddsb
mm0
,
mm7
; p1 - q1 + 1 * (q0 - p0)
paddsb
mm0
,
mm7
; p1 - q1 + 2 * (q0 - p0)
paddsb
mm0
,
mm7
; p1 - q1 + 3 * (q0 - p0)
pand
mm5
,
mm0
; mask filter values we don't care about
paddsb
mm5
,
[
GLOBAL
(
t4
)]
; 3* (q0 - p0) + (p1 - q1) + 4
movq
mm0
,
mm5
; get a copy of filters
psllw
mm0
,
8
; shift left 8
psraw
mm0
,
3
; arithmetic shift right 11
psrlw
mm0
,
8
movq
mm7
,
mm5
; get a copy of filters
psraw
mm7
,
11
; arithmetic shift right 11
psllw
mm7
,
8
; shift left 8 to put it back
por
mm0
,
mm7
; put the two together to get result
psubsb
mm3
,
mm0
; q0-= q0sz add
pxor
mm3
,
[
GLOBAL
(
t80
)]
; unoffset
; now do +3 side
psubsb
mm5
,
[
GLOBAL
(
t1s
)]
; +3 instead of +4
movq
mm0
,
mm5
; get a copy of filters
psllw
mm0
,
8
; shift left 8
psraw
mm0
,
3
; arithmetic shift right 11
psrlw
mm0
,
8
psraw
mm5
,
11
; arithmetic shift right 11
psllw
mm5
,
8
; shift left 8 to put it back
por
mm0
,
mm5
; put the two together to get result
paddsb
mm6
,
mm0
; p0+= p0 add
pxor
mm6
,
[
GLOBAL
(
t80
)]
; unoffset
movq
mm0
,
t0
movq
mm4
,
t1
; mm0 = 70 60 50 40 30 20 10 00
; mm6 = 71 61 51 41 31 21 11 01
; mm3 = 72 62 52 42 32 22 12 02
; mm4 = 73 63 53 43 33 23 13 03
; transpose back to write out
movq
mm1
,
mm0
;
punpcklbw
mm0
,
mm6
; 31 30 21 20 11 10 01 00
punpckhbw
mm1
,
mm6
; 71 70 61 60 51 50 41 40
movq
mm2
,
mm3
;
punpcklbw
mm2
,
mm4
; 33 32 23 22 13 12 03 02
movq
mm5
,
mm1
; 71 70 61 60 51 50 41 40
punpckhbw
mm3
,
mm4
; 73 72 63 62 53 52 43 42
movq
mm6
,
mm0
; 31 30 21 20 11 10 01 00
punpcklwd
mm0
,
mm2
; 13 12 11 10 03 02 01 00
punpckhwd
mm6
,
mm2
; 33 32 31 30 23 22 21 20
movd
[
rsi
+
rax
*
4
],
mm0
; write 03 02 01 00
punpcklwd
mm1
,
mm3
; 53 52 51 50 43 42 41 40
psrlq
mm0
,
32
; xx xx xx xx 13 12 11 10
punpckhwd
mm5
,
mm3
; 73 72 71 70 63 62 61 60
movd
[
rdi
+
rax
*
4
],
mm0
; write 13 12 11 10
movd
[
rsi
+
rax
*
2
],
mm6
; write 23 22 21 20
psrlq
mm6
,
32
; 33 32 31 30
movd
[
rsi
],
mm1
; write 43 42 41 40
movd
[
rsi
+
rax
],
mm6
; write 33 32 31 30
neg
rax
movd
[
rsi
+
rax
*
2
],
mm5
; write 63 62 61 60
psrlq
mm1
,
32
; 53 52 51 50
movd
[
rdi
],
mm1
; write out 53 52 51 50
psrlq
mm5
,
32
; 73 72 71 70
movd
[
rdi
+
rax
*
2
],
mm5
; write 73 72 71 70
lea
rsi
,
[
rsi
+
rax
*
8
]
; next 8
dec
rcx
jnz
.nexts8_v
add
rsp
,
32
pop
rsp
; begin epilog
pop
rdi
pop
rsi
REST
ORE_GOT
UNSHADOW_ARGS
pop
rbp
ret
;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
; int y_stride,
; loop_filter_info *lfi)
;{
;
;
; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
;}
SECTION
_RODATA
align
16
tfe:
...
...
vp9/common/x86/vp9_loopfilter_sse2.asm
View file @
effaa326
...
...
@@ -845,372 +845,6 @@ sym(vp9_loop_filter_vertical_edge_uv_sse2):
pop
rbp
ret
;void vp9_loop_filter_simple_horizontal_edge_sse2
;(
; unsigned char *src_ptr,
; int src_pixel_step,
; const char *blimit,
;)
global
sym
(
vp9_loop_filter_simple_horizontal_edge_sse2
)
PRIVATE
sym
(
vp9_loop_filter_simple_horizontal_edge_sse2
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
3
SAVE_XMM
7
GET_GOT
rbx
push
rsi
push
rdi
; end prolog
mov
rsi
,
arg
(
0
)
;src_ptr
movsxd
rax
,
dword
ptr
arg
(
1
)
;src_pixel_step ; destination pitch?
mov
rdx
,
arg
(
2
)
;blimit
movdqa
xmm3
,
XMMWORD
PTR
[
rdx
]
mov
rdi
,
rsi
; rdi points to row +1 for indirect addressing
add
rdi
,
rax
neg
rax
; calculate mask
movdqa
xmm1
,
[
rsi
+
2
*
rax
]
; p1
movdqa
xmm0
,
[
rdi
]
; q1
movdqa
xmm2
,
xmm1
movdqa
xmm7
,
xmm0
movdqa
xmm4
,
xmm0
psubusb
xmm0
,
xmm1
; q1-=p1
psubusb
xmm1
,
xmm4
; p1-=q1
por
xmm1
,
xmm0
; abs(p1-q1)
pand
xmm1
,
[
GLOBAL
(
tfe
)]
; set lsb of each byte to zero
psrlw
xmm1
,
1
; abs(p1-q1)/2
movdqa
xmm5
,
[
rsi
+
rax
]
; p0
movdqa
xmm4
,
[
rsi
]
; q0
movdqa
xmm0
,
xmm4
; q0
movdqa
xmm6
,
xmm5
; p0
psubusb
xmm5
,
xmm4
; p0-=q0
psubusb
xmm4
,
xmm6
; q0-=p0
por
xmm5
,
xmm4
; abs(p0 - q0)
paddusb
xmm5
,
xmm5
; abs(p0-q0)*2
paddusb
xmm5
,
xmm1
; abs (p0 - q0) *2 + abs(p1-q1)/2
psubusb
xmm5
,
xmm3
; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
pxor
xmm3
,
xmm3
pcmpeqb
xmm5
,
xmm3
; start work on filters
pxor
xmm2
,
[
GLOBAL
(
t80
)]
; p1 offset to convert to signed values
pxor
xmm7
,
[
GLOBAL
(
t80
)]
; q1 offset to convert to signed values
psubsb
xmm2
,
xmm7
; p1 - q1
pxor
xmm6
,
[
GLOBAL
(
t80
)]
; offset to convert to signed values
pxor
xmm0
,
[
GLOBAL
(
t80
)]
; offset to convert to signed values
movdqa
xmm3
,
xmm0
; q0
psubsb
xmm0
,
xmm6
; q0 - p0
paddsb
xmm2
,
xmm0
; p1 - q1 + 1 * (q0 - p0)
paddsb
xmm2
,
xmm0
; p1 - q1 + 2 * (q0 - p0)
paddsb
xmm2
,
xmm0
; p1 - q1 + 3 * (q0 - p0)
pand
xmm5
,
xmm2
; mask filter values we don't care about
; do + 4 side
paddsb
xmm5
,
[
GLOBAL
(
t4
)]
; 3* (q0 - p0) + (p1 - q1) + 4
movdqa
xmm0
,
xmm5
; get a copy of filters
psllw
xmm0
,
8
; shift left 8
psraw
xmm0
,
3
; arithmetic shift right 11
psrlw
xmm0
,
8
movdqa
xmm1
,
xmm5
; get a copy of filters
psraw
xmm1
,
11
; arithmetic shift right 11
psllw
xmm1
,
8
; shift left 8 to put it back
por
xmm0
,
xmm1
; put the two together to get result
psubsb
xmm3
,
xmm0
; q0-= q0 add
pxor
xmm3
,
[
GLOBAL
(
t80
)]
; unoffset
movdqa
[
rsi
],
xmm3
; write back
; now do +3 side
psubsb
xmm5
,
[
GLOBAL
(
t1s
)]
; +3 instead of +4
movdqa
xmm0
,
xmm5
; get a copy of filters
psllw
xmm0
,
8
; shift left 8
psraw
xmm0
,
3
; arithmetic shift right 11
psrlw
xmm0
,
8
psraw
xmm5
,
11
; arithmetic shift right 11
psllw
xmm5
,
8
; shift left 8 to put it back
por
xmm0
,
xmm5
; put the two together to get result
paddsb
xmm6
,
xmm0
; p0+= p0 add
pxor
xmm6
,
[
GLOBAL
(
t80
)]
; unoffset
movdqa
[
rsi
+
rax
],
xmm6
; write back
; begin epilog
pop
rdi
pop
rsi
REST
ORE_GOT
REST
ORE_XMM
UNSHADOW_ARGS