Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
A
aom-rav1e
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Packages & Registries
Packages & Registries
Container Registry
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Xiph.Org
aom-rav1e
Commits
4a559d34
Commit
4a559d34
authored
May 10, 2013
by
Dmitry Kovalev
Committed by
Gerrit Code Review
May 10, 2013
Browse files
Options
Browse Files
Download
Plain Diff
Merge "Removing unused simple loopfilter code." into experimental
parents
9755d9fd
effaa326
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
0 additions
and
807 deletions
+0
-807
vp9/common/vp9_loopfilter_filters.c
vp9/common/vp9_loopfilter_filters.c
+0
-34
vp9/common/vp9_rtcd_defs.sh
vp9/common/vp9_rtcd_defs.sh
+0
-24
vp9/common/x86/vp9_loopfilter_intrin_mmx.c
vp9/common/x86/vp9_loopfilter_intrin_mmx.c
+0
-16
vp9/common/x86/vp9_loopfilter_intrin_sse2.c
vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+0
-16
vp9/common/x86/vp9_loopfilter_mmx.asm
vp9/common/x86/vp9_loopfilter_mmx.asm
+0
-343
vp9/common/x86/vp9_loopfilter_sse2.asm
vp9/common/x86/vp9_loopfilter_sse2.asm
+0
-366
vp9/common/x86/vp9_loopfilter_x86.h
vp9/common/x86/vp9_loopfilter_x86.h
+0
-8
No files found.
vp9/common/vp9_loopfilter_filters.c
View file @
4a559d34
...
...
@@ -282,29 +282,6 @@ static INLINE void simple_filter(int8_t mask,
*
op0
=
signed_char_clamp
(
p0
+
filter2
)
^
0x80
;
}
void
vp9_loop_filter_simple_horizontal_edge_c
(
uint8_t
*
s
,
int
p
,
const
uint8_t
*
blimit
)
{
int
i
=
0
;
do
{
const
int8_t
mask
=
simple_filter_mask
(
blimit
[
0
],
s
[
-
2
*
p
],
s
[
-
1
*
p
],
s
[
0
*
p
],
s
[
1
*
p
]);
simple_filter
(
mask
,
s
-
2
*
p
,
s
-
1
*
p
,
s
,
s
+
1
*
p
);
++
s
;
}
while
(
++
i
<
16
);
}
void
vp9_loop_filter_simple_vertical_edge_c
(
uint8_t
*
s
,
int
p
,
const
uint8_t
*
blimit
)
{
int
i
=
0
;
do
{
const
int8_t
mask
=
simple_filter_mask
(
blimit
[
0
],
s
[
-
2
],
s
[
-
1
],
s
[
0
],
s
[
1
]);
simple_filter
(
mask
,
s
-
2
,
s
-
1
,
s
,
s
+
1
);
s
+=
p
;
}
while
(
++
i
<
16
);
}
/* Vertical MB Filtering */
void
vp9_loop_filter_mbv_c
(
uint8_t
*
y_ptr
,
uint8_t
*
u_ptr
,
uint8_t
*
v_ptr
,
int
y_stride
,
int
uv_stride
,
...
...
@@ -392,11 +369,6 @@ void vp9_loop_filter_bh8x8_c(uint8_t *y, uint8_t *u, uint8_t *v,
lfi
->
blim
,
lfi
->
lim
,
lfi
->
hev_thr
,
1
);
}
void
vp9_loop_filter_bhs_c
(
uint8_t
*
y
,
int
y_stride
,
const
uint8_t
*
blimit
)
{
vp9_loop_filter_simple_horizontal_edge_c
(
y
+
4
*
y_stride
,
y_stride
,
blimit
);
vp9_loop_filter_simple_horizontal_edge_c
(
y
+
8
*
y_stride
,
y_stride
,
blimit
);
vp9_loop_filter_simple_horizontal_edge_c
(
y
+
12
*
y_stride
,
y_stride
,
blimit
);
}
void
vp9_loop_filter_bv8x8_c
(
uint8_t
*
y
,
uint8_t
*
u
,
uint8_t
*
v
,
int
y_stride
,
int
uv_stride
,
...
...
@@ -413,12 +385,6 @@ void vp9_loop_filter_bv8x8_c(uint8_t *y, uint8_t *u, uint8_t *v,
lfi
->
blim
,
lfi
->
lim
,
lfi
->
hev_thr
,
1
);
}
void
vp9_loop_filter_bvs_c
(
uint8_t
*
y
,
int
y_stride
,
const
uint8_t
*
blimit
)
{
vp9_loop_filter_simple_vertical_edge_c
(
y
+
4
,
y_stride
,
blimit
);
vp9_loop_filter_simple_vertical_edge_c
(
y
+
8
,
y_stride
,
blimit
);
vp9_loop_filter_simple_vertical_edge_c
(
y
+
12
,
y_stride
,
blimit
);
}
static
INLINE
void
wide_mbfilter
(
int8_t
mask
,
uint8_t
hev
,
uint8_t
flat
,
uint8_t
flat2
,
uint8_t
*
op7
,
uint8_t
*
op6
,
uint8_t
*
op5
,
...
...
vp9/common/vp9_rtcd_defs.sh
View file @
4a559d34
...
...
@@ -128,30 +128,6 @@ specialize vp9_loop_filter_bh sse2
prototype void vp9_loop_filter_bh8x8
"uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
specialize vp9_loop_filter_bh8x8 sse2
prototype void vp9_loop_filter_simple_mbv
"uint8_t *y, int ystride, const uint8_t *blimit"
specialize vp9_loop_filter_simple_mbv mmx sse2
vp9_loop_filter_simple_mbv_c
=
vp9_loop_filter_simple_vertical_edge_c
vp9_loop_filter_simple_mbv_mmx
=
vp9_loop_filter_simple_vertical_edge_mmx
vp9_loop_filter_simple_mbv_sse2
=
vp9_loop_filter_simple_vertical_edge_sse2
prototype void vp9_loop_filter_simple_mbh
"uint8_t *y, int ystride, const uint8_t *blimit"
specialize vp9_loop_filter_simple_mbh mmx sse2
vp9_loop_filter_simple_mbh_c
=
vp9_loop_filter_simple_horizontal_edge_c
vp9_loop_filter_simple_mbh_mmx
=
vp9_loop_filter_simple_horizontal_edge_mmx
vp9_loop_filter_simple_mbh_sse2
=
vp9_loop_filter_simple_horizontal_edge_sse2
prototype void vp9_loop_filter_simple_bv
"uint8_t *y, int ystride, const uint8_t *blimit"
specialize vp9_loop_filter_simple_bv mmx sse2
vp9_loop_filter_simple_bv_c
=
vp9_loop_filter_bvs_c
vp9_loop_filter_simple_bv_mmx
=
vp9_loop_filter_bvs_mmx
vp9_loop_filter_simple_bv_sse2
=
vp9_loop_filter_bvs_sse2
prototype void vp9_loop_filter_simple_bh
"uint8_t *y, int ystride, const uint8_t *blimit"
specialize vp9_loop_filter_simple_bh mmx sse2
vp9_loop_filter_simple_bh_c
=
vp9_loop_filter_bhs_c
vp9_loop_filter_simple_bh_mmx
=
vp9_loop_filter_bhs_mmx
vp9_loop_filter_simple_bh_sse2
=
vp9_loop_filter_bhs_sse2
prototype void vp9_lpf_mbh_w
"unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi"
specialize vp9_lpf_mbh_w sse2
...
...
vp9/common/x86/vp9_loopfilter_intrin_mmx.c
View file @
4a559d34
...
...
@@ -35,16 +35,6 @@ void vp9_loop_filter_bh_mmx(unsigned char *y_ptr,
}
void
vp9_loop_filter_bhs_mmx
(
unsigned
char
*
y_ptr
,
int
y_stride
,
const
unsigned
char
*
blimit
)
{
vp9_loop_filter_simple_horizontal_edge_mmx
(
y_ptr
+
4
*
y_stride
,
y_stride
,
blimit
);
vp9_loop_filter_simple_horizontal_edge_mmx
(
y_ptr
+
8
*
y_stride
,
y_stride
,
blimit
);
vp9_loop_filter_simple_horizontal_edge_mmx
(
y_ptr
+
12
*
y_stride
,
y_stride
,
blimit
);
}
/* Vertical B Filtering */
void
vp9_loop_filter_bv_mmx
(
unsigned
char
*
y_ptr
,
unsigned
char
*
u_ptr
,
unsigned
char
*
v_ptr
,
...
...
@@ -66,9 +56,3 @@ void vp9_loop_filter_bv_mmx(unsigned char *y_ptr,
lfi
->
blim
,
lfi
->
lim
,
lfi
->
hev_thr
,
1
);
}
void
vp9_loop_filter_bvs_mmx
(
unsigned
char
*
y_ptr
,
int
y_stride
,
const
unsigned
char
*
blimit
)
{
vp9_loop_filter_simple_vertical_edge_mmx
(
y_ptr
+
4
,
y_stride
,
blimit
);
vp9_loop_filter_simple_vertical_edge_mmx
(
y_ptr
+
8
,
y_stride
,
blimit
);
vp9_loop_filter_simple_vertical_edge_mmx
(
y_ptr
+
12
,
y_stride
,
blimit
);
}
vp9/common/x86/vp9_loopfilter_intrin_sse2.c
View file @
4a559d34
...
...
@@ -1115,16 +1115,6 @@ void vp9_loop_filter_bh_sse2(unsigned char *y_ptr,
v_ptr
+
4
*
uv_stride
);
}
void
vp9_loop_filter_bhs_sse2
(
unsigned
char
*
y_ptr
,
int
y_stride
,
const
unsigned
char
*
blimit
)
{
vp9_loop_filter_simple_horizontal_edge_sse2
(
y_ptr
+
4
*
y_stride
,
y_stride
,
blimit
);
vp9_loop_filter_simple_horizontal_edge_sse2
(
y_ptr
+
8
*
y_stride
,
y_stride
,
blimit
);
vp9_loop_filter_simple_horizontal_edge_sse2
(
y_ptr
+
12
*
y_stride
,
y_stride
,
blimit
);
}
/* Vertical B Filtering */
void
vp9_loop_filter_bv_sse2
(
unsigned
char
*
y_ptr
,
unsigned
char
*
u_ptr
,
unsigned
char
*
v_ptr
,
...
...
@@ -1143,9 +1133,3 @@ void vp9_loop_filter_bv_sse2(unsigned char *y_ptr,
v_ptr
+
4
);
}
void
vp9_loop_filter_bvs_sse2
(
unsigned
char
*
y_ptr
,
int
y_stride
,
const
unsigned
char
*
blimit
)
{
vp9_loop_filter_simple_vertical_edge_sse2
(
y_ptr
+
4
,
y_stride
,
blimit
);
vp9_loop_filter_simple_vertical_edge_sse2
(
y_ptr
+
8
,
y_stride
,
blimit
);
vp9_loop_filter_simple_vertical_edge_sse2
(
y_ptr
+
12
,
y_stride
,
blimit
);
}
vp9/common/x86/vp9_loopfilter_mmx.asm
View file @
4a559d34
...
...
@@ -593,349 +593,6 @@ sym(vp9_loop_filter_vertical_edge_mmx):
pop
rbp
ret
;void vp9_loop_filter_simple_horizontal_edge_mmx
;(
; unsigned char *src_ptr,
; int src_pixel_step,
; const char *blimit
;)
global
sym
(
vp9_loop_filter_simple_horizontal_edge_mmx
)
PRIVATE
sym
(
vp9_loop_filter_simple_horizontal_edge_mmx
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
3
GET_GOT
rbx
push
rsi
push
rdi
; end prolog
mov
rsi
,
arg
(
0
)
;src_ptr
movsxd
rax
,
dword
ptr
arg
(
1
)
;src_pixel_step ; destination pitch?
mov
rcx
,
2
; count
.nexts8_h:
mov
rdx
,
arg
(
2
)
;blimit ; get blimit
movq
mm3
,
[
rdx
]
;
mov
rdi
,
rsi
; rdi points to row +1 for indirect addressing
add
rdi
,
rax
neg
rax
; calculate mask
movq
mm1
,
[
rsi
+
2
*
rax
]
; p1
movq
mm0
,
[
rdi
]
; q1
movq
mm2
,
mm1
movq
mm7
,
mm0
movq
mm4
,
mm0
psubusb
mm0
,
mm1
; q1-=p1
psubusb
mm1
,
mm4
; p1-=q1
por
mm1
,
mm0
; abs(p1-q1)
pand
mm1
,
[
GLOBAL
(
tfe
)]
; set lsb of each byte to zero
psrlw
mm1
,
1
; abs(p1-q1)/2
movq
mm5
,
[
rsi
+
rax
]
; p0
movq
mm4
,
[
rsi
]
; q0
movq
mm0
,
mm4
; q0
movq
mm6
,
mm5
; p0
psubusb
mm5
,
mm4
; p0-=q0
psubusb
mm4
,
mm6
; q0-=p0
por
mm5
,
mm4
; abs(p0 - q0)
paddusb
mm5
,
mm5
; abs(p0-q0)*2
paddusb
mm5
,
mm1
; abs (p0 - q0) *2 + abs(p1-q1)/2
psubusb
mm5
,
mm3
; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
pxor
mm3
,
mm3
pcmpeqb
mm5
,
mm3
; start work on filters
pxor
mm2
,
[
GLOBAL
(
t80
)]
; p1 offset to convert to signed values
pxor
mm7
,
[
GLOBAL
(
t80
)]
; q1 offset to convert to signed values
psubsb
mm2
,
mm7
; p1 - q1
pxor
mm6
,
[
GLOBAL
(
t80
)]
; offset to convert to signed values
pxor
mm0
,
[
GLOBAL
(
t80
)]
; offset to convert to signed values
movq
mm3
,
mm0
; q0
psubsb
mm0
,
mm6
; q0 - p0
paddsb
mm2
,
mm0
; p1 - q1 + 1 * (q0 - p0)
paddsb
mm2
,
mm0
; p1 - q1 + 2 * (q0 - p0)
paddsb
mm2
,
mm0
; p1 - q1 + 3 * (q0 - p0)
pand
mm5
,
mm2
; mask filter values we don't care about
; do + 4 side
paddsb
mm5
,
[
GLOBAL
(
t4
)]
; 3* (q0 - p0) + (p1 - q1) + 4
movq
mm0
,
mm5
; get a copy of filters
psllw
mm0
,
8
; shift left 8
psraw
mm0
,
3
; arithmetic shift right 11
psrlw
mm0
,
8
movq
mm1
,
mm5
; get a copy of filters
psraw
mm1
,
11
; arithmetic shift right 11
psllw
mm1
,
8
; shift left 8 to put it back
por
mm0
,
mm1
; put the two together to get result
psubsb
mm3
,
mm0
; q0-= q0 add
pxor
mm3
,
[
GLOBAL
(
t80
)]
; unoffset
movq
[
rsi
],
mm3
; write back
; now do +3 side
psubsb
mm5
,
[
GLOBAL
(
t1s
)]
; +3 instead of +4
movq
mm0
,
mm5
; get a copy of filters
psllw
mm0
,
8
; shift left 8
psraw
mm0
,
3
; arithmetic shift right 11
psrlw
mm0
,
8
psraw
mm5
,
11
; arithmetic shift right 11
psllw
mm5
,
8
; shift left 8 to put it back
por
mm0
,
mm5
; put the two together to get result
paddsb
mm6
,
mm0
; p0+= p0 add
pxor
mm6
,
[
GLOBAL
(
t80
)]
; unoffset
movq
[
rsi
+
rax
],
mm6
; write back
add
rsi
,
8
neg
rax
dec
rcx
jnz
.nexts8_h
; begin epilog
pop
rdi
pop
rsi
REST
ORE_GOT
UNSHADOW_ARGS
pop
rbp
ret
;void vp9_loop_filter_simple_vertical_edge_mmx
;(
; unsigned char *src_ptr,
; int src_pixel_step,
; const char *blimit
;)
global
sym
(
vp9_loop_filter_simple_vertical_edge_mmx
)
PRIVATE
sym
(
vp9_loop_filter_simple_vertical_edge_mmx
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
3
GET_GOT
rbx
push
rsi
push
rdi
; end prolog
ALIGN
_STACK
16
,
rax
sub
rsp
,
32
; reserve 32 bytes
%define t0 [rsp + 0]
;__declspec(align(16)) char t0[8];
%define t1 [rsp + 16]
;__declspec(align(16)) char t1[8];
mov
rsi
,
arg
(
0
)
;src_ptr
movsxd
rax
,
dword
ptr
arg
(
1
)
;src_pixel_step ; destination pitch?
lea
rsi
,
[
rsi
+
rax
*
4
-
2
]
; ;
mov
rcx
,
2
; count
.nexts8_v:
lea
rdi
,
[
rsi
+
rax
]
;
movd
mm0
,
[
rdi
+
rax
*
2
]
; xx xx xx xx 73 72 71 70
movd
mm6
,
[
rsi
+
rax
*
2
]
; xx xx xx xx 63 62 61 60
punpcklbw
mm6
,
mm0
; 73 63 72 62 71 61 70 60
movd
mm0
,
[
rsi
+
rax
]
; xx xx xx xx 53 52 51 50
movd
mm4
,
[
rsi
]
; xx xx xx xx 43 42 41 40
punpcklbw
mm4
,
mm0
; 53 43 52 42 51 41 50 40
movq
mm5
,
mm4
; 53 43 52 42 51 41 50 40
punpcklwd
mm4
,
mm6
; 71 61 51 41 70 60 50 40
punpckhwd
mm5
,
mm6
; 73 63 53 43 72 62 52 42
neg
rax
movd
mm7
,
[
rsi
+
rax
]
; xx xx xx xx 33 32 31 30
movd
mm6
,
[
rsi
+
rax
*
2
]
; xx xx xx xx 23 22 21 20
punpcklbw
mm6
,
mm7
; 33 23 32 22 31 21 30 20
movd
mm1
,
[
rdi
+
rax
*
4
]
; xx xx xx xx 13 12 11 10
movd
mm0
,
[
rsi
+
rax
*
4
]
; xx xx xx xx 03 02 01 00
punpcklbw
mm0
,
mm1
; 13 03 12 02 11 01 10 00
movq
mm2
,
mm0
; 13 03 12 02 11 01 10 00
punpcklwd
mm0
,
mm6
; 31 21 11 01 30 20 10 00
punpckhwd
mm2
,
mm6
; 33 23 13 03 32 22 12 02
movq
mm1
,
mm0
; 13 03 12 02 11 01 10 00
punpckldq
mm0
,
mm4
; 70 60 50 40 30 20 10 00 = p1
movq
mm3
,
mm2
; 33 23 13 03 32 22 12 02
punpckhdq
mm1
,
mm4
; 71 61 51 41 31 21 11 01 = p0
punpckldq
mm2
,
mm5
; 72 62 52 42 32 22 12 02 = q0
punpckhdq
mm3
,
mm5
; 73 63 53 43 33 23 13 03 = q1
; calculate mask
movq
mm6
,
mm0
; p1
movq
mm7
,
mm3
; q1
psubusb
mm7
,
mm6
; q1-=p1
psubusb
mm6
,
mm3
; p1-=q1
por
mm6
,
mm7
; abs(p1-q1)
pand
mm6
,
[
GLOBAL
(
tfe
)]
; set lsb of each byte to zero
psrlw
mm6
,
1
; abs(p1-q1)/2
movq
mm5
,
mm1
; p0
movq
mm4
,
mm2
; q0
psubusb
mm5
,
mm2
; p0-=q0
psubusb
mm4
,
mm1
; q0-=p0
por
mm5
,
mm4
; abs(p0 - q0)
paddusb
mm5
,
mm5
; abs(p0-q0)*2
paddusb
mm5
,
mm6
; abs (p0 - q0) *2 + abs(p1-q1)/2
mov
rdx
,
arg
(
2
)
;blimit ; get blimit
movq
mm7
,
[
rdx
]
psubusb
mm5
,
mm7
; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
pxor
mm7
,
mm7
pcmpeqb
mm5
,
mm7
; mm5 = mask
; start work on filters
movq
t0
,
mm0
movq
t1
,
mm3
pxor
mm0
,
[
GLOBAL
(
t80
)]
; p1 offset to convert to signed values
pxor
mm3
,
[
GLOBAL
(
t80
)]
; q1 offset to convert to signed values
psubsb
mm0
,
mm3
; p1 - q1
movq
mm6
,
mm1
; p0
movq
mm7
,
mm2
; q0
pxor
mm6
,
[
GLOBAL
(
t80
)]
; offset to convert to signed values
pxor
mm7
,
[
GLOBAL
(
t80
)]
; offset to convert to signed values
movq
mm3
,
mm7
; offseted ; q0
psubsb
mm7
,
mm6
; q0 - p0
paddsb
mm0
,
mm7
; p1 - q1 + 1 * (q0 - p0)
paddsb
mm0
,
mm7
; p1 - q1 + 2 * (q0 - p0)
paddsb
mm0
,
mm7
; p1 - q1 + 3 * (q0 - p0)
pand
mm5
,
mm0
; mask filter values we don't care about
paddsb
mm5
,
[
GLOBAL
(
t4
)]
; 3* (q0 - p0) + (p1 - q1) + 4
movq
mm0
,
mm5
; get a copy of filters
psllw
mm0
,
8
; shift left 8
psraw
mm0
,
3
; arithmetic shift right 11
psrlw
mm0
,
8
movq
mm7
,
mm5
; get a copy of filters
psraw
mm7
,
11
; arithmetic shift right 11
psllw
mm7
,
8
; shift left 8 to put it back
por
mm0
,
mm7
; put the two together to get result
psubsb
mm3
,
mm0
; q0-= q0sz add
pxor
mm3
,
[
GLOBAL
(
t80
)]
; unoffset
; now do +3 side
psubsb
mm5
,
[
GLOBAL
(
t1s
)]
; +3 instead of +4
movq
mm0
,
mm5
; get a copy of filters
psllw
mm0
,
8
; shift left 8
psraw
mm0
,
3
; arithmetic shift right 11
psrlw
mm0
,
8
psraw
mm5
,
11
; arithmetic shift right 11
psllw
mm5
,
8
; shift left 8 to put it back
por
mm0
,
mm5
; put the two together to get result
paddsb
mm6
,
mm0
; p0+= p0 add
pxor
mm6
,
[
GLOBAL
(
t80
)]
; unoffset
movq
mm0
,
t0
movq
mm4
,
t1
; mm0 = 70 60 50 40 30 20 10 00
; mm6 = 71 61 51 41 31 21 11 01
; mm3 = 72 62 52 42 32 22 12 02
; mm4 = 73 63 53 43 33 23 13 03
; transpose back to write out
movq
mm1
,
mm0
;
punpcklbw
mm0
,
mm6
; 31 30 21 20 11 10 01 00
punpckhbw
mm1
,
mm6
; 71 70 61 60 51 50 41 40
movq
mm2
,
mm3
;
punpcklbw
mm2
,
mm4
; 33 32 23 22 13 12 03 02
movq
mm5
,
mm1
; 71 70 61 60 51 50 41 40
punpckhbw
mm3
,
mm4
; 73 72 63 62 53 52 43 42
movq
mm6
,
mm0
; 31 30 21 20 11 10 01 00
punpcklwd
mm0
,
mm2
; 13 12 11 10 03 02 01 00
punpckhwd
mm6
,
mm2
; 33 32 31 30 23 22 21 20
movd
[
rsi
+
rax
*
4
],
mm0
; write 03 02 01 00
punpcklwd
mm1
,
mm3
; 53 52 51 50 43 42 41 40
psrlq
mm0
,
32
; xx xx xx xx 13 12 11 10
punpckhwd
mm5
,
mm3
; 73 72 71 70 63 62 61 60
movd
[
rdi
+
rax
*
4
],
mm0
; write 13 12 11 10
movd
[
rsi
+
rax
*
2
],
mm6
; write 23 22 21 20
psrlq
mm6
,
32
; 33 32 31 30
movd
[
rsi
],
mm1
; write 43 42 41 40
movd
[
rsi
+
rax
],
mm6
; write 33 32 31 30
neg
rax
movd
[
rsi
+
rax
*
2
],
mm5
; write 63 62 61 60
psrlq
mm1
,
32
; 53 52 51 50
movd
[
rdi
],
mm1
; write out 53 52 51 50
psrlq
mm5
,
32
; 73 72 71 70
movd
[
rdi
+
rax
*
2
],
mm5
; write 73 72 71 70
lea
rsi
,
[
rsi
+
rax
*
8
]
; next 8
dec
rcx
jnz
.nexts8_v
add
rsp
,
32
pop
rsp
; begin epilog
pop
rdi
pop
rsi
REST
ORE_GOT
UNSHADOW_ARGS
pop
rbp
ret
;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
; int y_stride,
; loop_filter_info *lfi)
;{
;
;
; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
;}
SECTION
_RODATA
align
16
tfe:
...
...
vp9/common/x86/vp9_loopfilter_sse2.asm
View file @
4a559d34
...
...
@@ -845,372 +845,6 @@ sym(vp9_loop_filter_vertical_edge_uv_sse2):
pop
rbp
ret
;void vp9_loop_filter_simple_horizontal_edge_sse2
;(
; unsigned char *src_ptr,
; int src_pixel_step,
; const char *blimit,
;)
global
sym
(
vp9_loop_filter_simple_horizontal_edge_sse2
)
PRIVATE
sym
(
vp9_loop_filter_simple_horizontal_edge_sse2
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
3
SAVE_XMM
7
GET_GOT
rbx
push
rsi
push
rdi
; end prolog
mov
rsi
,
arg
(
0
)
;src_ptr
movsxd
rax
,
dword
ptr
arg
(
1
)
;src_pixel_step ; destination pitch?
mov
rdx
,
arg
(
2
)
;blimit
movdqa
xmm3
,
XMMWORD
PTR
[
rdx
]
mov
rdi
,
rsi
; rdi points to row +1 for indirect addressing
add
rdi
,
rax
neg
rax
; calculate mask
movdqa
xmm1
,
[
rsi
+
2
*
rax
]
; p1
movdqa
xmm0
,
[
rdi
]
; q1
movdqa
xmm2
,
xmm1
movdqa
xmm7
,
xmm0
movdqa
xmm4
,
xmm0
psubusb
xmm0
,
xmm1
; q1-=p1
psubusb
xmm1
,
xmm4
; p1-=q1
por
xmm1
,
xmm0
; abs(p1-q1)
pand
xmm1
,
[
GLOBAL
(
tfe
)]
; set lsb of each byte to zero
psrlw
xmm1
,
1
; abs(p1-q1)/2
movdqa
xmm5
,
[
rsi
+
rax
]
; p0
movdqa
xmm4
,
[
rsi
]
; q0
movdqa
xmm0
,
xmm4
; q0
movdqa
xmm6
,
xmm5
; p0
psubusb
xmm5
,
xmm4
; p0-=q0
psubusb
xmm4
,
xmm6
; q0-=p0
por
xmm5
,
xmm4
; abs(p0 - q0)
paddusb
xmm5
,
xmm5
; abs(p0-q0)*2
paddusb
xmm5
,
xmm1
; abs (p0 - q0) *2 + abs(p1-q1)/2
psubusb
xmm5
,
xmm3
; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
pxor
xmm3
,
xmm3
pcmpeqb
xmm5
,
xmm3
; start work on filters
pxor
xmm2
,
[
GLOBAL
(
t80
)]
; p1 offset to convert to signed values
pxor
xmm7
,
[
GLOBAL
(
t80
)]
; q1 offset to convert to signed values
psubsb
xmm2
,
xmm7
; p1 - q1
pxor
xmm6
,
[
GLOBAL
(
t80
)]
; offset to convert to signed values
pxor
xmm0
,
[
GLOBAL
(
t80
)]
; offset to convert to signed values
movdqa
xmm3
,
xmm0
; q0
psubsb
xmm0
,
xmm6
; q0 - p0
paddsb
xmm2
,
xmm0
; p1 - q1 + 1 * (q0 - p0)
paddsb
xmm2
,
xmm0
; p1 - q1 + 2 * (q0 - p0)
paddsb
xmm2
,
xmm0
; p1 - q1 + 3 * (q0 - p0)
pand
xmm5
,
xmm2
; mask filter values we don't care about
; do + 4 side
paddsb
xmm5
,
[
GLOBAL
(
t4
)]
; 3* (q0 - p0) + (p1 - q1) + 4
movdqa
xmm0
,
xmm5
; get a copy of filters
psllw
xmm0
,
8
; shift left 8
psraw
xmm0
,
3
; arithmetic shift right 11
psrlw
xmm0
,
8
movdqa
xmm1
,
xmm5
; get a copy of filters
psraw
xmm1
,
11
; arithmetic shift right 11
psllw
xmm1
,
8
; shift left 8 to put it back
por
xmm0
,
xmm1
; put the two together to get result
psubsb
xmm3
,
xmm0
; q0-= q0 add
pxor
xmm3
,
[
GLOBAL
(
t80
)]
; unoffset
movdqa
[
rsi
],
xmm3
; write back
; now do +3 side
psubsb
xmm5
,
[
GLOBAL
(
t1s
)]
; +3 instead of +4
movdqa
xmm0
,
xmm5
; get a copy of filters
psllw
xmm0
,
8
; shift left 8
psraw
xmm0
,
3
; arithmetic shift right 11
psrlw
xmm0
,
8
psraw
xmm5
,
11
; arithmetic shift right 11
psllw
xmm5
,
8
; shift left 8 to put it back
por
xmm0
,
xmm5
; put the two together to get result
paddsb
xmm6
,
xmm0
; p0+= p0 add
pxor
xmm6
,
[
GLOBAL
(
t80
)]
; unoffset
movdqa
[
rsi
+
rax
],
xmm6
; write back
; begin epilog
pop
rdi
pop
rsi
REST
ORE_GOT
REST
ORE_XMM
UNSHADOW_ARGS
pop
rbp
ret
;void vp9_loop_filter_simple_vertical_edge_sse2
;(
; unsigned char *src_ptr,
; int src_pixel_step,
; const char *blimit,
;)
global
sym
(
vp9_loop_filter_simple_vertical_edge_sse2
)
PRIVATE
sym
(
vp9_loop_filter_simple_vertical_edge_sse2
):
push
rbp
; save old base pointer value.
mov
rbp
,
rsp
; set new base pointer value.
SHADOW_ARGS_TO_STACK
3
SAVE_XMM
7
GET_GOT
rbx
; save callee-saved reg
push
rsi
push
rdi
; end prolog
ALIGN
_STACK
16
,
rax
sub
rsp
,
32
; reserve 32 bytes
%define t0 [rsp + 0]
;__declspec(align(16)) char t0[16];