Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
0d3aeda3
Commit
0d3aeda3
authored
Jan 31, 2017
by
Tom Finegan
Browse files
Remove unused assembly sources and associated tests.
Change-Id: Ic8386743b1852ca1074528d04e2adc1d191b091b
parent
dbfec2a8
Changes
6
Expand all
Hide whitespace changes
Inline
Side-by-side
CMakeLists.txt
View file @
0d3aeda3
...
...
@@ -374,8 +374,6 @@ set(AOM_UNIT_TEST_SOURCES
#"${AOM_ROOT}/test/accounting_test.cc"
"
${
AOM_ROOT
}
/test/acm_random.h"
"
${
AOM_ROOT
}
/test/active_map_test.cc"
# not in test.mk
#"${AOM_ROOT}/test/add_noise_test.cc"
"
${
AOM_ROOT
}
/test/altref_test.cc"
"
${
AOM_ROOT
}
/test/android"
# requires CONFIG_ANS
...
...
aom_dsp/deblock.c
deleted
100644 → 0
View file @
dbfec2a8
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*
*/
#include
<stdlib.h>
#include
"aom/aom_integer.h"
const
int16_t
aom_rv
[]
=
{
8
,
5
,
2
,
2
,
8
,
12
,
4
,
9
,
8
,
3
,
0
,
3
,
9
,
0
,
0
,
0
,
8
,
3
,
14
,
4
,
10
,
1
,
11
,
14
,
1
,
14
,
9
,
6
,
12
,
11
,
8
,
6
,
10
,
0
,
0
,
8
,
9
,
0
,
3
,
14
,
8
,
11
,
13
,
4
,
2
,
9
,
0
,
3
,
9
,
6
,
1
,
2
,
3
,
14
,
13
,
1
,
8
,
2
,
9
,
7
,
3
,
3
,
1
,
13
,
13
,
6
,
6
,
5
,
2
,
7
,
11
,
9
,
11
,
8
,
7
,
3
,
2
,
0
,
13
,
13
,
14
,
4
,
12
,
5
,
12
,
10
,
8
,
10
,
13
,
10
,
4
,
14
,
4
,
10
,
0
,
8
,
11
,
1
,
13
,
7
,
7
,
14
,
6
,
14
,
13
,
2
,
13
,
5
,
4
,
4
,
0
,
10
,
0
,
5
,
13
,
2
,
12
,
7
,
11
,
13
,
8
,
0
,
4
,
10
,
7
,
2
,
7
,
2
,
2
,
5
,
3
,
4
,
7
,
3
,
3
,
14
,
14
,
5
,
9
,
13
,
3
,
14
,
3
,
6
,
3
,
0
,
11
,
8
,
13
,
1
,
13
,
1
,
12
,
0
,
10
,
9
,
7
,
6
,
2
,
8
,
5
,
2
,
13
,
7
,
1
,
13
,
14
,
7
,
6
,
7
,
9
,
6
,
10
,
11
,
7
,
8
,
7
,
5
,
14
,
8
,
4
,
4
,
0
,
8
,
7
,
10
,
0
,
8
,
14
,
11
,
3
,
12
,
5
,
7
,
14
,
3
,
14
,
5
,
2
,
6
,
11
,
12
,
12
,
8
,
0
,
11
,
13
,
1
,
2
,
0
,
5
,
10
,
14
,
7
,
8
,
0
,
4
,
11
,
0
,
8
,
0
,
3
,
10
,
5
,
8
,
0
,
11
,
6
,
7
,
8
,
10
,
7
,
13
,
9
,
2
,
5
,
1
,
5
,
10
,
2
,
4
,
3
,
5
,
6
,
10
,
8
,
9
,
4
,
11
,
14
,
0
,
10
,
0
,
5
,
13
,
2
,
12
,
7
,
11
,
13
,
8
,
0
,
4
,
10
,
7
,
2
,
7
,
2
,
2
,
5
,
3
,
4
,
7
,
3
,
3
,
14
,
14
,
5
,
9
,
13
,
3
,
14
,
3
,
6
,
3
,
0
,
11
,
8
,
13
,
1
,
13
,
1
,
12
,
0
,
10
,
9
,
7
,
6
,
2
,
8
,
5
,
2
,
13
,
7
,
1
,
13
,
14
,
7
,
6
,
7
,
9
,
6
,
10
,
11
,
7
,
8
,
7
,
5
,
14
,
8
,
4
,
4
,
0
,
8
,
7
,
10
,
0
,
8
,
14
,
11
,
3
,
12
,
5
,
7
,
14
,
3
,
14
,
5
,
2
,
6
,
11
,
12
,
12
,
8
,
0
,
11
,
13
,
1
,
2
,
0
,
5
,
10
,
14
,
7
,
8
,
0
,
4
,
11
,
0
,
8
,
0
,
3
,
10
,
5
,
8
,
0
,
11
,
6
,
7
,
8
,
10
,
7
,
13
,
9
,
2
,
5
,
1
,
5
,
10
,
2
,
4
,
3
,
5
,
6
,
10
,
8
,
9
,
4
,
11
,
14
,
3
,
8
,
3
,
7
,
8
,
5
,
11
,
4
,
12
,
3
,
11
,
9
,
14
,
8
,
14
,
13
,
4
,
3
,
1
,
2
,
14
,
6
,
5
,
4
,
4
,
11
,
4
,
6
,
2
,
1
,
5
,
8
,
8
,
12
,
13
,
5
,
14
,
10
,
12
,
13
,
0
,
9
,
5
,
5
,
11
,
10
,
13
,
9
,
10
,
13
,
};
void
aom_post_proc_down_and_across_mb_row_c
(
unsigned
char
*
src_ptr
,
unsigned
char
*
dst_ptr
,
int
src_pixels_per_line
,
int
dst_pixels_per_line
,
int
cols
,
unsigned
char
*
f
,
int
size
)
{
unsigned
char
*
p_src
,
*
p_dst
;
int
row
;
int
col
;
unsigned
char
v
;
unsigned
char
d
[
4
];
for
(
row
=
0
;
row
<
size
;
row
++
)
{
/* post_proc_down for one row */
p_src
=
src_ptr
;
p_dst
=
dst_ptr
;
for
(
col
=
0
;
col
<
cols
;
col
++
)
{
unsigned
char
p_above2
=
p_src
[
col
-
2
*
src_pixels_per_line
];
unsigned
char
p_above1
=
p_src
[
col
-
src_pixels_per_line
];
unsigned
char
p_below1
=
p_src
[
col
+
src_pixels_per_line
];
unsigned
char
p_below2
=
p_src
[
col
+
2
*
src_pixels_per_line
];
v
=
p_src
[
col
];
if
((
abs
(
v
-
p_above2
)
<
f
[
col
])
&&
(
abs
(
v
-
p_above1
)
<
f
[
col
])
&&
(
abs
(
v
-
p_below1
)
<
f
[
col
])
&&
(
abs
(
v
-
p_below2
)
<
f
[
col
]))
{
unsigned
char
k1
,
k2
,
k3
;
k1
=
(
p_above2
+
p_above1
+
1
)
>>
1
;
k2
=
(
p_below2
+
p_below1
+
1
)
>>
1
;
k3
=
(
k1
+
k2
+
1
)
>>
1
;
v
=
(
k3
+
v
+
1
)
>>
1
;
}
p_dst
[
col
]
=
v
;
}
/* now post_proc_across */
p_src
=
dst_ptr
;
p_dst
=
dst_ptr
;
p_src
[
-
2
]
=
p_src
[
-
1
]
=
p_src
[
0
];
p_src
[
cols
]
=
p_src
[
cols
+
1
]
=
p_src
[
cols
-
1
];
for
(
col
=
0
;
col
<
cols
;
col
++
)
{
v
=
p_src
[
col
];
if
((
abs
(
v
-
p_src
[
col
-
2
])
<
f
[
col
])
&&
(
abs
(
v
-
p_src
[
col
-
1
])
<
f
[
col
])
&&
(
abs
(
v
-
p_src
[
col
+
1
])
<
f
[
col
])
&&
(
abs
(
v
-
p_src
[
col
+
2
])
<
f
[
col
]))
{
unsigned
char
k1
,
k2
,
k3
;
k1
=
(
p_src
[
col
-
2
]
+
p_src
[
col
-
1
]
+
1
)
>>
1
;
k2
=
(
p_src
[
col
+
2
]
+
p_src
[
col
+
1
]
+
1
)
>>
1
;
k3
=
(
k1
+
k2
+
1
)
>>
1
;
v
=
(
k3
+
v
+
1
)
>>
1
;
}
d
[
col
&
3
]
=
v
;
if
(
col
>=
2
)
p_dst
[
col
-
2
]
=
d
[(
col
-
2
)
&
3
];
}
/* handle the last two pixels */
p_dst
[
col
-
2
]
=
d
[(
col
-
2
)
&
3
];
p_dst
[
col
-
1
]
=
d
[(
col
-
1
)
&
3
];
/* next row */
src_ptr
+=
src_pixels_per_line
;
dst_ptr
+=
dst_pixels_per_line
;
}
}
void
aom_mbpost_proc_across_ip_c
(
unsigned
char
*
src
,
int
pitch
,
int
rows
,
int
cols
,
int
flimit
)
{
int
r
,
c
,
i
;
unsigned
char
*
s
=
src
;
unsigned
char
d
[
16
];
for
(
r
=
0
;
r
<
rows
;
r
++
)
{
int
sumsq
=
0
;
int
sum
=
0
;
for
(
i
=
-
8
;
i
<
0
;
i
++
)
s
[
i
]
=
s
[
0
];
/* 17 avoids valgrind warning - we buffer values in c in d
* and only write them when we've read 8 ahead...
*/
for
(
i
=
0
;
i
<
17
;
i
++
)
s
[
i
+
cols
]
=
s
[
cols
-
1
];
for
(
i
=
-
8
;
i
<=
6
;
i
++
)
{
sumsq
+=
s
[
i
]
*
s
[
i
];
sum
+=
s
[
i
];
d
[
i
+
8
]
=
0
;
}
for
(
c
=
0
;
c
<
cols
+
8
;
c
++
)
{
int
x
=
s
[
c
+
7
]
-
s
[
c
-
8
];
int
y
=
s
[
c
+
7
]
+
s
[
c
-
8
];
sum
+=
x
;
sumsq
+=
x
*
y
;
d
[
c
&
15
]
=
s
[
c
];
if
(
sumsq
*
15
-
sum
*
sum
<
flimit
)
{
d
[
c
&
15
]
=
(
8
+
sum
+
s
[
c
])
>>
4
;
}
s
[
c
-
8
]
=
d
[(
c
-
8
)
&
15
];
}
s
+=
pitch
;
}
}
void
aom_mbpost_proc_down_c
(
unsigned
char
*
dst
,
int
pitch
,
int
rows
,
int
cols
,
int
flimit
)
{
int
r
,
c
,
i
;
const
int16_t
*
rv3
=
&
aom_rv
[
63
&
rand
()];
for
(
c
=
0
;
c
<
cols
;
c
++
)
{
unsigned
char
*
s
=
&
dst
[
c
];
int
sumsq
=
0
;
int
sum
=
0
;
unsigned
char
d
[
16
];
const
int16_t
*
rv2
=
rv3
+
((
c
*
17
)
&
127
);
for
(
i
=
-
8
;
i
<
0
;
i
++
)
s
[
i
*
pitch
]
=
s
[
0
];
/* 17 avoids valgrind warning - we buffer values in c in d
* and only write them when we've read 8 ahead...
*/
for
(
i
=
0
;
i
<
17
;
i
++
)
s
[(
i
+
rows
)
*
pitch
]
=
s
[(
rows
-
1
)
*
pitch
];
for
(
i
=
-
8
;
i
<=
6
;
i
++
)
{
sumsq
+=
s
[
i
*
pitch
]
*
s
[
i
*
pitch
];
sum
+=
s
[
i
*
pitch
];
}
for
(
r
=
0
;
r
<
rows
+
8
;
r
++
)
{
sumsq
+=
s
[
7
*
pitch
]
*
s
[
7
*
pitch
]
-
s
[
-
8
*
pitch
]
*
s
[
-
8
*
pitch
];
sum
+=
s
[
7
*
pitch
]
-
s
[
-
8
*
pitch
];
d
[
r
&
15
]
=
s
[
0
];
if
(
sumsq
*
15
-
sum
*
sum
<
flimit
)
{
d
[
r
&
15
]
=
(
rv2
[
r
&
127
]
+
sum
+
s
[
0
])
>>
4
;
}
if
(
r
>=
8
)
s
[
-
8
*
pitch
]
=
d
[(
r
-
8
)
&
15
];
s
+=
pitch
;
}
}
}
aom_dsp/mips/deblock_msa.c
deleted
100644 → 0
View file @
dbfec2a8
This diff is collapsed.
Click to expand it.
aom_dsp/x86/add_noise_sse2.asm
deleted
100644 → 0
View file @
dbfec2a8
;
; Copyright (c) 2016, Alliance for Open Media. All rights reserved
;
; This source code is subject to the terms of the BSD 2 Clause License and
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
; was not distributed with this source code in the LICENSE file, you can
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
; Media Patent License 1.0 was not distributed with this source code in the
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
;
%include "aom_ports/x86_abi_support.asm"
;void aom_plane_add_noise_sse2(unsigned char *start, unsigned char *noise,
; unsigned char blackclamp[16],
; unsigned char whiteclamp[16],
; unsigned char bothclamp[16],
; unsigned int width, unsigned int height,
; int pitch)
global
sym
(
aom_plane_add_noise_sse2
)
PRIVATE
sym
(
aom_plane_add_noise_sse2
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
8
GET_GOT
rbx
push
rsi
push
rdi
; end prolog
; get the clamps in registers
mov
rdx
,
arg
(
2
)
; blackclamp
movdqu
xmm3
,
[
rdx
]
mov
rdx
,
arg
(
3
)
; whiteclamp
movdqu
xmm4
,
[
rdx
]
mov
rdx
,
arg
(
4
)
; bothclamp
movdqu
xmm5
,
[
rdx
]
.addnoise_loop:
call
sym
(
LIBAOM_RAND
)
WRT
_PLT
mov
rcx
,
arg
(
1
)
;noise
and
rax
,
0xff
add
rcx
,
rax
mov
rdi
,
rcx
movsxd
rcx
,
dword
arg
(
5
)
;[Width]
mov
rsi
,
arg
(
0
)
;Pos
xor
rax
,
rax
.addnoise_nextset:
movdqu
xmm1
,[
rsi
+
rax
]
; get the source
psubusb
xmm1
,
xmm3
; subtract black clamp
paddusb
xmm1
,
xmm5
; add both clamp
psubusb
xmm1
,
xmm4
; subtract whiteclamp
movdqu
xmm2
,[
rdi
+
rax
]
; get the noise for this line
paddb
xmm1
,
xmm2
; add it in
movdqu
[
rsi
+
rax
],
xmm1
; store the result
add
rax
,
16
; move to the next line
cmp
rax
,
rcx
jl
.addnoise_nextset
movsxd
rax
,
dword
arg
(
7
)
; Pitch
add
arg
(
0
),
rax
; Start += Pitch
sub
dword
arg
(
6
),
1
; Height -= 1
jg
.addnoise_loop
; begin epilog
pop
rdi
pop
rsi
REST
ORE_GOT
UNSHADOW_ARGS
pop
rbp
ret
SECTION
_RODATA
align
16
rd42:
times
8
dw
0x04
four8s:
times
4
dd
8
aom_dsp/x86/deblock_sse2.asm
deleted
100644 → 0
View file @
dbfec2a8
;
; Copyright (c) 2016, Alliance for Open Media. All rights reserved
;
; This source code is subject to the terms of the BSD 2 Clause License and
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
; was not distributed with this source code in the LICENSE file, you can
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
; Media Patent License 1.0 was not distributed with this source code in the
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
;
%include "aom_ports/x86_abi_support.asm"
;macro in deblock functions
%macro FIRST_2_ROWS 0
movdqa
xmm4
,
xmm0
movdqa
xmm6
,
xmm0
movdqa
xmm5
,
xmm1
pavgb
xmm5
,
xmm3
;calculate absolute value
psubusb
xmm4
,
xmm1
psubusb
xmm1
,
xmm0
psubusb
xmm6
,
xmm3
psubusb
xmm3
,
xmm0
paddusb
xmm4
,
xmm1
paddusb
xmm6
,
xmm3
;get threshold
movdqa
xmm2
,
flimit
pxor
xmm1
,
xmm1
movdqa
xmm7
,
xmm2
;get mask
psubusb
xmm2
,
xmm4
psubusb
xmm7
,
xmm6
pcmpeqb
xmm2
,
xmm1
pcmpeqb
xmm7
,
xmm1
por
xmm7
,
xmm2
%endmacro
%macro SECOND_2_ROWS 0
movdqa
xmm6
,
xmm0
movdqa
xmm4
,
xmm0
movdqa
xmm2
,
xmm1
pavgb
xmm1
,
xmm3
;calculate absolute value
psubusb
xmm6
,
xmm2
psubusb
xmm2
,
xmm0
psubusb
xmm4
,
xmm3
psubusb
xmm3
,
xmm0
paddusb
xmm6
,
xmm2
paddusb
xmm4
,
xmm3
pavgb
xmm5
,
xmm1
;get threshold
movdqa
xmm2
,
flimit
pxor
xmm1
,
xmm1
movdqa
xmm3
,
xmm2
;get mask
psubusb
xmm2
,
xmm6
psubusb
xmm3
,
xmm4
pcmpeqb
xmm2
,
xmm1
pcmpeqb
xmm3
,
xmm1
por
xmm7
,
xmm2
por
xmm7
,
xmm3
pavgb
xmm5
,
xmm0
;decide if or not to use filtered value
pand
xmm0
,
xmm7
pandn
xmm7
,
xmm5
paddusb
xmm0
,
xmm7
%endmacro
%macro UPDATE_FLIMIT 0
movdqa
xmm2
,
XMMWORD
PTR
[
rbx
]
movdqa
[
rsp
],
xmm2
add
rbx
,
16
%endmacro
;void aom_post_proc_down_and_across_mb_row_sse2
;(
; unsigned char *src_ptr,
; unsigned char *dst_ptr,
; int src_pixels_per_line,
; int dst_pixels_per_line,
; int cols,
; int *flimits,
; int size
;)
global
sym
(
aom_post_proc_down_and_across_mb_row_sse2
)
PRIVATE
sym
(
aom_post_proc_down_and_across_mb_row_sse2
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
7
SAVE_XMM
7
push
rbx
push
rsi
push
rdi
; end prolog
ALIGN
_STACK
16
,
rax
sub
rsp
,
16
; put flimit on stack
mov
rbx
,
arg
(
5
)
;flimits ptr
UPDATE_FLIMIT
%define flimit [rsp]
mov
rsi
,
arg
(
0
)
;src_ptr
mov
rdi
,
arg
(
1
)
;dst_ptr
movsxd
rax
,
DWORD
PTR
arg
(
2
)
;src_pixels_per_line
movsxd
rcx
,
DWORD
PTR
arg
(
6
)
;rows in a macroblock
.nextrow:
xor
rdx
,
rdx
;col
.nextcol:
;load current and next 2 rows
movdqu
xmm0
,
XMMWORD
PTR
[
rsi
]
movdqu
xmm1
,
XMMWORD
PTR
[
rsi
+
rax
]
movdqu
xmm3
,
XMMWORD
PTR
[
rsi
+
2
*
rax
]
FIRST_2_ROWS
;load above 2 rows
neg
rax
movdqu
xmm1
,
XMMWORD
PTR
[
rsi
+
2
*
rax
]
movdqu
xmm3
,
XMMWORD
PTR
[
rsi
+
rax
]
SECOND_2_ROWS
movdqu
XMMWORD
PTR
[
rdi
],
xmm0
neg
rax
; positive stride
add
rsi
,
16
add
rdi
,
16
add
rdx
,
16
cmp
edx
,
dword
arg
(
4
)
;cols
jge
.downdone
UPDATE_FLIMIT
jmp
.nextcol
.downdone:
; done with the all cols, start the across filtering in place
sub
rsi
,
rdx
sub
rdi
,
rdx
mov
rbx
,
arg
(
5
)
; flimits
UPDATE_FLIMIT
; dup the first byte into the left border 8 times
movq
mm1
,
[
rdi
]
punpcklbw
mm1
,
mm1
punpcklwd
mm1
,
mm1
punpckldq
mm1
,
mm1
mov
rdx
,
-
8
movq
[
rdi
+
rdx
],
mm1
; dup the last byte into the right border
movsxd
rdx
,
dword
arg
(
4
)
movq
mm1
,
[
rdi
+
rdx
+
-
1
]
punpcklbw
mm1
,
mm1
punpcklwd
mm1
,
mm1
punpckldq
mm1
,
mm1
movq
[
rdi
+
rdx
],
mm1
xor
rdx
,
rdx
movq
mm0
,
QWORD
PTR
[
rdi
-
16
]
;
movq
mm1
,
QWORD
PTR
[
rdi
-
8
]
;
.acrossnextcol:
movdqu
xmm0
,
XMMWORD
PTR
[
rdi
+
rdx
]
movdqu
xmm1
,
XMMWORD
PTR
[
rdi
+
rdx
-
2
]
movdqu
xmm3
,
XMMWORD
PTR
[
rdi
+
rdx
-
1
]
FIRST_2_ROWS
movdqu
xmm1
,
XMMWORD
PTR
[
rdi
+
rdx
+
1
]
movdqu
xmm3
,
XMMWORD
PTR
[
rdi
+
rdx
+
2
]
SECOND_2_ROWS
movq
QWORD
PTR
[
rdi
+
rdx
-
16
],
mm0
; store previous 8 bytes
movq
QWORD
PTR
[
rdi
+
rdx
-
8
],
mm1
; store previous 8 bytes
movdq2q
mm0
,
xmm0
psrldq
xmm0
,
8
movdq2q
mm1
,
xmm0
add
rdx
,
16
cmp
edx
,
dword
arg
(
4
)
;cols
jge
.acrossdone
UPDATE_FLIMIT
jmp
.acrossnextcol
.acrossdone:
; last 16 pixels
movq
QWORD
PTR
[
rdi
+
rdx
-
16
],
mm0
cmp
edx
,
dword
arg
(
4
)
jne
.throw_last_8
movq
QWORD
PTR
[
rdi
+
rdx
-
8
],
mm1
.throw_last_8:
; done with this rwo
add
rsi
,
rax
;next src line
mov
eax
,
dword
arg
(
3
)
;dst_pixels_per_line
add
rdi
,
rax
;next destination
mov
eax
,
dword
arg
(
2
)
;src_pixels_per_line
mov
rbx
,
arg
(
5
)
;flimits
UPDATE_FLIMIT
dec
rcx
;decrement count
jnz
.nextrow
;next row
add
rsp
,
16
pop
rsp
; begin epilog
pop
rdi
pop
rsi
pop
rbx
REST
ORE_XMM
UNSHADOW_ARGS
pop
rbp
ret
%undef flimit
;void aom_mbpost_proc_down_xmm(unsigned char *dst,
; int pitch, int rows, int cols,int flimit)
extern
sym
(
aom_rv
)
global
sym
(
aom_mbpost_proc_down_xmm
)
PRIVATE
sym
(
aom_mbpost_proc_down_xmm
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
5
SAVE_XMM
7
GET_GOT
rbx
push
rsi
push
rdi
; end prolog
ALIGN
_STACK
16
,
rax
sub
rsp
,
128
+
16
; unsigned char d[16][8] at [rsp]
; create flimit2 at [rsp+128]
mov
eax
,
dword
ptr
arg
(
4
)
;flimit
mov
[
rsp
+
128
],
eax
mov
[
rsp
+
128
+
4
],
eax
mov
[
rsp
+
128
+
8
],
eax
mov
[
rsp
+
128
+
12
],
eax
%define flimit4 [rsp+128]
%if ABI_IS_32BIT=0
lea
r8
,
[
GLOBAL
(
sym
(
aom_rv
))]
%endif
;rows +=8;
add
dword
arg
(
2
),
8
;for(c=0; c<cols; c+=8)
.loop_col:
mov
rsi
,
arg
(
0
)
; s
pxor
xmm0
,
xmm0
;
movsxd
rax
,
dword
ptr
arg
(
1
)
;pitch ;
; this copies the last row down into the border 8 rows
mov
rdi
,
rsi
mov
rdx
,
arg
(
2
)
sub
rdx
,
9
imul
rdx
,
rax
lea
rdi
,
[
rdi
+
rdx
]
movq
xmm1
,
QWORD
ptr
[
rdi
]
; first row
mov
rcx
,
8
.init_borderd:
; initialize borders
lea
rdi
,
[
rdi
+
rax
]
movq
[
rdi
],
xmm1
dec
rcx
jne
.init_borderd
neg
rax
; rax = -pitch
; this copies the first row up into the border 8 rows
mov
rdi
,
rsi
movq
xmm1
,
QWORD
ptr
[
rdi
]
; first row
mov
rcx
,
8
.init_border:
; initialize borders
lea
rdi
,
[
rdi
+
rax
]
movq
[
rdi
],
xmm1
dec
rcx
jne
.init_border
lea
rsi
,
[
rsi
+
rax
*
8
]
; ; rdi = s[-pitch*8]
neg
rax
pxor
xmm5
,
xmm5
pxor
xmm6
,
xmm6
;
pxor
xmm7
,
xmm7
;
mov
rdi
,
rsi