Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
Xiph.Org
aom-rav1e
Commits
ab73dba6
Commit
ab73dba6
authored
Sep 03, 2014
by
Dmitry Kovalev
Committed by
Gerrit Code Review
Sep 03, 2014
Browse files
Options
Browse Files
Download
Plain Diff
Merge "Replacing asm 16x16 variance calculation with intrinsics."
parents
406404af
6f6bd282
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
44 additions
and
147 deletions
+44
-147
vp9/encoder/x86/vp9_variance_impl_sse2.asm
vp9/encoder/x86/vp9_variance_impl_sse2.asm
+0
-142
vp9/encoder/x86/vp9_variance_sse2.c
vp9/encoder/x86/vp9_variance_sse2.c
+44
-5
No files found.
vp9/encoder/x86/vp9_variance_impl_sse2.asm
View file @
ab73dba6
...
...
@@ -67,145 +67,3 @@ sym(vp9_get_mb_ss_sse2):
UNSHADOW_ARGS
pop
rbp
ret
;unsigned int vp9_get16x16var_sse2
;(
; unsigned char * src_ptr,
; int source_stride,
; unsigned char * ref_ptr,
; int recon_stride,
; unsigned int * SSE,
; int * Sum
;)
global
sym
(
vp9_get16x16var_sse2
)
PRIVATE
sym
(
vp9_get16x16var_sse2
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
6
SAVE_XMM
7
push
rbx
push
rsi
push
rdi
; end prolog
mov
rsi
,
arg
(
0
)
;[src_ptr]
mov
rdi
,
arg
(
2
)
;[ref_ptr]
movsxd
rax
,
DWORD
PTR
arg
(
1
)
;[source_stride]
movsxd
rdx
,
DWORD
PTR
arg
(
3
)
;[recon_stride]
; Prefetch data
lea
rcx
,
[
rax
+
rax
*
2
]
prefetcht0
[
rsi
]
prefetcht0
[
rsi
+
rax
]
prefetcht0
[
rsi
+
rax
*
2
]
prefetcht0
[
rsi
+
rcx
]
lea
rbx
,
[
rsi
+
rax
*
4
]
prefetcht0
[
rbx
]
prefetcht0
[
rbx
+
rax
]
prefetcht0
[
rbx
+
rax
*
2
]
prefetcht0
[
rbx
+
rcx
]
lea
rcx
,
[
rdx
+
rdx
*
2
]
prefetcht0
[
rdi
]
prefetcht0
[
rdi
+
rdx
]
prefetcht0
[
rdi
+
rdx
*
2
]
prefetcht0
[
rdi
+
rcx
]
lea
rbx
,
[
rdi
+
rdx
*
4
]
prefetcht0
[
rbx
]
prefetcht0
[
rbx
+
rdx
]
prefetcht0
[
rbx
+
rdx
*
2
]
prefetcht0
[
rbx
+
rcx
]
pxor
xmm0
,
xmm0
; clear xmm0 for unpack
pxor
xmm7
,
xmm7
; clear xmm7 for accumulating diffs
pxor
xmm6
,
xmm6
; clear xmm6 for accumulating sse
mov
rcx
,
16
.var16loop:
movdqu
xmm1
,
XMMWORD
PTR
[
rsi
]
movdqu
xmm2
,
XMMWORD
PTR
[
rdi
]
prefetcht0
[
rsi
+
rax
*
8
]
prefetcht0
[
rdi
+
rdx
*
8
]
movdqa
xmm3
,
xmm1
movdqa
xmm4
,
xmm2
punpcklbw
xmm1
,
xmm0
punpckhbw
xmm3
,
xmm0
punpcklbw
xmm2
,
xmm0
punpckhbw
xmm4
,
xmm0
psubw
xmm1
,
xmm2
psubw
xmm3
,
xmm4
paddw
xmm7
,
xmm1
pmaddwd
xmm1
,
xmm1
paddw
xmm7
,
xmm3
pmaddwd
xmm3
,
xmm3
paddd
xmm6
,
xmm1
paddd
xmm6
,
xmm3
add
rsi
,
rax
add
rdi
,
rdx
sub
rcx
,
1
jnz
.var16loop
movdqa
xmm1
,
xmm6
pxor
xmm6
,
xmm6
pxor
xmm5
,
xmm5
punpcklwd
xmm6
,
xmm7
punpckhwd
xmm5
,
xmm7
psrad
xmm5
,
16
psrad
xmm6
,
16
paddd
xmm6
,
xmm5
movdqa
xmm2
,
xmm1
punpckldq
xmm1
,
xmm0
punpckhdq
xmm2
,
xmm0
movdqa
xmm7
,
xmm6
paddd
xmm1
,
xmm2
punpckldq
xmm6
,
xmm0
punpckhdq
xmm7
,
xmm0
paddd
xmm6
,
xmm7
movdqa
xmm2
,
xmm1
movdqa
xmm7
,
xmm6
psrldq
xmm1
,
8
psrldq
xmm6
,
8
paddd
xmm7
,
xmm6
paddd
xmm1
,
xmm2
mov
rax
,
arg
(
5
)
;[Sum]
mov
rdi
,
arg
(
4
)
;[SSE]
movd
DWORD
PTR
[
rax
],
xmm7
movd
DWORD
PTR
[
rdi
],
xmm1
; begin epilog
pop
rdi
pop
rsi
pop
rbx
REST
ORE_XMM
UNSHADOW_ARGS
pop
rbp
ret
vp9/encoder/x86/vp9_variance_sse2.c
View file @
ab73dba6
...
...
@@ -92,9 +92,49 @@ unsigned int vp9_get8x8var_sse2(const uint8_t *src, int src_stride,
return
0
;
}
unsigned
int
vp9_get16x16var_sse2
(
const
unsigned
char
*
src
,
int
src_stride
,
const
unsigned
char
*
ref
,
int
ref_stride
,
unsigned
int
*
sse
,
int
*
sum
);
unsigned
int
vp9_get16x16var_sse2
(
const
uint8_t
*
src
,
int
src_stride
,
const
uint8_t
*
ref
,
int
ref_stride
,
unsigned
int
*
sse
,
int
*
sum
)
{
const
__m128i
zero
=
_mm_setzero_si128
();
__m128i
vsum
=
_mm_setzero_si128
();
__m128i
vsse
=
_mm_setzero_si128
();
int
i
;
for
(
i
=
0
;
i
<
16
;
++
i
)
{
const
__m128i
s
=
_mm_loadu_si128
((
const
__m128i
*
)
src
);
const
__m128i
r
=
_mm_loadu_si128
((
const
__m128i
*
)
ref
);
const
__m128i
src0
=
_mm_unpacklo_epi8
(
s
,
zero
);
const
__m128i
ref0
=
_mm_unpacklo_epi8
(
r
,
zero
);
const
__m128i
diff0
=
_mm_sub_epi16
(
src0
,
ref0
);
const
__m128i
src1
=
_mm_unpackhi_epi8
(
s
,
zero
);
const
__m128i
ref1
=
_mm_unpackhi_epi8
(
r
,
zero
);
const
__m128i
diff1
=
_mm_sub_epi16
(
src1
,
ref1
);
vsum
=
_mm_add_epi16
(
vsum
,
diff0
);
vsum
=
_mm_add_epi16
(
vsum
,
diff1
);
vsse
=
_mm_add_epi32
(
vsse
,
_mm_madd_epi16
(
diff0
,
diff0
));
vsse
=
_mm_add_epi32
(
vsse
,
_mm_madd_epi16
(
diff1
,
diff1
));
src
+=
src_stride
;
ref
+=
ref_stride
;
}
// sum
vsum
=
_mm_add_epi16
(
vsum
,
_mm_srli_si128
(
vsum
,
8
));
vsum
=
_mm_add_epi16
(
vsum
,
_mm_srli_si128
(
vsum
,
4
));
*
sum
=
(
int16_t
)
_mm_extract_epi16
(
vsum
,
0
)
+
(
int16_t
)
_mm_extract_epi16
(
vsum
,
1
);
// sse
vsse
=
_mm_add_epi32
(
vsse
,
_mm_srli_si128
(
vsse
,
8
));
vsse
=
_mm_add_epi32
(
vsse
,
_mm_srli_si128
(
vsse
,
4
));
*
sse
=
_mm_cvtsi128_si32
(
vsse
);
return
0
;
}
static
void
variance_sse2
(
const
unsigned
char
*
src
,
int
src_stride
,
const
unsigned
char
*
ref
,
int
ref_stride
,
...
...
@@ -173,8 +213,7 @@ unsigned int vp9_variance16x16_sse2(const unsigned char *src, int src_stride,
const
unsigned
char
*
ref
,
int
ref_stride
,
unsigned
int
*
sse
)
{
int
sum
;
variance_sse2
(
src
,
src_stride
,
ref
,
ref_stride
,
16
,
16
,
sse
,
&
sum
,
vp9_get16x16var_sse2
,
16
);
vp9_get16x16var_sse2
(
src
,
src_stride
,
ref
,
ref_stride
,
sse
,
&
sum
);
return
*
sse
-
(((
unsigned
int
)
sum
*
sum
)
>>
8
);
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment