Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
52141c91
Commit
52141c91
authored
Jun 21, 2016
by
Geza Lore
Browse files
Add 1D version of vpx_sum_squares_i16
Change-Id: I1829f931749a26aec38c896b609c5a2640d6dfaf
parent
f1a50db2
Changes
3
Hide whitespace changes
Inline
Side-by-side
vpx_dsp/sum_squares.c
View file @
52141c91
...
...
@@ -27,3 +27,13 @@ uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int src_stride,
return
ss
;
}
uint64_t
vpx_sum_squares_i16_c
(
const
int16_t
*
src
,
uint32_t
n
)
{
uint64_t
ss
=
0
;
do
{
const
int16_t
v
=
*
src
++
;
ss
+=
v
*
v
;
}
while
(
--
n
);
return
ss
;
}
vpx_dsp/vpx_dsp_rtcd_defs.pl
View file @
52141c91
...
...
@@ -972,6 +972,9 @@ if (vpx_config("CONFIG_VP10_ENCODER") eq "yes") {
#
add_proto
qw/uint64_t vpx_sum_squares_2d_i16/
,
"
const int16_t *src, int stride, int size
";
specialize
qw/vpx_sum_squares_2d_i16 sse2/
;
add_proto
qw/uint64_t vpx_sum_squares_i16/
,
"
const int16_t *src, uint32_t N
";
specialize
qw/vpx_sum_squares_i16 sse2/
;
}
if
((
vpx_config
("
CONFIG_VP9_ENCODER
")
eq
"
yes
")
||
(
vpx_config
("
CONFIG_VP10_ENCODER
")
eq
"
yes
"))
{
...
...
vpx_dsp/x86/sum_squares_sse2.c
View file @
52141c91
...
...
@@ -12,6 +12,8 @@
#include <emmintrin.h>
#include <stdio.h>
#include "vpx_dsp/x86/synonyms.h"
#include "./vpx_dsp_rtcd.h"
static
uint64_t
vpx_sum_squares_2d_i16_4x4_sse2
(
const
int16_t
*
src
,
...
...
@@ -117,3 +119,77 @@ uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride,
return
vpx_sum_squares_2d_i16_nxn_sse2
(
src
,
stride
,
size
);
}
}
//////////////////////////////////////////////////////////////////////////////
// 1D version
//////////////////////////////////////////////////////////////////////////////
static
uint64_t
vpx_sum_squares_i16_64n_sse2
(
const
int16_t
*
src
,
uint32_t
n
)
{
const
__m128i
v_zext_mask_q
=
_mm_set_epi32
(
0
,
0xffffffff
,
0
,
0xffffffff
);
__m128i
v_acc0_q
=
_mm_setzero_si128
();
__m128i
v_acc1_q
=
_mm_setzero_si128
();
const
int16_t
*
const
end
=
src
+
n
;
assert
(
n
%
64
==
0
);
while
(
src
<
end
)
{
const
__m128i
v_val_0_w
=
xx_load_128
(
src
);
const
__m128i
v_val_1_w
=
xx_load_128
(
src
+
8
);
const
__m128i
v_val_2_w
=
xx_load_128
(
src
+
16
);
const
__m128i
v_val_3_w
=
xx_load_128
(
src
+
24
);
const
__m128i
v_val_4_w
=
xx_load_128
(
src
+
32
);
const
__m128i
v_val_5_w
=
xx_load_128
(
src
+
40
);
const
__m128i
v_val_6_w
=
xx_load_128
(
src
+
48
);
const
__m128i
v_val_7_w
=
xx_load_128
(
src
+
56
);
const
__m128i
v_sq_0_d
=
_mm_madd_epi16
(
v_val_0_w
,
v_val_0_w
);
const
__m128i
v_sq_1_d
=
_mm_madd_epi16
(
v_val_1_w
,
v_val_1_w
);
const
__m128i
v_sq_2_d
=
_mm_madd_epi16
(
v_val_2_w
,
v_val_2_w
);
const
__m128i
v_sq_3_d
=
_mm_madd_epi16
(
v_val_3_w
,
v_val_3_w
);
const
__m128i
v_sq_4_d
=
_mm_madd_epi16
(
v_val_4_w
,
v_val_4_w
);
const
__m128i
v_sq_5_d
=
_mm_madd_epi16
(
v_val_5_w
,
v_val_5_w
);
const
__m128i
v_sq_6_d
=
_mm_madd_epi16
(
v_val_6_w
,
v_val_6_w
);
const
__m128i
v_sq_7_d
=
_mm_madd_epi16
(
v_val_7_w
,
v_val_7_w
);
const
__m128i
v_sum_01_d
=
_mm_add_epi32
(
v_sq_0_d
,
v_sq_1_d
);
const
__m128i
v_sum_23_d
=
_mm_add_epi32
(
v_sq_2_d
,
v_sq_3_d
);
const
__m128i
v_sum_45_d
=
_mm_add_epi32
(
v_sq_4_d
,
v_sq_5_d
);
const
__m128i
v_sum_67_d
=
_mm_add_epi32
(
v_sq_6_d
,
v_sq_7_d
);
const
__m128i
v_sum_0123_d
=
_mm_add_epi32
(
v_sum_01_d
,
v_sum_23_d
);
const
__m128i
v_sum_4567_d
=
_mm_add_epi32
(
v_sum_45_d
,
v_sum_67_d
);
const
__m128i
v_sum_d
=
_mm_add_epi32
(
v_sum_0123_d
,
v_sum_4567_d
);
v_acc0_q
=
_mm_add_epi64
(
v_acc0_q
,
_mm_and_si128
(
v_sum_d
,
v_zext_mask_q
));
v_acc1_q
=
_mm_add_epi64
(
v_acc1_q
,
_mm_srli_epi64
(
v_sum_d
,
32
));
src
+=
64
;
}
v_acc0_q
=
_mm_add_epi64
(
v_acc0_q
,
v_acc1_q
);
v_acc0_q
=
_mm_add_epi64
(
v_acc0_q
,
_mm_srli_si128
(
v_acc0_q
,
8
));
#if ARCH_X86_64
return
(
uint64_t
)
_mm_cvtsi128_si64
(
v_acc0_q
);
#else
{
uint64_t
tmp
;
_mm_storel_epi64
((
__m128i
*
)
&
tmp
,
v_acc0_q
);
return
tmp
;
}
#endif
}
uint64_t
vpx_sum_squares_i16_sse2
(
const
int16_t
*
src
,
uint32_t
n
)
{
if
(
n
%
64
==
0
)
{
return
vpx_sum_squares_i16_64n_sse2
(
src
,
n
);
}
else
if
(
n
>
64
)
{
int
k
=
n
&
~
(
64
-
1
);
return
vpx_sum_squares_i16_64n_sse2
(
src
,
k
)
+
vpx_sum_squares_i16_c
(
src
+
k
,
n
-
k
);
}
else
{
return
vpx_sum_squares_i16_c
(
src
,
n
);
}
}
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment