Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
Xiph.Org
aom-rav1e
Commits
05bd964a
Commit
05bd964a
authored
Jun 14, 2016
by
James Zern
Committed by
Gerrit Code Review
Jun 14, 2016
Browse files
Options
Browse Files
Download
Plain Diff
Merge "Revert "Add 1D version of vpx_sum_squares_i16"" into nextgenv2
parents
d2ca083c
5e831c54
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
93 additions
and
245 deletions
+93
-245
test/array_utils.h
test/array_utils.h
+0
-39
test/sum_squares_test.cc
test/sum_squares_test.cc
+89
-108
vpx_dsp/sum_squares.c
vpx_dsp/sum_squares.c
+1
-11
vpx_dsp/vpx_dsp_rtcd_defs.pl
vpx_dsp/vpx_dsp_rtcd_defs.pl
+1
-4
vpx_dsp/x86/sum_squares_sse2.c
vpx_dsp/x86/sum_squares_sse2.c
+2
-83
No files found.
test/array_utils.h
deleted
100644 → 0
View file @
d2ca083c
/*
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef TEST_ARRAY_UTILS_H_
#define TEST_ARRAY_UTILS_H_
#include "third_party/googletest/src/include/gtest/gtest.h"
namespace
libvpx_test
{
namespace
array_utils
{
template
<
typename
T
,
size_t
n
,
typename
V
>
void
arraySet
(
T
(
&
arr
)[
n
],
const
V
&
v
)
{
for
(
size_t
i
=
0
;
i
<
n
;
i
++
)
{
arr
[
i
]
=
v
;
}
}
template
<
typename
T
,
size_t
n
,
size_t
m
,
typename
V
>
void
arraySet
(
T
(
&
arr
)[
n
][
m
],
const
V
&
v
)
{
for
(
size_t
i
=
0
;
i
<
n
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
m
;
j
++
)
{
arr
[
i
][
j
]
=
v
;
}
}
}
}
// namespace array_utils
}
// namespace libvpx_test
#endif // TEST_ARRAY_UTILS_H_
test/sum_squares_test.cc
View file @
05bd964a
...
...
@@ -8,144 +8,125 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include <cmath>
#include <cstdlib>
#include <string>
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_ports/mem.h"
#include "test/array_utils.h"
#include "test/assertion_helpers.h"
#include "test/function_equivalence_test.h"
#include "test/randomise.h"
#include "test/acm_random.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "test/snapshot.h"
using
libvpx_test
::
FunctionEquivalenceTest
;
using
libvpx_test
::
Snapshot
;
using
libvpx_test
::
Randomise
;
using
libvpx_test
::
array_utils
::
arraySet
;
using
libvpx_test
::
assertion_helpers
::
ArraysEq
;
namespace
{
static
const
int16_t
int13_max
=
(
1
<<
12
)
-
1
;
//////////////////////////////////////////////////////////////////////////////
// 2D version
//////////////////////////////////////////////////////////////////////////////
typedef
uint64_t
(
*
F2D
)(
const
int16_t
*
src
,
int
stride
,
uint32_t
size
);
class
SumSquares2DTest
:
public
FunctionEquivalenceTest
<
F2D
>
{
protected:
void
Common
()
{
const
int
sizelog2
=
randomise
.
uniform
<
int
>
(
2
,
8
);
const
uint32_t
size
=
1
<<
sizelog2
;
const
int
stride
=
1
<<
randomise
.
uniform
<
int
>
(
sizelog2
,
9
);
snapshot
(
src
);
#include "test/util.h"
uint64_t
ref_res
,
tst_res
;
ref_res
=
ref_func_
(
src
,
stride
,
size
);
ASM_REGISTER_STATE_CHECK
(
tst_res
=
tst_func_
(
src
,
stride
,
size
));
using
libvpx_test
::
ACMRandom
;
ASSERT_EQ
(
ref_res
,
tst_res
);
ASSERT_TRUE
(
ArraysEq
(
snapshot
.
get
(
src
),
src
));
}
Snapshot
snapshot
;
Randomise
randomise
;
DECLARE_ALIGNED
(
16
,
int16_t
,
src
[
256
*
256
]);
};
namespace
{
const
int
kNumIterations
=
10000
;
TEST_P
(
SumSquares2DTest
,
RandomValues
)
{
for
(
int
i
=
0
;
i
<
10000
&&
!
HasFatalFailure
();
i
++
)
{
randomise
(
src
,
-
int13_max
,
int13_max
+
1
);
typedef
uint64_t
(
*
SSI16Func
)(
const
int16_t
*
src
,
int
stride
,
int
size
);
Common
();
}
}
typedef
std
::
tr1
::
tuple
<
SSI16Func
,
SSI16Func
>
SumSquaresParam
;
TEST_P
(
SumSquares2DTest
,
ExtremeValues
)
{
for
(
int
i
=
0
;
i
<
10000
&&
!
HasFatalFailure
();
i
++
)
{
if
(
randomise
.
uniform
<
bool
>
())
arraySet
(
src
,
int13_max
);
else
arraySet
(
src
,
-
int13_max
);
Common
();
class
SumSquaresTest
:
public
::
testing
::
TestWithParam
<
SumSquaresParam
>
{
public:
virtual
~
SumSquaresTest
()
{}
virtual
void
SetUp
()
{
ref_func_
=
GET_PARAM
(
0
);
tst_func_
=
GET_PARAM
(
1
);
}
}
using
std
::
tr1
::
make_tuple
;
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P
(
SSE2
,
SumSquares2DTest
,
::
testing
::
Values
(
make_tuple
(
&
vpx_sum_squares_2d_i16_c
,
&
vpx_sum_squares_2d_i16_sse2
)
)
);
#endif // HAVE_SSE2
//////////////////////////////////////////////////////////////////////////////
// 1D version
//////////////////////////////////////////////////////////////////////////////
virtual
void
TearDown
()
{
libvpx_test
::
ClearSystemState
();
}
typedef
uint64_t
(
*
F1D
)(
const
int16_t
*
src
,
uint32_t
N
);
class
SumSquares1DTest
:
public
FunctionEquivalenceTest
<
F1D
>
{
protected:
void
Common
()
{
const
int
N
=
randomise
.
uniform
<
int
>
(
1
,
256
*
256
-
1
);
snapshot
(
src
);
uint64_t
ref_res
,
tst_res
;
ref_res
=
ref_func_
(
src
,
N
);
ASM_REGISTER_STATE_CHECK
(
tst_res
=
tst_func_
(
src
,
N
));
ASSERT_EQ
(
ref_res
,
tst_res
);
ASSERT_TRUE
(
ArraysEq
(
snapshot
.
get
(
src
),
src
));
}
Snapshot
snapshot
;
Randomise
randomise
;
DECLARE_ALIGNED
(
16
,
int16_t
,
src
[
256
*
256
]);
SSI16Func
ref_func_
;
SSI16Func
tst_func_
;
};
TEST_P
(
SumSquares
1D
Test
,
RandomValues
)
{
for
(
int
i
=
0
;
i
<
10000
&&
!
HasFatalFailure
();
i
++
)
{
randomise
(
src
,
-
int1
3_max
,
int13_max
+
1
);
TEST_P
(
SumSquaresTest
,
OperationCheck
)
{
ACMRandom
rnd
(
ACMRandom
::
DeterministicSeed
());
DECLARE_ALIGNED
(
16
,
int1
6_t
,
src
[
256
*
256
]
);
Common
();
int
failed
=
0
;
const
int
msb
=
11
;
// Up to 12 bit input
const
int
limit
=
1
<<
(
msb
+
1
);
for
(
int
k
=
0
;
k
<
kNumIterations
;
k
++
)
{
int
size
=
4
<<
rnd
(
6
);
// Up to 128x128
int
stride
=
4
<<
rnd
(
7
);
// Up to 256 stride
while
(
stride
<
size
)
{
// Make sure it's valid
stride
=
4
<<
rnd
(
7
);
}
for
(
int
ii
=
0
;
ii
<
size
;
ii
++
)
{
for
(
int
jj
=
0
;
jj
<
size
;
jj
++
)
{
src
[
ii
*
stride
+
jj
]
=
rnd
(
2
)
?
rnd
(
limit
)
:
-
rnd
(
limit
);
}
}
uint64_t
res_ref
=
ref_func_
(
src
,
stride
,
size
);
uint64_t
res_tst
;
ASM_REGISTER_STATE_CHECK
(
res_tst
=
tst_func_
(
src
,
stride
,
size
));
if
(
!
failed
)
{
failed
=
res_ref
!=
res_tst
;
EXPECT_EQ
(
res_ref
,
res_tst
)
<<
"Error: Sum Squares Test"
<<
" C output does not match optimized output."
;
}
}
}
TEST_P
(
SumSquares1DTest
,
ExtremeValues
)
{
for
(
int
i
=
0
;
i
<
10000
&&
!
HasFatalFailure
();
i
++
)
{
if
(
randomise
.
uniform
<
bool
>
())
arraySet
(
src
,
int13_max
);
else
arraySet
(
src
,
-
int13_max
);
TEST_P
(
SumSquaresTest
,
ExtremeValues
)
{
ACMRandom
rnd
(
ACMRandom
::
DeterministicSeed
());
DECLARE_ALIGNED
(
16
,
int16_t
,
src
[
256
*
256
]);
Common
();
int
failed
=
0
;
const
int
msb
=
11
;
// Up to 12 bit input
const
int
limit
=
1
<<
(
msb
+
1
);
for
(
int
k
=
0
;
k
<
kNumIterations
;
k
++
)
{
int
size
=
4
<<
rnd
(
6
);
// Up to 128x128
int
stride
=
4
<<
rnd
(
7
);
// Up to 256 stride
while
(
stride
<
size
)
{
// Make sure it's valid
stride
=
4
<<
rnd
(
7
);
}
int
val
=
rnd
(
2
)
?
limit
-
1
:
-
(
limit
-
1
);
for
(
int
ii
=
0
;
ii
<
size
;
ii
++
)
{
for
(
int
jj
=
0
;
jj
<
size
;
jj
++
)
{
src
[
ii
*
stride
+
jj
]
=
val
;
}
}
uint64_t
res_ref
=
ref_func_
(
src
,
stride
,
size
);
uint64_t
res_tst
;
ASM_REGISTER_STATE_CHECK
(
res_tst
=
tst_func_
(
src
,
stride
,
size
));
if
(
!
failed
)
{
failed
=
res_ref
!=
res_tst
;
EXPECT_EQ
(
res_ref
,
res_tst
)
<<
"Error: Sum Squares Test"
<<
" C output does not match optimized output."
;
}
}
}
using
std
::
tr1
::
make_tuple
;
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P
(
SSE2
,
SumSquares
1D
Test
,
SSE2
,
SumSquaresTest
,
::
testing
::
Values
(
make_tuple
(
&
vpx_sum_squares_i16_c
,
&
vpx_sum_squares_i16_sse2
)
make_tuple
(
&
vpx_sum_squares_
2d_
i16_c
,
&
vpx_sum_squares_
2d_
i16_sse2
)
)
);
#endif // HAVE_SSE2
...
...
vpx_dsp/sum_squares.c
View file @
05bd964a
...
...
@@ -13,7 +13,7 @@
#include "./vpx_dsp_rtcd.h"
uint64_t
vpx_sum_squares_2d_i16_c
(
const
int16_t
*
src
,
int
src_stride
,
u
int
32_t
size
)
{
int
size
)
{
int
r
,
c
;
uint64_t
ss
=
0
;
...
...
@@ -27,13 +27,3 @@ uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int src_stride,
return
ss
;
}
uint64_t
vpx_sum_squares_i16_c
(
const
int16_t
*
src
,
uint32_t
n
)
{
uint64_t
ss
=
0
;
do
{
const
int16_t
v
=
*
src
++
;
ss
+=
v
*
v
;
}
while
(
--
n
);
return
ss
;
}
vpx_dsp/vpx_dsp_rtcd_defs.pl
View file @
05bd964a
...
...
@@ -970,11 +970,8 @@ if (vpx_config("CONFIG_VP10_ENCODER") eq "yes") {
#
# Sum of Squares
#
add_proto
qw/uint64_t vpx_sum_squares_2d_i16/
,
"
const int16_t *src, int stride,
u
int
32_t
size
";
add_proto
qw/uint64_t vpx_sum_squares_2d_i16/
,
"
const int16_t *src, int stride, int size
";
specialize
qw/vpx_sum_squares_2d_i16 sse2/
;
add_proto
qw/uint64_t vpx_sum_squares_i16/
,
"
const int16_t *src, uint32_t N
";
specialize
qw/vpx_sum_squares_i16 sse2/
;
}
if
((
vpx_config
("
CONFIG_VP9_ENCODER
")
eq
"
yes
")
||
(
vpx_config
("
CONFIG_VP10_ENCODER
")
eq
"
yes
"))
{
...
...
vpx_dsp/x86/sum_squares_sse2.c
View file @
05bd964a
...
...
@@ -12,14 +12,8 @@
#include <emmintrin.h>
#include <stdio.h>
#include "vpx_dsp/x86/synonyms.h"
#include "./vpx_dsp_rtcd.h"
//////////////////////////////////////////////////////////////////////////////
// 2D version
//////////////////////////////////////////////////////////////////////////////
static
uint64_t
vpx_sum_squares_2d_i16_4x4_sse2
(
const
int16_t
*
src
,
int
stride
)
{
const
__m128i
v_val_0_w
=
_mm_loadl_epi64
((
const
__m128i
*
)(
src
+
0
*
stride
));
...
...
@@ -50,7 +44,7 @@ __attribute__((noinline))
#endif
static
uint64_t
vpx_sum_squares_2d_i16_nxn_sse2
(
const
int16_t
*
src
,
int
stride
,
u
int
32_t
size
)
{
int
size
)
{
int
r
,
c
;
const
__m128i
v_zext_mask_q
=
_mm_set_epi32
(
0
,
0xffffffff
,
0
,
0xffffffff
);
...
...
@@ -112,7 +106,7 @@ static uint64_t vpx_sum_squares_2d_i16_nxn_sse2(const int16_t *src,
}
uint64_t
vpx_sum_squares_2d_i16_sse2
(
const
int16_t
*
src
,
int
stride
,
u
int
32_t
size
)
{
int
size
)
{
// 4 elements per row only requires half an XMM register, so this
// must be a special case, but also note that over 75% of all calls
// are with size == 4, so it is also the common case.
...
...
@@ -123,78 +117,3 @@ uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride,
return
vpx_sum_squares_2d_i16_nxn_sse2
(
src
,
stride
,
size
);
}
}
//////////////////////////////////////////////////////////////////////////////
// 1D version
//////////////////////////////////////////////////////////////////////////////
static
uint64_t
vpx_sum_squares_i16_64n_sse2
(
const
int16_t
*
src
,
uint32_t
n
)
{
const
__m128i
v_zext_mask_q
=
_mm_set_epi32
(
0
,
0xffffffff
,
0
,
0xffffffff
);
__m128i
v_acc0_q
=
_mm_setzero_si128
();
__m128i
v_acc1_q
=
_mm_setzero_si128
();
const
int16_t
*
const
end
=
src
+
n
;
assert
(
n
%
64
==
0
);
while
(
src
<
end
)
{
const
__m128i
v_val_0_w
=
xx_load_128
(
src
);
const
__m128i
v_val_1_w
=
xx_load_128
(
src
+
8
);
const
__m128i
v_val_2_w
=
xx_load_128
(
src
+
16
);
const
__m128i
v_val_3_w
=
xx_load_128
(
src
+
24
);
const
__m128i
v_val_4_w
=
xx_load_128
(
src
+
32
);
const
__m128i
v_val_5_w
=
xx_load_128
(
src
+
40
);
const
__m128i
v_val_6_w
=
xx_load_128
(
src
+
48
);
const
__m128i
v_val_7_w
=
xx_load_128
(
src
+
56
);
const
__m128i
v_sq_0_d
=
_mm_madd_epi16
(
v_val_0_w
,
v_val_0_w
);
const
__m128i
v_sq_1_d
=
_mm_madd_epi16
(
v_val_1_w
,
v_val_1_w
);
const
__m128i
v_sq_2_d
=
_mm_madd_epi16
(
v_val_2_w
,
v_val_2_w
);
const
__m128i
v_sq_3_d
=
_mm_madd_epi16
(
v_val_3_w
,
v_val_3_w
);
const
__m128i
v_sq_4_d
=
_mm_madd_epi16
(
v_val_4_w
,
v_val_4_w
);
const
__m128i
v_sq_5_d
=
_mm_madd_epi16
(
v_val_5_w
,
v_val_5_w
);
const
__m128i
v_sq_6_d
=
_mm_madd_epi16
(
v_val_6_w
,
v_val_6_w
);
const
__m128i
v_sq_7_d
=
_mm_madd_epi16
(
v_val_7_w
,
v_val_7_w
);
const
__m128i
v_sum_01_d
=
_mm_add_epi32
(
v_sq_0_d
,
v_sq_1_d
);
const
__m128i
v_sum_23_d
=
_mm_add_epi32
(
v_sq_2_d
,
v_sq_3_d
);
const
__m128i
v_sum_45_d
=
_mm_add_epi32
(
v_sq_4_d
,
v_sq_5_d
);
const
__m128i
v_sum_67_d
=
_mm_add_epi32
(
v_sq_6_d
,
v_sq_7_d
);
const
__m128i
v_sum_0123_d
=
_mm_add_epi32
(
v_sum_01_d
,
v_sum_23_d
);
const
__m128i
v_sum_4567_d
=
_mm_add_epi32
(
v_sum_45_d
,
v_sum_67_d
);
const
__m128i
v_sum_d
=
_mm_add_epi32
(
v_sum_0123_d
,
v_sum_4567_d
);
v_acc0_q
=
_mm_add_epi64
(
v_acc0_q
,
_mm_and_si128
(
v_sum_d
,
v_zext_mask_q
));
v_acc1_q
=
_mm_add_epi64
(
v_acc1_q
,
_mm_srli_epi64
(
v_sum_d
,
32
));
src
+=
64
;
}
v_acc0_q
=
_mm_add_epi64
(
v_acc0_q
,
v_acc1_q
);
v_acc0_q
=
_mm_add_epi64
(
v_acc0_q
,
_mm_srli_si128
(
v_acc0_q
,
8
));
#if ARCH_X86_64
return
(
uint64_t
)
_mm_cvtsi128_si64
(
v_acc0_q
);
#else
{
uint64_t
tmp
;
_mm_storel_epi64
((
__m128i
*
)
&
tmp
,
v_acc0_q
);
return
tmp
;
}
#endif
}
uint64_t
vpx_sum_squares_i16_sse2
(
const
int16_t
*
src
,
uint32_t
n
)
{
if
(
n
%
64
==
0
)
{
return
vpx_sum_squares_i16_64n_sse2
(
src
,
n
);
}
else
if
(
n
>
64
)
{
int
k
=
n
&
~
(
64
-
1
);
return
vpx_sum_squares_i16_64n_sse2
(
src
,
k
)
+
vpx_sum_squares_i16_c
(
src
+
k
,
n
-
k
);
}
else
{
return
vpx_sum_squares_i16_c
(
src
,
n
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment