Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
A
aom-rav1e
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Packages & Registries
Packages & Registries
Container Registry
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Xiph.Org
aom-rav1e
Commits
f0f00251
Commit
f0f00251
authored
Jul 22, 2015
by
Jingning Han
Committed by
Gerrit Code Review
Jul 22, 2015
Browse files
Options
Browse Files
Download
Plain Diff
Merge "Factor forward 2D-DCT transforms into vpx_dsp"
parents
34c4e1d6
b67821f3
Changes
18
Hide whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
867 additions
and
717 deletions
+867
-717
test/dct16x16_test.cc
test/dct16x16_test.cc
+3
-1
test/fdct4x4_test.cc
test/fdct4x4_test.cc
+3
-1
test/fdct8x8_test.cc
test/fdct8x8_test.cc
+3
-1
test/partial_idct_test.cc
test/partial_idct_test.cc
+1
-0
vp9/common/vp9_rtcd_defs.pl
vp9/common/vp9_rtcd_defs.pl
+0
-27
vp9/encoder/arm/neon/vp9_dct_neon.c
vp9/encoder/arm/neon/vp9_dct_neon.c
+1
-190
vp9/encoder/vp9_dct.c
vp9/encoder/vp9_dct.c
+3
-342
vp9/encoder/x86/vp9_dct_sse2.c
vp9/encoder/x86/vp9_dct_sse2.c
+0
-16
vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
+0
-137
vp9/vp9cx.mk
vp9/vp9cx.mk
+0
-1
vpx_dsp/arm/fwd_txfm_neon.c
vpx_dsp/arm/fwd_txfm_neon.c
+202
-0
vpx_dsp/fwd_txfm.c
vpx_dsp/fwd_txfm.c
+361
-0
vpx_dsp/fwd_txfm.h
vpx_dsp/fwd_txfm.h
+19
-0
vpx_dsp/vpx_dsp.mk
vpx_dsp/vpx_dsp.mk
+13
-0
vpx_dsp/vpx_dsp_rtcd_defs.pl
vpx_dsp/vpx_dsp_rtcd_defs.pl
+38
-0
vpx_dsp/x86/fwd_txfm_impl_sse2.h
vpx_dsp/x86/fwd_txfm_impl_sse2.h
+4
-1
vpx_dsp/x86/fwd_txfm_sse2.c
vpx_dsp/x86/fwd_txfm_sse2.c
+34
-0
vpx_dsp/x86/fwd_txfm_ssse3.asm
vpx_dsp/x86/fwd_txfm_ssse3.asm
+182
-0
No files found.
test/dct16x16_test.cc
View file @
f0f00251
...
...
@@ -19,6 +19,7 @@
#include "test/util.h"
#include "./vp9_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "vp9/common/vp9_entropy.h"
#include "vp9/common/vp9_scan.h"
#include "vpx/vpx_codec.h"
...
...
@@ -921,7 +922,8 @@ INSTANTIATE_TEST_CASE_P(
&
idct16x16_256_add_12_sse2
,
3167
,
VPX_BITS_12
)));
#endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
// TODO(jingning) Re-enable the mips/msa unit test.
#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE && 0
INSTANTIATE_TEST_CASE_P
(
MSA
,
Trans16x16DCT
,
::
testing
::
Values
(
...
...
test/fdct4x4_test.cc
View file @
f0f00251
...
...
@@ -19,6 +19,7 @@
#include "test/util.h"
#include "./vp9_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "vp9/common/vp9_entropy.h"
#include "vpx/vpx_codec.h"
#include "vpx/vpx_integer.h"
...
...
@@ -537,7 +538,8 @@ INSTANTIATE_TEST_CASE_P(
make_tuple
(
&
vp9_fht4x4_sse2
,
&
vp9_iht4x4_16_add_c
,
3
,
VPX_BITS_8
)));
#endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
// TODO(jingning) Re-enable the mips/msa unit test.
#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE && 0
INSTANTIATE_TEST_CASE_P
(
MSA
,
Trans4x4DCT
,
::
testing
::
Values
(
...
...
test/fdct8x8_test.cc
View file @
f0f00251
...
...
@@ -19,6 +19,7 @@
#include "test/util.h"
#include "./vp9_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "vp9/common/vp9_entropy.h"
#include "vp9/common/vp9_scan.h"
#include "vpx/vpx_codec.h"
...
...
@@ -772,7 +773,8 @@ INSTANTIATE_TEST_CASE_P(
VPX_BITS_8
)));
#endif
#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
// TODO(jingning) Re-enable the mips/msa unit test.
#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE && 0
INSTANTIATE_TEST_CASE_P
(
MSA
,
FwdTrans8x8DCT
,
::
testing
::
Values
(
...
...
test/partial_idct_test.cc
View file @
f0f00251
...
...
@@ -19,6 +19,7 @@
#include "test/util.h"
#include "./vp9_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_scan.h"
#include "vpx/vpx_integer.h"
...
...
vp9/common/vp9_rtcd_defs.pl
View file @
f0f00251
...
...
@@ -829,21 +829,12 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto
qw/void vp9_fdct4x4_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp9_fdct4x4_1 sse2/
;
add_proto
qw/void vp9_fdct4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp9_fdct4x4 sse2/
;
add_proto
qw/void vp9_fdct8x8_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp9_fdct8x8_1 sse2/
;
add_proto
qw/void vp9_fdct8x8/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp9_fdct8x8 sse2/
;
add_proto
qw/void vp9_fdct16x16_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp9_fdct16x16_1 sse2/
;
add_proto
qw/void vp9_fdct16x16/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp9_fdct16x16 sse2/
;
add_proto
qw/void vp9_fdct32x32_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp9_fdct32x32_1 sse2/
;
...
...
@@ -868,21 +859,12 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto
qw/void vp9_fdct4x4_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp9_fdct4x4_1 sse2/
;
add_proto
qw/void vp9_fdct4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp9_fdct4x4 sse2 msa/
;
add_proto
qw/void vp9_fdct8x8_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp9_fdct8x8_1 sse2 neon msa/
;
add_proto
qw/void vp9_fdct8x8/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp9_fdct8x8 sse2 neon msa/
,
"
$ssse3_x86_64_x86inc
";
add_proto
qw/void vp9_fdct16x16_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp9_fdct16x16_1 sse2 msa/
;
add_proto
qw/void vp9_fdct16x16/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp9_fdct16x16 sse2 msa/
;
add_proto
qw/void vp9_fdct32x32_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp9_fdct32x32_1 sse2 msa/
;
...
...
@@ -944,21 +926,12 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto
qw/void vp9_highbd_fwht4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp9_highbd_fwht4x4/
;
add_proto
qw/void vp9_highbd_fdct4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp9_highbd_fdct4x4 sse2/
;
add_proto
qw/void vp9_highbd_fdct8x8_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp9_highbd_fdct8x8_1/
;
add_proto
qw/void vp9_highbd_fdct8x8/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp9_highbd_fdct8x8 sse2/
;
add_proto
qw/void vp9_highbd_fdct16x16_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp9_highbd_fdct16x16_1/
;
add_proto
qw/void vp9_highbd_fdct16x16/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp9_highbd_fdct16x16 sse2/
;
add_proto
qw/void vp9_highbd_fdct32x32_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp9_highbd_fdct32x32_1/
;
...
...
vp9/encoder/arm/neon/vp9_dct_neon.c
View file @
f0f00251
...
...
@@ -10,6 +10,7 @@
#include <arm_neon.h>
#include "./vp9_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "./vpx_config.h"
#include "vp9/common/vp9_blockd.h"
...
...
@@ -49,193 +50,3 @@ void vp9_fdct8x8_quant_neon(const int16_t *input, int stride,
quant_ptr
,
quant_shift_ptr
,
qcoeff_ptr
,
dqcoeff_ptr
,
dequant_ptr
,
eob_ptr
,
scan_ptr
,
iscan_ptr
);
}
void
vp9_fdct8x8_neon
(
const
int16_t
*
input
,
int16_t
*
final_output
,
int
stride
)
{
int
i
;
// stage 1
int16x8_t
input_0
=
vshlq_n_s16
(
vld1q_s16
(
&
input
[
0
*
stride
]),
2
);
int16x8_t
input_1
=
vshlq_n_s16
(
vld1q_s16
(
&
input
[
1
*
stride
]),
2
);
int16x8_t
input_2
=
vshlq_n_s16
(
vld1q_s16
(
&
input
[
2
*
stride
]),
2
);
int16x8_t
input_3
=
vshlq_n_s16
(
vld1q_s16
(
&
input
[
3
*
stride
]),
2
);
int16x8_t
input_4
=
vshlq_n_s16
(
vld1q_s16
(
&
input
[
4
*
stride
]),
2
);
int16x8_t
input_5
=
vshlq_n_s16
(
vld1q_s16
(
&
input
[
5
*
stride
]),
2
);
int16x8_t
input_6
=
vshlq_n_s16
(
vld1q_s16
(
&
input
[
6
*
stride
]),
2
);
int16x8_t
input_7
=
vshlq_n_s16
(
vld1q_s16
(
&
input
[
7
*
stride
]),
2
);
for
(
i
=
0
;
i
<
2
;
++
i
)
{
int16x8_t
out_0
,
out_1
,
out_2
,
out_3
,
out_4
,
out_5
,
out_6
,
out_7
;
const
int16x8_t
v_s0
=
vaddq_s16
(
input_0
,
input_7
);
const
int16x8_t
v_s1
=
vaddq_s16
(
input_1
,
input_6
);
const
int16x8_t
v_s2
=
vaddq_s16
(
input_2
,
input_5
);
const
int16x8_t
v_s3
=
vaddq_s16
(
input_3
,
input_4
);
const
int16x8_t
v_s4
=
vsubq_s16
(
input_3
,
input_4
);
const
int16x8_t
v_s5
=
vsubq_s16
(
input_2
,
input_5
);
const
int16x8_t
v_s6
=
vsubq_s16
(
input_1
,
input_6
);
const
int16x8_t
v_s7
=
vsubq_s16
(
input_0
,
input_7
);
// fdct4(step, step);
int16x8_t
v_x0
=
vaddq_s16
(
v_s0
,
v_s3
);
int16x8_t
v_x1
=
vaddq_s16
(
v_s1
,
v_s2
);
int16x8_t
v_x2
=
vsubq_s16
(
v_s1
,
v_s2
);
int16x8_t
v_x3
=
vsubq_s16
(
v_s0
,
v_s3
);
// fdct4(step, step);
int32x4_t
v_t0_lo
=
vaddl_s16
(
vget_low_s16
(
v_x0
),
vget_low_s16
(
v_x1
));
int32x4_t
v_t0_hi
=
vaddl_s16
(
vget_high_s16
(
v_x0
),
vget_high_s16
(
v_x1
));
int32x4_t
v_t1_lo
=
vsubl_s16
(
vget_low_s16
(
v_x0
),
vget_low_s16
(
v_x1
));
int32x4_t
v_t1_hi
=
vsubl_s16
(
vget_high_s16
(
v_x0
),
vget_high_s16
(
v_x1
));
int32x4_t
v_t2_lo
=
vmull_n_s16
(
vget_low_s16
(
v_x2
),
(
int16_t
)
cospi_24_64
);
int32x4_t
v_t2_hi
=
vmull_n_s16
(
vget_high_s16
(
v_x2
),
(
int16_t
)
cospi_24_64
);
int32x4_t
v_t3_lo
=
vmull_n_s16
(
vget_low_s16
(
v_x3
),
(
int16_t
)
cospi_24_64
);
int32x4_t
v_t3_hi
=
vmull_n_s16
(
vget_high_s16
(
v_x3
),
(
int16_t
)
cospi_24_64
);
v_t2_lo
=
vmlal_n_s16
(
v_t2_lo
,
vget_low_s16
(
v_x3
),
(
int16_t
)
cospi_8_64
);
v_t2_hi
=
vmlal_n_s16
(
v_t2_hi
,
vget_high_s16
(
v_x3
),
(
int16_t
)
cospi_8_64
);
v_t3_lo
=
vmlsl_n_s16
(
v_t3_lo
,
vget_low_s16
(
v_x2
),
(
int16_t
)
cospi_8_64
);
v_t3_hi
=
vmlsl_n_s16
(
v_t3_hi
,
vget_high_s16
(
v_x2
),
(
int16_t
)
cospi_8_64
);
v_t0_lo
=
vmulq_n_s32
(
v_t0_lo
,
cospi_16_64
);
v_t0_hi
=
vmulq_n_s32
(
v_t0_hi
,
cospi_16_64
);
v_t1_lo
=
vmulq_n_s32
(
v_t1_lo
,
cospi_16_64
);
v_t1_hi
=
vmulq_n_s32
(
v_t1_hi
,
cospi_16_64
);
{
const
int16x4_t
a
=
vrshrn_n_s32
(
v_t0_lo
,
DCT_CONST_BITS
);
const
int16x4_t
b
=
vrshrn_n_s32
(
v_t0_hi
,
DCT_CONST_BITS
);
const
int16x4_t
c
=
vrshrn_n_s32
(
v_t1_lo
,
DCT_CONST_BITS
);
const
int16x4_t
d
=
vrshrn_n_s32
(
v_t1_hi
,
DCT_CONST_BITS
);
const
int16x4_t
e
=
vrshrn_n_s32
(
v_t2_lo
,
DCT_CONST_BITS
);
const
int16x4_t
f
=
vrshrn_n_s32
(
v_t2_hi
,
DCT_CONST_BITS
);
const
int16x4_t
g
=
vrshrn_n_s32
(
v_t3_lo
,
DCT_CONST_BITS
);
const
int16x4_t
h
=
vrshrn_n_s32
(
v_t3_hi
,
DCT_CONST_BITS
);
out_0
=
vcombine_s16
(
a
,
c
);
// 00 01 02 03 40 41 42 43
out_2
=
vcombine_s16
(
e
,
g
);
// 20 21 22 23 60 61 62 63
out_4
=
vcombine_s16
(
b
,
d
);
// 04 05 06 07 44 45 46 47
out_6
=
vcombine_s16
(
f
,
h
);
// 24 25 26 27 64 65 66 67
}
// Stage 2
v_x0
=
vsubq_s16
(
v_s6
,
v_s5
);
v_x1
=
vaddq_s16
(
v_s6
,
v_s5
);
v_t0_lo
=
vmull_n_s16
(
vget_low_s16
(
v_x0
),
(
int16_t
)
cospi_16_64
);
v_t0_hi
=
vmull_n_s16
(
vget_high_s16
(
v_x0
),
(
int16_t
)
cospi_16_64
);
v_t1_lo
=
vmull_n_s16
(
vget_low_s16
(
v_x1
),
(
int16_t
)
cospi_16_64
);
v_t1_hi
=
vmull_n_s16
(
vget_high_s16
(
v_x1
),
(
int16_t
)
cospi_16_64
);
{
const
int16x4_t
a
=
vrshrn_n_s32
(
v_t0_lo
,
DCT_CONST_BITS
);
const
int16x4_t
b
=
vrshrn_n_s32
(
v_t0_hi
,
DCT_CONST_BITS
);
const
int16x4_t
c
=
vrshrn_n_s32
(
v_t1_lo
,
DCT_CONST_BITS
);
const
int16x4_t
d
=
vrshrn_n_s32
(
v_t1_hi
,
DCT_CONST_BITS
);
const
int16x8_t
ab
=
vcombine_s16
(
a
,
b
);
const
int16x8_t
cd
=
vcombine_s16
(
c
,
d
);
// Stage 3
v_x0
=
vaddq_s16
(
v_s4
,
ab
);
v_x1
=
vsubq_s16
(
v_s4
,
ab
);
v_x2
=
vsubq_s16
(
v_s7
,
cd
);
v_x3
=
vaddq_s16
(
v_s7
,
cd
);
}
// Stage 4
v_t0_lo
=
vmull_n_s16
(
vget_low_s16
(
v_x3
),
(
int16_t
)
cospi_4_64
);
v_t0_hi
=
vmull_n_s16
(
vget_high_s16
(
v_x3
),
(
int16_t
)
cospi_4_64
);
v_t0_lo
=
vmlal_n_s16
(
v_t0_lo
,
vget_low_s16
(
v_x0
),
(
int16_t
)
cospi_28_64
);
v_t0_hi
=
vmlal_n_s16
(
v_t0_hi
,
vget_high_s16
(
v_x0
),
(
int16_t
)
cospi_28_64
);
v_t1_lo
=
vmull_n_s16
(
vget_low_s16
(
v_x1
),
(
int16_t
)
cospi_12_64
);
v_t1_hi
=
vmull_n_s16
(
vget_high_s16
(
v_x1
),
(
int16_t
)
cospi_12_64
);
v_t1_lo
=
vmlal_n_s16
(
v_t1_lo
,
vget_low_s16
(
v_x2
),
(
int16_t
)
cospi_20_64
);
v_t1_hi
=
vmlal_n_s16
(
v_t1_hi
,
vget_high_s16
(
v_x2
),
(
int16_t
)
cospi_20_64
);
v_t2_lo
=
vmull_n_s16
(
vget_low_s16
(
v_x2
),
(
int16_t
)
cospi_12_64
);
v_t2_hi
=
vmull_n_s16
(
vget_high_s16
(
v_x2
),
(
int16_t
)
cospi_12_64
);
v_t2_lo
=
vmlsl_n_s16
(
v_t2_lo
,
vget_low_s16
(
v_x1
),
(
int16_t
)
cospi_20_64
);
v_t2_hi
=
vmlsl_n_s16
(
v_t2_hi
,
vget_high_s16
(
v_x1
),
(
int16_t
)
cospi_20_64
);
v_t3_lo
=
vmull_n_s16
(
vget_low_s16
(
v_x3
),
(
int16_t
)
cospi_28_64
);
v_t3_hi
=
vmull_n_s16
(
vget_high_s16
(
v_x3
),
(
int16_t
)
cospi_28_64
);
v_t3_lo
=
vmlsl_n_s16
(
v_t3_lo
,
vget_low_s16
(
v_x0
),
(
int16_t
)
cospi_4_64
);
v_t3_hi
=
vmlsl_n_s16
(
v_t3_hi
,
vget_high_s16
(
v_x0
),
(
int16_t
)
cospi_4_64
);
{
const
int16x4_t
a
=
vrshrn_n_s32
(
v_t0_lo
,
DCT_CONST_BITS
);
const
int16x4_t
b
=
vrshrn_n_s32
(
v_t0_hi
,
DCT_CONST_BITS
);
const
int16x4_t
c
=
vrshrn_n_s32
(
v_t1_lo
,
DCT_CONST_BITS
);
const
int16x4_t
d
=
vrshrn_n_s32
(
v_t1_hi
,
DCT_CONST_BITS
);
const
int16x4_t
e
=
vrshrn_n_s32
(
v_t2_lo
,
DCT_CONST_BITS
);
const
int16x4_t
f
=
vrshrn_n_s32
(
v_t2_hi
,
DCT_CONST_BITS
);
const
int16x4_t
g
=
vrshrn_n_s32
(
v_t3_lo
,
DCT_CONST_BITS
);
const
int16x4_t
h
=
vrshrn_n_s32
(
v_t3_hi
,
DCT_CONST_BITS
);
out_1
=
vcombine_s16
(
a
,
c
);
// 10 11 12 13 50 51 52 53
out_3
=
vcombine_s16
(
e
,
g
);
// 30 31 32 33 70 71 72 73
out_5
=
vcombine_s16
(
b
,
d
);
// 14 15 16 17 54 55 56 57
out_7
=
vcombine_s16
(
f
,
h
);
// 34 35 36 37 74 75 76 77
}
// transpose 8x8
{
// 00 01 02 03 40 41 42 43
// 10 11 12 13 50 51 52 53
// 20 21 22 23 60 61 62 63
// 30 31 32 33 70 71 72 73
// 04 05 06 07 44 45 46 47
// 14 15 16 17 54 55 56 57
// 24 25 26 27 64 65 66 67
// 34 35 36 37 74 75 76 77
const
int32x4x2_t
r02_s32
=
vtrnq_s32
(
vreinterpretq_s32_s16
(
out_0
),
vreinterpretq_s32_s16
(
out_2
));
const
int32x4x2_t
r13_s32
=
vtrnq_s32
(
vreinterpretq_s32_s16
(
out_1
),
vreinterpretq_s32_s16
(
out_3
));
const
int32x4x2_t
r46_s32
=
vtrnq_s32
(
vreinterpretq_s32_s16
(
out_4
),
vreinterpretq_s32_s16
(
out_6
));
const
int32x4x2_t
r57_s32
=
vtrnq_s32
(
vreinterpretq_s32_s16
(
out_5
),
vreinterpretq_s32_s16
(
out_7
));
const
int16x8x2_t
r01_s16
=
vtrnq_s16
(
vreinterpretq_s16_s32
(
r02_s32
.
val
[
0
]),
vreinterpretq_s16_s32
(
r13_s32
.
val
[
0
]));
const
int16x8x2_t
r23_s16
=
vtrnq_s16
(
vreinterpretq_s16_s32
(
r02_s32
.
val
[
1
]),
vreinterpretq_s16_s32
(
r13_s32
.
val
[
1
]));
const
int16x8x2_t
r45_s16
=
vtrnq_s16
(
vreinterpretq_s16_s32
(
r46_s32
.
val
[
0
]),
vreinterpretq_s16_s32
(
r57_s32
.
val
[
0
]));
const
int16x8x2_t
r67_s16
=
vtrnq_s16
(
vreinterpretq_s16_s32
(
r46_s32
.
val
[
1
]),
vreinterpretq_s16_s32
(
r57_s32
.
val
[
1
]));
input_0
=
r01_s16
.
val
[
0
];
input_1
=
r01_s16
.
val
[
1
];
input_2
=
r23_s16
.
val
[
0
];
input_3
=
r23_s16
.
val
[
1
];
input_4
=
r45_s16
.
val
[
0
];
input_5
=
r45_s16
.
val
[
1
];
input_6
=
r67_s16
.
val
[
0
];
input_7
=
r67_s16
.
val
[
1
];
// 00 10 20 30 40 50 60 70
// 01 11 21 31 41 51 61 71
// 02 12 22 32 42 52 62 72
// 03 13 23 33 43 53 63 73
// 04 14 24 34 44 54 64 74
// 05 15 25 35 45 55 65 75
// 06 16 26 36 46 56 66 76
// 07 17 27 37 47 57 67 77
}
}
// for
{
// from vp9_dct_sse2.c
// Post-condition (division by two)
// division of two 16 bits signed numbers using shifts
// n / 2 = (n - (n >> 15)) >> 1
const
int16x8_t
sign_in0
=
vshrq_n_s16
(
input_0
,
15
);
const
int16x8_t
sign_in1
=
vshrq_n_s16
(
input_1
,
15
);
const
int16x8_t
sign_in2
=
vshrq_n_s16
(
input_2
,
15
);
const
int16x8_t
sign_in3
=
vshrq_n_s16
(
input_3
,
15
);
const
int16x8_t
sign_in4
=
vshrq_n_s16
(
input_4
,
15
);
const
int16x8_t
sign_in5
=
vshrq_n_s16
(
input_5
,
15
);
const
int16x8_t
sign_in6
=
vshrq_n_s16
(
input_6
,
15
);
const
int16x8_t
sign_in7
=
vshrq_n_s16
(
input_7
,
15
);
input_0
=
vhsubq_s16
(
input_0
,
sign_in0
);
input_1
=
vhsubq_s16
(
input_1
,
sign_in1
);
input_2
=
vhsubq_s16
(
input_2
,
sign_in2
);
input_3
=
vhsubq_s16
(
input_3
,
sign_in3
);
input_4
=
vhsubq_s16
(
input_4
,
sign_in4
);
input_5
=
vhsubq_s16
(
input_5
,
sign_in5
);
input_6
=
vhsubq_s16
(
input_6
,
sign_in6
);
input_7
=
vhsubq_s16
(
input_7
,
sign_in7
);
// store results
vst1q_s16
(
&
final_output
[
0
*
8
],
input_0
);
vst1q_s16
(
&
final_output
[
1
*
8
],
input_1
);
vst1q_s16
(
&
final_output
[
2
*
8
],
input_2
);
vst1q_s16
(
&
final_output
[
3
*
8
],
input_3
);
vst1q_s16
(
&
final_output
[
4
*
8
],
input_4
);
vst1q_s16
(
&
final_output
[
5
*
8
],
input_5
);
vst1q_s16
(
&
final_output
[
6
*
8
],
input_6
);
vst1q_s16
(
&
final_output
[
7
*
8
],
input_7
);
}
}
vp9/encoder/vp9_dct.c
View file @
f0f00251
...
...
@@ -13,20 +13,14 @@
#include "./vpx_config.h"
#include "./vp9_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_ports/mem.h"
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_idct.h"
#include "vp9/common/vp9_systemdependent.h"
#include "vp9/encoder/vp9_dct.h"
static
INLINE
tran_high_t
fdct_round_shift
(
tran_high_t
input
)
{
tran_high_t
rv
=
ROUND_POWER_OF_TWO
(
input
,
DCT_CONST_BITS
);
// TODO(debargha, peter.derivaz): Find new bounds for this assert
// and make the bounds consts.
// assert(INT16_MIN <= rv && rv <= INT16_MAX);
return
rv
;
}
#include "vpx_ports/mem.h"
#include "vpx_dsp/fwd_txfm.h"
static
void
fdct4
(
const
tran_low_t
*
input
,
tran_low_t
*
output
)
{
tran_high_t
step
[
4
];
...
...
@@ -546,73 +540,6 @@ void vp9_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
output
[
1
]
=
0
;
}
void
vp9_fdct4x4_c
(
const
int16_t
*
input
,
tran_low_t
*
output
,
int
stride
)
{
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
// as the first pass results are transposed, we transpose the columns (that
// is the transposed rows) and transpose the results (so that it goes back
// in normal/row positions).
int
pass
;
// We need an intermediate buffer between passes.
tran_low_t
intermediate
[
4
*
4
];
const
int16_t
*
in_pass0
=
input
;
const
tran_low_t
*
in
=
NULL
;
tran_low_t
*
out
=
intermediate
;
// Do the two transform/transpose passes
for
(
pass
=
0
;
pass
<
2
;
++
pass
)
{
tran_high_t
input
[
4
];
// canbe16
tran_high_t
step
[
4
];
// canbe16
tran_high_t
temp1
,
temp2
;
// needs32
int
i
;
for
(
i
=
0
;
i
<
4
;
++
i
)
{
// Load inputs.
if
(
0
==
pass
)
{
input
[
0
]
=
in_pass0
[
0
*
stride
]
*
16
;
input
[
1
]
=
in_pass0
[
1
*
stride
]
*
16
;
input
[
2
]
=
in_pass0
[
2
*
stride
]
*
16
;
input
[
3
]
=
in_pass0
[
3
*
stride
]
*
16
;
if
(
i
==
0
&&
input
[
0
])
{
input
[
0
]
+=
1
;
}
}
else
{
input
[
0
]
=
in
[
0
*
4
];
input
[
1
]
=
in
[
1
*
4
];
input
[
2
]
=
in
[
2
*
4
];
input
[
3
]
=
in
[
3
*
4
];
}
// Transform.
step
[
0
]
=
input
[
0
]
+
input
[
3
];
step
[
1
]
=
input
[
1
]
+
input
[
2
];
step
[
2
]
=
input
[
1
]
-
input
[
2
];
step
[
3
]
=
input
[
0
]
-
input
[
3
];
temp1
=
(
step
[
0
]
+
step
[
1
])
*
cospi_16_64
;
temp2
=
(
step
[
0
]
-
step
[
1
])
*
cospi_16_64
;
out
[
0
]
=
(
tran_low_t
)
fdct_round_shift
(
temp1
);
out
[
2
]
=
(
tran_low_t
)
fdct_round_shift
(
temp2
);
temp1
=
step
[
2
]
*
cospi_24_64
+
step
[
3
]
*
cospi_8_64
;
temp2
=
-
step
[
2
]
*
cospi_8_64
+
step
[
3
]
*
cospi_24_64
;
out
[
1
]
=
(
tran_low_t
)
fdct_round_shift
(
temp1
);
out
[
3
]
=
(
tran_low_t
)
fdct_round_shift
(
temp2
);
// Do next column (which is a transposed row in second/horizontal pass)
in_pass0
++
;
in
++
;
out
+=
4
;
}
// Setup in/out for next pass.
in
=
intermediate
;
out
=
output
;
}
{
int
i
,
j
;
for
(
i
=
0
;
i
<
4
;
++
i
)
{
for
(
j
=
0
;
j
<
4
;
++
j
)
output
[
j
+
i
*
4
]
=
(
output
[
j
+
i
*
4
]
+
1
)
>>
2
;
}
}
}
void
vp9_fht4x4_c
(
const
int16_t
*
input
,
tran_low_t
*
output
,
int
stride
,
int
tx_type
)
{
if
(
tx_type
==
DCT_DCT
)
{
...
...
@@ -656,77 +583,6 @@ void vp9_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
output
[
1
]
=
0
;
}
void
vp9_fdct8x8_c
(
const
int16_t
*
input
,
tran_low_t
*
final_output
,
int
stride
)
{
int
i
,
j
;
tran_low_t
intermediate
[
64
];
// Transform columns
{
tran_low_t
*
output
=
intermediate
;
tran_high_t
s0
,
s1
,
s2
,
s3
,
s4
,
s5
,
s6
,
s7
;
// canbe16
tran_high_t
t0
,
t1
,
t2
,
t3
;
// needs32
tran_high_t
x0
,
x1
,
x2
,
x3
;
// canbe16
int
i
;
for
(
i
=
0
;
i
<
8
;
i
++
)
{
// stage 1
s0
=
(
input
[
0
*
stride
]
+
input
[
7
*
stride
])
*
4
;
s1
=
(
input
[
1
*
stride
]
+
input
[
6
*
stride
])
*
4
;
s2
=
(
input
[
2
*
stride
]
+
input
[
5
*
stride
])
*
4
;
s3
=
(
input
[
3
*
stride
]
+
input
[
4
*
stride
])
*
4
;
s4
=
(
input
[
3
*
stride
]
-
input
[
4
*
stride
])
*
4
;
s5
=
(
input
[
2
*
stride
]
-
input
[
5
*
stride
])
*
4
;
s6
=
(
input
[
1
*
stride
]
-
input
[
6
*
stride
])
*
4
;
s7
=
(
input
[
0
*
stride
]
-
input
[
7
*
stride
])
*
4
;
// fdct4(step, step);
x0
=
s0
+
s3
;
x1
=
s1
+
s2
;
x2
=
s1
-
s2
;
x3
=
s0
-
s3
;
t0
=
(
x0
+
x1
)
*
cospi_16_64
;
t1
=
(
x0
-
x1
)
*
cospi_16_64
;
t2
=
x2
*
cospi_24_64
+
x3
*
cospi_8_64
;
t3
=
-
x2
*
cospi_8_64
+
x3
*
cospi_24_64
;
output
[
0
*
8
]
=
(
tran_low_t
)
fdct_round_shift
(
t0
);
output
[
2
*
8
]
=
(
tran_low_t
)
fdct_round_shift
(
t2
);
output
[
4
*
8
]
=
(
tran_low_t
)
fdct_round_shift
(
t1
);
output
[
6
*
8
]
=
(
tran_low_t
)
fdct_round_shift
(
t3
);
// Stage 2
t0
=
(
s6
-
s5
)
*
cospi_16_64
;
t1
=
(
s6
+
s5
)
*
cospi_16_64
;
t2
=
fdct_round_shift
(
t0
);
t3
=
fdct_round_shift
(
t1
);
// Stage 3
x0
=
s4
+
t2
;
x1
=
s4
-
t2
;
x2
=
s7
-
t3
;
x3
=
s7
+
t3
;
// Stage 4
t0
=
x0
*
cospi_28_64
+
x3
*
cospi_4_64
;
t1
=
x1
*
cospi_12_64
+
x2
*
cospi_20_64
;
t2
=
x2
*
cospi_12_64
+
x1
*
-
cospi_20_64
;
t3
=
x3
*
cospi_28_64
+
x0
*
-
cospi_4_64
;
output
[
1
*
8
]
=
(
tran_low_t
)
fdct_round_shift
(
t0
);
output
[
3
*
8
]
=
(
tran_low_t
)
fdct_round_shift
(
t2
);
output
[
5
*
8
]
=
(
tran_low_t
)
fdct_round_shift
(
t1
);
output
[
7
*
8
]
=
(
tran_low_t
)
fdct_round_shift
(
t3
);
input
++
;
output
++
;
}
}
// Rows
for
(
i
=
0
;
i
<
8
;
++
i
)
{
fdct8
(
&
intermediate
[
i
*
8
],
&
final_output
[
i
*
8
]);
for
(
j
=
0
;
j
<
8
;
++
j
)
final_output
[
j
+
i
*
8
]
/=
2
;
}
}
void
vp9_fdct8x8_quant_c
(
const
int16_t
*
input
,
int
stride
,
tran_low_t
*
coeff_ptr
,
intptr_t
n_coeffs
,
int
skip_block
,
...
...
@@ -850,186 +706,6 @@ void vp9_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
output
[
1
]
=
0
;
}
void
vp9_fdct16x16_c
(
const
int16_t
*
input
,
tran_low_t
*
output
,
int
stride
)
{
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
// as the first pass results are transposed, we transpose the columns (that
// is the transposed rows) and transpose the results (so that it goes back
// in normal/row positions).
int
pass
;
// We need an intermediate buffer between passes.
tran_low_t
intermediate
[
256
];
const
int16_t
*
in_pass0
=
input
;
const
tran_low_t
*
in
=
NULL
;
tran_low_t
*
out
=
intermediate
;
// Do the two transform/transpose passes
for
(
pass
=
0
;
pass
<
2
;
++
pass
)
{
tran_high_t
step1
[
8
];
// canbe16
tran_high_t
step2
[
8
];
// canbe16
tran_high_t
step3
[
8
];
// canbe16
tran_high_t
input
[
8
];
// canbe16
tran_high_t
temp1
,
temp2
;
// needs32
int
i
;
for
(
i
=
0
;
i
<
16
;
i
++
)
{
if
(
0
==
pass
)
{
// Calculate input for the first 8 results.
input
[
0
]
=
(
in_pass0
[
0
*
stride
]
+
in_pass0
[
15
*
stride
])
*
4
;
input
[
1
]
=
(
in_pass0
[
1
*
stride
]
+
in_pass0
[
14
*
stride
])
*
4
;
input
[
2
]
=
(
in_pass0
[
2
*
stride
]
+
in_pass0
[
13
*
stride
])
*
4
;
input
[
3
]
=
(
in_pass0
[
3
*
stride
]
+
in_pass0
[
12
*
stride
])
*
4
;
input
[
4
]
=
(
in_pass0
[
4
*
stride
]
+
in_pass0
[
11
*
stride
])
*
4
;
input
[
5
]
=
(
in_pass0
[
5
*
stride
]
+
in_pass0
[
10
*
stride
])
*
4
;
input
[
6
]
=
(
in_pass0
[
6
*
stride
]
+
in_pass0
[
9
*
stride
])
*
4
;
input
[
7
]
=
(
in_pass0
[
7
*
stride
]
+
in_pass0
[
8
*
stride
])
*
4
;
// Calculate input for the next 8 results.
step1
[
0
]
=
(
in_pass0
[
7
*
stride
]
-
in_pass0
[
8
*
stride
])
*
4
;
step1
[
1
]
=
(
in_pass0
[
6
*
stride
]
-
in_pass0
[
9
*
stride
])
*
4
;
step1
[
2
]
=
(
in_pass0
[
5
*
stride
]
-
in_pass0
[
10
*
stride
])
*
4
;
step1
[
3
]
=
(
in_pass0
[
4
*
stride
]
-
in_pass0
[
11
*
stride
])
*
4
;
step1
[
4
]
=
(
in_pass0
[
3
*
stride
]
-
in_pass0
[
12
*
stride
])
*
4
;
step1
[
5
]
=
(
in_pass0
[
2
*
stride
]
-
in_pass0
[
13
*
stride
])
*
4
;
step1
[
6
]
=
(
in_pass0
[
1
*
stride
]
-
in_pass0
[
14
*
stride
])
*
4
;
step1
[
7
]
=
(
in_pass0
[
0
*
stride
]
-
in_pass0
[
15
*
stride
])
*
4
;
}
else
{
// Calculate input for the first 8 results.
input
[
0
]
=
((
in
[
0
*
16
]
+
1
)
>>
2
)
+
((
in
[
15
*
16
]
+
1
)
>>
2
);
input
[
1
]
=
((
in
[
1
*
16
]
+
1
)
>>
2
)
+
((
in
[
14
*
16
]
+
1
)
>>
2
);
input
[
2
]
=
((
in
[
2
*
16
]
+
1
)
>>
2
)
+
((
in
[
13
*
16
]
+
1
)
>>
2
);
input
[
3
]
=
((
in
[
3
*
16
]
+
1
)
>>
2
)
+
((
in
[
12
*
16
]
+
1
)
>>
2
);
input
[
4
]
=
((
in
[
4
*
16
]
+
1
)
>>
2
)
+
((
in
[
11
*
16
]
+
1
)
>>
2
);
input
[
5
]
=
((
in
[
5
*
16
]
+
1
)
>>
2
)
+
((
in
[
10
*
16
]
+
1
)
>>
2
);
input
[
6
]
=
((
in
[
6
*
16
]
+
1
)
>>
2
)
+
((
in
[
9
*
16
]
+
1
)
>>
2
);
input
[
7
]
=
((
in
[
7
*
16
]
+
1
)
>>
2
)
+
((
in
[
8
*
16
]
+
1
)
>>
2
);
// Calculate input for the next 8 results.
step1
[
0
]
=
((
in
[
7
*
16
]
+
1
)
>>
2
)
-
((
in
[
8
*
16
]
+
1
)
>>
2
);
step1
[
1
]
=
((
in
[
6
*
16
]
+
1
)
>>
2
)
-
((
in
[
9
*
16
]
+
1
)
>>
2
);
step1
[
2
]
=
((
in
[
5
*
16
]
+
1
)
>>
2
)
-
((
in
[
10
*
16
]
+
1
)
>>
2
);
step1
[
3
]
=
((
in
[
4
*
16
]
+
1
)
>>
2
)
-
((
in
[
11
*
16
]
+
1
)
>>
2
);
step1
[
4
]
=
((
in
[
3
*
16
]
+
1
)
>>
2
)
-
((
in
[
12
*
16
]
+
1
)
>>
2
);
step1
[
5
]
=
((
in
[
2
*
16
]
+
1
)
>>
2
)
-
((
in
[
13
*
16
]
+
1
)
>>
2
);
step1
[
6
]
=
((
in
[
1
*
16
]
+
1
)
>>
2
)
-
((
in
[
14
*
16
]
+
1
)
>>
2
);
step1
[
7
]
=
((
in
[
0
*
16
]
+
1
)
>>
2
)
-
((
in
[
15
*
16
]
+
1
)
>>
2
);
}
// Work on the first eight values; fdct8(input, even_results);
{
tran_high_t
s0
,
s1
,
s2
,
s3
,
s4
,
s5
,
s6
,
s7
;
// canbe16
tran_high_t
t0
,
t1
,
t2
,
t3
;
// needs32
tran_high_t
x0
,
x1
,
x2
,
x3
;
// canbe16
// stage 1
s0
=
input
[
0
]
+
input
[
7
];
s1
=
input
[
1
]
+
input
[
6
];
s2
=
input
[
2
]
+
input
[
5
];
s3
=
input
[
3
]
+
input
[
4
];
s4
=
input
[
3
]
-
input
[
4
];
s5
=
input
[
2
]
-
input
[
5
];
s6
=
input
[
1
]
-
input
[
6
];
s7
=
input
[
0
]
-
input
[
7
];
// fdct4(step, step);
x0
=
s0
+
s3
;
x1
=
s1
+
s2
;
x2
=
s1
-
s2
;
x3
=
s0
-
s3
;
t0
=
(
x0
+
x1
)
*
cospi_16_64
;
t1
=
(
x0
-
x1
)
*
cospi_16_64
;
t2
=
x3
*
cospi_8_64
+
x2
*
cospi_24_64
;
t3
=
x3
*
cospi_24_64
-
x2
*
cospi_8_64
;
out
[
0
]
=
(
tran_low_t
)
fdct_round_shift
(
t0
);
out
[
4
]
=
(
tran_low_t
)
fdct_round_shift
(
t2
);
out
[
8
]
=
(
tran_low_t
)
fdct_round_shift
(
t1
);
out
[
12
]
=
(
tran_low_t
)
fdct_round_shift
(
t3
);
// Stage 2
t0
=
(
s6
-
s5
)
*
cospi_16_64
;
t1
=
(
s6
+
s5
)
*
cospi_16_64
;