Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
7b71cdb0
Commit
7b71cdb0
authored
Jun 23, 2015
by
Parag Salasakar
Committed by
Gerrit Code Review
Jun 23, 2015
Browse files
Merge "mips msa vp9 fdct 4x4 optimization"
parents
fb2a89b1
bc949991
Changes
6
Hide whitespace changes
Inline
Side-by-side
test/fdct4x4_test.cc
View file @
7b71cdb0
...
...
@@ -541,13 +541,13 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P
(
MSA
,
Trans4x4DCT
,
::
testing
::
Values
(
make_tuple
(
&
vp9_fdct4x4_
c
,
&
vp9_idct4x4_16_add_msa
,
1
,
VPX_BITS_8
)));
make_tuple
(
&
vp9_fdct4x4_
msa
,
&
vp9_idct4x4_16_add_msa
,
0
,
VPX_BITS_8
)));
INSTANTIATE_TEST_CASE_P
(
MSA
,
Trans4x4HT
,
::
testing
::
Values
(
make_tuple
(
&
vp9_fht4x4_
c
,
&
vp9_iht4x4_16_add_msa
,
0
,
VPX_BITS_8
),
make_tuple
(
&
vp9_fht4x4_
c
,
&
vp9_iht4x4_16_add_msa
,
1
,
VPX_BITS_8
),
make_tuple
(
&
vp9_fht4x4_
c
,
&
vp9_iht4x4_16_add_msa
,
2
,
VPX_BITS_8
),
make_tuple
(
&
vp9_fht4x4_
c
,
&
vp9_iht4x4_16_add_msa
,
3
,
VPX_BITS_8
)));
make_tuple
(
&
vp9_fht4x4_
msa
,
&
vp9_iht4x4_16_add_msa
,
0
,
VPX_BITS_8
),
make_tuple
(
&
vp9_fht4x4_
msa
,
&
vp9_iht4x4_16_add_msa
,
1
,
VPX_BITS_8
),
make_tuple
(
&
vp9_fht4x4_
msa
,
&
vp9_iht4x4_16_add_msa
,
2
,
VPX_BITS_8
),
make_tuple
(
&
vp9_fht4x4_
msa
,
&
vp9_iht4x4_16_add_msa
,
3
,
VPX_BITS_8
)));
#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
}
// namespace
vp9/common/mips/msa/vp9_macros_msa.h
View file @
7b71cdb0
...
...
@@ -467,6 +467,24 @@
SH(out3_m, pblk_2x4_m + 3 * stride); \
}
/* Description : Store 4x2 byte block to destination memory from input vector
Arguments : Inputs - in, pdst, stride
Details : Index 0 word element from 'in' vector is copied to a GP
register and stored to (pdst)
Index 1 word element from 'in' vector is copied to a GP
register and stored to (pdst + stride)
*/
#define ST4x2_UB(in, pdst, stride) { \
uint32_t out0_m, out1_m; \
uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \
\
out0_m = __msa_copy_u_w((v4i32)in, 0); \
out1_m = __msa_copy_u_w((v4i32)in, 1); \
\
SW(out0_m, pblk_4x2_m); \
SW(out1_m, pblk_4x2_m + stride); \
}
/* Description : Store as 4x4 byte block to destination memory from input vector
Arguments : Inputs - in0, in1, pdst, stride
Return Type - unsigned byte
...
...
@@ -1472,6 +1490,22 @@
}
#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)
/* Description : Multiplication of pairs of vectors
Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1
Details : Each element from 'in0' is multiplied with elements from 'in1'
and the result is written to 'out0'
*/
#define MUL2(in0, in1, in2, in3, out0, out1) { \
out0 = in0 * in1; \
out1 = in2 * in3; \
}
#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3) { \
MUL2(in0, in1, in2, in3, out0, out1); \
MUL2(in4, in5, in6, in7, out2, out3); \
}
/* Description : Addition of 2 pairs of vectors
Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1
...
...
vp9/common/vp9_rtcd_defs.pl
View file @
7b71cdb0
...
...
@@ -1023,7 +1023,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize
qw/vp9_fdct32x32_rd sse2/
;
}
else
{
add_proto
qw/void vp9_fht4x4/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/vp9_fht4x4 sse2/
;
specialize
qw/vp9_fht4x4 sse2
msa
/
;
add_proto
qw/void vp9_fht8x8/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/vp9_fht8x8 sse2 msa/
;
...
...
@@ -1032,13 +1032,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize
qw/vp9_fht16x16 sse2 msa/
;
add_proto
qw/void vp9_fwht4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp9_fwht4x4/
,
"
$mmx_x86inc
";
specialize
qw/vp9_fwht4x4
msa
/
,
"
$mmx_x86inc
";
add_proto
qw/void vp9_fdct4x4_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp9_fdct4x4_1 sse2/
;
add_proto
qw/void vp9_fdct4x4/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp9_fdct4x4 sse2/
;
specialize
qw/vp9_fdct4x4 sse2
msa
/
;
add_proto
qw/void vp9_fdct8x8_1/
,
"
const int16_t *input, tran_low_t *output, int stride
";
specialize
qw/vp9_fdct8x8_1 sse2 neon msa/
;
...
...
vp9/encoder/mips/msa/vp9_fdct4x4_msa.c
0 → 100644
View file @
7b71cdb0
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <assert.h>
#include "./vp9_rtcd.h"
#include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
void
vp9_fwht4x4_msa
(
const
int16_t
*
input
,
int16_t
*
output
,
int32_t
src_stride
)
{
v8i16
in0
,
in1
,
in2
,
in3
,
in4
;
LD_SH4
(
input
,
src_stride
,
in0
,
in1
,
in2
,
in3
);
in0
+=
in1
;
in3
-=
in2
;
in4
=
(
in0
-
in3
)
>>
1
;
SUB2
(
in4
,
in1
,
in4
,
in2
,
in1
,
in2
);
in0
-=
in2
;
in3
+=
in1
;
TRANSPOSE4x4_SH_SH
(
in0
,
in2
,
in3
,
in1
,
in0
,
in2
,
in3
,
in1
);
in0
+=
in2
;
in1
-=
in3
;
in4
=
(
in0
-
in1
)
>>
1
;
SUB2
(
in4
,
in2
,
in4
,
in3
,
in2
,
in3
);
in0
-=
in3
;
in1
+=
in2
;
SLLI_4V
(
in0
,
in1
,
in2
,
in3
,
2
);
TRANSPOSE4x4_SH_SH
(
in0
,
in3
,
in1
,
in2
,
in0
,
in3
,
in1
,
in2
);
ST4x2_UB
(
in0
,
output
,
4
);
ST4x2_UB
(
in3
,
output
+
4
,
4
);
ST4x2_UB
(
in1
,
output
+
8
,
4
);
ST4x2_UB
(
in2
,
output
+
12
,
4
);
}
void
vp9_fdct4x4_msa
(
const
int16_t
*
input
,
int16_t
*
output
,
int32_t
src_stride
)
{
v8i16
in0
,
in1
,
in2
,
in3
;
LD_SH4
(
input
,
src_stride
,
in0
,
in1
,
in2
,
in3
);
/* fdct4 pre-process */
{
v8i16
vec
,
mask
;
v16i8
zero
=
{
0
};
v16i8
one
=
__msa_ldi_b
(
1
);
mask
=
(
v8i16
)
__msa_sldi_b
(
zero
,
one
,
15
);
SLLI_4V
(
in0
,
in1
,
in2
,
in3
,
4
);
vec
=
__msa_ceqi_h
(
in0
,
0
);
vec
=
vec
^
255
;
vec
=
mask
&
vec
;
in0
+=
vec
;
}
VP9_FDCT4
(
in0
,
in1
,
in2
,
in3
,
in0
,
in1
,
in2
,
in3
);
TRANSPOSE4x4_SH_SH
(
in0
,
in1
,
in2
,
in3
,
in0
,
in1
,
in2
,
in3
);
VP9_FDCT4
(
in0
,
in1
,
in2
,
in3
,
in0
,
in1
,
in2
,
in3
);
TRANSPOSE4x4_SH_SH
(
in0
,
in1
,
in2
,
in3
,
in0
,
in1
,
in2
,
in3
);
ADD4
(
in0
,
1
,
in1
,
1
,
in2
,
1
,
in3
,
1
,
in0
,
in1
,
in2
,
in3
);
SRA_4V
(
in0
,
in1
,
in2
,
in3
,
2
);
PCKEV_D2_SH
(
in1
,
in0
,
in3
,
in2
,
in0
,
in2
);
ST_SH2
(
in0
,
in2
,
output
,
8
);
}
void
vp9_fht4x4_msa
(
const
int16_t
*
input
,
int16_t
*
output
,
int32_t
stride
,
int32_t
tx_type
)
{
v8i16
in0
,
in1
,
in2
,
in3
;
LD_SH4
(
input
,
stride
,
in0
,
in1
,
in2
,
in3
);
/* fdct4 pre-process */
{
v8i16
temp
,
mask
;
v16i8
zero
=
{
0
};
v16i8
one
=
__msa_ldi_b
(
1
);
mask
=
(
v8i16
)
__msa_sldi_b
(
zero
,
one
,
15
);
SLLI_4V
(
in0
,
in1
,
in2
,
in3
,
4
);
temp
=
__msa_ceqi_h
(
in0
,
0
);
temp
=
(
v8i16
)
__msa_xori_b
((
v16u8
)
temp
,
255
);
temp
=
mask
&
temp
;
in0
+=
temp
;
}
switch
(
tx_type
)
{
case
DCT_DCT
:
VP9_FDCT4
(
in0
,
in1
,
in2
,
in3
,
in0
,
in1
,
in2
,
in3
);
TRANSPOSE4x4_SH_SH
(
in0
,
in1
,
in2
,
in3
,
in0
,
in1
,
in2
,
in3
);
VP9_FDCT4
(
in0
,
in1
,
in2
,
in3
,
in0
,
in1
,
in2
,
in3
);
break
;
case
ADST_DCT
:
VP9_FADST4
(
in0
,
in1
,
in2
,
in3
,
in0
,
in1
,
in2
,
in3
);
TRANSPOSE4x4_SH_SH
(
in0
,
in1
,
in2
,
in3
,
in0
,
in1
,
in2
,
in3
);
VP9_FDCT4
(
in0
,
in1
,
in2
,
in3
,
in0
,
in1
,
in2
,
in3
);
break
;
case
DCT_ADST
:
VP9_FDCT4
(
in0
,
in1
,
in2
,
in3
,
in0
,
in1
,
in2
,
in3
);
TRANSPOSE4x4_SH_SH
(
in0
,
in1
,
in2
,
in3
,
in0
,
in1
,
in2
,
in3
);
VP9_FADST4
(
in0
,
in1
,
in2
,
in3
,
in0
,
in1
,
in2
,
in3
);
break
;
case
ADST_ADST
:
VP9_FADST4
(
in0
,
in1
,
in2
,
in3
,
in0
,
in1
,
in2
,
in3
);
TRANSPOSE4x4_SH_SH
(
in0
,
in1
,
in2
,
in3
,
in0
,
in1
,
in2
,
in3
);
VP9_FADST4
(
in0
,
in1
,
in2
,
in3
,
in0
,
in1
,
in2
,
in3
);
break
;
default:
assert
(
0
);
break
;
}
TRANSPOSE4x4_SH_SH
(
in0
,
in1
,
in2
,
in3
,
in0
,
in1
,
in2
,
in3
);
ADD4
(
in0
,
1
,
in1
,
1
,
in2
,
1
,
in3
,
1
,
in0
,
in1
,
in2
,
in3
);
SRA_4V
(
in0
,
in1
,
in2
,
in3
,
2
);
PCKEV_D2_SH
(
in1
,
in0
,
in3
,
in2
,
in0
,
in2
);
ST_SH2
(
in0
,
in2
,
output
,
8
);
}
vp9/encoder/mips/msa/vp9_fdct_msa.h
View file @
7b71cdb0
...
...
@@ -190,6 +190,67 @@
vec1 >>= 2; \
}
#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) { \
v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m; \
v8i16 vec0_m, vec1_m, vec2_m, vec3_m; \
v4i32 vec4_m, vec5_m, vec6_m, vec7_m; \
v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, \
cospi_24_64, -cospi_8_64, 0, 0, 0 }; \
\
BUTTERFLY_4(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \
ILVR_H2_SH(vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \
SPLATI_H2_SH(coeff_m, 0, 1, cnst0_m, cnst1_m); \
cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
vec5_m = __msa_dotp_s_w(vec0_m, cnst1_m); \
\
SPLATI_H2_SH(coeff_m, 4, 3, cnst2_m, cnst3_m); \
cnst2_m = __msa_ilvev_h(cnst3_m, cnst2_m); \
vec7_m = __msa_dotp_s_w(vec2_m, cnst2_m); \
\
vec4_m = __msa_dotp_s_w(vec0_m, cnst0_m); \
cnst2_m = __msa_splati_h(coeff_m, 2); \
cnst2_m = __msa_ilvev_h(cnst2_m, cnst3_m); \
vec6_m = __msa_dotp_s_w(vec2_m, cnst2_m); \
\
SRARI_W4_SW(vec4_m, vec5_m, vec6_m, vec7_m, DCT_CONST_BITS); \
PCKEV_H4_SH(vec4_m, vec4_m, vec5_m, vec5_m, vec6_m, vec6_m, \
vec7_m, vec7_m, out0, out2, out1, out3); \
}
#define VP9_FADST4(in0, in1, in2, in3, out0, out1, out2, out3) { \
v4i32 s0_m, s1_m, s2_m, s3_m, constant_m; \
v4i32 in0_r_m, in1_r_m, in2_r_m, in3_r_m; \
\
UNPCK_R_SH_SW(in0, in0_r_m); \
UNPCK_R_SH_SW(in1, in1_r_m); \
UNPCK_R_SH_SW(in2, in2_r_m); \
UNPCK_R_SH_SW(in3, in3_r_m); \
\
constant_m = __msa_fill_w(sinpi_4_9); \
MUL2(in0_r_m, constant_m, in3_r_m, constant_m, s1_m, s0_m); \
\
constant_m = __msa_fill_w(sinpi_1_9); \
s0_m += in0_r_m * constant_m; \
s1_m -= in1_r_m * constant_m; \
\
constant_m = __msa_fill_w(sinpi_2_9); \
s0_m += in1_r_m * constant_m; \
s1_m += in3_r_m * constant_m; \
\
s2_m = in0_r_m + in1_r_m - in3_r_m; \
\
constant_m = __msa_fill_w(sinpi_3_9); \
MUL2(in2_r_m, constant_m, s2_m, constant_m, s3_m, in1_r_m); \
\
in0_r_m = s0_m + s3_m; \
s2_m = s1_m - s3_m; \
s3_m = s1_m - s0_m + s3_m; \
\
SRARI_W4_SW(in0_r_m, in1_r_m, s2_m, s3_m, DCT_CONST_BITS); \
PCKEV_H4_SH(in0_r_m, in0_r_m, in1_r_m, in1_r_m, s2_m, s2_m, \
s3_m, s3_m, out0, out1, out2, out3); \
}
#define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3, out4, out5, out6, out7) { \
v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \
...
...
vp9/vp9cx.mk
View file @
7b71cdb0
...
...
@@ -152,6 +152,7 @@ VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
VP9_CX_SRCS-$(HAVE_NEON)
+=
encoder/arm/neon/vp9_subtract_neon.c
VP9_CX_SRCS-$(HAVE_NEON)
+=
encoder/arm/neon/vp9_variance_neon.c
VP9_CX_SRCS-$(HAVE_MSA)
+=
encoder/mips/msa/vp9_fdct4x4_msa.c
VP9_CX_SRCS-$(HAVE_MSA)
+=
encoder/mips/msa/vp9_fdct8x8_msa.c
VP9_CX_SRCS-$(HAVE_MSA)
+=
encoder/mips/msa/vp9_fdct16x16_msa.c
VP9_CX_SRCS-$(HAVE_MSA)
+=
encoder/mips/msa/vp9_fdct32x32_msa.c
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment