Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
0e3f494b
Commit
0e3f494b
authored
Jul 31, 2015
by
Parag Salasakar
Browse files
mips msa vp8 block subtract optimization
average improvement ~2x-3x Change-Id: I30abf4c92cddcc9e87b7a40d4106076e1ec701c2
parent
e3ee8c29
Changes
4
Hide whitespace changes
Inline
Side-by-side
vp8/common/mips/msa/vp8_macros_msa.h
View file @
0e3f494b
...
...
@@ -643,6 +643,23 @@
}
#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
/* Description : Dot product of word vector elements
Arguments : Inputs - mult0, mult1, cnst0, cnst1
Outputs - out0, out1
Return Type - as per RTYPE
Details : Signed word elements from 'mult0' are multiplied with
signed word elements from 'cnst0' producing a result
twice the size of input i.e. signed double word.
The multiplication result of adjacent odd-even elements
are added together and written to the 'out0' vector
*/
#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
{ \
out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \
out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \
}
#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
/* Description : Dot product & addition of byte vector elements
Arguments : Inputs - mult0, mult1, cnst0, cnst1
Outputs - out0, out1
...
...
@@ -693,6 +710,23 @@
}
#define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
/* Description : Dot product & addition of double word vector elements
Arguments : Inputs - mult0, mult1
Outputs - out0, out1
Return Type - as per RTYPE
Details : Each signed word element from 'mult0' is multiplied with itself
producing an intermediate result twice the size of it
i.e. signed double word
The multiplication result of adjacent odd-even elements
are added to the 'out0' vector
*/
#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) \
{ \
out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \
out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \
}
#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
/* Description : Clips all signed halfword elements of input vector
between 0 & 255
Arguments : Input - in
...
...
@@ -805,6 +839,21 @@
}
#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
/* Description : Horizontal subtraction of signed halfword vector elements
Arguments : Inputs - in0, in1
Outputs - out0, out1
Return Type - as per RTYPE
Details : Each signed odd halfword element from 'in0' is subtracted from
even signed halfword element from 'in0' (pairwise) and the
word result is written to 'out0'
*/
#define HSUB_UH2(RTYPE, in0, in1, out0, out1) \
{ \
out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \
out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \
}
#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
/* Description : Set element n input vector to GPR value
Arguments : Inputs - in0, in1, in2, in3
Output - out
...
...
vp8/common/rtcd_defs.pl
View file @
0e3f494b
...
...
@@ -295,15 +295,15 @@ specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa/;
# Block subtraction
#
add_proto
qw/int vp8_block_error/
,
"
short *coeff, short *dqcoeff
";
specialize
qw/vp8_block_error mmx sse2/
;
specialize
qw/vp8_block_error mmx sse2
msa
/
;
$vp8_block_error_sse2
=
vp8_block_error_xmm
;
add_proto
qw/int vp8_mbblock_error/
,
"
struct macroblock *mb, int dc
";
specialize
qw/vp8_mbblock_error mmx sse2/
;
specialize
qw/vp8_mbblock_error mmx sse2
msa
/
;
$vp8_mbblock_error_sse2
=
vp8_mbblock_error_xmm
;
add_proto
qw/int vp8_mbuverror/
,
"
struct macroblock *mb
";
specialize
qw/vp8_mbuverror mmx sse2/
;
specialize
qw/vp8_mbuverror mmx sse2
msa
/
;
$vp8_mbuverror_sse2
=
vp8_mbuverror_xmm
;
#
...
...
vp8/encoder/mips/msa/encodeopt_msa.c
0 → 100644
View file @
0e3f494b
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include
"./vp8_rtcd.h"
#include
"vp8/common/mips/msa/vp8_macros_msa.h"
#include
"vp8/encoder/block.h"
int32_t
vp8_block_error_msa
(
int16_t
*
coeff_ptr
,
int16_t
*
dq_coeff_ptr
)
{
int32_t
err
=
0
;
uint32_t
loop_cnt
;
v8i16
coeff
,
dq_coeff
,
coeff0
,
coeff1
;
v4i32
diff0
,
diff1
;
v2i64
err0
=
{
0
};
v2i64
err1
=
{
0
};
for
(
loop_cnt
=
2
;
loop_cnt
--
;)
{
coeff
=
LD_SH
(
coeff_ptr
);
dq_coeff
=
LD_SH
(
dq_coeff_ptr
);
ILVRL_H2_SH
(
coeff
,
dq_coeff
,
coeff0
,
coeff1
);
HSUB_UH2_SW
(
coeff0
,
coeff1
,
diff0
,
diff1
);
DPADD_SD2_SD
(
diff0
,
diff1
,
err0
,
err1
);
coeff_ptr
+=
8
;
dq_coeff_ptr
+=
8
;
}
err0
+=
__msa_splati_d
(
err0
,
1
);
err1
+=
__msa_splati_d
(
err1
,
1
);
err
=
__msa_copy_s_d
(
err0
,
0
);
err
+=
__msa_copy_s_d
(
err1
,
0
);
return
err
;
}
int32_t
vp8_mbblock_error_msa
(
MACROBLOCK
*
mb
,
int32_t
dc
)
{
BLOCK
*
be
;
BLOCKD
*
bd
;
int16_t
*
coeff_ptr
,
*
dq_coeff_ptr
;
int32_t
err
=
0
;
uint32_t
loop_cnt
;
v8i16
coeff
,
coeff0
,
coeff1
,
coeff2
,
coeff3
,
coeff4
;
v8i16
dq_coeff
,
dq_coeff2
,
dq_coeff3
,
dq_coeff4
;
v4i32
diff0
,
diff1
;
v2i64
err0
,
err1
;
v16u8
zero
=
{
0
};
v16u8
mask0
=
(
v16u8
)
__msa_ldi_b
(
255
);
if
(
1
==
dc
)
{
mask0
=
(
v16u8
)
__msa_insve_w
((
v4i32
)
mask0
,
0
,
(
v4i32
)
zero
);
}
for
(
loop_cnt
=
0
;
loop_cnt
<
8
;
loop_cnt
++
)
{
be
=
&
mb
->
block
[
2
*
loop_cnt
];
bd
=
&
mb
->
e_mbd
.
block
[
2
*
loop_cnt
];
coeff_ptr
=
be
->
coeff
;
dq_coeff_ptr
=
bd
->
dqcoeff
;
coeff
=
LD_SH
(
coeff_ptr
);
dq_coeff
=
LD_SH
(
dq_coeff_ptr
);
coeff_ptr
+=
8
;
dq_coeff_ptr
+=
8
;
coeff2
=
LD_SH
(
coeff_ptr
);
dq_coeff2
=
LD_SH
(
dq_coeff_ptr
);
be
=
&
mb
->
block
[
2
*
loop_cnt
+
1
];
bd
=
&
mb
->
e_mbd
.
block
[
2
*
loop_cnt
+
1
];
coeff_ptr
=
be
->
coeff
;
dq_coeff_ptr
=
bd
->
dqcoeff
;
coeff3
=
LD_SH
(
coeff_ptr
);
dq_coeff3
=
LD_SH
(
dq_coeff_ptr
);
coeff_ptr
+=
8
;
dq_coeff_ptr
+=
8
;
coeff4
=
LD_SH
(
coeff_ptr
);
dq_coeff4
=
LD_SH
(
dq_coeff_ptr
);
ILVRL_H2_SH
(
coeff
,
dq_coeff
,
coeff0
,
coeff1
);
HSUB_UH2_SW
(
coeff0
,
coeff1
,
diff0
,
diff1
);
diff0
=
(
v4i32
)
__msa_bmnz_v
(
zero
,
(
v16u8
)
diff0
,
mask0
);
DOTP_SW2_SD
(
diff0
,
diff1
,
diff0
,
diff1
,
err0
,
err1
);
ILVRL_H2_SH
(
coeff2
,
dq_coeff2
,
coeff0
,
coeff1
);
HSUB_UH2_SW
(
coeff0
,
coeff1
,
diff0
,
diff1
);
DPADD_SD2_SD
(
diff0
,
diff1
,
err0
,
err1
);
err0
+=
__msa_splati_d
(
err0
,
1
);
err1
+=
__msa_splati_d
(
err1
,
1
);
err
+=
__msa_copy_s_d
(
err0
,
0
);
err
+=
__msa_copy_s_d
(
err1
,
0
);
ILVRL_H2_SH
(
coeff3
,
dq_coeff3
,
coeff0
,
coeff1
);
HSUB_UH2_SW
(
coeff0
,
coeff1
,
diff0
,
diff1
);
diff0
=
(
v4i32
)
__msa_bmnz_v
(
zero
,
(
v16u8
)
diff0
,
mask0
);
DOTP_SW2_SD
(
diff0
,
diff1
,
diff0
,
diff1
,
err0
,
err1
);
ILVRL_H2_SH
(
coeff4
,
dq_coeff4
,
coeff0
,
coeff1
);
HSUB_UH2_SW
(
coeff0
,
coeff1
,
diff0
,
diff1
);
DPADD_SD2_SD
(
diff0
,
diff1
,
err0
,
err1
);
err0
+=
__msa_splati_d
(
err0
,
1
);
err1
+=
__msa_splati_d
(
err1
,
1
);
err
+=
__msa_copy_s_d
(
err0
,
0
);
err
+=
__msa_copy_s_d
(
err1
,
0
);
}
return
err
;
}
int32_t
vp8_mbuverror_msa
(
MACROBLOCK
*
mb
)
{
BLOCK
*
be
;
BLOCKD
*
bd
;
int16_t
*
coeff_ptr
,
*
dq_coeff_ptr
;
int32_t
err
=
0
;
uint32_t
loop_cnt
;
v8i16
coeff
,
coeff0
,
coeff1
,
coeff2
,
coeff3
,
coeff4
;
v8i16
dq_coeff
,
dq_coeff2
,
dq_coeff3
,
dq_coeff4
;
v4i32
diff0
,
diff1
;
v2i64
err0
,
err1
,
err_dup0
,
err_dup1
;
for
(
loop_cnt
=
16
;
loop_cnt
<
24
;
loop_cnt
+=
2
)
{
be
=
&
mb
->
block
[
loop_cnt
];
bd
=
&
mb
->
e_mbd
.
block
[
loop_cnt
];
coeff_ptr
=
be
->
coeff
;
dq_coeff_ptr
=
bd
->
dqcoeff
;
coeff
=
LD_SH
(
coeff_ptr
);
dq_coeff
=
LD_SH
(
dq_coeff_ptr
);
coeff_ptr
+=
8
;
dq_coeff_ptr
+=
8
;
coeff2
=
LD_SH
(
coeff_ptr
);
dq_coeff2
=
LD_SH
(
dq_coeff_ptr
);
be
=
&
mb
->
block
[
loop_cnt
+
1
];
bd
=
&
mb
->
e_mbd
.
block
[
loop_cnt
+
1
];
coeff_ptr
=
be
->
coeff
;
dq_coeff_ptr
=
bd
->
dqcoeff
;
coeff3
=
LD_SH
(
coeff_ptr
);
dq_coeff3
=
LD_SH
(
dq_coeff_ptr
);
coeff_ptr
+=
8
;
dq_coeff_ptr
+=
8
;
coeff4
=
LD_SH
(
coeff_ptr
);
dq_coeff4
=
LD_SH
(
dq_coeff_ptr
);
ILVRL_H2_SH
(
coeff
,
dq_coeff
,
coeff0
,
coeff1
);
HSUB_UH2_SW
(
coeff0
,
coeff1
,
diff0
,
diff1
);
DOTP_SW2_SD
(
diff0
,
diff1
,
diff0
,
diff1
,
err0
,
err1
);
ILVRL_H2_SH
(
coeff2
,
dq_coeff2
,
coeff0
,
coeff1
);
HSUB_UH2_SW
(
coeff0
,
coeff1
,
diff0
,
diff1
);
DPADD_SD2_SD
(
diff0
,
diff1
,
err0
,
err1
);
err_dup0
=
__msa_splati_d
(
err0
,
1
);
err_dup1
=
__msa_splati_d
(
err1
,
1
);
ADD2
(
err0
,
err_dup0
,
err1
,
err_dup1
,
err0
,
err1
);
err
+=
__msa_copy_s_d
(
err0
,
0
);
err
+=
__msa_copy_s_d
(
err1
,
0
);
ILVRL_H2_SH
(
coeff3
,
dq_coeff3
,
coeff0
,
coeff1
);
HSUB_UH2_SW
(
coeff0
,
coeff1
,
diff0
,
diff1
);
DOTP_SW2_SD
(
diff0
,
diff1
,
diff0
,
diff1
,
err0
,
err1
);
ILVRL_H2_SH
(
coeff4
,
dq_coeff4
,
coeff0
,
coeff1
);
HSUB_UH2_SW
(
coeff0
,
coeff1
,
diff0
,
diff1
);
DPADD_SD2_SD
(
diff0
,
diff1
,
err0
,
err1
);
err_dup0
=
__msa_splati_d
(
err0
,
1
);
err_dup1
=
__msa_splati_d
(
err1
,
1
);
ADD2
(
err0
,
err_dup0
,
err1
,
err_dup1
,
err0
,
err1
);
err
+=
__msa_copy_s_d
(
err0
,
0
);
err
+=
__msa_copy_s_d
(
err1
,
0
);
}
return
err
;
}
vp8/vp8cx.mk
View file @
0e3f494b
...
...
@@ -104,6 +104,7 @@ VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
endif
VP8_CX_SRCS-$(HAVE_MSA)
+=
encoder/mips/msa/dct_msa.c
VP8_CX_SRCS-$(HAVE_MSA)
+=
encoder/mips/msa/encodeopt_msa.c
VP8_CX_SRCS-$(HAVE_MSA)
+=
encoder/mips/msa/quantize_msa.c
VP8_CX_SRCS-yes
:=
$(
filter-out
$
(
VP8_CX_SRCS_REMOVE-yes
)
,
$
(
VP8_CX_SRCS-yes
))
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment