Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Raphael Zumer
aom-rav1e
Commits
1543f2b6
Commit
1543f2b6
authored
Jun 23, 2015
by
Parag Salasakar
Browse files
mips msa vp9 block error optimization
average improvement ~3x-4x Change-Id: If0fdcc34b17437a7e3e7fb4caaf1067bc175f291
parent
7555e2b8
Changes
4
Hide whitespace changes
Inline
Side-by-side
vp9/common/mips/msa/vp9_macros_msa.h
View file @
1543f2b6
...
...
@@ -440,6 +440,17 @@
}
#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
/* Description : Store vectors of word elements with stride
Arguments : Inputs - in0, in1, stride
- pdst (destination pointer to store to)
Details : Store 4 word elements from 'in0' to (pdst)
Store 4 word elements from 'in1' to (pdst + stride)
*/
#define ST_SW2(in0, in1, pdst, stride) { \
ST_SW(in0, (pdst)); \
ST_SW(in1, (pdst) + stride); \
}
/* Description : Store as 2x4 byte block to destination memory from input vector
Arguments : Inputs - in, stidx, pdst, stride
Return Type - unsigned byte
...
...
@@ -781,6 +792,39 @@
}
#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
/* Description : Dot product & addition of halfword vector elements
Arguments : Inputs - mult0, mult1
cnst0, cnst1
Outputs - out0, out1
Return Type - as per RTYPE
Details : Signed halfword elements from 'mult0' are multiplied with
signed halfword elements from 'cnst0' producing a result
twice the size of input i.e. signed word.
The multiplication result of adjacent odd-even elements
are added to the 'out0' vector
*/
#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \
out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \
out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \
}
#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
/* Description : Dot product & addition of double word vector elements
Arguments : Inputs - mult0, mult1
Outputs - out0, out1
Return Type - as per RTYPE
Details : Each signed word element from 'mult0' is multiplied with itself
producing an intermediate result twice the size of input
i.e. signed double word
The multiplication result of adjacent odd-even elements
are added to the 'out0' vector
*/
#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) { \
out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \
out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \
}
#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
/* Description : Minimum values between unsigned elements of
either vector are copied to the output vector
Arguments : Inputs - in0, in1, min_vec
...
...
@@ -862,6 +906,34 @@
}
#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
/* Description : Horizontal subtraction of unsigned byte vector elements
Arguments : Inputs - in0, in1
Outputs - out0, out1
Return Type - as per RTYPE
Details : Each unsigned odd byte element from 'in0' is subtracted from
even unsigned byte element from 'in0' (pairwise) and the
halfword result is written to 'out0'
*/
#define HSUB_UB2(RTYPE, in0, in1, out0, out1) { \
out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
}
#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
/* Description : Horizontal subtraction of signed halfword vector elements
Arguments : Inputs - in0, in1
Outputs - out0, out1
Return Type - as per RTYPE
Details : Each signed odd halfword element from 'in0' is subtracted from
even signed halfword element from 'in0' (pairwise) and the
word result is written to 'out0'
*/
#define HSUB_UH2(RTYPE, in0, in1, out0, out1) { \
out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \
out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \
}
#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
/* Description : Insert specified word elements from input vectors to 1
destination vector
Arguments : Inputs - in0, in1, in2, in3 (4 input vectors)
...
...
vp9/common/vp9_rtcd_defs.pl
View file @
1543f2b6
...
...
@@ -948,7 +948,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize
qw/vp9_fdct8x8_quant/
;
}
else
{
add_proto
qw/int64_t vp9_block_error/
,
"
const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz
";
specialize
qw/vp9_block_error avx2/
,
"
$sse2_x86inc
";
specialize
qw/vp9_block_error avx2
msa
/
,
"
$sse2_x86inc
";
add_proto
qw/int64_t vp9_block_error_fp/
,
"
const int16_t *coeff, const int16_t *dqcoeff, int block_size
";
specialize
qw/vp9_block_error_fp sse2/
;
...
...
vp9/encoder/mips/msa/vp9_error_msa.c
0 → 100644
View file @
1543f2b6
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include
"./vp9_rtcd.h"
#include
"vp9/common/mips/msa/vp9_macros_msa.h"
#define BLOCK_ERROR_BLOCKSIZE_MSA(BSize) \
static int64_t block_error_##BSize##size_msa(const int16_t *coeff_ptr, \
const int16_t *dq_coeff_ptr, \
int64_t *ssz) { \
int64_t err = 0; \
uint32_t loop_cnt; \
v8i16 coeff, dq_coeff, coeff_r_h, coeff_l_h; \
v4i32 diff_r, diff_l, coeff_r_w, coeff_l_w; \
v2i64 sq_coeff_r, sq_coeff_l; \
v2i64 err0, err_dup0, err1, err_dup1; \
\
coeff = LD_SH(coeff_ptr); \
dq_coeff = LD_SH(dq_coeff_ptr); \
UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \
ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \
HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \
DOTP_SW2_SD(coeff_r_w, coeff_l_w, coeff_r_w, coeff_l_w, \
sq_coeff_r, sq_coeff_l); \
DOTP_SW2_SD(diff_r, diff_l, diff_r, diff_l, err0, err1); \
\
coeff = LD_SH(coeff_ptr + 8); \
dq_coeff = LD_SH(dq_coeff_ptr + 8); \
UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \
ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \
HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \
DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \
DPADD_SD2_SD(diff_r, diff_l, err0, err1); \
\
coeff_ptr += 16; \
dq_coeff_ptr += 16; \
\
for (loop_cnt = ((BSize >> 4) - 1); loop_cnt--;) { \
coeff = LD_SH(coeff_ptr); \
dq_coeff = LD_SH(dq_coeff_ptr); \
UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \
ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \
HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \
DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \
DPADD_SD2_SD(diff_r, diff_l, err0, err1); \
\
coeff = LD_SH(coeff_ptr + 8); \
dq_coeff = LD_SH(dq_coeff_ptr + 8); \
UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \
ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \
HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \
DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \
DPADD_SD2_SD(diff_r, diff_l, err0, err1); \
\
coeff_ptr += 16; \
dq_coeff_ptr += 16; \
} \
\
err_dup0 = __msa_splati_d(sq_coeff_r, 1); \
err_dup1 = __msa_splati_d(sq_coeff_l, 1); \
sq_coeff_r += err_dup0; \
sq_coeff_l += err_dup1; \
*ssz = __msa_copy_s_d(sq_coeff_r, 0); \
*ssz += __msa_copy_s_d(sq_coeff_l, 0); \
\
err_dup0 = __msa_splati_d(err0, 1); \
err_dup1 = __msa_splati_d(err1, 1); \
err0 += err_dup0; \
err1 += err_dup1; \
err = __msa_copy_s_d(err0, 0); \
err += __msa_copy_s_d(err1, 0); \
\
return err; \
}
BLOCK_ERROR_BLOCKSIZE_MSA
(
16
);
BLOCK_ERROR_BLOCKSIZE_MSA
(
64
);
BLOCK_ERROR_BLOCKSIZE_MSA
(
256
);
BLOCK_ERROR_BLOCKSIZE_MSA
(
1024
);
int64_t
vp9_block_error_msa
(
const
tran_low_t
*
coeff_ptr
,
const
tran_low_t
*
dq_coeff_ptr
,
intptr_t
blk_size
,
int64_t
*
ssz
)
{
int64_t
err
;
const
int16_t
*
coeff
=
(
const
int16_t
*
)
coeff_ptr
;
const
int16_t
*
dq_coeff
=
(
const
int16_t
*
)
dq_coeff_ptr
;
switch
(
blk_size
)
{
case
16
:
err
=
block_error_16size_msa
(
coeff
,
dq_coeff
,
ssz
);
break
;
case
64
:
err
=
block_error_64size_msa
(
coeff
,
dq_coeff
,
ssz
);
break
;
case
256
:
err
=
block_error_256size_msa
(
coeff
,
dq_coeff
,
ssz
);
break
;
case
1024
:
err
=
block_error_1024size_msa
(
coeff
,
dq_coeff
,
ssz
);
break
;
default:
err
=
vp9_block_error_c
(
coeff_ptr
,
dq_coeff_ptr
,
blk_size
,
ssz
);
break
;
}
return
err
;
}
vp9/vp9cx.mk
View file @
1543f2b6
...
...
@@ -152,11 +152,12 @@ VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
VP9_CX_SRCS-$(HAVE_NEON)
+=
encoder/arm/neon/vp9_subtract_neon.c
VP9_CX_SRCS-$(HAVE_NEON)
+=
encoder/arm/neon/vp9_variance_neon.c
VP9_CX_SRCS-$(HAVE_MSA)
+=
encoder/mips/msa/vp9_avg_msa.c
VP9_CX_SRCS-$(HAVE_MSA)
+=
encoder/mips/msa/vp9_error_msa.c
VP9_CX_SRCS-$(HAVE_MSA)
+=
encoder/mips/msa/vp9_fdct4x4_msa.c
VP9_CX_SRCS-$(HAVE_MSA)
+=
encoder/mips/msa/vp9_fdct8x8_msa.c
VP9_CX_SRCS-$(HAVE_MSA)
+=
encoder/mips/msa/vp9_fdct16x16_msa.c
VP9_CX_SRCS-$(HAVE_MSA)
+=
encoder/mips/msa/vp9_fdct32x32_msa.c
VP9_CX_SRCS-$(HAVE_MSA)
+=
encoder/mips/msa/vp9_fdct_msa.h
VP9_CX_SRCS-$(HAVE_MSA)
+=
encoder/mips/msa/vp9_avg_msa.c
VP9_CX_SRCS-yes
:=
$(
filter-out
$
(
VP9_CX_SRCS_REMOVE-yes
)
,
$
(
VP9_CX_SRCS-yes
))
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment