Commit d9fedf78 authored by Parag Salasakar's avatar Parag Salasakar

mips msa vp9 fdct 32x32 optimization

average improvement ~4x-6x

Change-Id: Ibcac3ef8ed5e207cf8c121e696570e6b63d3c0f4
parent fa53008f
......@@ -386,7 +386,9 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
MSA, Trans32x32Test,
::testing::Values(
make_tuple(&vp9_fdct32x32_c,
&vp9_idct32x32_1024_add_msa, 0, VPX_BITS_8)));
make_tuple(&vp9_fdct32x32_msa,
&vp9_idct32x32_1024_add_msa, 0, VPX_BITS_8),
make_tuple(&vp9_fdct32x32_rd_msa,
&vp9_idct32x32_1024_add_msa, 1, VPX_BITS_8)));
#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
} // namespace
......@@ -720,6 +720,24 @@
}
#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
/* Description : Dot product of word vector elements
Arguments : Inputs - mult0, mult1
cnst0, cnst1
Outputs - out0, out1
Return Type - signed word
Details : Signed word elements from mult0 are multiplied with
signed word elements from cnst0 producing a result
twice the size of input i.e. signed double word.
Then this multiplication results of adjacent odd-even elements
are added together and stored to the out vector
(2 signed double word results)
*/
#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \
out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \
out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \
}
#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
/* Description : Dot product & addition of byte vector elements
Arguments : Inputs - mult0, mult1
cnst0, cnst1
......@@ -1103,7 +1121,7 @@
Return Type - unsigned halfword
Details : Each unsigned halfword element from 'in0' is saturated to the
value generated with (sat_val+1) bit range.
The results are in placed to original vectors
The results are stored in place
*/
#define SAT_UH2(RTYPE, in0, in1, sat_val) { \
in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \
......@@ -1125,7 +1143,7 @@
Return Type - unsigned halfword
Details : Each unsigned halfword element from 'in0' is saturated to the
value generated with (sat_val+1) bit range
The results are in placed to original vectors
The results are stored in place
*/
#define SAT_SH2(RTYPE, in0, in1, sat_val) { \
in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \
......@@ -1440,6 +1458,24 @@
ILVRL_B2_SH(zero_m, in, out0, out1); \
}
/* Description : Sign extend halfword elements from input vector and return
result in pair of vectors
Arguments : Inputs - in (1 input halfword vector)
Outputs - out0, out1 (sign extended 2 word vectors)
Return Type - signed word
Details : Sign bit of halfword elements from input vector 'in' is
extracted and interleaved right with same vector 'in0' to
generate 4 signed word elements in 'out0'
Then interleaved left with same vector 'in0' to
generate 4 signed word elements in 'out1'
*/
#define UNPCK_SH_SW(in, out0, out1) { \
v8i16 tmp_m; \
\
tmp_m = __msa_clti_s_h((v8i16)in, 0); \
ILVRL_H2_SW(tmp_m, in, out0, out1); \
}
/* Description : Butterfly of 4 input vectors
Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1, out2, out3
......
......@@ -1053,13 +1053,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_fdct16x16 sse2 msa/;
add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fdct32x32_1 sse2/;
specialize qw/vp9_fdct32x32_1 sse2 msa/;
add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fdct32x32 sse2 avx2/;
specialize qw/vp9_fdct32x32 sse2 avx2 msa/;
add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fdct32x32_rd sse2 avx2/;
specialize qw/vp9_fdct32x32_rd sse2 avx2 msa/;
}
#
......
This diff is collapsed.
......@@ -15,6 +15,24 @@
#include "vp9/common/vp9_idct.h"
#include "vp9/common/mips/msa/vp9_macros_msa.h"
#define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) { \
v8i16 k0_m = __msa_fill_h(cnst0); \
v4i32 s0_m, s1_m, s2_m, s3_m; \
\
s0_m = (v4i32)__msa_fill_h(cnst1); \
k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m); \
\
ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m); \
ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \
DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m); \
SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
\
DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \
SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
}
#define VP9_DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) ({ \
v8i16 dst_m; \
v4i32 tp0_m, tp1_m; \
......@@ -148,7 +166,6 @@
out3 = VP9_DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \
}
/* FDCT16x16 specific */
#define VP9_FDCT8x16_ODD(input0, input1, input2, input3, \
input4, input5, input6, input7, \
out1, out3, out5, out7, \
......@@ -250,4 +267,67 @@
cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
out3 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
}
#define VP9_FDCT32_POSTPROC_NEG_W(vec) { \
v4i32 temp_m; \
v4i32 one_m = __msa_ldi_w(1); \
\
temp_m = __msa_clti_s_w(vec, 0); \
vec += 1; \
temp_m = one_m & temp_m; \
vec += temp_m; \
vec >>= 2; \
}
#define VP9_FDCT32_POSTPROC_2V_POS_H(vec0, vec1) { \
v8i16 tp0_m, tp1_m; \
v8i16 one = __msa_ldi_h(1); \
\
tp0_m = __msa_clei_s_h(vec0, 0); \
tp1_m = __msa_clei_s_h(vec1, 0); \
tp0_m = (v8i16)__msa_xori_b((v16u8)tp0_m, 255); \
tp1_m = (v8i16)__msa_xori_b((v16u8)tp1_m, 255); \
vec0 += 1; \
vec1 += 1; \
tp0_m = one & tp0_m; \
tp1_m = one & tp1_m; \
vec0 += tp0_m; \
vec1 += tp1_m; \
vec0 >>= 2; \
vec1 >>= 2; \
}
#define VP9_DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, \
reg1_right, const0, const1, \
out0, out1, out2, out3) { \
v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \
v2i64 tp0_m, tp1_m, tp2_m, tp3_m; \
v4i32 k0_m = __msa_fill_w((int32_t) const0); \
\
s0_m = __msa_fill_w((int32_t) const1); \
k0_m = __msa_ilvev_w(s0_m, k0_m); \
\
ILVRL_W2_SW(-reg1_left, reg0_left, s1_m, s0_m); \
ILVRL_W2_SW(reg0_left, reg1_left, s3_m, s2_m); \
ILVRL_W2_SW(-reg1_right, reg0_right, s5_m, s4_m); \
ILVRL_W2_SW(reg0_right, reg1_right, s7_m, s6_m); \
\
DOTP_SW2_SD(s0_m, s1_m, k0_m, k0_m, tp0_m, tp1_m); \
DOTP_SW2_SD(s4_m, s5_m, k0_m, k0_m, tp2_m, tp3_m); \
tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \
tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \
tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \
tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \
out0 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \
out1 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \
\
DOTP_SW2_SD(s2_m, s3_m, k0_m, k0_m, tp0_m, tp1_m); \
DOTP_SW2_SD(s6_m, s7_m, k0_m, k0_m, tp2_m, tp3_m); \
tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \
tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \
tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \
tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \
out2 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \
out3 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \
}
#endif /* VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ */
......@@ -153,6 +153,7 @@ VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_subtract_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct32x32_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h
VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment