Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
2e36149c
Commit
2e36149c
authored
Apr 18, 2015
by
Parag Salasakar
Committed by
Gerrit Code Review
Apr 18, 2015
Browse files
Merge "mips msa vp9 convolve8 vert optimization"
parents
03829f2f
27d083c1
Changes
5
Expand all
Hide whitespace changes
Inline
Side-by-side
test/convolve_test.cc
View file @
2e36149c
...
...
@@ -1814,4 +1814,27 @@ INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest, ::testing::Values(
make_tuple
(
32
,
64
,
&
convolve8_dspr2
),
make_tuple
(
64
,
64
,
&
convolve8_dspr2
)));
#endif
#if HAVE_MSA
const
ConvolveFunctions
convolve8_msa
(
vp9_convolve_copy_c
,
vp9_convolve_avg_c
,
vp9_convolve8_horiz_c
,
vp9_convolve8_avg_horiz_c
,
vp9_convolve8_vert_msa
,
vp9_convolve8_avg_vert_c
,
vp9_convolve8_c
,
vp9_convolve8_avg_c
,
0
);
INSTANTIATE_TEST_CASE_P
(
MSA
,
ConvolveTest
,
::
testing
::
Values
(
make_tuple
(
4
,
4
,
&
convolve8_msa
),
make_tuple
(
8
,
4
,
&
convolve8_msa
),
make_tuple
(
4
,
8
,
&
convolve8_msa
),
make_tuple
(
8
,
8
,
&
convolve8_msa
),
make_tuple
(
16
,
8
,
&
convolve8_msa
),
make_tuple
(
8
,
16
,
&
convolve8_msa
),
make_tuple
(
16
,
16
,
&
convolve8_msa
),
make_tuple
(
32
,
16
,
&
convolve8_msa
),
make_tuple
(
16
,
32
,
&
convolve8_msa
),
make_tuple
(
32
,
32
,
&
convolve8_msa
),
make_tuple
(
64
,
32
,
&
convolve8_msa
),
make_tuple
(
32
,
64
,
&
convolve8_msa
),
make_tuple
(
64
,
64
,
&
convolve8_msa
)));
#endif // HAVE_MSA
}
// namespace
vp9/common/mips/msa/vp9_convolve8_vert_msa.c
0 → 100644
View file @
2e36149c
This diff is collapsed.
Click to expand it.
vp9/common/mips/msa/vp9_convolve_msa.h
0 → 100644
View file @
2e36149c
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_
#define VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_
#include
"vp9/common/vp9_filter.h"
#include
"vp9/common/mips/msa/vp9_macros_msa.h"
extern
uint8_t
mc_filt_mask_arr
[
16
*
3
];
#define HORIZ_8TAP_FILT(src, mask0, mask1, mask2, mask3, \
filt_h0, filt_h1, filt_h2, filt_h3) ({ \
v8i16 vec0, vec1, vec2, vec3, horiz_out; \
\
vec0 = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src), (v16i8)(src)); \
vec0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)(filt_h0)); \
vec1 = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src), (v16i8)(src)); \
vec0 = __msa_dpadd_s_h(vec0, (v16i8)(filt_h1), (v16i8)vec1); \
vec2 = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src), (v16i8)(src)); \
vec2 = __msa_dotp_s_h((v16i8)vec2, (v16i8)(filt_h2)); \
vec3 = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src), (v16i8)(src)); \
vec2 = __msa_dpadd_s_h(vec2, (v16i8)(filt_h3), (v16i8)vec3); \
vec0 = __msa_adds_s_h(vec0, vec2); \
horiz_out = SRARI_SATURATE_SIGNED_H(vec0, FILTER_BITS, 7); \
\
horiz_out; \
})
#define HORIZ_8TAP_FILT_2VECS(src0, src1, mask0, mask1, mask2, mask3, \
filt_h0, filt_h1, filt_h2, filt_h3) ({ \
v8i16 vec0, vec1, vec2, vec3, horiz_out; \
\
vec0 = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src1), (v16i8)(src0)); \
vec0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)(filt_h0)); \
vec1 = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src1), (v16i8)(src0)); \
vec0 = __msa_dpadd_s_h(vec0, (v16i8)(filt_h1), (v16i8)vec1); \
vec2 = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src1), (v16i8)(src0)); \
vec2 = __msa_dotp_s_h((v16i8)vec2, (v16i8)(filt_h2)); \
vec3 = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src1), (v16i8)(src0)); \
vec2 = __msa_dpadd_s_h(vec2, ((v16i8)filt_h3), (v16i8)vec3); \
vec0 = __msa_adds_s_h(vec0, vec2); \
horiz_out = (v8i16)SRARI_SATURATE_SIGNED_H(vec0, FILTER_BITS, 7); \
\
horiz_out; \
})
#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \
filt0, filt1, filt2, filt3) ({ \
v8i16 tmp0, tmp1; \
\
tmp0 = __msa_dotp_s_h((v16i8)(vec0), (v16i8)(filt0)); \
tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)(vec1), (v16i8)(filt1)); \
tmp1 = __msa_dotp_s_h((v16i8)(vec2), (v16i8)(filt2)); \
tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)(vec3), ((v16i8)filt3)); \
tmp0 = __msa_adds_s_h(tmp0, tmp1); \
\
tmp0; \
})
#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
mask0, mask1, mask2, mask3, \
filt0, filt1, filt2, filt3, \
out0, out1) { \
v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
v8i16 res0_m, res1_m, res2_m, res3_m; \
\
vec0_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src1), (v16i8)(src0)); \
vec1_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src3), (v16i8)(src2)); \
\
res0_m = __msa_dotp_s_h((v16i8)vec0_m, (v16i8)(filt0)); \
res1_m = __msa_dotp_s_h((v16i8)vec1_m, (v16i8)(filt0)); \
\
vec2_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src1), (v16i8)(src0)); \
vec3_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src3), (v16i8)(src2)); \
\
res0_m = __msa_dpadd_s_h(res0_m, (filt1), (v16i8)vec2_m); \
res1_m = __msa_dpadd_s_h(res1_m, (filt1), (v16i8)vec3_m); \
\
vec4_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src1), (v16i8)(src0)); \
vec5_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src3), (v16i8)(src2)); \
\
res2_m = __msa_dotp_s_h((v16i8)(filt2), (v16i8)vec4_m); \
res3_m = __msa_dotp_s_h((v16i8)(filt2), (v16i8)vec5_m); \
\
vec6_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src1), (v16i8)(src0)); \
vec7_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src3), (v16i8)(src2)); \
\
res2_m = __msa_dpadd_s_h(res2_m, (v16i8)(filt3), (v16i8)vec6_m); \
res3_m = __msa_dpadd_s_h(res3_m, (v16i8)(filt3), (v16i8)vec7_m); \
\
out0 = __msa_adds_s_h(res0_m, res2_m); \
out1 = __msa_adds_s_h(res1_m, res3_m); \
}
#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
mask0, mask1, mask2, mask3, \
filt0, filt1, filt2, filt3, \
out0, out1, out2, out3) { \
v8i16 vec0_m, vec1_m, vec2_m, vec3_m; \
v8i16 vec4_m, vec5_m, vec6_m, vec7_m; \
v8i16 res0_m, res1_m, res2_m, res3_m; \
v8i16 res4_m, res5_m, res6_m, res7_m; \
\
vec0_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src0), (v16i8)(src0)); \
vec1_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src1), (v16i8)(src1)); \
vec2_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src2), (v16i8)(src2)); \
vec3_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src3), (v16i8)(src3)); \
\
res0_m = __msa_dotp_s_h((v16i8)vec0_m, (v16i8)(filt0)); \
res1_m = __msa_dotp_s_h((v16i8)vec1_m, (v16i8)(filt0)); \
res2_m = __msa_dotp_s_h((v16i8)vec2_m, (v16i8)(filt0)); \
res3_m = __msa_dotp_s_h((v16i8)vec3_m, (v16i8)(filt0)); \
\
vec0_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src0), (v16i8)(src0)); \
vec1_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src1), (v16i8)(src1)); \
vec2_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src2), (v16i8)(src2)); \
vec3_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src3), (v16i8)(src3)); \
\
res4_m = __msa_dotp_s_h((v16i8)vec0_m, (v16i8)(filt2)); \
res5_m = __msa_dotp_s_h((v16i8)vec1_m, (v16i8)(filt2)); \
res6_m = __msa_dotp_s_h((v16i8)vec2_m, (v16i8)(filt2)); \
res7_m = __msa_dotp_s_h((v16i8)vec3_m, (v16i8)(filt2)); \
\
vec4_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src0), (v16i8)(src0)); \
vec5_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src1), (v16i8)(src1)); \
vec6_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src2), (v16i8)(src2)); \
vec7_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src3), (v16i8)(src3)); \
\
res0_m = __msa_dpadd_s_h(res0_m, (v16i8)(filt1), (v16i8)vec4_m); \
res1_m = __msa_dpadd_s_h(res1_m, (v16i8)(filt1), (v16i8)vec5_m); \
res2_m = __msa_dpadd_s_h(res2_m, (v16i8)(filt1), (v16i8)vec6_m); \
res3_m = __msa_dpadd_s_h(res3_m, (v16i8)(filt1), (v16i8)vec7_m); \
\
vec4_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src0), (v16i8)(src0)); \
vec5_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src1), (v16i8)(src1)); \
vec6_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src2), (v16i8)(src2)); \
vec7_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src3), (v16i8)(src3)); \
\
res4_m = __msa_dpadd_s_h(res4_m, (v16i8)(filt3), (v16i8)vec4_m); \
res5_m = __msa_dpadd_s_h(res5_m, (v16i8)(filt3), (v16i8)vec5_m); \
res6_m = __msa_dpadd_s_h(res6_m, (v16i8)(filt3), (v16i8)vec6_m); \
res7_m = __msa_dpadd_s_h(res7_m, (v16i8)(filt3), (v16i8)vec7_m); \
\
out0 = __msa_adds_s_h(res0_m, res4_m); \
out1 = __msa_adds_s_h(res1_m, res5_m); \
out2 = __msa_adds_s_h(res2_m, res6_m); \
out3 = __msa_adds_s_h(res3_m, res7_m); \
}
#endif
/* VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_ */
vp9/common/vp9_rtcd_defs.pl
View file @
2e36149c
...
...
@@ -298,7 +298,7 @@ add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_strid
specialize
qw/vp9_convolve8_horiz sse2 ssse3 neon dspr2/
,
"
$avx2_ssse3
";
add_proto
qw/void vp9_convolve8_vert/
,
"
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h
";
specialize
qw/vp9_convolve8_vert sse2 ssse3 neon dspr2/
,
"
$avx2_ssse3
";
specialize
qw/vp9_convolve8_vert sse2 ssse3 neon dspr2
msa
/
,
"
$avx2_ssse3
";
add_proto
qw/void vp9_convolve8_avg/
,
"
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h
";
specialize
qw/vp9_convolve8_avg sse2 ssse3 neon dspr2/
;
...
...
vp9/vp9_common.mk
View file @
2e36149c
...
...
@@ -131,6 +131,8 @@ VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_mblpf_vert_loopfilter_ds
# common (msa)
VP9_COMMON_SRCS-$(HAVE_MSA)
+=
common/mips/msa/vp9_macros_msa.h
VP9_COMMON_SRCS-$(HAVE_MSA)
+=
common/mips/msa/vp9_convolve8_vert_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA)
+=
common/mips/msa/vp9_convolve_msa.h
VP9_COMMON_SRCS-$(HAVE_SSE2)
+=
common/x86/vp9_idct_intrin_sse2.c
VP9_COMMON_SRCS-$(HAVE_SSE2)
+=
common/x86/vp9_idct_intrin_sse2.h
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment