Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
d43fd998
Commit
d43fd998
authored
Jun 05, 2015
by
Parag Salasakar
Browse files
mips msa vp9 loopfilter 4, 8 optimization
average improvement ~3x-4x Change-Id: I59279293ce4b2a1e99bd10579ac97740e943643f
parent
dc07cc6f
Changes
5
Hide whitespace changes
Inline
Side-by-side
test/lpf_8_test.cc
View file @
d43fd998
...
...
@@ -694,9 +694,23 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P
(
MSA
,
Loop8Test6Param
,
::
testing
::
Values
(
make_tuple
(
&
vp9_lpf_horizontal_8_msa
,
&
vp9_lpf_horizontal_8_c
,
8
,
1
),
make_tuple
(
&
vp9_lpf_horizontal_16_msa
,
&
vp9_lpf_horizontal_16_c
,
8
,
1
),
make_tuple
(
&
vp9_lpf_horizontal_16_msa
,
&
vp9_lpf_horizontal_16_c
,
8
,
2
),
make_tuple
(
&
vp9_lpf_vertical_8_msa
,
&
vp9_lpf_vertical_8_c
,
8
,
1
),
make_tuple
(
&
wrapper_vertical_16_msa
,
&
wrapper_vertical_16_c
,
8
,
1
)));
INSTANTIATE_TEST_CASE_P
(
MSA
,
Loop8Test9Param
,
::
testing
::
Values
(
make_tuple
(
&
vp9_lpf_horizontal_4_dual_msa
,
&
vp9_lpf_horizontal_4_dual_c
,
8
),
make_tuple
(
&
vp9_lpf_horizontal_8_dual_msa
,
&
vp9_lpf_horizontal_8_dual_c
,
8
),
make_tuple
(
&
vp9_lpf_vertical_4_dual_msa
,
&
vp9_lpf_vertical_4_dual_c
,
8
),
make_tuple
(
&
vp9_lpf_vertical_8_dual_msa
,
&
vp9_lpf_vertical_8_dual_c
,
8
)));
#endif // HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
}
// namespace
vp9/common/mips/msa/vp9_loopfilter_4_msa.c
0 → 100644
View file @
d43fd998
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vp9/common/mips/msa/vp9_loopfilter_msa.h"
void
vp9_lpf_horizontal_4_msa
(
uint8_t
*
src
,
int32_t
pitch
,
const
uint8_t
*
b_limit_ptr
,
const
uint8_t
*
limit_ptr
,
const
uint8_t
*
thresh_ptr
,
int32_t
count
)
{
uint64_t
p1_d
,
p0_d
,
q0_d
,
q1_d
;
v16u8
mask
,
hev
,
flat
,
thresh
,
b_limit
,
limit
;
v16u8
p3
,
p2
,
p1
,
p0
,
q3
,
q2
,
q1
,
q0
,
p1_out
,
p0_out
,
q0_out
,
q1_out
;
(
void
)
count
;
/* load vector elements */
LD_UB8
((
src
-
4
*
pitch
),
pitch
,
p3
,
p2
,
p1
,
p0
,
q0
,
q1
,
q2
,
q3
);
thresh
=
(
v16u8
)
__msa_fill_b
(
*
thresh_ptr
);
b_limit
=
(
v16u8
)
__msa_fill_b
(
*
b_limit_ptr
);
limit
=
(
v16u8
)
__msa_fill_b
(
*
limit_ptr
);
LPF_MASK_HEV
(
p3
,
p2
,
p1
,
p0
,
q0
,
q1
,
q2
,
q3
,
limit
,
b_limit
,
thresh
,
hev
,
mask
,
flat
);
VP9_LPF_FILTER4_8W
(
p1
,
p0
,
q0
,
q1
,
mask
,
hev
,
p1_out
,
p0_out
,
q0_out
,
q1_out
);
p1_d
=
__msa_copy_u_d
((
v2i64
)
p1_out
,
0
);
p0_d
=
__msa_copy_u_d
((
v2i64
)
p0_out
,
0
);
q0_d
=
__msa_copy_u_d
((
v2i64
)
q0_out
,
0
);
q1_d
=
__msa_copy_u_d
((
v2i64
)
q1_out
,
0
);
SD4
(
p1_d
,
p0_d
,
q0_d
,
q1_d
,
(
src
-
2
*
pitch
),
pitch
);
}
void
vp9_lpf_horizontal_4_dual_msa
(
uint8_t
*
src
,
int32_t
pitch
,
const
uint8_t
*
b_limit0_ptr
,
const
uint8_t
*
limit0_ptr
,
const
uint8_t
*
thresh0_ptr
,
const
uint8_t
*
b_limit1_ptr
,
const
uint8_t
*
limit1_ptr
,
const
uint8_t
*
thresh1_ptr
)
{
v16u8
mask
,
hev
,
flat
,
thresh0
,
b_limit0
,
limit0
,
thresh1
,
b_limit1
,
limit1
;
v16u8
p3
,
p2
,
p1
,
p0
,
q3
,
q2
,
q1
,
q0
;
/* load vector elements */
LD_UB8
((
src
-
4
*
pitch
),
pitch
,
p3
,
p2
,
p1
,
p0
,
q0
,
q1
,
q2
,
q3
);
thresh0
=
(
v16u8
)
__msa_fill_b
(
*
thresh0_ptr
);
thresh1
=
(
v16u8
)
__msa_fill_b
(
*
thresh1_ptr
);
thresh0
=
(
v16u8
)
__msa_ilvr_d
((
v2i64
)
thresh1
,
(
v2i64
)
thresh0
);
b_limit0
=
(
v16u8
)
__msa_fill_b
(
*
b_limit0_ptr
);
b_limit1
=
(
v16u8
)
__msa_fill_b
(
*
b_limit1_ptr
);
b_limit0
=
(
v16u8
)
__msa_ilvr_d
((
v2i64
)
b_limit1
,
(
v2i64
)
b_limit0
);
limit0
=
(
v16u8
)
__msa_fill_b
(
*
limit0_ptr
);
limit1
=
(
v16u8
)
__msa_fill_b
(
*
limit1_ptr
);
limit0
=
(
v16u8
)
__msa_ilvr_d
((
v2i64
)
limit1
,
(
v2i64
)
limit0
);
LPF_MASK_HEV
(
p3
,
p2
,
p1
,
p0
,
q0
,
q1
,
q2
,
q3
,
limit0
,
b_limit0
,
thresh0
,
hev
,
mask
,
flat
);
VP9_LPF_FILTER4_4W
(
p1
,
p0
,
q0
,
q1
,
mask
,
hev
,
p1
,
p0
,
q0
,
q1
);
ST_UB4
(
p1
,
p0
,
q0
,
q1
,
(
src
-
2
*
pitch
),
pitch
);
}
void
vp9_lpf_vertical_4_msa
(
uint8_t
*
src
,
int32_t
pitch
,
const
uint8_t
*
b_limit_ptr
,
const
uint8_t
*
limit_ptr
,
const
uint8_t
*
thresh_ptr
,
int32_t
count
)
{
v16u8
mask
,
hev
,
flat
,
limit
,
thresh
,
b_limit
;
v16u8
p3
,
p2
,
p1
,
p0
,
q3
,
q2
,
q1
,
q0
;
v8i16
vec0
,
vec1
,
vec2
,
vec3
;
(
void
)
count
;
LD_UB8
((
src
-
4
),
pitch
,
p3
,
p2
,
p1
,
p0
,
q0
,
q1
,
q2
,
q3
);
thresh
=
(
v16u8
)
__msa_fill_b
(
*
thresh_ptr
);
b_limit
=
(
v16u8
)
__msa_fill_b
(
*
b_limit_ptr
);
limit
=
(
v16u8
)
__msa_fill_b
(
*
limit_ptr
);
TRANSPOSE8x8_UB_UB
(
p3
,
p2
,
p1
,
p0
,
q0
,
q1
,
q2
,
q3
,
p3
,
p2
,
p1
,
p0
,
q0
,
q1
,
q2
,
q3
);
LPF_MASK_HEV
(
p3
,
p2
,
p1
,
p0
,
q0
,
q1
,
q2
,
q3
,
limit
,
b_limit
,
thresh
,
hev
,
mask
,
flat
);
VP9_LPF_FILTER4_8W
(
p1
,
p0
,
q0
,
q1
,
mask
,
hev
,
p1
,
p0
,
q0
,
q1
);
ILVR_B2_SH
(
p0
,
p1
,
q1
,
q0
,
vec0
,
vec1
);
ILVRL_H2_SH
(
vec1
,
vec0
,
vec2
,
vec3
);
src
-=
2
;
ST4x4_UB
(
vec2
,
vec2
,
0
,
1
,
2
,
3
,
src
,
pitch
);
src
+=
4
*
pitch
;
ST4x4_UB
(
vec3
,
vec3
,
0
,
1
,
2
,
3
,
src
,
pitch
);
}
void
vp9_lpf_vertical_4_dual_msa
(
uint8_t
*
src
,
int32_t
pitch
,
const
uint8_t
*
b_limit0_ptr
,
const
uint8_t
*
limit0_ptr
,
const
uint8_t
*
thresh0_ptr
,
const
uint8_t
*
b_limit1_ptr
,
const
uint8_t
*
limit1_ptr
,
const
uint8_t
*
thresh1_ptr
)
{
v16u8
mask
,
hev
,
flat
;
v16u8
thresh0
,
b_limit0
,
limit0
,
thresh1
,
b_limit1
,
limit1
;
v16u8
p3
,
p2
,
p1
,
p0
,
q3
,
q2
,
q1
,
q0
;
v16u8
row0
,
row1
,
row2
,
row3
,
row4
,
row5
,
row6
,
row7
;
v16u8
row8
,
row9
,
row10
,
row11
,
row12
,
row13
,
row14
,
row15
;
v8i16
tmp0
,
tmp1
,
tmp2
,
tmp3
,
tmp4
,
tmp5
;
LD_UB8
(
src
-
4
,
pitch
,
row0
,
row1
,
row2
,
row3
,
row4
,
row5
,
row6
,
row7
);
LD_UB8
(
src
-
4
+
(
8
*
pitch
),
pitch
,
row8
,
row9
,
row10
,
row11
,
row12
,
row13
,
row14
,
row15
);
TRANSPOSE16x8_UB_UB
(
row0
,
row1
,
row2
,
row3
,
row4
,
row5
,
row6
,
row7
,
row8
,
row9
,
row10
,
row11
,
row12
,
row13
,
row14
,
row15
,
p3
,
p2
,
p1
,
p0
,
q0
,
q1
,
q2
,
q3
);
thresh0
=
(
v16u8
)
__msa_fill_b
(
*
thresh0_ptr
);
thresh1
=
(
v16u8
)
__msa_fill_b
(
*
thresh1_ptr
);
thresh0
=
(
v16u8
)
__msa_ilvr_d
((
v2i64
)
thresh1
,
(
v2i64
)
thresh0
);
b_limit0
=
(
v16u8
)
__msa_fill_b
(
*
b_limit0_ptr
);
b_limit1
=
(
v16u8
)
__msa_fill_b
(
*
b_limit1_ptr
);
b_limit0
=
(
v16u8
)
__msa_ilvr_d
((
v2i64
)
b_limit1
,
(
v2i64
)
b_limit0
);
limit0
=
(
v16u8
)
__msa_fill_b
(
*
limit0_ptr
);
limit1
=
(
v16u8
)
__msa_fill_b
(
*
limit1_ptr
);
limit0
=
(
v16u8
)
__msa_ilvr_d
((
v2i64
)
limit1
,
(
v2i64
)
limit0
);
LPF_MASK_HEV
(
p3
,
p2
,
p1
,
p0
,
q0
,
q1
,
q2
,
q3
,
limit0
,
b_limit0
,
thresh0
,
hev
,
mask
,
flat
);
VP9_LPF_FILTER4_4W
(
p1
,
p0
,
q0
,
q1
,
mask
,
hev
,
p1
,
p0
,
q0
,
q1
);
ILVR_B2_SH
(
p0
,
p1
,
q1
,
q0
,
tmp0
,
tmp1
);
ILVRL_H2_SH
(
tmp1
,
tmp0
,
tmp2
,
tmp3
);
ILVL_B2_SH
(
p0
,
p1
,
q1
,
q0
,
tmp0
,
tmp1
);
ILVRL_H2_SH
(
tmp1
,
tmp0
,
tmp4
,
tmp5
);
src
-=
2
;
ST4x8_UB
(
tmp2
,
tmp3
,
src
,
pitch
);
src
+=
(
8
*
pitch
);
ST4x8_UB
(
tmp4
,
tmp5
,
src
,
pitch
);
}
vp9/common/mips/msa/vp9_loopfilter_8_msa.c
0 → 100644
View file @
d43fd998
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vp9/common/mips/msa/vp9_loopfilter_msa.h"
void
vp9_lpf_horizontal_8_msa
(
uint8_t
*
src
,
int32_t
pitch
,
const
uint8_t
*
b_limit_ptr
,
const
uint8_t
*
limit_ptr
,
const
uint8_t
*
thresh_ptr
,
int32_t
count
)
{
uint64_t
p2_d
,
p1_d
,
p0_d
,
q0_d
,
q1_d
,
q2_d
;
v16u8
mask
,
hev
,
flat
,
thresh
,
b_limit
,
limit
;
v16u8
p3
,
p2
,
p1
,
p0
,
q3
,
q2
,
q1
,
q0
;
v16u8
p2_out
,
p1_out
,
p0_out
,
q0_out
,
q1_out
,
q2_out
;
v8i16
p2_filter8
,
p1_filter8
,
p0_filter8
,
q0_filter8
,
q1_filter8
,
q2_filter8
;
v8u16
p3_r
,
p2_r
,
p1_r
,
p0_r
,
q3_r
,
q2_r
,
q1_r
,
q0_r
;
v16i8
zero
=
{
0
};
(
void
)
count
;
/* load vector elements */
LD_UB8
((
src
-
4
*
pitch
),
pitch
,
p3
,
p2
,
p1
,
p0
,
q0
,
q1
,
q2
,
q3
);
thresh
=
(
v16u8
)
__msa_fill_b
(
*
thresh_ptr
);
b_limit
=
(
v16u8
)
__msa_fill_b
(
*
b_limit_ptr
);
limit
=
(
v16u8
)
__msa_fill_b
(
*
limit_ptr
);
LPF_MASK_HEV
(
p3
,
p2
,
p1
,
p0
,
q0
,
q1
,
q2
,
q3
,
limit
,
b_limit
,
thresh
,
hev
,
mask
,
flat
);
VP9_FLAT4
(
p3
,
p2
,
p0
,
q0
,
q2
,
q3
,
flat
);
VP9_LPF_FILTER4_8W
(
p1
,
p0
,
q0
,
q1
,
mask
,
hev
,
p1_out
,
p0_out
,
q0_out
,
q1_out
);
flat
=
(
v16u8
)
__msa_ilvr_d
((
v2i64
)
zero
,
(
v2i64
)
flat
);
if
(
__msa_test_bz_v
(
flat
))
{
p1_d
=
__msa_copy_u_d
((
v2i64
)
p1_out
,
0
);
p0_d
=
__msa_copy_u_d
((
v2i64
)
p0_out
,
0
);
q0_d
=
__msa_copy_u_d
((
v2i64
)
q0_out
,
0
);
q1_d
=
__msa_copy_u_d
((
v2i64
)
q1_out
,
0
);
SD4
(
p1_d
,
p0_d
,
q0_d
,
q1_d
,
(
src
-
2
*
pitch
),
pitch
);
}
else
{
ILVR_B8_UH
(
zero
,
p3
,
zero
,
p2
,
zero
,
p1
,
zero
,
p0
,
zero
,
q0
,
zero
,
q1
,
zero
,
q2
,
zero
,
q3
,
p3_r
,
p2_r
,
p1_r
,
p0_r
,
q0_r
,
q1_r
,
q2_r
,
q3_r
);
VP9_FILTER8
(
p3_r
,
p2_r
,
p1_r
,
p0_r
,
q0_r
,
q1_r
,
q2_r
,
q3_r
,
p2_filter8
,
p1_filter8
,
p0_filter8
,
q0_filter8
,
q1_filter8
,
q2_filter8
);
/* convert 16 bit output data into 8 bit */
PCKEV_B4_SH
(
zero
,
p2_filter8
,
zero
,
p1_filter8
,
zero
,
p0_filter8
,
zero
,
q0_filter8
,
p2_filter8
,
p1_filter8
,
p0_filter8
,
q0_filter8
);
PCKEV_B2_SH
(
zero
,
q1_filter8
,
zero
,
q2_filter8
,
q1_filter8
,
q2_filter8
);
/* store pixel values */
p2_out
=
__msa_bmnz_v
(
p2
,
(
v16u8
)
p2_filter8
,
flat
);
p1_out
=
__msa_bmnz_v
(
p1_out
,
(
v16u8
)
p1_filter8
,
flat
);
p0_out
=
__msa_bmnz_v
(
p0_out
,
(
v16u8
)
p0_filter8
,
flat
);
q0_out
=
__msa_bmnz_v
(
q0_out
,
(
v16u8
)
q0_filter8
,
flat
);
q1_out
=
__msa_bmnz_v
(
q1_out
,
(
v16u8
)
q1_filter8
,
flat
);
q2_out
=
__msa_bmnz_v
(
q2
,
(
v16u8
)
q2_filter8
,
flat
);
p2_d
=
__msa_copy_u_d
((
v2i64
)
p2_out
,
0
);
p1_d
=
__msa_copy_u_d
((
v2i64
)
p1_out
,
0
);
p0_d
=
__msa_copy_u_d
((
v2i64
)
p0_out
,
0
);
q0_d
=
__msa_copy_u_d
((
v2i64
)
q0_out
,
0
);
q1_d
=
__msa_copy_u_d
((
v2i64
)
q1_out
,
0
);
q2_d
=
__msa_copy_u_d
((
v2i64
)
q2_out
,
0
);
src
-=
3
*
pitch
;
SD4
(
p2_d
,
p1_d
,
p0_d
,
q0_d
,
src
,
pitch
);
src
+=
(
4
*
pitch
);
SD
(
q1_d
,
src
);
src
+=
pitch
;
SD
(
q2_d
,
src
);
}
}
void
vp9_lpf_horizontal_8_dual_msa
(
uint8_t
*
src
,
int32_t
pitch
,
const
uint8_t
*
b_limit0
,
const
uint8_t
*
limit0
,
const
uint8_t
*
thresh0
,
const
uint8_t
*
b_limit1
,
const
uint8_t
*
limit1
,
const
uint8_t
*
thresh1
)
{
v16u8
p3
,
p2
,
p1
,
p0
,
q3
,
q2
,
q1
,
q0
;
v16u8
p2_out
,
p1_out
,
p0_out
,
q0_out
,
q1_out
,
q2_out
;
v16u8
flat
,
mask
,
hev
,
tmp
,
thresh
,
b_limit
,
limit
;
v8u16
p3_r
,
p2_r
,
p1_r
,
p0_r
,
q0_r
,
q1_r
,
q2_r
,
q3_r
;
v8u16
p3_l
,
p2_l
,
p1_l
,
p0_l
,
q0_l
,
q1_l
,
q2_l
,
q3_l
;
v8i16
p2_filt8_r
,
p1_filt8_r
,
p0_filt8_r
,
q0_filt8_r
,
q1_filt8_r
,
q2_filt8_r
;
v8i16
p2_filt8_l
,
p1_filt8_l
,
p0_filt8_l
,
q0_filt8_l
,
q1_filt8_l
,
q2_filt8_l
;
v16u8
zero
=
{
0
};
/* load vector elements */
LD_UB8
(
src
-
(
4
*
pitch
),
pitch
,
p3
,
p2
,
p1
,
p0
,
q0
,
q1
,
q2
,
q3
);
thresh
=
(
v16u8
)
__msa_fill_b
(
*
thresh0
);
tmp
=
(
v16u8
)
__msa_fill_b
(
*
thresh1
);
thresh
=
(
v16u8
)
__msa_ilvr_d
((
v2i64
)
tmp
,
(
v2i64
)
thresh
);
b_limit
=
(
v16u8
)
__msa_fill_b
(
*
b_limit0
);
tmp
=
(
v16u8
)
__msa_fill_b
(
*
b_limit1
);
b_limit
=
(
v16u8
)
__msa_ilvr_d
((
v2i64
)
tmp
,
(
v2i64
)
b_limit
);
limit
=
(
v16u8
)
__msa_fill_b
(
*
limit0
);
tmp
=
(
v16u8
)
__msa_fill_b
(
*
limit1
);
limit
=
(
v16u8
)
__msa_ilvr_d
((
v2i64
)
tmp
,
(
v2i64
)
limit
);
/* mask and hev */
LPF_MASK_HEV
(
p3
,
p2
,
p1
,
p0
,
q0
,
q1
,
q2
,
q3
,
limit
,
b_limit
,
thresh
,
hev
,
mask
,
flat
);
VP9_FLAT4
(
p3
,
p2
,
p0
,
q0
,
q2
,
q3
,
flat
);
VP9_LPF_FILTER4_4W
(
p1
,
p0
,
q0
,
q1
,
mask
,
hev
,
p1_out
,
p0_out
,
q0_out
,
q1_out
);
if
(
__msa_test_bz_v
(
flat
))
{
ST_UB4
(
p1_out
,
p0_out
,
q0_out
,
q1_out
,
(
src
-
2
*
pitch
),
pitch
);
}
else
{
ILVR_B8_UH
(
zero
,
p3
,
zero
,
p2
,
zero
,
p1
,
zero
,
p0
,
zero
,
q0
,
zero
,
q1
,
zero
,
q2
,
zero
,
q3
,
p3_r
,
p2_r
,
p1_r
,
p0_r
,
q0_r
,
q1_r
,
q2_r
,
q3_r
);
VP9_FILTER8
(
p3_r
,
p2_r
,
p1_r
,
p0_r
,
q0_r
,
q1_r
,
q2_r
,
q3_r
,
p2_filt8_r
,
p1_filt8_r
,
p0_filt8_r
,
q0_filt8_r
,
q1_filt8_r
,
q2_filt8_r
);
ILVL_B4_UH
(
zero
,
p3
,
zero
,
p2
,
zero
,
p1
,
zero
,
p0
,
p3_l
,
p2_l
,
p1_l
,
p0_l
);
ILVL_B4_UH
(
zero
,
q0
,
zero
,
q1
,
zero
,
q2
,
zero
,
q3
,
q0_l
,
q1_l
,
q2_l
,
q3_l
);
VP9_FILTER8
(
p3_l
,
p2_l
,
p1_l
,
p0_l
,
q0_l
,
q1_l
,
q2_l
,
q3_l
,
p2_filt8_l
,
p1_filt8_l
,
p0_filt8_l
,
q0_filt8_l
,
q1_filt8_l
,
q2_filt8_l
);
/* convert 16 bit output data into 8 bit */
PCKEV_B4_SH
(
p2_filt8_l
,
p2_filt8_r
,
p1_filt8_l
,
p1_filt8_r
,
p0_filt8_l
,
p0_filt8_r
,
q0_filt8_l
,
q0_filt8_r
,
p2_filt8_r
,
p1_filt8_r
,
p0_filt8_r
,
q0_filt8_r
);
PCKEV_B2_SH
(
q1_filt8_l
,
q1_filt8_r
,
q2_filt8_l
,
q2_filt8_r
,
q1_filt8_r
,
q2_filt8_r
);
/* store pixel values */
p2_out
=
__msa_bmnz_v
(
p2
,
(
v16u8
)
p2_filt8_r
,
flat
);
p1_out
=
__msa_bmnz_v
(
p1_out
,
(
v16u8
)
p1_filt8_r
,
flat
);
p0_out
=
__msa_bmnz_v
(
p0_out
,
(
v16u8
)
p0_filt8_r
,
flat
);
q0_out
=
__msa_bmnz_v
(
q0_out
,
(
v16u8
)
q0_filt8_r
,
flat
);
q1_out
=
__msa_bmnz_v
(
q1_out
,
(
v16u8
)
q1_filt8_r
,
flat
);
q2_out
=
__msa_bmnz_v
(
q2
,
(
v16u8
)
q2_filt8_r
,
flat
);
src
-=
3
*
pitch
;
ST_UB4
(
p2_out
,
p1_out
,
p0_out
,
q0_out
,
src
,
pitch
);
src
+=
(
4
*
pitch
);
ST_UB2
(
q1_out
,
q2_out
,
src
,
pitch
);
src
+=
(
2
*
pitch
);
}
}
void
vp9_lpf_vertical_8_msa
(
uint8_t
*
src
,
int32_t
pitch
,
const
uint8_t
*
b_limit_ptr
,
const
uint8_t
*
limit_ptr
,
const
uint8_t
*
thresh_ptr
,
int32_t
count
)
{
v16u8
p3
,
p2
,
p1
,
p0
,
q3
,
q2
,
q1
,
q0
;
v16u8
p1_out
,
p0_out
,
q0_out
,
q1_out
;
v16u8
flat
,
mask
,
hev
,
thresh
,
b_limit
,
limit
;
v8u16
p3_r
,
p2_r
,
p1_r
,
p0_r
,
q0_r
,
q1_r
,
q2_r
,
q3_r
;
v8i16
p2_filt8_r
,
p1_filt8_r
,
p0_filt8_r
,
q0_filt8_r
,
q1_filt8_r
,
q2_filt8_r
;
v16u8
zero
=
{
0
};
v8i16
vec0
,
vec1
,
vec2
,
vec3
,
vec4
;
(
void
)
count
;
/* load vector elements */
LD_UB8
(
src
-
4
,
pitch
,
p3
,
p2
,
p1
,
p0
,
q0
,
q1
,
q2
,
q3
);
TRANSPOSE8x8_UB_UB
(
p3
,
p2
,
p1
,
p0
,
q0
,
q1
,
q2
,
q3
,
p3
,
p2
,
p1
,
p0
,
q0
,
q1
,
q2
,
q3
);
thresh
=
(
v16u8
)
__msa_fill_b
(
*
thresh_ptr
);
b_limit
=
(
v16u8
)
__msa_fill_b
(
*
b_limit_ptr
);
limit
=
(
v16u8
)
__msa_fill_b
(
*
limit_ptr
);
/* mask and hev */
LPF_MASK_HEV
(
p3
,
p2
,
p1
,
p0
,
q0
,
q1
,
q2
,
q3
,
limit
,
b_limit
,
thresh
,
hev
,
mask
,
flat
);
/* flat4 */
VP9_FLAT4
(
p3
,
p2
,
p0
,
q0
,
q2
,
q3
,
flat
);
/* filter4 */
VP9_LPF_FILTER4_8W
(
p1
,
p0
,
q0
,
q1
,
mask
,
hev
,
p1_out
,
p0_out
,
q0_out
,
q1_out
);
flat
=
(
v16u8
)
__msa_ilvr_d
((
v2i64
)
zero
,
(
v2i64
)
flat
);
if
(
__msa_test_bz_v
(
flat
))
{
/* Store 4 pixels p1-_q1 */
ILVR_B2_SH
(
p0_out
,
p1_out
,
q1_out
,
q0_out
,
vec0
,
vec1
);
ILVRL_H2_SH
(
vec1
,
vec0
,
vec2
,
vec3
);
src
-=
2
;
ST4x4_UB
(
vec2
,
vec2
,
0
,
1
,
2
,
3
,
src
,
pitch
);
src
+=
4
*
pitch
;
ST4x4_UB
(
vec3
,
vec3
,
0
,
1
,
2
,
3
,
src
,
pitch
);
}
else
{
ILVR_B8_UH
(
zero
,
p3
,
zero
,
p2
,
zero
,
p1
,
zero
,
p0
,
zero
,
q0
,
zero
,
q1
,
zero
,
q2
,
zero
,
q3
,
p3_r
,
p2_r
,
p1_r
,
p0_r
,
q0_r
,
q1_r
,
q2_r
,
q3_r
);
VP9_FILTER8
(
p3_r
,
p2_r
,
p1_r
,
p0_r
,
q0_r
,
q1_r
,
q2_r
,
q3_r
,
p2_filt8_r
,
p1_filt8_r
,
p0_filt8_r
,
q0_filt8_r
,
q1_filt8_r
,
q2_filt8_r
);
/* convert 16 bit output data into 8 bit */
PCKEV_B4_SH
(
p2_filt8_r
,
p2_filt8_r
,
p1_filt8_r
,
p1_filt8_r
,
p0_filt8_r
,
p0_filt8_r
,
q0_filt8_r
,
q0_filt8_r
,
p2_filt8_r
,
p1_filt8_r
,
p0_filt8_r
,
q0_filt8_r
);
PCKEV_B2_SH
(
q1_filt8_r
,
q1_filt8_r
,
q2_filt8_r
,
q2_filt8_r
,
q1_filt8_r
,
q2_filt8_r
);
/* store pixel values */
p2
=
__msa_bmnz_v
(
p2
,
(
v16u8
)
p2_filt8_r
,
flat
);
p1
=
__msa_bmnz_v
(
p1_out
,
(
v16u8
)
p1_filt8_r
,
flat
);
p0
=
__msa_bmnz_v
(
p0_out
,
(
v16u8
)
p0_filt8_r
,
flat
);
q0
=
__msa_bmnz_v
(
q0_out
,
(
v16u8
)
q0_filt8_r
,
flat
);
q1
=
__msa_bmnz_v
(
q1_out
,
(
v16u8
)
q1_filt8_r
,
flat
);
q2
=
__msa_bmnz_v
(
q2
,
(
v16u8
)
q2_filt8_r
,
flat
);
/* Store 6 pixels p2-_q2 */
ILVR_B2_SH
(
p1
,
p2
,
q0
,
p0
,
vec0
,
vec1
);
ILVRL_H2_SH
(
vec1
,
vec0
,
vec2
,
vec3
);
vec4
=
(
v8i16
)
__msa_ilvr_b
((
v16i8
)
q2
,
(
v16i8
)
q1
);
src
-=
3
;
ST4x4_UB
(
vec2
,
vec2
,
0
,
1
,
2
,
3
,
src
,
pitch
);
ST2x4_UB
(
vec4
,
0
,
src
+
4
,
pitch
);
src
+=
(
4
*
pitch
);
ST4x4_UB
(
vec3
,
vec3
,
0
,
1
,
2
,
3
,
src
,
pitch
);
ST2x4_UB
(
vec4
,
4
,
src
+
4
,
pitch
);
}
}
void
vp9_lpf_vertical_8_dual_msa
(
uint8_t
*
src
,
int32_t
pitch
,
const
uint8_t
*
b_limit0
,
const
uint8_t
*
limit0
,
const
uint8_t
*
thresh0
,
const
uint8_t
*
b_limit1
,
const
uint8_t
*
limit1
,
const
uint8_t
*
thresh1
)
{
uint8_t
*
temp_src
;
v16u8
p3
,
p2
,
p1
,
p0
,
q3
,
q2
,
q1
,
q0
;
v16u8
p1_out
,
p0_out
,
q0_out
,
q1_out
;
v16u8
flat
,
mask
,
hev
,
thresh
,
b_limit
,
limit
;
v16u8
row4
,
row5
,
row6
,
row7
,
row12
,
row13
,
row14
,
row15
;
v8u16
p3_r
,
p2_r
,
p1_r
,
p0_r
,
q0_r
,
q1_r
,
q2_r
,
q3_r
;
v8u16
p3_l
,
p2_l
,
p1_l
,
p0_l
,
q0_l
,
q1_l
,
q2_l
,
q3_l
;
v8i16
p2_filt8_r
,
p1_filt8_r
,
p0_filt8_r
,
q0_filt8_r
,
q1_filt8_r
,
q2_filt8_r
;
v8i16
p2_filt8_l
,
p1_filt8_l
,
p0_filt8_l
,
q0_filt8_l
,
q1_filt8_l
,
q2_filt8_l
;
v16u8
zero
=
{
0
};
v8i16
vec0
,
vec1
,
vec2
,
vec3
,
vec4
,
vec5
,
vec6
,
vec7
;
temp_src
=
src
-
4
;
LD_UB8
(
temp_src
,
pitch
,
p0
,
p1
,
p2
,
p3
,
row4
,
row5
,
row6
,
row7
);
temp_src
+=
(
8
*
pitch
);
LD_UB8
(
temp_src
,
pitch
,
q3
,
q2
,
q1
,
q0
,
row12
,
row13
,
row14
,
row15
);
/* transpose 16x8 matrix into 8x16 */
TRANSPOSE16x8_UB_UB
(
p0
,
p1
,
p2
,
p3
,
row4
,
row5
,
row6
,
row7
,
q3
,
q2
,
q1
,
q0
,
row12
,
row13
,
row14
,
row15
,
p3
,
p2
,
p1
,
p0
,
q0
,
q1
,
q2
,
q3
);
thresh
=
(
v16u8
)
__msa_fill_b
(
*
thresh0
);
vec0
=
(
v8i16
)
__msa_fill_b
(
*
thresh1
);
thresh
=
(
v16u8
)
__msa_ilvr_d
((
v2i64
)
vec0
,
(
v2i64
)
thresh
);
b_limit
=
(
v16u8
)
__msa_fill_b
(
*
b_limit0
);
vec0
=
(
v8i16
)
__msa_fill_b
(
*
b_limit1
);
b_limit
=
(
v16u8
)
__msa_ilvr_d
((
v2i64
)
vec0
,
(
v2i64
)
b_limit
);
limit
=
(
v16u8
)
__msa_fill_b
(
*
limit0
);
vec0
=
(
v8i16
)
__msa_fill_b
(
*
limit1
);
limit
=
(
v16u8
)
__msa_ilvr_d
((
v2i64
)
vec0
,
(
v2i64
)
limit
);
/* mask and hev */
LPF_MASK_HEV
(
p3
,
p2
,
p1
,
p0
,
q0
,
q1
,
q2
,
q3
,
limit
,
b_limit
,
thresh
,
hev
,
mask
,
flat
);
/* flat4 */
VP9_FLAT4
(
p3
,
p2
,
p0
,
q0
,
q2
,
q3
,
flat
);
/* filter4 */
VP9_LPF_FILTER4_4W
(
p1
,
p0
,
q0
,
q1
,
mask
,
hev
,
p1_out
,
p0_out
,
q0_out
,
q1_out
);
if
(
__msa_test_bz_v
(
flat
))
{
ILVR_B2_SH
(
p0_out
,
p1_out
,
q1_out
,
q0_out
,
vec0
,
vec1
);
ILVRL_H2_SH
(
vec1
,
vec0
,
vec2
,
vec3
);
ILVL_B2_SH
(
p0_out
,
p1_out
,
q1_out
,
q0_out
,
vec0
,
vec1
);
ILVRL_H2_SH
(
vec1
,
vec0
,
vec4
,
vec5
);
src
-=
2
;
ST4x8_UB
(
vec2
,
vec3
,
src
,
pitch
);
src
+=
8
*
pitch
;
ST4x8_UB
(
vec4
,
vec5
,
src
,
pitch
);
}
else
{
ILVR_B8_UH
(
zero
,
p3
,
zero
,
p2
,
zero
,
p1
,
zero
,
p0
,
zero
,
q0
,
zero
,
q1
,
zero
,
q2
,
zero
,
q3
,
p3_r
,
p2_r
,
p1_r
,
p0_r
,
q0_r
,
q1_r
,
q2_r
,
q3_r
);
VP9_FILTER8
(
p3_r
,
p2_r
,
p1_r
,
p0_r
,
q0_r
,
q1_r
,
q2_r
,
q3_r
,
p2_filt8_r
,
p1_filt8_r
,
p0_filt8_r
,
q0_filt8_r
,
q1_filt8_r
,
q2_filt8_r
);
ILVL_B4_UH
(
zero
,
p3
,
zero
,
p2
,
zero
,
p1
,
zero
,
p0
,
p3_l
,
p2_l
,
p1_l
,
p0_l
);
ILVL_B4_UH
(
zero
,
q0
,
zero
,
q1
,
zero
,
q2
,
zero
,
q3
,
q0_l
,
q1_l
,
q2_l
,
q3_l
);
/* filter8 */
VP9_FILTER8
(
p3_l
,
p2_l
,
p1_l
,
p0_l
,
q0_l
,
q1_l
,
q2_l
,
q3_l
,
p2_filt8_l
,
p1_filt8_l
,
p0_filt8_l
,
q0_filt8_l
,
q1_filt8_l
,
q2_filt8_l
);
/* convert 16 bit output data into 8 bit */
PCKEV_B4_SH
(
p2_filt8_l
,
p2_filt8_r
,
p1_filt8_l
,
p1_filt8_r
,
p0_filt8_l
,
p0_filt8_r
,
q0_filt8_l
,
q0_filt8_r
,
p2_filt8_r
,
p1_filt8_r
,
p0_filt8_r
,
q0_filt8_r
);
PCKEV_B2_SH
(
q1_filt8_l
,
q1_filt8_r
,
q2_filt8_l
,
q2_filt8_r
,
q1_filt8_r
,
q2_filt8_r
);
/* store pixel values */
p2
=
__msa_bmnz_v
(
p2
,
(
v16u8
)
p2_filt8_r
,
flat
);
p1
=
__msa_bmnz_v
(
p1_out
,
(
v16u8
)
p1_filt8_r
,
flat
);
p0
=
__msa_bmnz_v
(
p0_out
,
(
v16u8
)
p0_filt8_r
,
flat
);
q0
=
__msa_bmnz_v
(
q0_out
,
(
v16u8
)
q0_filt8_r
,
flat
);
q1
=
__msa_bmnz_v
(
q1_out
,
(
v16u8
)
q1_filt8_r
,
flat
);
q2
=
__msa_bmnz_v
(
q2
,
(
v16u8
)
q2_filt8_r
,
flat
);
ILVR_B2_SH
(
p1
,
p2
,
q0
,
p0
,
vec0
,
vec1
);
ILVRL_H2_SH
(
vec1
,
vec0
,
vec3
,
vec4
);
ILVL_B2_SH
(
p1
,
p2
,
q0
,
p0
,
vec0
,
vec1
);
ILVRL_H2_SH
(
vec1
,
vec0
,
vec6
,
vec7
);
ILVRL_B2_SH
(
q2
,
q1
,
vec2
,
vec5
);
src
-=
3
;
ST4x4_UB
(
vec3
,
vec3
,
0
,
1
,
2
,
3
,
src
,
pitch
);
ST2x4_UB
(
vec2
,
0
,
src
+
4
,
pitch
);
src
+=
(
4
*
pitch
);
ST4x4_UB
(
vec4
,
vec4
,
0
,
1
,
2
,
3
,
src
,
pitch
);
ST2x4_UB
(
vec2
,
4
,
src
+
4
,
pitch
);
src
+=
(
4
*
pitch
);
ST4x4_UB
(
vec6
,
vec6
,
0
,
1
,
2
,
3
,
src
,
pitch
);
ST2x4_UB
(
vec5
,
0
,
src
+
4
,
pitch
);
src
+=
(
4
*
pitch
);
ST4x4_UB
(
vec7
,
vec7
,
0
,
1
,
2
,
3
,
src
,
pitch
);
ST2x4_UB
(
vec5
,
4
,
src
+
4
,
pitch
);
}
}
vp9/common/vp9_rtcd_defs.pl
View file @
d43fd998
...
...
@@ -224,36 +224,36 @@ specialize qw/vp9_lpf_vertical_16_dual sse2 neon_asm dspr2 msa/;
$vp9_lpf_vertical_16_dual_neon_asm
=
vp9_lpf_vertical_16_dual_neon
;
add_proto
qw/void vp9_lpf_vertical_8/
,
"
uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count
";
specialize
qw/vp9_lpf_vertical_8 sse2 neon_asm dspr2/
;
specialize
qw/vp9_lpf_vertical_8 sse2 neon_asm dspr2
msa
/
;
$vp9_lpf_vertical_8_neon_asm
=
vp9_lpf_vertical_8_neon
;
add_proto
qw/void vp9_lpf_vertical_8_dual/
,
"
uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1
";
specialize
qw/vp9_lpf_vertical_8_dual sse2 neon_asm dspr2/
;
specialize
qw/vp9_lpf_vertical_8_dual sse2 neon_asm dspr2
msa
/
;
$vp9_lpf_vertical_8_dual_neon_asm
=
vp9_lpf_vertical_8_dual_neon
;
add_proto
qw/void vp9_lpf_vertical_4/
,
"
uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count
";
specialize
qw/vp9_lpf_vertical_4 mmx neon dspr2/
;
specialize
qw/vp9_lpf_vertical_4 mmx neon dspr2
msa
/
;
add_proto
qw/void vp9_lpf_vertical_4_dual/
,
"
uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1
";
specialize
qw/vp9_lpf_vertical_4_dual sse2 neon dspr2/
;
specialize
qw/vp9_lpf_vertical_4_dual sse2 neon dspr2
msa
/
;
add_proto
qw/void vp9_lpf_horizontal_16/
,
"
uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count
";
specialize
qw/vp9_lpf_horizontal_16 sse2 avx2 neon_asm dspr2 msa/
;
$vp9_lpf_horizontal_16_neon_asm
=
vp9_lpf_horizontal_16_neon
;
add_proto
qw/void vp9_lpf_horizontal_8/
,
"
uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count
";
specialize
qw/vp9_lpf_horizontal_8 sse2 neon_asm dspr2/
;
specialize
qw/vp9_lpf_horizontal_8 sse2 neon_asm dspr2
msa
/
;
$vp9_lpf_horizontal_8_neon_asm
=
vp9_lpf_horizontal_8_neon
;
add_proto
qw/void vp9_lpf_horizontal_8_dual/
,
"
uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1
";
specialize
qw/vp9_lpf_horizontal_8_dual sse2 neon_asm dspr2/
;
specialize
qw/vp9_lpf_horizontal_8_dual sse2 neon_asm dspr2
msa
/
;
$vp9_lpf_horizontal_8_dual_neon_asm
=
vp9_lpf_horizontal_8_dual_neon
;
add_proto
qw/void vp9_lpf_horizontal_4/
,
"
uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count
";
specialize
qw/vp9_lpf_horizontal_4 mmx neon dspr2/
;
specialize
qw/vp9_lpf_horizontal_4 mmx neon dspr2
msa
/
;
add_proto
qw/void vp9_lpf_horizontal_4_dual/
,
"
uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1
";