Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
b92cc27b
Commit
b92cc27b
authored
Jun 26, 2015
by
Parag Salasakar
Browse files
mips msa vp9 temporal filter optimization
average improvement ~4x-5x Change-Id: Iad9c0a296dbc2ea96d000bd009077999ed58a3c5
parent
c040f96e
Changes
3
Hide whitespace changes
Inline
Side-by-side
vp9/common/vp9_rtcd_defs.pl
View file @
b92cc27b
...
...
@@ -1077,7 +1077,7 @@ add_proto qw/int vp9_full_range_search/, "const struct macroblock *x, const stru
specialize
qw/vp9_full_range_search/
;
add_proto
qw/void vp9_temporal_filter_apply/
,
"
uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count
";
specialize
qw/vp9_temporal_filter_apply sse2/
;
specialize
qw/vp9_temporal_filter_apply sse2
msa
/
;
if
(
vpx_config
("
CONFIG_VP9_HIGHBITDEPTH
")
eq
"
yes
")
{
...
...
vp9/encoder/mips/msa/vp9_temporal_filter_msa.c
0 → 100644
View file @
b92cc27b
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vp9_rtcd.h"
#include "vp9/common/mips/msa/vp9_macros_msa.h"
static
void
temporal_filter_apply_8size_msa
(
uint8_t
*
frm1_ptr
,
uint32_t
stride
,
uint8_t
*
frm2_ptr
,
int32_t
filt_sth
,
int32_t
filt_wgt
,
uint32_t
*
acc
,
uint16_t
*
cnt
)
{
uint32_t
row
;
uint64_t
f0
,
f1
,
f2
,
f3
;
v16i8
frm2
,
frm1
=
{
0
};
v16i8
frm4
,
frm3
=
{
0
};
v16u8
frm_r
,
frm_l
;
v8i16
frm2_r
,
frm2_l
;
v8i16
diff0
,
diff1
,
mod0_h
,
mod1_h
;
v4i32
cnst3
,
cnst16
,
filt_wt
,
strength
;
v4i32
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
;
v4i32
diff0_r
,
diff0_l
,
diff1_r
,
diff1_l
;
v4i32
frm2_rr
,
frm2_rl
,
frm2_lr
,
frm2_ll
;
v4i32
acc0
,
acc1
,
acc2
,
acc3
;
v8i16
cnt0
,
cnt1
;
filt_wt
=
__msa_fill_w
(
filt_wgt
);
strength
=
__msa_fill_w
(
filt_sth
);
cnst3
=
__msa_ldi_w
(
3
);
cnst16
=
__msa_ldi_w
(
16
);
for
(
row
=
2
;
row
--
;)
{
LD4
(
frm1_ptr
,
stride
,
f0
,
f1
,
f2
,
f3
);
frm1_ptr
+=
(
4
*
stride
);
LD_SB2
(
frm2_ptr
,
16
,
frm2
,
frm4
);
frm2_ptr
+=
32
;
LD_SW2
(
acc
,
4
,
acc0
,
acc1
);
LD_SW2
(
acc
+
8
,
4
,
acc2
,
acc3
);
LD_SH2
(
cnt
,
8
,
cnt0
,
cnt1
);
INSERT_D2_SB
(
f0
,
f1
,
frm1
);
INSERT_D2_SB
(
f2
,
f3
,
frm3
);
ILVRL_B2_UB
(
frm1
,
frm2
,
frm_r
,
frm_l
);
HSUB_UB2_SH
(
frm_r
,
frm_l
,
diff0
,
diff1
);
UNPCK_SH_SW
(
diff0
,
diff0_r
,
diff0_l
);
UNPCK_SH_SW
(
diff1
,
diff1_r
,
diff1_l
);
MUL4
(
diff0_r
,
diff0_r
,
diff0_l
,
diff0_l
,
diff1_r
,
diff1_r
,
diff1_l
,
diff1_l
,
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
);
MUL4
(
mod0_w
,
cnst3
,
mod1_w
,
cnst3
,
mod2_w
,
cnst3
,
mod3_w
,
cnst3
,
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
);
SRAR_W4_SW
(
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
,
strength
);
diff0_r
=
(
mod0_w
<
cnst16
);
diff0_l
=
(
mod1_w
<
cnst16
);
diff1_r
=
(
mod2_w
<
cnst16
);
diff1_l
=
(
mod3_w
<
cnst16
);
SUB4
(
cnst16
,
mod0_w
,
cnst16
,
mod1_w
,
cnst16
,
mod2_w
,
cnst16
,
mod3_w
,
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
);
mod0_w
=
diff0_r
&
mod0_w
;
mod1_w
=
diff0_l
&
mod1_w
;
mod2_w
=
diff1_r
&
mod2_w
;
mod3_w
=
diff1_l
&
mod3_w
;
MUL4
(
mod0_w
,
filt_wt
,
mod1_w
,
filt_wt
,
mod2_w
,
filt_wt
,
mod3_w
,
filt_wt
,
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
);
PCKEV_H2_SH
(
mod1_w
,
mod0_w
,
mod3_w
,
mod2_w
,
mod0_h
,
mod1_h
);
ADD2
(
mod0_h
,
cnt0
,
mod1_h
,
cnt1
,
mod0_h
,
mod1_h
);
ST_SH2
(
mod0_h
,
mod1_h
,
cnt
,
8
);
cnt
+=
16
;
UNPCK_UB_SH
(
frm2
,
frm2_r
,
frm2_l
);
UNPCK_SH_SW
(
frm2_r
,
frm2_rr
,
frm2_rl
);
UNPCK_SH_SW
(
frm2_l
,
frm2_lr
,
frm2_ll
);
MUL4
(
mod0_w
,
frm2_rr
,
mod1_w
,
frm2_rl
,
mod2_w
,
frm2_lr
,
mod3_w
,
frm2_ll
,
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
);
ADD4
(
mod0_w
,
acc0
,
mod1_w
,
acc1
,
mod2_w
,
acc2
,
mod3_w
,
acc3
,
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
);
ST_SW2
(
mod0_w
,
mod1_w
,
acc
,
4
);
acc
+=
8
;
ST_SW2
(
mod2_w
,
mod3_w
,
acc
,
4
);
acc
+=
8
;
LD_SW2
(
acc
,
4
,
acc0
,
acc1
);
LD_SW2
(
acc
+
8
,
4
,
acc2
,
acc3
);
LD_SH2
(
cnt
,
8
,
cnt0
,
cnt1
);
ILVRL_B2_UB
(
frm3
,
frm4
,
frm_r
,
frm_l
);
HSUB_UB2_SH
(
frm_r
,
frm_l
,
diff0
,
diff1
);
UNPCK_SH_SW
(
diff0
,
diff0_r
,
diff0_l
);
UNPCK_SH_SW
(
diff1
,
diff1_r
,
diff1_l
);
MUL4
(
diff0_r
,
diff0_r
,
diff0_l
,
diff0_l
,
diff1_r
,
diff1_r
,
diff1_l
,
diff1_l
,
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
);
MUL4
(
mod0_w
,
cnst3
,
mod1_w
,
cnst3
,
mod2_w
,
cnst3
,
mod3_w
,
cnst3
,
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
);
SRAR_W4_SW
(
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
,
strength
);
diff0_r
=
(
mod0_w
<
cnst16
);
diff0_l
=
(
mod1_w
<
cnst16
);
diff1_r
=
(
mod2_w
<
cnst16
);
diff1_l
=
(
mod3_w
<
cnst16
);
SUB4
(
cnst16
,
mod0_w
,
cnst16
,
mod1_w
,
cnst16
,
mod2_w
,
cnst16
,
mod3_w
,
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
);
mod0_w
=
diff0_r
&
mod0_w
;
mod1_w
=
diff0_l
&
mod1_w
;
mod2_w
=
diff1_r
&
mod2_w
;
mod3_w
=
diff1_l
&
mod3_w
;
MUL4
(
mod0_w
,
filt_wt
,
mod1_w
,
filt_wt
,
mod2_w
,
filt_wt
,
mod3_w
,
filt_wt
,
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
);
PCKEV_H2_SH
(
mod1_w
,
mod0_w
,
mod3_w
,
mod2_w
,
mod0_h
,
mod1_h
);
ADD2
(
mod0_h
,
cnt0
,
mod1_h
,
cnt1
,
mod0_h
,
mod1_h
);
ST_SH2
(
mod0_h
,
mod1_h
,
cnt
,
8
);
cnt
+=
16
;
UNPCK_UB_SH
(
frm4
,
frm2_r
,
frm2_l
);
UNPCK_SH_SW
(
frm2_r
,
frm2_rr
,
frm2_rl
);
UNPCK_SH_SW
(
frm2_l
,
frm2_lr
,
frm2_ll
);
MUL4
(
mod0_w
,
frm2_rr
,
mod1_w
,
frm2_rl
,
mod2_w
,
frm2_lr
,
mod3_w
,
frm2_ll
,
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
);
ADD4
(
mod0_w
,
acc0
,
mod1_w
,
acc1
,
mod2_w
,
acc2
,
mod3_w
,
acc3
,
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
);
ST_SW2
(
mod0_w
,
mod1_w
,
acc
,
4
);
acc
+=
8
;
ST_SW2
(
mod2_w
,
mod3_w
,
acc
,
4
);
acc
+=
8
;
}
}
static
void
temporal_filter_apply_16size_msa
(
uint8_t
*
frm1_ptr
,
uint32_t
stride
,
uint8_t
*
frm2_ptr
,
int32_t
filt_sth
,
int32_t
filt_wgt
,
uint32_t
*
acc
,
uint16_t
*
cnt
)
{
uint32_t
row
;
v16i8
frm1
,
frm2
,
frm3
,
frm4
;
v16u8
frm_r
,
frm_l
;
v16i8
zero
=
{
0
};
v8u16
frm2_r
,
frm2_l
;
v8i16
diff0
,
diff1
,
mod0_h
,
mod1_h
;
v4i32
cnst3
,
cnst16
,
filt_wt
,
strength
;
v4i32
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
;
v4i32
diff0_r
,
diff0_l
,
diff1_r
,
diff1_l
;
v4i32
frm2_rr
,
frm2_rl
,
frm2_lr
,
frm2_ll
;
v4i32
acc0
,
acc1
,
acc2
,
acc3
;
v8i16
cnt0
,
cnt1
;
filt_wt
=
__msa_fill_w
(
filt_wgt
);
strength
=
__msa_fill_w
(
filt_sth
);
cnst3
=
__msa_ldi_w
(
3
);
cnst16
=
__msa_ldi_w
(
16
);
for
(
row
=
8
;
row
--
;)
{
LD_SB2
(
frm1_ptr
,
stride
,
frm1
,
frm3
);
frm1_ptr
+=
stride
;
LD_SB2
(
frm2_ptr
,
16
,
frm2
,
frm4
);
frm2_ptr
+=
16
;
LD_SW2
(
acc
,
4
,
acc0
,
acc1
);
LD_SW2
(
acc
,
4
,
acc2
,
acc3
);
LD_SH2
(
cnt
,
8
,
cnt0
,
cnt1
);
ILVRL_B2_UB
(
frm1
,
frm2
,
frm_r
,
frm_l
);
HSUB_UB2_SH
(
frm_r
,
frm_l
,
diff0
,
diff1
);
UNPCK_SH_SW
(
diff0
,
diff0_r
,
diff0_l
);
UNPCK_SH_SW
(
diff1
,
diff1_r
,
diff1_l
);
MUL4
(
diff0_r
,
diff0_r
,
diff0_l
,
diff0_l
,
diff1_r
,
diff1_r
,
diff1_l
,
diff1_l
,
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
);
MUL4
(
mod0_w
,
cnst3
,
mod1_w
,
cnst3
,
mod2_w
,
cnst3
,
mod3_w
,
cnst3
,
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
);
SRAR_W4_SW
(
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
,
strength
);
diff0_r
=
(
mod0_w
<
cnst16
);
diff0_l
=
(
mod1_w
<
cnst16
);
diff1_r
=
(
mod2_w
<
cnst16
);
diff1_l
=
(
mod3_w
<
cnst16
);
SUB4
(
cnst16
,
mod0_w
,
cnst16
,
mod1_w
,
cnst16
,
mod2_w
,
cnst16
,
mod3_w
,
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
);
mod0_w
=
diff0_r
&
mod0_w
;
mod1_w
=
diff0_l
&
mod1_w
;
mod2_w
=
diff1_r
&
mod2_w
;
mod3_w
=
diff1_l
&
mod3_w
;
MUL4
(
mod0_w
,
filt_wt
,
mod1_w
,
filt_wt
,
mod2_w
,
filt_wt
,
mod3_w
,
filt_wt
,
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
);
PCKEV_H2_SH
(
mod1_w
,
mod0_w
,
mod3_w
,
mod2_w
,
mod0_h
,
mod1_h
);
ADD2
(
mod0_h
,
cnt0
,
mod1_h
,
cnt1
,
mod0_h
,
mod1_h
);
ST_SH2
(
mod0_h
,
mod1_h
,
cnt
,
8
);
cnt
+=
16
;
ILVRL_B2_UH
(
zero
,
frm2
,
frm2_r
,
frm2_l
);
UNPCK_SH_SW
(
frm2_r
,
frm2_rr
,
frm2_rl
);
UNPCK_SH_SW
(
frm2_l
,
frm2_lr
,
frm2_ll
);
MUL4
(
mod0_w
,
frm2_rr
,
mod1_w
,
frm2_rl
,
mod2_w
,
frm2_lr
,
mod3_w
,
frm2_ll
,
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
);
ADD4
(
mod0_w
,
acc0
,
mod1_w
,
acc1
,
mod2_w
,
acc2
,
mod3_w
,
acc3
,
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
);
ST_SW2
(
mod0_w
,
mod1_w
,
acc
,
4
);
acc
+=
8
;
ST_SW2
(
mod2_w
,
mod3_w
,
acc
,
4
);
acc
+=
8
;
LD_SW2
(
acc
,
4
,
acc0
,
acc1
);
LD_SW2
(
acc
+
8
,
4
,
acc2
,
acc3
);
LD_SH2
(
cnt
,
8
,
cnt0
,
cnt1
);
ILVRL_B2_UB
(
frm3
,
frm4
,
frm_r
,
frm_l
);
HSUB_UB2_SH
(
frm_r
,
frm_l
,
diff0
,
diff1
);
UNPCK_SH_SW
(
diff0
,
diff0_r
,
diff0_l
);
UNPCK_SH_SW
(
diff1
,
diff1_r
,
diff1_l
);
MUL4
(
diff0_r
,
diff0_r
,
diff0_l
,
diff0_l
,
diff1_r
,
diff1_r
,
diff1_l
,
diff1_l
,
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
);
MUL4
(
mod0_w
,
cnst3
,
mod1_w
,
cnst3
,
mod2_w
,
cnst3
,
mod3_w
,
cnst3
,
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
);
SRAR_W4_SW
(
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
,
strength
);
diff0_r
=
(
mod0_w
<
cnst16
);
diff0_l
=
(
mod1_w
<
cnst16
);
diff1_r
=
(
mod2_w
<
cnst16
);
diff1_l
=
(
mod3_w
<
cnst16
);
SUB4
(
cnst16
,
mod0_w
,
cnst16
,
mod1_w
,
cnst16
,
mod2_w
,
cnst16
,
mod3_w
,
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
);
mod0_w
=
diff0_r
&
mod0_w
;
mod1_w
=
diff0_l
&
mod1_w
;
mod2_w
=
diff1_r
&
mod2_w
;
mod3_w
=
diff1_l
&
mod3_w
;
MUL4
(
mod0_w
,
filt_wt
,
mod1_w
,
filt_wt
,
mod2_w
,
filt_wt
,
mod3_w
,
filt_wt
,
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
);
PCKEV_H2_SH
(
mod1_w
,
mod0_w
,
mod3_w
,
mod2_w
,
mod0_h
,
mod1_h
);
ADD2
(
mod0_h
,
cnt0
,
mod1_h
,
cnt1
,
mod0_h
,
mod1_h
);
ST_SH2
(
mod0_h
,
mod1_h
,
cnt
,
8
);
cnt
+=
16
;
ILVRL_B2_UH
(
zero
,
frm4
,
frm2_r
,
frm2_l
);
UNPCK_SH_SW
(
frm2_r
,
frm2_rr
,
frm2_rl
);
UNPCK_SH_SW
(
frm2_l
,
frm2_lr
,
frm2_ll
);
MUL4
(
mod0_w
,
frm2_rr
,
mod1_w
,
frm2_rl
,
mod2_w
,
frm2_lr
,
mod3_w
,
frm2_ll
,
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
);
ADD4
(
mod0_w
,
acc0
,
mod1_w
,
acc1
,
mod2_w
,
acc2
,
mod3_w
,
acc3
,
mod0_w
,
mod1_w
,
mod2_w
,
mod3_w
);
ST_SW2
(
mod0_w
,
mod1_w
,
acc
,
4
);
acc
+=
8
;
ST_SW2
(
mod2_w
,
mod3_w
,
acc
,
4
);
acc
+=
8
;
frm1_ptr
+=
stride
;
frm2_ptr
+=
16
;
}
}
void
vp9_temporal_filter_apply_msa
(
uint8_t
*
frame1_ptr
,
uint32_t
stride
,
uint8_t
*
frame2_ptr
,
uint32_t
blk_w
,
uint32_t
blk_h
,
int32_t
strength
,
int32_t
filt_wgt
,
uint32_t
*
accu
,
uint16_t
*
cnt
)
{
if
(
8
==
(
blk_w
*
blk_h
))
{
temporal_filter_apply_8size_msa
(
frame1_ptr
,
stride
,
frame2_ptr
,
strength
,
filt_wgt
,
accu
,
cnt
);
}
else
if
(
16
==
(
blk_w
*
blk_h
))
{
temporal_filter_apply_16size_msa
(
frame1_ptr
,
stride
,
frame2_ptr
,
strength
,
filt_wgt
,
accu
,
cnt
);
}
else
{
vp9_temporal_filter_apply_c
(
frame1_ptr
,
stride
,
frame2_ptr
,
blk_w
,
blk_h
,
strength
,
filt_wgt
,
accu
,
cnt
);
}
}
vp9/vp9cx.mk
View file @
b92cc27b
...
...
@@ -160,5 +160,6 @@ VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c
VP9_CX_SRCS-$(HAVE_MSA)
+=
encoder/mips/msa/vp9_fdct32x32_msa.c
VP9_CX_SRCS-$(HAVE_MSA)
+=
encoder/mips/msa/vp9_fdct_msa.h
VP9_CX_SRCS-$(HAVE_MSA)
+=
encoder/mips/msa/vp9_subtract_msa.c
VP9_CX_SRCS-$(HAVE_MSA)
+=
encoder/mips/msa/vp9_temporal_filter_msa.c
VP9_CX_SRCS-yes
:=
$(
filter-out
$
(
VP9_CX_SRCS_REMOVE-yes
)
,
$
(
VP9_CX_SRCS-yes
))
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment