Commit fbac961b authored by Parag Salasakar's avatar Parag Salasakar
Browse files

mips msa vp9 filter by weight optimization

filter by weight - average improvement ~2x-3x

Change-Id: I4832033335d339cdafdce697f07ce3e643920057
parent c7489f48
......@@ -244,6 +244,22 @@
out3 = LW((psrc) + 3 * stride); \
}
/* Description : Load double words with stride
Arguments : Inputs - psrc (source pointer to load from)
- stride
Outputs - out0, out1
Details : Loads double word in 'out0' from (psrc)
Loads double word in 'out1' from (psrc + stride)
*/
#define LD2(psrc, stride, out0, out1) { \
out0 = LD((psrc)); \
out1 = LD((psrc) + stride); \
}
#define LD4(psrc, stride, out0, out1, out2, out3) { \
LD2((psrc), stride, out0, out1); \
LD2((psrc) + 2 * stride, stride, out2, out3); \
}
/* Description : Store 4 words with stride
Arguments : Inputs - in0, in1, in2, in3, pdst, stride
Details : Stores word from 'in0' to (pdst)
......@@ -482,6 +498,24 @@
SD(out0_m, pdst); \
}
/* Description : Store as 8x2 byte block to destination memory from input vector
Arguments : Inputs - in, pdst, stride
Details : Index 0 double word element from input vector 'in' is copied
and stored to destination memory at (pdst)
Index 1 double word element from input vector 'in' is copied
and stored to destination memory at (pdst + stride)
*/
#define ST8x2_UB(in, pdst, stride) { \
uint64_t out0_m, out1_m; \
uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \
\
out0_m = __msa_copy_u_d((v2i64)in, 0); \
out1_m = __msa_copy_u_d((v2i64)in, 1); \
\
SD(out0_m, pblk_8x2_m); \
SD(out1_m, pblk_8x2_m + stride); \
}
/* Description : Store as 8x4 byte block to destination memory from input
vectors
Arguments : Inputs - in0, in1, pdst, stride
......
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_onyxc_int.h"
#include "vp9/common/mips/msa/vp9_macros_msa.h"
static void filter_by_weight8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
uint8_t *dst_ptr, int32_t dst_stride,
int32_t src_weight) {
int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
int32_t row;
uint64_t src0_d, src1_d, dst0_d, dst1_d;
v16i8 src0 = { 0 };
v16i8 src1 = { 0 };
v16i8 dst0 = { 0 };
v16i8 dst1 = { 0 };
v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
src_wt = __msa_fill_h(src_weight);
dst_wt = __msa_fill_h(dst_weight);
for (row = 2; row--;) {
LD2(src_ptr, src_stride, src0_d, src1_d);
src_ptr += (2 * src_stride);
LD2(dst_ptr, dst_stride, dst0_d, dst1_d);
INSERT_D2_SB(src0_d, src1_d, src0);
INSERT_D2_SB(dst0_d, dst1_d, dst0);
LD2(src_ptr, src_stride, src0_d, src1_d);
src_ptr += (2 * src_stride);
LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d);
INSERT_D2_SB(src0_d, src1_d, src1);
INSERT_D2_SB(dst0_d, dst1_d, dst1);
UNPCK_UB_SH(src0, src_r, src_l);
UNPCK_UB_SH(dst0, dst_r, dst_l);
res_h_r = (src_r * src_wt);
res_h_r += (dst_r * dst_wt);
res_h_l = (src_l * src_wt);
res_h_l += (dst_l * dst_wt);
SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
ST8x2_UB(dst0, dst_ptr, dst_stride);
dst_ptr += (2 * dst_stride);
UNPCK_UB_SH(src1, src_r, src_l);
UNPCK_UB_SH(dst1, dst_r, dst_l);
res_h_r = (src_r * src_wt);
res_h_r += (dst_r * dst_wt);
res_h_l = (src_l * src_wt);
res_h_l += (dst_l * dst_wt);
SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
ST8x2_UB(dst1, dst_ptr, dst_stride);
dst_ptr += (2 * dst_stride);
}
}
static void filter_by_weight16x16_msa(const uint8_t *src_ptr,
int32_t src_stride,
uint8_t *dst_ptr,
int32_t dst_stride,
int32_t src_weight) {
int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
int32_t row;
v16i8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
src_wt = __msa_fill_h(src_weight);
dst_wt = __msa_fill_h(dst_weight);
for (row = 4; row--;) {
LD_SB4(src_ptr, src_stride, src0, src1, src2, src3);
src_ptr += (4 * src_stride);
LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3);
UNPCK_UB_SH(src0, src_r, src_l);
UNPCK_UB_SH(dst0, dst_r, dst_l);
res_h_r = (src_r * src_wt);
res_h_r += (dst_r * dst_wt);
res_h_l = (src_l * src_wt);
res_h_l += (dst_l * dst_wt);
SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
dst_ptr += dst_stride;
UNPCK_UB_SH(src1, src_r, src_l);
UNPCK_UB_SH(dst1, dst_r, dst_l);
res_h_r = (src_r * src_wt);
res_h_r += (dst_r * dst_wt);
res_h_l = (src_l * src_wt);
res_h_l += (dst_l * dst_wt);
SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
dst_ptr += dst_stride;
UNPCK_UB_SH(src2, src_r, src_l);
UNPCK_UB_SH(dst2, dst_r, dst_l);
res_h_r = (src_r * src_wt);
res_h_r += (dst_r * dst_wt);
res_h_l = (src_l * src_wt);
res_h_l += (dst_l * dst_wt);
SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
dst_ptr += dst_stride;
UNPCK_UB_SH(src3, src_r, src_l);
UNPCK_UB_SH(dst3, dst_r, dst_l);
res_h_r = (src_r * src_wt);
res_h_r += (dst_r * dst_wt);
res_h_l = (src_l * src_wt);
res_h_l += (dst_l * dst_wt);
SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
dst_ptr += dst_stride;
}
}
void vp9_filter_by_weight8x8_msa(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
int src_weight) {
filter_by_weight8x8_msa(src, src_stride, dst, dst_stride, src_weight);
}
void vp9_filter_by_weight16x16_msa(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
int src_weight) {
filter_by_weight16x16_msa(src, src_stride, dst, dst_stride, src_weight);
}
......@@ -276,10 +276,10 @@ specialize qw/vp9_plane_add_noise sse2/;
$vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt;
add_proto qw/void vp9_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
specialize qw/vp9_filter_by_weight16x16 sse2/;
specialize qw/vp9_filter_by_weight16x16 sse2 msa/;
add_proto qw/void vp9_filter_by_weight8x8/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
specialize qw/vp9_filter_by_weight8x8 sse2/;
specialize qw/vp9_filter_by_weight8x8 sse2 msa/;
}
#
......
......@@ -152,6 +152,10 @@ VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_8_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_16_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_msa.h
ifeq ($(CONFIG_VP9_POSTPROC),yes)
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_mfqe_msa.c
endif
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.h
ifeq ($(ARCH_X86_64), yes)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment