Commit b89861a4 authored by Yaowu Xu's avatar Yaowu Xu

Remove vp9-postproc from configure

Change-Id: I601464f0b74183daa80730856dfbf33ddfce2cfe
parent 3246fc04
......@@ -41,7 +41,6 @@ Advanced options:
${toggle_vp10} VP10 codec support
${toggle_internal_stats} output of encoder internal stats for debug, if supported (encoders)
${toggle_postproc} postprocessing
${toggle_vp9_postproc} vp9 specific postprocessing
${toggle_multithread} multithreaded encoding and decoding
${toggle_spatial_resampling} spatial sampling (scaling) support
${toggle_realtime_only} enable this option while building for real-time encoding
......@@ -283,7 +282,6 @@ CONFIG_LIST="
dc_recon
runtime_cpu_detect
postproc
vp9_postproc
multithread
internal_stats
${CODECS}
......@@ -346,7 +344,6 @@ CMDLINE_SELECT="
dequant_tokens
dc_recon
postproc
vp9_postproc
multithread
internal_stats
${CODECS}
......@@ -442,7 +439,7 @@ process_targets() {
done
enabled debug_libs && DIST_DIR="${DIST_DIR}-debug"
enabled codec_srcs && DIST_DIR="${DIST_DIR}-src"
! enabled postproc && ! enabled vp9_postproc && DIST_DIR="${DIST_DIR}-nopost"
! enabled postproc && DIST_DIR="${DIST_DIR}-nopost"
! enabled multithread && DIST_DIR="${DIST_DIR}-nomt"
! enabled install_docs && DIST_DIR="${DIST_DIR}-nodocs"
DIST_DIR="${DIST_DIR}-${tgt_isa}-${tgt_os}"
......@@ -626,10 +623,6 @@ process_toolchain() {
enable_feature dc_recon
fi
if enabled internal_stats; then
enable_feature vp9_postproc
fi
# Enable the postbuild target if building for visual studio.
case "$tgt_cc" in
vs*) enable_feature msvs
......
......@@ -81,15 +81,6 @@ void vp10_free_ref_frame_buffers(BufferPool *pool) {
}
}
void vp10_free_postproc_buffers(VP10_COMMON *cm) {
#if CONFIG_VP9_POSTPROC
vpx_free_frame_buffer(&cm->post_proc_buffer);
vpx_free_frame_buffer(&cm->post_proc_buffer_int);
#else
(void)cm;
#endif
}
void vp10_free_context_buffers(VP10_COMMON *cm) {
cm->free_mi(cm);
free_seg_map(cm);
......
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_config.h"
#include "./vp10_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "./vpx_scale_rtcd.h"
#include "vp10/common/onyxc_int.h"
#include "vp10/common/postproc.h"
// TODO(jackychen): Replace this function with SSE2 code. There is
// one SSE2 implementation in vp8, so will consider how to share it
// between vp8 and vp9.
static void filter_by_weight(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
int block_size, int src_weight) {
const int dst_weight = (1 << MFQE_PRECISION) - src_weight;
const int rounding_bit = 1 << (MFQE_PRECISION - 1);
int r, c;
for (r = 0; r < block_size; r++) {
for (c = 0; c < block_size; c++) {
dst[c] = (src[c] * src_weight + dst[c] * dst_weight + rounding_bit)
>> MFQE_PRECISION;
}
src += src_stride;
dst += dst_stride;
}
}
void vp10_filter_by_weight8x8_c(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride, int src_weight) {
filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight);
}
void vp10_filter_by_weight16x16_c(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
int src_weight) {
filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight);
}
static void filter_by_weight32x32(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride, int weight) {
vp10_filter_by_weight16x16(src, src_stride, dst, dst_stride, weight);
vp10_filter_by_weight16x16(src + 16, src_stride, dst + 16, dst_stride,
weight);
vp10_filter_by_weight16x16(src + src_stride * 16, src_stride,
dst + dst_stride * 16, dst_stride, weight);
vp10_filter_by_weight16x16(src + src_stride * 16 + 16, src_stride,
dst + dst_stride * 16 + 16, dst_stride, weight);
}
static void filter_by_weight64x64(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride, int weight) {
filter_by_weight32x32(src, src_stride, dst, dst_stride, weight);
filter_by_weight32x32(src + 32, src_stride, dst + 32,
dst_stride, weight);
filter_by_weight32x32(src + src_stride * 32, src_stride,
dst + dst_stride * 32, dst_stride, weight);
filter_by_weight32x32(src + src_stride * 32 + 32, src_stride,
dst + dst_stride * 32 + 32, dst_stride, weight);
}
static void apply_ifactor(const uint8_t *y, int y_stride, uint8_t *yd,
int yd_stride, const uint8_t *u, const uint8_t *v,
int uv_stride, uint8_t *ud, uint8_t *vd,
int uvd_stride, BLOCK_SIZE block_size,
int weight) {
if (block_size == BLOCK_16X16) {
vp10_filter_by_weight16x16(y, y_stride, yd, yd_stride, weight);
vp10_filter_by_weight8x8(u, uv_stride, ud, uvd_stride, weight);
vp10_filter_by_weight8x8(v, uv_stride, vd, uvd_stride, weight);
} else if (block_size == BLOCK_32X32) {
filter_by_weight32x32(y, y_stride, yd, yd_stride, weight);
vp10_filter_by_weight16x16(u, uv_stride, ud, uvd_stride, weight);
vp10_filter_by_weight16x16(v, uv_stride, vd, uvd_stride, weight);
} else if (block_size == BLOCK_64X64) {
filter_by_weight64x64(y, y_stride, yd, yd_stride, weight);
filter_by_weight32x32(u, uv_stride, ud, uvd_stride, weight);
filter_by_weight32x32(v, uv_stride, vd, uvd_stride, weight);
}
}
// TODO(jackychen): Determine whether replace it with assembly code.
static void copy_mem8x8(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride) {
int r;
for (r = 0; r < 8; r++) {
memcpy(dst, src, 8);
src += src_stride;
dst += dst_stride;
}
}
static void copy_mem16x16(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride) {
int r;
for (r = 0; r < 16; r++) {
memcpy(dst, src, 16);
src += src_stride;
dst += dst_stride;
}
}
static void copy_mem32x32(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride) {
copy_mem16x16(src, src_stride, dst, dst_stride);
copy_mem16x16(src + 16, src_stride, dst + 16, dst_stride);
copy_mem16x16(src + src_stride * 16, src_stride,
dst + dst_stride * 16, dst_stride);
copy_mem16x16(src + src_stride * 16 + 16, src_stride,
dst + dst_stride * 16 + 16, dst_stride);
}
void copy_mem64x64(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride) {
copy_mem32x32(src, src_stride, dst, dst_stride);
copy_mem32x32(src + 32, src_stride, dst + 32, dst_stride);
copy_mem32x32(src + src_stride * 32, src_stride,
dst + src_stride * 32, dst_stride);
copy_mem32x32(src + src_stride * 32 + 32, src_stride,
dst + src_stride * 32 + 32, dst_stride);
}
static void copy_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
int y_stride, int uv_stride, uint8_t *yd, uint8_t *ud,
uint8_t *vd, int yd_stride, int uvd_stride,
BLOCK_SIZE bs) {
if (bs == BLOCK_16X16) {
copy_mem16x16(y, y_stride, yd, yd_stride);
copy_mem8x8(u, uv_stride, ud, uvd_stride);
copy_mem8x8(v, uv_stride, vd, uvd_stride);
} else if (bs == BLOCK_32X32) {
copy_mem32x32(y, y_stride, yd, yd_stride);
copy_mem16x16(u, uv_stride, ud, uvd_stride);
copy_mem16x16(v, uv_stride, vd, uvd_stride);
} else {
copy_mem64x64(y, y_stride, yd, yd_stride);
copy_mem32x32(u, uv_stride, ud, uvd_stride);
copy_mem32x32(v, uv_stride, vd, uvd_stride);
}
}
static void get_thr(BLOCK_SIZE bs, int qdiff, int *sad_thr, int *vdiff_thr) {
const int adj = qdiff >> MFQE_PRECISION;
if (bs == BLOCK_16X16) {
*sad_thr = 7 + adj;
} else if (bs == BLOCK_32X32) {
*sad_thr = 6 + adj;
} else { // BLOCK_64X64
*sad_thr = 5 + adj;
}
*vdiff_thr = 125 + qdiff;
}
static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u,
const uint8_t *v, int y_stride, int uv_stride,
uint8_t *yd, uint8_t *ud, uint8_t *vd, int yd_stride,
int uvd_stride, int qdiff) {
int sad, sad_thr, vdiff, vdiff_thr;
uint32_t sse;
get_thr(bs, qdiff, &sad_thr, &vdiff_thr);
if (bs == BLOCK_16X16) {
vdiff = (vpx_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
} else if (bs == BLOCK_32X32) {
vdiff = (vpx_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
sad = (vpx_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10;
} else /* if (bs == BLOCK_64X64) */ {
vdiff = (vpx_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
sad = (vpx_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12;
}
// vdiff > sad * 3 means vdiff should not be too small, otherwise,
// it might be a lighting change in smooth area. When there is a
// lighting change in smooth area, it is dangerous to do MFQE.
if (sad > 1 && vdiff > sad * 3) {
const int weight = 1 << MFQE_PRECISION;
int ifactor = weight * sad * vdiff / (sad_thr * vdiff_thr);
// When ifactor equals weight, no MFQE is done.
if (ifactor > weight) {
ifactor = weight;
}
apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride, ud, vd,
uvd_stride, bs, ifactor);
} else {
// Copy the block from current frame (i.e., no mfqe is done).
copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
yd_stride, uvd_stride, bs);
}
}
static int mfqe_decision(MODE_INFO *mi, BLOCK_SIZE cur_bs) {
// Check the motion in current block(for inter frame),
// or check the motion in the correlated block in last frame (for keyframe).
const int mv_len_square = mi->mbmi.mv[0].as_mv.row *
mi->mbmi.mv[0].as_mv.row +
mi->mbmi.mv[0].as_mv.col *
mi->mbmi.mv[0].as_mv.col;
const int mv_threshold = 100;
return mi->mbmi.mode >= NEARESTMV && // Not an intra block
cur_bs >= BLOCK_16X16 &&
mv_len_square <= mv_threshold;
}
// Process each partiton in a super block, recursively.
static void mfqe_partition(VP10_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
const uint8_t *y, const uint8_t *u,
const uint8_t *v, int y_stride, int uv_stride,
uint8_t *yd, uint8_t *ud, uint8_t *vd,
int yd_stride, int uvd_stride) {
int mi_offset, y_offset, uv_offset;
const BLOCK_SIZE cur_bs = mi->mbmi.sb_type;
const int qdiff = cm->base_qindex - cm->postproc_state.last_base_qindex;
const int bsl = b_width_log2_lookup[bs];
PARTITION_TYPE partition = partition_lookup[bsl][cur_bs];
const BLOCK_SIZE subsize = get_subsize(bs, partition);
if (cur_bs < BLOCK_8X8) {
// If there are blocks smaller than 8x8, it must be on the boundary.
return;
}
// No MFQE on blocks smaller than 16x16
if (bs == BLOCK_16X16) {
partition = PARTITION_NONE;
}
if (bs == BLOCK_64X64) {
mi_offset = 4;
y_offset = 32;
uv_offset = 16;
} else {
mi_offset = 2;
y_offset = 16;
uv_offset = 8;
}
switch (partition) {
BLOCK_SIZE mfqe_bs, bs_tmp;
case PARTITION_HORZ:
if (bs == BLOCK_64X64) {
mfqe_bs = BLOCK_64X32;
bs_tmp = BLOCK_32X32;
} else {
mfqe_bs = BLOCK_32X16;
bs_tmp = BLOCK_16X16;
}
if (mfqe_decision(mi, mfqe_bs)) {
// Do mfqe on the first square partition.
mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride,
yd, ud, vd, yd_stride, uvd_stride, qdiff);
// Do mfqe on the second square partition.
mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset,
y_stride, uv_stride, yd + y_offset, ud + uv_offset,
vd + uv_offset, yd_stride, uvd_stride, qdiff);
}
if (mfqe_decision(mi + mi_offset * cm->mi_stride, mfqe_bs)) {
// Do mfqe on the first square partition.
mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
v + uv_offset * uv_stride, y_stride, uv_stride,
yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff);
// Do mfqe on the second square partition.
mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
u + uv_offset * uv_stride + uv_offset,
v + uv_offset * uv_stride + uv_offset, y_stride,
uv_stride, yd + y_offset * yd_stride + y_offset,
ud + uv_offset * uvd_stride + uv_offset,
vd + uv_offset * uvd_stride + uv_offset,
yd_stride, uvd_stride, qdiff);
}
break;
case PARTITION_VERT:
if (bs == BLOCK_64X64) {
mfqe_bs = BLOCK_32X64;
bs_tmp = BLOCK_32X32;
} else {
mfqe_bs = BLOCK_16X32;
bs_tmp = BLOCK_16X16;
}
if (mfqe_decision(mi, mfqe_bs)) {
// Do mfqe on the first square partition.
mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride,
yd, ud, vd, yd_stride, uvd_stride, qdiff);
// Do mfqe on the second square partition.
mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
v + uv_offset * uv_stride, y_stride, uv_stride,
yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff);
}
if (mfqe_decision(mi + mi_offset, mfqe_bs)) {
// Do mfqe on the first square partition.
mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset,
y_stride, uv_stride, yd + y_offset, ud + uv_offset,
vd + uv_offset, yd_stride, uvd_stride, qdiff);
// Do mfqe on the second square partition.
mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
u + uv_offset * uv_stride + uv_offset,
v + uv_offset * uv_stride + uv_offset, y_stride,
uv_stride, yd + y_offset * yd_stride + y_offset,
ud + uv_offset * uvd_stride + uv_offset,
vd + uv_offset * uvd_stride + uv_offset,
yd_stride, uvd_stride, qdiff);
}
break;
case PARTITION_NONE:
if (mfqe_decision(mi, cur_bs)) {
// Do mfqe on this partition.
mfqe_block(cur_bs, y, u, v, y_stride, uv_stride,
yd, ud, vd, yd_stride, uvd_stride, qdiff);
} else {
// Copy the block from current frame(i.e., no mfqe is done).
copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
yd_stride, uvd_stride, bs);
}
break;
case PARTITION_SPLIT:
// Recursion on four square partitions, e.g. if bs is 64X64,
// then look into four 32X32 blocks in it.
mfqe_partition(cm, mi, subsize, y, u, v, y_stride, uv_stride, yd, ud, vd,
yd_stride, uvd_stride);
mfqe_partition(cm, mi + mi_offset, subsize, y + y_offset, u + uv_offset,
v + uv_offset, y_stride, uv_stride, yd + y_offset,
ud + uv_offset, vd + uv_offset, yd_stride, uvd_stride);
mfqe_partition(cm, mi + mi_offset * cm->mi_stride, subsize,
y + y_offset * y_stride, u + uv_offset * uv_stride,
v + uv_offset * uv_stride, y_stride, uv_stride,
yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
vd + uv_offset * uvd_stride, yd_stride, uvd_stride);
mfqe_partition(cm, mi + mi_offset * cm->mi_stride + mi_offset,
subsize, y + y_offset * y_stride + y_offset,
u + uv_offset * uv_stride + uv_offset,
v + uv_offset * uv_stride + uv_offset, y_stride,
uv_stride, yd + y_offset * yd_stride + y_offset,
ud + uv_offset * uvd_stride + uv_offset,
vd + uv_offset * uvd_stride + uv_offset,
yd_stride, uvd_stride);
break;
default:
assert(0);
}
}
void vp10_mfqe(VP10_COMMON *cm) {
int mi_row, mi_col;
// Current decoded frame.
const YV12_BUFFER_CONFIG *show = cm->frame_to_show;
// Last decoded frame and will store the MFQE result.
YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
// Loop through each super block.
for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MI_BLOCK_SIZE) {
for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
MODE_INFO *mi;
MODE_INFO *mi_local = cm->mi + (mi_row * cm->mi_stride + mi_col);
// Motion Info in last frame.
MODE_INFO *mi_prev = cm->postproc_state.prev_mi +
(mi_row * cm->mi_stride + mi_col);
const uint32_t y_stride = show->y_stride;
const uint32_t uv_stride = show->uv_stride;
const uint32_t yd_stride = dest->y_stride;
const uint32_t uvd_stride = dest->uv_stride;
const uint32_t row_offset_y = mi_row << 3;
const uint32_t row_offset_uv = mi_row << 2;
const uint32_t col_offset_y = mi_col << 3;
const uint32_t col_offset_uv = mi_col << 2;
const uint8_t *y = show->y_buffer + row_offset_y * y_stride +
col_offset_y;
const uint8_t *u = show->u_buffer + row_offset_uv * uv_stride +
col_offset_uv;
const uint8_t *v = show->v_buffer + row_offset_uv * uv_stride +
col_offset_uv;
uint8_t *yd = dest->y_buffer + row_offset_y * yd_stride + col_offset_y;
uint8_t *ud = dest->u_buffer + row_offset_uv * uvd_stride +
col_offset_uv;
uint8_t *vd = dest->v_buffer + row_offset_uv * uvd_stride +
col_offset_uv;
if (frame_is_intra_only(cm)) {
mi = mi_prev;
} else {
mi = mi_local;
}
mfqe_partition(cm, mi, BLOCK_64X64, y, u, v, y_stride, uv_stride, yd, ud,
vd, yd_stride, uvd_stride);
}
}
}
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP10_COMMON_MFQE_H_
#define VP10_COMMON_MFQE_H_
#ifdef __cplusplus
extern "C" {
#endif
// Multiframe Quality Enhancement.
// The aim for MFQE is to replace pixel blocks in the current frame with
// the correlated pixel blocks (with higher quality) in the last frame.
// The replacement can only be taken in stationary blocks by checking
// the motion of the blocks and other conditions such as the SAD of
// the current block and correlated block, the variance of the block
// difference, etc.
void vp10_mfqe(struct VP10Common *cm);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP10_COMMON_MFQE_H_
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vp10_rtcd.h"
#include "vp10/common/onyxc_int.h"
#include "vpx_dsp/mips/macros_msa.h"
static void filter_by_weight8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
uint8_t *dst_ptr, int32_t dst_stride,
int32_t src_weight) {
int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
int32_t row;
uint64_t src0_d, src1_d, dst0_d, dst1_d;
v16i8 src0 = { 0 };
v16i8 src1 = { 0 };
v16i8 dst0 = { 0 };
v16i8 dst1 = { 0 };
v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
src_wt = __msa_fill_h(src_weight);
dst_wt = __msa_fill_h(dst_weight);
for (row = 2; row--;) {
LD2(src_ptr, src_stride, src0_d, src1_d);
src_ptr += (2 * src_stride);
LD2(dst_ptr, dst_stride, dst0_d, dst1_d);
INSERT_D2_SB(src0_d, src1_d, src0);
INSERT_D2_SB(dst0_d, dst1_d, dst0);
LD2(src_ptr, src_stride, src0_d, src1_d);
src_ptr += (2 * src_stride);
LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d);
INSERT_D2_SB(src0_d, src1_d, src1);
INSERT_D2_SB(dst0_d, dst1_d, dst1);
UNPCK_UB_SH(src0, src_r, src_l);
UNPCK_UB_SH(dst0, dst_r, dst_l);
res_h_r = (src_r * src_wt);
res_h_r += (dst_r * dst_wt);
res_h_l = (src_l * src_wt);
res_h_l += (dst_l * dst_wt);
SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
ST8x2_UB(dst0, dst_ptr, dst_stride);
dst_ptr += (2 * dst_stride);
UNPCK_UB_SH(src1, src_r, src_l);
UNPCK_UB_SH(dst1, dst_r, dst_l);
res_h_r = (src_r * src_wt);
res_h_r += (dst_r * dst_wt);
res_h_l = (src_l * src_wt);
res_h_l += (dst_l * dst_wt);
SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
ST8x2_UB(dst1, dst_ptr, dst_stride);
dst_ptr += (2 * dst_stride);
}
}
static void filter_by_weight16x16_msa(const uint8_t *src_ptr,
int32_t src_stride,
uint8_t *dst_ptr,
int32_t dst_stride,
int32_t src_weight) {
int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
int32_t row;
v16i8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
src_wt = __msa_fill_h(src_weight);
dst_wt = __msa_fill_h(dst_weight);
for (row = 4; row--;) {
LD_SB4(src_ptr, src_stride, src0, src1, src2, src3);
src_ptr += (4 * src_stride);
LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3);
UNPCK_UB_SH(src0, src_r, src_l);
UNPCK_UB_SH(dst0, dst_r, dst_l);
res_h_r = (src_r * src_wt);
res_h_r += (dst_r * dst_wt);
res_h_l = (src_l * src_wt);
res_h_l += (dst_l * dst_wt);
SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
dst_ptr += dst_stride;
UNPCK_UB_SH(src1, src_r, src_l);
UNPCK_UB_SH(dst1, dst_r, dst_l);
res_h_r = (src_r * src_wt);
res_h_r += (dst_r * dst_wt);
res_h_l = (src_l * src_wt);
res_h_l += (dst_l * dst_wt);
SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
dst_ptr += dst_stride;
UNPCK_UB_SH(src2, src_r, src_l);
UNPCK_UB_SH(dst2, dst_r, dst_l);
res_h_r = (src_r * src_wt);
res_h_r += (dst_r * dst_wt);
res_h_l = (src_l * src_wt);
res_h_l +=