Commit 814e1346 authored by Parag Salasakar's avatar Parag Salasakar Committed by Gerrit Code Review

Merge "mips msa vpx convolve optimzation"

parents cc4c5de2 1579bb88
......@@ -323,7 +323,7 @@ static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
int8_t *filter) {
v16i8 src0, src1, src2, src3, mask;
v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1;
v8u16 vec2, vec3, const255, filt;
v8u16 vec2, vec3, filt;
mask = LD_SB(&mc_filt_mask_arr[16]);
......@@ -331,14 +331,11 @@ static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
filt = LD_UH(filter);
filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
const255 = (v8u16)__msa_ldi_h(255);
LD_SB4(src, src_stride, src0, src1, src2, src3);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
SRARI_H2_UH(vec2, vec3, FILTER_BITS);
MIN_UH2_UH(vec2, vec3, const255);
PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
......@@ -353,7 +350,7 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
v8u16 vec4, vec5, vec6, vec7, const255, filt;
v8u16 vec4, vec5, vec6, vec7, filt;
mask = LD_SB(&mc_filt_mask_arr[16]);
......@@ -361,8 +358,6 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
filt = LD_UH(filter);
filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
const255 = (v8u16)__msa_ldi_h(255);
LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
......@@ -370,7 +365,6 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
vec6, vec7);
SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
MIN_UH4_UH(vec4, vec5, vec6, vec7, const255);
PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
res3);
ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
......@@ -402,7 +396,7 @@ static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
int8_t *filter) {
v16i8 src0, src1, src2, src3, mask;
v16u8 filt0, dst0, dst1, dst2, dst3;
v8u16 vec0, vec1, vec2, vec3, const255, filt;
v8u16 vec0, vec1, vec2, vec3, filt;
mask = LD_SB(&mc_filt_mask_arr[0]);
......@@ -410,8 +404,6 @@ static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
filt = LD_UH(filter);
filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
const255 = (v8u16)__msa_ldi_h(255);
LD_SB4(src, src_stride, src0, src1, src2, src3);
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
......@@ -419,7 +411,6 @@ static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
dst, dst_stride);
}
......@@ -432,7 +423,7 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
int32_t height) {
v16i8 src0, src1, src2, src3, mask;
v16u8 filt0, dst0, dst1, dst2, dst3;
v8u16 vec0, vec1, vec2, vec3, const255, filt;
v8u16 vec0, vec1, vec2, vec3, filt;
mask = LD_SB(&mc_filt_mask_arr[0]);
......@@ -440,8 +431,6 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
filt = LD_UH(filter);
filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
const255 = (v8u16)__msa_ldi_h(255);
LD_SB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
......@@ -450,7 +439,6 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
LD_SB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
......@@ -463,7 +451,6 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
dst, dst_stride);
dst += (4 * dst_stride);
......@@ -478,7 +465,6 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
LD_SB4(src, src_stride, src0, src1, src2, src3);
PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
dst, dst_stride);
......@@ -490,7 +476,6 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
dst, dst_stride);
}
......@@ -520,7 +505,7 @@ static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
v16u8 filt0, dst0, dst1, dst2, dst3;
v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8u16 res0, res1, res2, res3, res4, res5, res6, res7, const255, filt;
v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
mask = LD_SB(&mc_filt_mask_arr[0]);
......@@ -528,8 +513,6 @@ static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
filt = LD_UH(filter);
filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
const255 = (v8u16)__msa_ldi_h(255);
LD_SB4(src, src_stride, src0, src2, src4, src6);
LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
src += (4 * src_stride);
......@@ -545,8 +528,6 @@ static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
MIN_UH4_UH(res0, res1, res2, res3, const255);
MIN_UH4_UH(res4, res5, res6, res7, const255);
PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
dst += dst_stride;
PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
......@@ -572,8 +553,6 @@ static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
MIN_UH4_UH(res0, res1, res2, res3, const255);
MIN_UH4_UH(res4, res5, res6, res7, const255);
PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
dst += dst_stride;
PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
......@@ -595,7 +574,7 @@ static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
v16u8 filt0, dst0, dst1, dst2, dst3;
v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8u16 res0, res1, res2, res3, res4, res5, res6, res7, const255, filt;
v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
mask = LD_SB(&mc_filt_mask_arr[0]);
......@@ -603,8 +582,6 @@ static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
filt = LD_UH(filter);
filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
const255 = (v8u16)__msa_ldi_h(255);
for (loop_cnt = (height >> 1); loop_cnt--;) {
src0 = LD_SB(src);
src2 = LD_SB(src + 16);
......@@ -627,8 +604,6 @@ static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
res6, res7);
SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
MIN_UH4_UH(res0, res1, res2, res3, const255);
MIN_UH4_UH(res4, res5, res6, res7, const255);
LD_UB2(dst, 16, dst0, dst1);
PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
......@@ -650,7 +625,7 @@ static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
v16u8 filt0, dst0, dst1, dst2, dst3;
v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8u16 out0, out1, out2, out3, out4, out5, out6, out7, const255, filt;
v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
mask = LD_SB(&mc_filt_mask_arr[0]);
......@@ -658,8 +633,6 @@ static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
filt = LD_UH(filter);
filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
const255 = (v8u16)__msa_ldi_h(255);
for (loop_cnt = height; loop_cnt--;) {
LD_SB4(src, 16, src0, src2, src4, src6);
src7 = LD_SB(src + 56);
......@@ -677,8 +650,6 @@ static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
MIN_UH4_UH(out0, out1, out2, out3, const255);
MIN_UH4_UH(out4, out5, out6, out7, const255);
PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);
......
......@@ -274,7 +274,6 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src,
ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
......@@ -323,7 +322,6 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src,
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
tmp0, tmp1, tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1,
res2, res3);
AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1,
......@@ -391,7 +389,6 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src,
tmp3 = __msa_dotp_u_h(vec3, filt_vt);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
dst, dst_stride);
}
......@@ -436,7 +433,6 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src,
tmp1 = __msa_dotp_u_h(vec0, filt_vt);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
......@@ -447,7 +443,6 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src,
tmp3 = __msa_dotp_u_h(vec0, filt_vt);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
SAT_UH2_UH(tmp2, tmp3, 7);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
dst, dst_stride);
......@@ -511,7 +506,6 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src,
ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
dst += dst_stride;
......@@ -520,7 +514,6 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src,
ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
dst += dst_stride;
......@@ -529,7 +522,6 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src,
ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
dst += dst_stride;
......@@ -538,7 +530,6 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src,
ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
dst += dst_stride;
}
......
......@@ -283,7 +283,6 @@ static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
out = __msa_aver_u_b(out, dst0);
......@@ -323,7 +322,6 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
tmp0, tmp1, tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
......@@ -365,7 +363,6 @@ static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
dst, dst_stride);
}
......@@ -402,7 +399,6 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4,
dst, dst_stride);
dst += (4 * dst_stride);
......@@ -410,7 +406,6 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8,
dst, dst_stride);
dst += (4 * dst_stride);
......@@ -460,7 +455,6 @@ static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src,
ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
dst += dst_stride;
......@@ -468,19 +462,16 @@ static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src,
ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
SAT_UH2_UH(tmp2, tmp3, 7);
PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst);
dst += dst_stride;
DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
dst += dst_stride;
DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
SAT_UH2_UH(tmp2, tmp3, 7);
PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst);
dst += dst_stride;
......@@ -519,48 +510,40 @@ static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src,
DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
SAT_UH2_UH(tmp2, tmp3, 7);
PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride);
DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
SAT_UH2_UH(tmp2, tmp3, 7);
PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride);
ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16);
DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
SAT_UH2_UH(tmp2, tmp3, 7);
PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride);
ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride);
DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
SAT_UH2_UH(tmp2, tmp3, 7);
PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride);
dst += (4 * dst_stride);
......@@ -605,48 +588,40 @@ static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,
ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
SAT_UH2_UH(tmp2, tmp3, 7);
PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
SAT_UH2_UH(tmp4, tmp5, 7);
PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16);
DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
SAT_UH2_UH(tmp6, tmp7, 7);
PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride);
ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32);
DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
SAT_UH2_UH(tmp2, tmp3, 7);
PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride);
ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
SAT_UH2_UH(tmp4, tmp5, 7);
PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48));
DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
SAT_UH2_UH(tmp6, tmp7, 7);
PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride);
dst += (2 * dst_stride);
......
......@@ -318,7 +318,7 @@ static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
int8_t *filter) {
v16i8 src0, src1, src2, src3, mask;
v16u8 filt0, vec0, vec1, res0, res1;
v8u16 vec2, vec3, filt, const255;
v8u16 vec2, vec3, filt;
mask = LD_SB(&mc_filt_mask_arr[16]);
......@@ -326,13 +326,10 @@ static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
filt = LD_UH(filter);
filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
const255 = (v8u16) __msa_ldi_h(255);
LD_SB4(src, src_stride, src0, src1, src2, src3);
VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
SRARI_H2_UH(vec2, vec3, FILTER_BITS);
MIN_UH2_UH(vec2, vec3, const255);
PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
......@@ -343,7 +340,7 @@ static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
v16u8 vec0, vec1, vec2, vec3, filt0;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
v16i8 res0, res1, res2, res3;
v8u16 vec4, vec5, vec6, vec7, filt, const255;
v8u16 vec4, vec5, vec6, vec7, filt;
mask = LD_SB(&mc_filt_mask_arr[16]);
......@@ -351,15 +348,12 @@ static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
filt = LD_UH(filter);
filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
const255 = (v8u16) __msa_ldi_h(255);
LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
vec6, vec7);
SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
MIN_UH4_UH(vec4, vec5, vec6, vec7, const255);
PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
res2, res3);
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
......@@ -382,7 +376,7 @@ static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
int8_t *filter) {
v16u8 filt0;
v16i8 src0, src1, src2, src3, mask;
v8u16 vec0, vec1, vec2, vec3, const255, filt;
v8u16 vec0, vec1, vec2, vec3, filt;
mask = LD_SB(&mc_filt_mask_arr[0]);
......@@ -390,15 +384,12 @@ static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
filt = LD_UH(filter);
filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
const255 = (v8u16) __msa_ldi_h(255);
LD_SB4(src, src_stride, src0, src1, src2, src3);
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
ST8x4_UB(src0, src1, dst, dst_stride);
}
......@@ -408,7 +399,7 @@ static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
int8_t *filter, int32_t height) {
v16u8 filt0;
v16i8 src0, src1, src2, src3, mask, out0, out1;
v8u16 vec0, vec1, vec2, vec3, filt, const255;
v8u16 vec0, vec1, vec2, vec3, filt;
mask = LD_SB(&mc_filt_mask_arr[0]);
......@@ -416,8 +407,6 @@ static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
filt = LD_UH(filter);
filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
const255 = (v8u16) __msa_ldi_h(255);
LD_SB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
......@@ -426,7 +415,6 @@ static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
LD_SB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
......@@ -440,7 +428,6 @@ static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
......@@ -454,7 +441,6 @@ static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
LD_SB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
......@@ -466,7 +452,6 @@ static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
}
......@@ -488,7 +473,7 @@ static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255;
v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
mask = LD_SB(&mc_filt_mask_arr[0]);
......@@ -498,8 +483,6 @@ static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
filt = LD_UH(filter);
filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
const255 = (v8u16) __msa_ldi_h(255);
LD_SB4(src, src_stride, src0, src2, src4, src6);
LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
src += (4 * src_stride);
......@@ -514,8 +497,6 @@ static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
out6, out7);
SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
MIN_UH4_UH(out0, out1, out2, out3, const255);
MIN_UH4_UH(out4, out5, out6, out7, const255);
PCKEV_ST_SB(out0, out1, dst);
dst += dst_stride;
PCKEV_ST_SB(out2, out3, dst);
......@@ -540,8 +521,6 @@ static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
out6, out7);