Commit d341f843 authored by Jingning Han's avatar Jingning Han
Browse files

Refactor forward/inverse transform msa implementations

This commit factors out common macro definitions from the forward
and inverse transform implementations into vpx_dsp. It removes
the duplicate macro definitions from encoder and decoder folders.

Change-Id: I92301acbd3317075e9c5f03328a25abb123bca78
parent 33a9d53c
......@@ -25,12 +25,12 @@ void vp9_idct16_1d_rows_msa(const int16_t *input, int16_t *output) {
reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
TRANSPOSE8x8_SH_SH(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15,
reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
VP9_DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
VP9_DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
VP9_DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
VP9_DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
VP9_DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
SUB4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg0, reg12, reg4,
reg8);
......@@ -38,16 +38,16 @@ void vp9_idct16_1d_rows_msa(const int16_t *input, int16_t *output) {
reg10);
/* stage 2 */
VP9_DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
VP9_DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
reg9 = reg1 - loc2;
reg1 = reg1 + loc2;
reg7 = reg15 - loc3;
reg15 = reg15 + loc3;
VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
VP9_DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
loc1 = reg15 + reg3;
......@@ -63,8 +63,8 @@ void vp9_idct16_1d_rows_msa(const int16_t *input, int16_t *output) {
tmp7 = loc1;
reg0 = loc2;
VP9_DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
VP9_DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11);
DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11);
loc0 = reg9 + reg5;
reg5 = reg9 - reg5;
......@@ -77,13 +77,13 @@ void vp9_idct16_1d_rows_msa(const int16_t *input, int16_t *output) {
loc2 = reg4 - loc0;
tmp5 = loc1;
VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
reg10 = loc0;
reg11 = loc1;
VP9_DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
reg13 = loc2;
......@@ -117,12 +117,12 @@ void vp9_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
/* load bottom 8x8 */
LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
VP9_DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
VP9_DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
VP9_DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
VP9_DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
VP9_DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
reg0 = reg2 - loc1;
......@@ -135,16 +135,16 @@ void vp9_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
reg10 = reg10 + loc2;
/* stage 2 */
VP9_DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
VP9_DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
reg9 = reg1 - loc2;
reg1 = reg1 + loc2;
reg7 = reg15 - loc3;
reg15 = reg15 + loc3;
VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
VP9_DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
loc1 = reg15 + reg3;
......@@ -160,8 +160,8 @@ void vp9_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
tmp7 = loc1;
reg0 = loc2;
VP9_DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
VP9_DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11);
DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11);
loc0 = reg9 + reg5;
reg5 = reg9 - reg5;
......@@ -174,13 +174,13 @@ void vp9_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
loc2 = reg4 - loc0;
tmp5 = loc1;
VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
reg10 = loc0;
reg11 = loc1;
VP9_DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
reg13 = loc2;
......@@ -350,17 +350,17 @@ static void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
k1 = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64);
k2 = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64);
k3 = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64);
VP9_MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
k0 = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64);
k1 = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64);
k2 = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64);
k3 = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);
VP9_MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
BUTTERFLY_4(g0, g2, g10, g8, h8, h9, v2, v0);
k0 = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
k1 = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
k2 = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);
VP9_MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
r1 = LD_SH(input + 1 * 16);
r2 = LD_SH(input + 2 * 16);
......@@ -375,12 +375,12 @@ static void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
k1 = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64);
k2 = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64);
k3 = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64);
VP9_MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, g4, g5, g6, g7);
MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, g4, g5, g6, g7);
k0 = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64);
k1 = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64);
k2 = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64);
k3 = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);
VP9_MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g12, g13, g14, g15);
MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g12, g13, g14, g15);
BUTTERFLY_4(g4, g6, g14, g12, h10, h11, v6, v4);
BUTTERFLY_4(h8, h9, h11, h10, out0, out1, h11, h10);
out1 = -out1;
......@@ -397,7 +397,7 @@ static void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
k0 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
k1 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
k2 = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64);
VP9_MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
out8 = -out8;
......@@ -414,7 +414,7 @@ static void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
k0 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
k1 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
k2 = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64);
VP9_MADD_BF(v0, v2, v4, v6, k0, k1, k2, k0, out4, out6, out5, out7);
MADD_BF(v0, v2, v4, v6, k0, k1, k2, k0, out4, out6, out5, out7);
out4 = -out4;
SRARI_H2_SH(out4, out5, 6);
dst4 = LD_UB(dst + 3 * dst_stride);
......@@ -426,7 +426,7 @@ static void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
ST8x1_UB(res4, dst + 3 * dst_stride);
ST8x1_UB(res5, dst + 12 * dst_stride);
VP9_MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
out13 = -out13;
SRARI_H2_SH(out12, out13, 6);
dst12 = LD_UB(dst + 2 * dst_stride);
......@@ -440,7 +440,7 @@ static void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
k3 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
VP9_MADD_SHORT(out6, out7, k0, k3, out6, out7);
MADD_SHORT(out6, out7, k0, k3, out6, out7);
SRARI_H2_SH(out6, out7, 6);
dst6 = LD_UB(dst + 4 * dst_stride);
dst7 = LD_UB(dst + 11 * dst_stride);
......@@ -451,7 +451,7 @@ static void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
ST8x1_UB(res6, dst + 4 * dst_stride);
ST8x1_UB(res7, dst + 11 * dst_stride);
VP9_MADD_SHORT(out10, out11, k0, k3, out10, out11);
MADD_SHORT(out10, out11, k0, k3, out10, out11);
SRARI_H2_SH(out10, out11, 6);
dst10 = LD_UB(dst + 6 * dst_stride);
dst11 = LD_UB(dst + 9 * dst_stride);
......@@ -464,7 +464,7 @@ static void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);
k2 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
VP9_MADD_SHORT(h10, h11, k1, k2, out2, out3);
MADD_SHORT(h10, h11, k1, k2, out2, out3);
SRARI_H2_SH(out2, out3, 6);
dst2 = LD_UB(dst + 7 * dst_stride);
dst3 = LD_UB(dst + 8 * dst_stride);
......@@ -475,7 +475,7 @@ static void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
ST8x1_UB(res2, dst + 7 * dst_stride);
ST8x1_UB(res3, dst + 8 * dst_stride);
VP9_MADD_SHORT(out14, out15, k1, k2, out14, out15);
MADD_SHORT(out14, out15, k1, k2, out14, out15);
SRARI_H2_SH(out14, out15, 6);
dst14 = LD_UB(dst + 5 * dst_stride);
dst15 = LD_UB(dst + 10 * dst_stride);
......
......@@ -47,26 +47,26 @@ static void vp9_idct32x8_row_even_process_store(int16_t *tmp_buf,
/* Even stage 1 */
LD_SH8(tmp_buf, 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
VP9_DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
VP9_DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
VP9_DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
loc1 = vec3;
loc0 = vec1;
VP9_DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
VP9_DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
/* Even stage 2 */
LD_SH8((tmp_buf + 16), 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
VP9_DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
VP9_DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
VP9_DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
vec0 = reg0 + reg4;
reg0 = reg0 - reg4;
......@@ -84,16 +84,16 @@ static void vp9_idct32x8_row_even_process_store(int16_t *tmp_buf,
reg4 = reg5 - vec1;
reg5 = reg5 + vec1;
VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
VP9_DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
vec0 = reg0 - reg6;
reg0 = reg0 + reg6;
vec1 = reg7 - reg1;
reg7 = reg7 + reg1;
VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
/* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
......@@ -137,10 +137,10 @@ static void vp9_idct32x8_row_odd_process_store(int16_t *tmp_buf,
reg6 = LD_SH(tmp_buf + 25 * 8);
reg7 = LD_SH(tmp_buf + 31 * 8);
VP9_DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
VP9_DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
VP9_DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
vec0 = reg0 + reg3;
reg0 = reg0 - reg3;
......@@ -157,16 +157,16 @@ static void vp9_idct32x8_row_odd_process_store(int16_t *tmp_buf,
ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8);
SUB2(reg5, reg4, reg3, reg2, vec0, vec1);
VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
ST_SH2(vec0, vec1, (tmp_odd_buf), 8);
/* 4 Stores */
VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
VP9_DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8);
VP9_DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8);
/* Odd stage 2 */
......@@ -180,21 +180,21 @@ static void vp9_idct32x8_row_odd_process_store(int16_t *tmp_buf,
reg6 = LD_SH(tmp_buf + 27 * 8);
reg7 = LD_SH(tmp_buf + 29 * 8);
VP9_DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
VP9_DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
VP9_DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
/* 4 Stores */
SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4,
vec0, vec1, vec2, vec3);
VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
VP9_DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
BUTTERFLY_4(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3);
ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8);
VP9_DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
/* 4 Stores */
......@@ -204,7 +204,7 @@ static void vp9_idct32x8_row_odd_process_store(int16_t *tmp_buf,
ST_SH(reg0, (tmp_odd_buf + 13 * 8));
ST_SH(reg1, (tmp_odd_buf + 14 * 8));
VP9_DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8);
/* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
......@@ -218,10 +218,10 @@ static void vp9_idct32x8_row_odd_process_store(int16_t *tmp_buf,
ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
SUB2(reg2, reg6, reg3, reg7, vec0, vec1);
VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8);
/* Load 8 & Store 8 */
......@@ -233,10 +233,10 @@ static void vp9_idct32x8_row_odd_process_store(int16_t *tmp_buf,
ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
SUB2(reg1, reg5, reg2, reg6, vec0, vec1);
VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8);
}
......@@ -363,16 +363,16 @@ static void vp9_idct8x32_column_even_process_store(int16_t *tmp_buf,
LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
tmp_buf += (2 * 32);
VP9_DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
VP9_DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
VP9_DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
loc1 = vec3;
loc0 = vec1;
VP9_DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
VP9_DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
......@@ -381,10 +381,10 @@ static void vp9_idct8x32_column_even_process_store(int16_t *tmp_buf,
/* Load 8 */
LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
VP9_DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
VP9_DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
VP9_DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
vec0 = reg0 + reg4;
reg0 = reg0 - reg4;
......@@ -402,16 +402,16 @@ static void vp9_idct8x32_column_even_process_store(int16_t *tmp_buf,
reg4 = reg5 - vec1;
reg5 = reg5 + vec1;
VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
VP9_DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
vec0 = reg0 - reg6;
reg0 = reg0 + reg6;
vec1 = reg7 - reg1;
reg7 = reg7 + reg1;
VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
/* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
/* Store 8 */
......@@ -448,10 +448,10 @@ static void vp9_idct8x32_column_odd_process_store(int16_t *tmp_buf,
reg6 = LD_SH(tmp_buf + 25 * 32);
reg7 = LD_SH(tmp_buf + 31 * 32);
VP9_DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
VP9_DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
VP9_DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
vec0 = reg0 + reg3;
reg0 = reg0 - reg3;
......@@ -467,15 +467,15 @@ static void vp9_idct8x32_column_odd_process_store(int16_t *tmp_buf,
ADD2(reg5, reg4, reg3, reg2, vec0, vec1);
ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8);
SUB2(reg5, reg4, reg3, reg2, vec0, vec1);
VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
ST_SH2(vec0, vec1, tmp_odd_buf, 8);
/* 4 Stores */
VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
VP9_DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8);
VP9_DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8);
/* Odd stage 2 */
......@@ -489,25 +489,25 @@ static void vp9_idct8x32_column_odd_process_store(int16_t *tmp_buf,
reg6 = LD_SH(tmp_buf + 27 * 32);
reg7 = LD_SH(tmp_buf + 29 * 32);
VP9_DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
VP9_DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
VP9_DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
/* 4 Stores */
SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3);
VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
VP9_DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
BUTTERFLY_4(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2);
ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8);
VP9_DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
/* 4 Stores */
ADD4(reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0, vec1, vec2, vec3);
BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
ST_SH2(reg0, reg1, (tmp_odd_buf + 13 * 8), 8);
VP9_DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8);
/* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
......@@ -519,10 +519,10 @@ static void vp9_idct8x32_column_odd_process_store(int16_t *tmp_buf,
ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
SUB2(reg2, reg6, reg3, reg7, vec0, vec1);
VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8);
/* Load 8 & Store 8 */
......@@ -533,10 +533,10 @@ static void vp9_idct8x32_column_odd_process_store(int16_t *tmp_buf,
ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
SUB2(reg1, reg5, reg2, reg6, vec0, vec1);
VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8);
}
......
......@@ -14,52 +14,7 @@
#include "vpx_ports/mem.h"
#include "vp9/common/vp9_idct.h"
#include "vpx_dsp/mips/macros_msa.h"
#define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) { \
v8i16 k0_m = __msa_fill_h(cnst0); \
v4i32 s0_m, s1_m, s2_m, s3_m; \
\
s0_m = (v4i32)__msa_fill_h(cnst1); \
k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m); \
\
ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m); \
ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \
DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m); \
SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
\
DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \
SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
}
#define VP9_DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, \
dst0, dst1, dst2, dst3) { \
v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m; \
v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m; \
\
DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, \
tp0_m, tp2_m, tp3_m, tp4_m); \
DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, \
tp5_m, tp6_m, tp7_m, tp8_m); \
BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m); \
BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m); \
SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS); \
SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS); \
PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, \
dst0, dst1, dst2, dst3); \
}