Commit 13d2aee7 authored by Yi Luo's avatar Yi Luo
Browse files

Add the missing IDTX type optimization to hybrid txfm

Change-Id: I99b15e5270bfefe2eb3e982aeba06ed564540d73
parent 72e2e982
...@@ -470,6 +470,10 @@ void av1_iht16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, ...@@ -470,6 +470,10 @@ void av1_iht16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
iadst16(in); iadst16(in);
flip_col(&dest, &stride, 16); flip_col(&dest, &stride, 16);
break; break;
case IDTX:
iidtx16(in);
iidtx16(in);
break;
case V_DCT: case V_DCT:
iidtx16(in); iidtx16(in);
idct16(in); idct16(in);
......
...@@ -494,6 +494,10 @@ void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, ...@@ -494,6 +494,10 @@ void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
aom_iadst16_sse2(in0, in1); aom_iadst16_sse2(in0, in1);
FLIPUD_PTR(dest, stride, 16); FLIPUD_PTR(dest, stride, 16);
break; break;
case IDTX:
iidtx16_sse2(in0, in1);
iidtx16_sse2(in0, in1);
break;
case V_DCT: case V_DCT:
iidtx16_sse2(in0, in1); iidtx16_sse2(in0, in1);
aom_idct16_sse2(in0, in1); aom_idct16_sse2(in0, in1);
......
...@@ -52,12 +52,7 @@ static void fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff, ...@@ -52,12 +52,7 @@ static void fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
return; return;
} }
#if CONFIG_EXT_TX av1_fht4x4(src_diff, coeff, diff_stride, tx_type);
if (tx_type == IDTX)
av1_fwd_idtx_c(src_diff, coeff, diff_stride, 4, tx_type);
else
#endif
av1_fht4x4(src_diff, coeff, diff_stride, tx_type);
} }
static void fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff, static void fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
...@@ -106,36 +101,21 @@ static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff, ...@@ -106,36 +101,21 @@ static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TX_TYPE tx_type, int diff_stride, TX_TYPE tx_type,
FWD_TXFM_OPT fwd_txfm_opt) { FWD_TXFM_OPT fwd_txfm_opt) {
(void)fwd_txfm_opt; (void)fwd_txfm_opt;
#if CONFIG_EXT_TX av1_fht8x8(src_diff, coeff, diff_stride, tx_type);
if (tx_type == IDTX)
av1_fwd_idtx_c(src_diff, coeff, diff_stride, 8, tx_type);
else
#endif
av1_fht8x8(src_diff, coeff, diff_stride, tx_type);
} }
static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff, static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TX_TYPE tx_type, int diff_stride, TX_TYPE tx_type,
FWD_TXFM_OPT fwd_txfm_opt) { FWD_TXFM_OPT fwd_txfm_opt) {
(void)fwd_txfm_opt; (void)fwd_txfm_opt;
#if CONFIG_EXT_TX av1_fht16x16(src_diff, coeff, diff_stride, tx_type);
if (tx_type == IDTX)
av1_fwd_idtx_c(src_diff, coeff, diff_stride, 16, tx_type);
else
#endif
av1_fht16x16(src_diff, coeff, diff_stride, tx_type);
} }
static void fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff, static void fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TX_TYPE tx_type, int diff_stride, TX_TYPE tx_type,
FWD_TXFM_OPT fwd_txfm_opt) { FWD_TXFM_OPT fwd_txfm_opt) {
(void)fwd_txfm_opt; (void)fwd_txfm_opt;
#if CONFIG_EXT_TX av1_fht32x32(src_diff, coeff, diff_stride, tx_type);
if (tx_type == IDTX)
av1_fwd_idtx_c(src_diff, coeff, diff_stride, 32, tx_type);
else
#endif
av1_fht32x32(src_diff, coeff, diff_stride, tx_type);
} }
#if CONFIG_TX64X64 #if CONFIG_TX64X64
......
...@@ -257,6 +257,12 @@ void av1_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride, ...@@ -257,6 +257,12 @@ void av1_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
fadst4_sse2(in); fadst4_sse2(in);
write_buffer_4x4(output, in); write_buffer_4x4(output, in);
break; break;
case IDTX:
load_buffer_4x4(input, in, stride, 0, 0);
fidtx4_sse2(in);
fidtx4_sse2(in);
write_buffer_4x4(output, in);
break;
case V_DCT: case V_DCT:
load_buffer_4x4(input, in, stride, 0, 0); load_buffer_4x4(input, in, stride, 0, 0);
fdct4_sse2(in); fdct4_sse2(in);
...@@ -1357,6 +1363,13 @@ void av1_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, ...@@ -1357,6 +1363,13 @@ void av1_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
right_shift_8x8(in, 1); right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8); write_buffer_8x8(output, in, 8);
break; break;
case IDTX:
load_buffer_8x8(input, in, stride, 0, 0);
fidtx8_sse2(in);
fidtx8_sse2(in);
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
case V_DCT: case V_DCT:
load_buffer_8x8(input, in, stride, 0, 0); load_buffer_8x8(input, in, stride, 0, 0);
fdct8_sse2(in); fdct8_sse2(in);
...@@ -2579,6 +2592,13 @@ void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, ...@@ -2579,6 +2592,13 @@ void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
fadst16_sse2(in0, in1); fadst16_sse2(in0, in1);
write_buffer_16x16(output, in0, in1, 16); write_buffer_16x16(output, in0, in1, 16);
break; break;
case IDTX:
load_buffer_16x16(input, in0, in1, stride, 0, 0);
fidtx16_sse2(in0, in1);
right_shift_16x16(in0, in1);
fidtx16_sse2(in0, in1);
write_buffer_16x16(output, in0, in1, 16);
break;
case V_DCT: case V_DCT:
load_buffer_16x16(input, in0, in1, stride, 0, 0); load_buffer_16x16(input, in0, in1, stride, 0, 0);
fdct16_sse2(in0, in1); fdct16_sse2(in0, in1);
......
...@@ -1025,6 +1025,13 @@ void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride, ...@@ -1025,6 +1025,13 @@ void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
right_shift_16x16(in); right_shift_16x16(in);
fadst16_avx2(in); fadst16_avx2(in);
break; break;
case IDTX:
load_buffer_16x16(input, stride, 0, 0, in);
fidtx16_avx2(in);
mm256_transpose_16x16(in);
right_shift_16x16(in);
fidtx16_avx2(in);
break;
case V_DCT: case V_DCT:
load_buffer_16x16(input, stride, 0, 0, in); load_buffer_16x16(input, stride, 0, 0, in);
fdct16_avx2(in); fdct16_avx2(in);
...@@ -1621,6 +1628,12 @@ void av1_fht32x32_avx2(const int16_t *input, tran_low_t *output, int stride, ...@@ -1621,6 +1628,12 @@ void av1_fht32x32_avx2(const int16_t *input, tran_low_t *output, int stride,
right_shift_32x32(in0, in1); right_shift_32x32(in0, in1);
fhalfright32_avx2(in0, in1); fhalfright32_avx2(in0, in1);
break; break;
case IDTX:
load_buffer_32x32(input, stride, 0, 0, in0, in1);
fidtx32_avx2(in0, in1);
right_shift_32x32(in0, in1);
fidtx32_avx2(in0, in1);
break;
case V_DCT: case V_DCT:
load_buffer_32x32(input, stride, 0, 0, in0, in1); load_buffer_32x32(input, stride, 0, 0, in0, in1);
fdct32_avx2(in0, in1); fdct32_avx2(in0, in1);
......
...@@ -184,6 +184,8 @@ const Ht16x16Param kArrayHt16x16Param_sse2[] = { ...@@ -184,6 +184,8 @@ const Ht16x16Param kArrayHt16x16Param_sse2[] = {
256), 256),
make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 8, AOM_BITS_8, make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 8, AOM_BITS_8,
256), 256),
make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 9, AOM_BITS_8,
256),
make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 10, AOM_BITS_8, make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 10, AOM_BITS_8,
256), 256),
make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 11, AOM_BITS_8, make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 11, AOM_BITS_8,
...@@ -223,6 +225,8 @@ const Ht16x16Param kArrayHt16x16Param_avx2[] = { ...@@ -223,6 +225,8 @@ const Ht16x16Param kArrayHt16x16Param_avx2[] = {
256), 256),
make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 8, AOM_BITS_8, make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 8, AOM_BITS_8,
256), 256),
make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 9, AOM_BITS_8,
256),
make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 10, AOM_BITS_8, make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 10, AOM_BITS_8,
256), 256),
make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 11, AOM_BITS_8, make_tuple(&av1_fht16x16_avx2, &av1_iht16x16_256_add_avx2, 11, AOM_BITS_8,
......
...@@ -177,6 +177,7 @@ const Ht4x4Param kArrayHt4x4Param_sse2[] = { ...@@ -177,6 +177,7 @@ const Ht4x4Param kArrayHt4x4Param_sse2[] = {
make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 6, AOM_BITS_8, 16), make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 6, AOM_BITS_8, 16),
make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 7, AOM_BITS_8, 16), make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 7, AOM_BITS_8, 16),
make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 8, AOM_BITS_8, 16), make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 8, AOM_BITS_8, 16),
make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 9, AOM_BITS_8, 16),
make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 10, AOM_BITS_8, 16), make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 10, AOM_BITS_8, 16),
make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 11, AOM_BITS_8, 16), make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 11, AOM_BITS_8, 16),
make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 12, AOM_BITS_8, 16), make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 12, AOM_BITS_8, 16),
......
...@@ -177,6 +177,7 @@ const Ht8x8Param kArrayHt8x8Param_sse2[] = { ...@@ -177,6 +177,7 @@ const Ht8x8Param kArrayHt8x8Param_sse2[] = {
make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 6, AOM_BITS_8, 64), make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 6, AOM_BITS_8, 64),
make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 7, AOM_BITS_8, 64), make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 7, AOM_BITS_8, 64),
make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 8, AOM_BITS_8, 64), make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 8, AOM_BITS_8, 64),
make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 9, AOM_BITS_8, 64),
make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 10, AOM_BITS_8, 64), make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 10, AOM_BITS_8, 64),
make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 11, AOM_BITS_8, 64), make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 11, AOM_BITS_8, 64),
make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 12, AOM_BITS_8, 64), make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 12, AOM_BITS_8, 64),
......
...@@ -201,6 +201,7 @@ const Ht32x32Param kArrayHt32x32Param_avx2[] = { ...@@ -201,6 +201,7 @@ const Ht32x32Param kArrayHt32x32Param_avx2[] = {
make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 6, AOM_BITS_8, 1024), make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 6, AOM_BITS_8, 1024),
make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 7, AOM_BITS_8, 1024), make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 7, AOM_BITS_8, 1024),
make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 8, AOM_BITS_8, 1024), make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 8, AOM_BITS_8, 1024),
make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 9, AOM_BITS_8, 1024),
make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 10, AOM_BITS_8, 1024), make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 10, AOM_BITS_8, 1024),
make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 11, AOM_BITS_8, 1024), make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 11, AOM_BITS_8, 1024),
make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 12, AOM_BITS_8, 1024), make_tuple(&av1_fht32x32_avx2, &dummy_inv_txfm, 12, AOM_BITS_8, 1024),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment