diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl index 71edf88fdd6d50ed8f8ff34b8c795a18e670ea4f..2bf4c3e926d70ba99450bb6225a7b07f583839b5 100755 --- a/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/aom_dsp/aom_dsp_rtcd_defs.pl @@ -395,18 +395,6 @@ if (aom_config("CONFIG_AV1") eq "yes") { add_proto qw/void aom_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/aom_iwht4x4_16_add sse2/; - add_proto qw/void aom_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - - add_proto qw/void aom_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - - add_proto qw/void aom_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - - add_proto qw/void aom_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - - add_proto qw/void aom_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - - add_proto qw/void aom_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - add_proto qw/void aom_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; add_proto qw/void aom_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; @@ -452,9 +440,6 @@ if (aom_config("CONFIG_AV1") eq "yes") { add_proto qw/void aom_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/aom_idct32x32_1_add sse2 avx2/; - - add_proto qw/void aom_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/aom_highbd_idct4x4_16_add sse2/; } if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { } else { diff --git a/aom_dsp/inv_txfm.c b/aom_dsp/inv_txfm.c index 8b2c6a3b78cd07c38b586c3ec83f791a77978af3..39c9cf5775bb90a72e215418038b54764e38f982 100644 --- a/aom_dsp/inv_txfm.c +++ b/aom_dsp/inv_txfm.c @@ -1404,72 +1404,3 @@ void aom_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, dest++; } } - -// TODO(sarahparker) this one still needs to be removed but will be done in -// a followup because of its use in encoder/encodemb.c -void aom_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) { - tran_low_t step[4]; - tran_high_t temp1, temp2; - (void)bd; - // stage 1 - temp1 = (input[0] + input[2]) * cospi_16_64; - temp2 = (input[0] - input[2]) * cospi_16_64; - step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; - temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; - step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - - // stage 2 - output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd); - output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd); - output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd); - output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd); -} - -void aom_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[4 * 4]; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[4], temp_out[4]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - // Rows - for (i = 0; i < 4; ++i) { - aom_highbd_idct4_c(input, outptr, bd); - input += 4; - outptr += 4; - } - - // Columns - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; - aom_highbd_idct4_c(temp_in, temp_out, bd); - for (j = 0; j < 4; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); - } - } -} - -void aom_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, - int dest_stride, int bd) { - int i; - tran_high_t a1; - tran_low_t out = - HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); - a1 = ROUND_POWER_OF_TWO(out, 4); - - for (i = 0; i < 4; i++) { - dest[0] = highbd_clip_pixel_add(dest[0], a1, bd); - dest[1] = highbd_clip_pixel_add(dest[1], a1, bd); - dest[2] = highbd_clip_pixel_add(dest[2], a1, bd); - dest[3] = highbd_clip_pixel_add(dest[3], a1, bd); - dest += dest_stride; - } -} diff --git a/aom_dsp/x86/inv_txfm_sse2.c b/aom_dsp/x86/inv_txfm_sse2.c index 6973fbdd576cf2365d3a05f2a035a6add019772d..86ce928b7ac50d052a78686b2d0032846777c7d4 100644 --- a/aom_dsp/x86/inv_txfm_sse2.c +++ b/aom_dsp/x86/inv_txfm_sse2.c @@ -3498,131 +3498,3 @@ void idct32_8col(__m128i *in0, __m128i *in1) { in1[14] = _mm_sub_epi16(stp1_1, stp1_30); in1[15] = _mm_sub_epi16(stp1_0, stp1_31); } - -static INLINE __m128i clamp_high_sse2(__m128i value, int bd) { - __m128i ubounded, retval; - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi16(1); - const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one); - ubounded = _mm_cmpgt_epi16(value, max); - retval = _mm_andnot_si128(ubounded, value); - ubounded = _mm_and_si128(ubounded, max); - retval = _mm_or_si128(retval, ubounded); - retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); - return retval; -} - -void aom_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[4 * 4]; - tran_low_t *outptr = out; - int i, j; - __m128i inptr[4]; - __m128i sign_bits[2]; - __m128i temp_mm, min_input, max_input; - int test; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - int optimised_cols = 0; - const __m128i zero = _mm_set1_epi16(0); - const __m128i eight = _mm_set1_epi16(8); - const __m128i max = _mm_set1_epi16(12043); - const __m128i min = _mm_set1_epi16(-12043); - // Load input into __m128i - inptr[0] = _mm_loadu_si128((const __m128i *)input); - inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4)); - inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8)); - inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12)); - - // Pack to 16 bits - inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]); - inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]); - - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp_mm = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp_mm); - - if (!test) { - // Do the row transform - aom_idct4_sse2(inptr); - - // Check the min & max values - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp_mm = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp_mm); - - if (test) { - array_transpose_4x4(inptr); - sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero); - sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero); - inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]); - inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]); - inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]); - inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]); - _mm_storeu_si128((__m128i *)outptr, inptr[0]); - _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]); - _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]); - _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]); - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 4; ++i) { - aom_highbd_idct4_c(input, outptr, bd); - input += 4; - outptr += 4; - } - } - - if (optimised_cols) { - aom_idct4_sse2(inptr); - - // Final round and shift - inptr[0] = _mm_add_epi16(inptr[0], eight); - inptr[1] = _mm_add_epi16(inptr[1], eight); - - inptr[0] = _mm_srai_epi16(inptr[0], 4); - inptr[1] = _mm_srai_epi16(inptr[1], 4); - - // Reconstruction and Store - { - __m128i d0 = _mm_loadl_epi64((const __m128i *)dest); - __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2)); - d0 = _mm_unpacklo_epi64( - d0, _mm_loadl_epi64((const __m128i *)(dest + stride))); - d2 = _mm_unpacklo_epi64( - d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3))); - d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd); - d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd); - // store input0 - _mm_storel_epi64((__m128i *)dest, d0); - // store input1 - d0 = _mm_srli_si128(d0, 8); - _mm_storel_epi64((__m128i *)(dest + stride), d0); - // store input2 - _mm_storel_epi64((__m128i *)(dest + stride * 2), d2); - // store input3 - d2 = _mm_srli_si128(d2, 8); - _mm_storel_epi64((__m128i *)(dest + stride * 3), d2); - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[4], temp_out[4]; - // Columns - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; - aom_highbd_idct4_c(temp_in, temp_out, bd); - for (j = 0; j < 4; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); - } - } - } -} diff --git a/av1/common/idct.c b/av1/common/idct.c index 260df3461329f3f7599c4139d9d966c20e59d0fc..86892f0e4c41b9fcc2112598c242402a99e13b11 100644 --- a/av1/common/idct.c +++ b/av1/common/idct.c @@ -1456,15 +1456,6 @@ static void inv_txfm_add_64x64(const tran_low_t *input, uint8_t *dest, } #endif // CONFIG_TX64X64 -// idct -void av1_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, - int eob, int bd) { - if (eob > 1) - aom_highbd_idct4x4_16_add(input, dest, stride, bd); - else - aom_highbd_idct4x4_1_add(input, dest, stride, bd); -} - void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, int eob, int bd) { if (eob > 1) diff --git a/av1/common/idct.h b/av1/common/idct.h index e1bda4b53c54b45470f002797ca336c94dcb97b1..c3b3876b5aea2095499196903ccf537035f08587 100644 --- a/av1/common/idct.h +++ b/av1/common/idct.h @@ -70,8 +70,6 @@ void av1_inverse_transform_block_facade(MACROBLOCKD *xd, int plane, int block, void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, int eob, int bd); -void av1_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, - int eob, int bd); void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest, int stride, int eob, int bd, TX_TYPE tx_type, int lossless); diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc index a2b180a56f31753c50ef35d44d9964e973767f42..c6e8dbcc1dfd5be5bd81469e9ba009fcc032f8a2 100644 --- a/test/fdct4x4_test.cc +++ b/test/fdct4x4_test.cc @@ -63,14 +63,6 @@ void fht4x4_12(const int16_t *in, tran_low_t *out, int stride, int tx_type) { av1_fwd_txfm2d_4x4_c(in, out, stride, tx_type, 12); } -void idct4x4_10(const tran_low_t *in, uint8_t *out, int stride) { - aom_highbd_idct4x4_16_add_c(in, out, stride, 10); -} - -void idct4x4_12(const tran_low_t *in, uint8_t *out, int stride) { - aom_highbd_idct4x4_16_add_c(in, out, stride, 12); -} - void iht4x4_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { av1_inv_txfm2d_add_4x4_c(in, CONVERT_TO_SHORTPTR(out), stride, tx_type, 10); } @@ -86,16 +78,6 @@ void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) { void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) { aom_highbd_iwht4x4_16_add_c(in, out, stride, 12); } - -#if HAVE_SSE2 -void idct4x4_10_sse2(const tran_low_t *in, uint8_t *out, int stride) { - aom_highbd_idct4x4_16_add_sse2(in, out, stride, 10); -} - -void idct4x4_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { - aom_highbd_idct4x4_16_add_sse2(in, out, stride, 12); -} -#endif // HAVE_SSE2 #endif // CONFIG_HIGHBITDEPTH class Trans4x4DCT : public libaom_test::TransformTestBase, @@ -221,19 +203,10 @@ TEST_P(Trans4x4WHT, MemCheck) { RunMemCheck(); } TEST_P(Trans4x4WHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); } using std::tr1::make_tuple; -#if CONFIG_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( - C, Trans4x4DCT, - ::testing::Values( - make_tuple(&aom_highbd_fdct4x4_c, &idct4x4_10, 0, AOM_BITS_10, 16), - make_tuple(&aom_highbd_fdct4x4_c, &idct4x4_12, 0, AOM_BITS_12, 16), - make_tuple(&aom_fdct4x4_c, &aom_idct4x4_16_add_c, 0, AOM_BITS_8, 16))); -#else INSTANTIATE_TEST_CASE_P(C, Trans4x4DCT, ::testing::Values(make_tuple(&aom_fdct4x4_c, &aom_idct4x4_16_add_c, 0, AOM_BITS_8, 16))); -#endif // CONFIG_HIGHBITDEPTH #if CONFIG_HIGHBITDEPTH INSTANTIATE_TEST_CASE_P( @@ -322,18 +295,6 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSE2 && !CONFIG_HIGHBITDEPTH #if HAVE_SSE2 && CONFIG_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( - SSE2, Trans4x4DCT, - ::testing::Values( - make_tuple(&aom_highbd_fdct4x4_c, &idct4x4_10_sse2, 0, AOM_BITS_10, 16), - make_tuple(&aom_highbd_fdct4x4_sse2, &idct4x4_10_sse2, 0, AOM_BITS_10, - 16), - make_tuple(&aom_highbd_fdct4x4_c, &idct4x4_12_sse2, 0, AOM_BITS_12, 16), - make_tuple(&aom_highbd_fdct4x4_sse2, &idct4x4_12_sse2, 0, AOM_BITS_12, - 16), - make_tuple(&aom_fdct4x4_sse2, &aom_idct4x4_16_add_c, 0, AOM_BITS_8, - 16))); - INSTANTIATE_TEST_CASE_P( SSE2, Trans4x4HT, ::testing::Values( diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc index 033f182940a94fedb4fce2f33ca33b91fbb2fbbc..b2ea176e867f65f77f4235af72578a652d71578b 100644 --- a/test/partial_idct_test.cc +++ b/test/partial_idct_test.cc @@ -41,13 +41,6 @@ void wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) { fn(in, out, stride); } -#if CONFIG_HIGHBITDEPTH -template -void highbd_wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) { - fn(in, CONVERT_TO_BYTEPTR(out), stride, bd); -} -#endif - typedef std::tr1::tuple PartialInvTxfmParam; @@ -285,26 +278,6 @@ TEST_P(PartialIDctTest, DISABLED_Speed) { using std::tr1::make_tuple; const PartialInvTxfmParam c_partial_idct_tests[] = { -#if CONFIG_HIGHBITDEPTH - make_tuple(&aom_highbd_fdct4x4_c, - &highbd_wrapper, - &highbd_wrapper, TX_4X4, 16, 8, 2), - make_tuple(&aom_highbd_fdct4x4_c, - &highbd_wrapper, - &highbd_wrapper, TX_4X4, 16, 10, 2), - make_tuple(&aom_highbd_fdct4x4_c, - &highbd_wrapper, - &highbd_wrapper, TX_4X4, 16, 12, 2), - make_tuple(&aom_highbd_fdct4x4_c, - &highbd_wrapper, - &highbd_wrapper, TX_4X4, 1, 8, 2), - make_tuple(&aom_highbd_fdct4x4_c, - &highbd_wrapper, - &highbd_wrapper, TX_4X4, 1, 10, 2), - make_tuple(&aom_highbd_fdct4x4_c, - &highbd_wrapper, - &highbd_wrapper, TX_4X4, 1, 12, 2), -#endif // CONFIG_HIGHBITDEPTH make_tuple(&aom_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 1024, 8, 1), make_tuple(&aom_fdct32x32_c, &wrapper, @@ -358,17 +331,6 @@ INSTANTIATE_TEST_CASE_P(NEON, PartialIDctTest, #if HAVE_SSE2 const PartialInvTxfmParam sse2_partial_idct_tests[] = { -#if CONFIG_HIGHBITDEPTH - make_tuple(&aom_highbd_fdct4x4_c, - &highbd_wrapper, - &highbd_wrapper, TX_4X4, 16, 8, 2), - make_tuple( - &aom_highbd_fdct4x4_c, &highbd_wrapper, - &highbd_wrapper, TX_4X4, 16, 10, 2), - make_tuple( - &aom_highbd_fdct4x4_c, &highbd_wrapper, - &highbd_wrapper, TX_4X4, 16, 12, 2), -#endif // CONFIG_HIGHBITDEPTH make_tuple(&aom_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 1024, 8, 1), make_tuple(&aom_fdct32x32_c, &wrapper,