From 18976fa5f1ce9777276d969190a1ccb51afbcefe Mon Sep 17 00:00:00 2001 From: Peng Bin Date: Sun, 11 Feb 2018 16:34:19 +0800 Subject: [PATCH] Add inv txfm2d sse2 for sizes with 4 Implement av1_lowbd_inv_txfm2d_add_4x4_sse2 Implement av1_lowbd_inv_txfm2d_add_4x8_sse2 Implement av1_lowbd_inv_txfm2d_add_8x4_sse2 Implement av1_lowbd_inv_txfm2d_add_4x16_sse2 Implement av1_lowbd_inv_txfm2d_add_16x4_sse2 A brief speed test shows that using the included SSE2 functions completed by this CL, for speed1 lowbitdepth encoder speeds up >9% and lowbitdepth decoder speeds up >25%, comparing to the highbitdepth implementation in the baseline. Change-Id: I0576a2a146c0b1a7b483c9d35c3d21d979e263cd --- aom_dsp/x86/transpose_sse2.h | 58 +++++- av1/common/x86/av1_inv_txfm_sse2.c | 324 +++++++++++++++++++++++++++-- av1/common/x86/av1_txfm_sse2.h | 27 +++ test/av1_inv_txfm2d_test.cc | 10 +- 4 files changed, 399 insertions(+), 20 deletions(-) diff --git a/aom_dsp/x86/transpose_sse2.h b/aom_dsp/x86/transpose_sse2.h index 445eb0153..5edfa7184 100644 --- a/aom_dsp/x86/transpose_sse2.h +++ b/aom_dsp/x86/transpose_sse2.h @@ -107,10 +107,14 @@ static INLINE void transpose_16bit_4x4(const __m128i *const in, const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); // Unpack 32 bit elements resulting in: - // out[0]: 00 10 20 30 01 11 21 31 - // out[1]: 02 12 22 32 03 13 23 33 + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 out[0] = _mm_unpacklo_epi32(a0, a1); - out[1] = _mm_unpackhi_epi32(a0, a1); + out[1] = _mm_srli_si128(out[0], 8); + out[2] = _mm_unpackhi_epi32(a0, a1); + out[3] = _mm_srli_si128(out[2], 8); } static INLINE void transpose_16bit_4x8(const __m128i *const in, @@ -155,6 +159,54 @@ static INLINE void transpose_16bit_4x8(const __m128i *const in, out[3] = _mm_unpackhi_epi64(b2, b3); } +static INLINE void transpose_16bit_8x4(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + // a4: 04 14 05 15 06 16 07 17 + // a5: 24 34 25 35 26 36 27 37 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); + + // Unpack 32 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 + // b2: 04 14 24 34 05 15 25 35 + // b4: 02 12 22 32 03 13 23 33 + // b6: 06 16 26 36 07 17 27 37 + const __m128i b0 = _mm_unpacklo_epi32(a0, a1); + const __m128i b2 = _mm_unpacklo_epi32(a4, a5); + const __m128i b4 = _mm_unpackhi_epi32(a0, a1); + const __m128i b6 = _mm_unpackhi_epi32(a4, a5); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 XX XX XX XX + // out[1]: 01 11 21 31 XX XX XX XX + // out[2]: 02 12 22 32 XX XX XX XX + // out[3]: 03 13 23 33 XX XX XX XX + // out[4]: 04 14 24 34 XX XX XX XX + // out[5]: 05 15 25 35 XX XX XX XX + // out[6]: 06 16 26 36 XX XX XX XX + // out[7]: 07 17 27 37 XX XX XX XX + const __m128i zeros = _mm_setzero_si128(); + out[0] = _mm_unpacklo_epi64(b0, zeros); + out[1] = _mm_unpackhi_epi64(b0, zeros); + out[2] = _mm_unpacklo_epi64(b4, zeros); + out[3] = _mm_unpackhi_epi64(b4, zeros); + out[4] = _mm_unpacklo_epi64(b2, zeros); + out[5] = _mm_unpackhi_epi64(b2, zeros); + out[6] = _mm_unpacklo_epi64(b6, zeros); + out[7] = _mm_unpackhi_epi64(b6, zeros); +} + static INLINE void transpose_16bit_8x8(const __m128i *const in, __m128i *const out) { // Unpack 16 bit elements. Goes from: diff --git a/av1/common/x86/av1_inv_txfm_sse2.c b/av1/common/x86/av1_inv_txfm_sse2.c index 50e0c4b5a..f09f2690a 100644 --- a/av1/common/x86/av1_inv_txfm_sse2.c +++ b/av1/common/x86/av1_inv_txfm_sse2.c @@ -1327,6 +1327,72 @@ void idct64_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { output[32] = _mm_subs_epi16(x10[31], x10[32]); } +void iadst4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { + const int32_t *sinpi = sinpi_arr(cos_bit); + const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]); + const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]); + const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]); + const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]); + const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]); + const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]); + const __m128i sinpi_0_p02 = pair_set_epi16(0, sinpi[2]); + const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]); + __m128i x0[4]; + x0[0] = input[0]; + x0[1] = input[1]; + x0[2] = input[2]; + x0[3] = input[3]; + + __m128i u[4]; + u[0] = _mm_unpacklo_epi16(x0[0], x0[2]); + u[1] = _mm_unpackhi_epi16(x0[0], x0[2]); + u[2] = _mm_unpacklo_epi16(x0[1], x0[3]); + u[3] = _mm_unpackhi_epi16(x0[1], x0[3]); + + __m128i x1[16]; + x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4 + x1[1] = _mm_madd_epi16(u[1], sinpi_p01_p04); + x1[2] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1 + x1[3] = _mm_madd_epi16(u[1], sinpi_p02_m01); + x1[4] = _mm_madd_epi16(u[2], sinpi_p03_p02); // x1*sin3 + x3*sin2 + x1[5] = _mm_madd_epi16(u[3], sinpi_p03_p02); + x1[6] = _mm_madd_epi16(u[2], sinpi_p03_m04); // x1*sin3 - x3*sin4 + x1[7] = _mm_madd_epi16(u[3], sinpi_p03_m04); + x1[8] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3 + x1[9] = _mm_madd_epi16(u[1], sinpi_p03_m03); + x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03); // x2*sin3 + x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03); + x1[12] = _mm_madd_epi16(u[2], sinpi_0_p02); // x3*sin2 + x1[13] = _mm_madd_epi16(u[3], sinpi_0_p02); + x1[14] = _mm_madd_epi16(u[2], sinpi_p03_p04); // x1*sin3 + x3*sin4 + x1[15] = _mm_madd_epi16(u[3], sinpi_p03_p04); + + __m128i x2[8]; + x2[0] = _mm_add_epi32(x1[0], x1[4]); // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2 + x2[1] = _mm_add_epi32(x1[1], x1[5]); + x2[2] = _mm_add_epi32(x1[2], x1[6]); // x0*sin2 - x2*sin1 +x1*sin3 - x3*sin4 + x2[3] = _mm_add_epi32(x1[3], x1[7]); + x2[4] = _mm_add_epi32(x1[8], x1[10]); // x0*sin3 - x2*sin3 +x3*sin3 + x2[5] = _mm_add_epi32(x1[9], x1[11]); + x2[6] = _mm_add_epi32(x1[0], x1[2]); // x0*sin1 + x2*sin4 + x0*sin2 - x2*sin1 + x2[7] = _mm_add_epi32(x1[1], x1[3]); + x2[6] = _mm_add_epi32( + x2[6], x1[12]); // x0*sin1 + x2*sin4 + x3*sin2 + x0*sin2 - x2*sin1 + x2[7] = _mm_add_epi32(x2[7], x1[13]); + x2[6] = _mm_sub_epi32( + x2[6], x1[14]); // x0*sin1 + x2*sin4 + x3*sin2 + x0*sin2 - x2*sin1 + x2[7] = _mm_sub_epi32(x2[7], x1[15]); + + const __m128i rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + for (int i = 0; i < 4; ++i) { + __m128i out0 = _mm_add_epi32(x2[2 * i], rounding); + __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding); + out0 = _mm_srai_epi32(out0, cos_bit); + out1 = _mm_srai_epi32(out1, cos_bit); + output[i] = _mm_packs_epi32(out0, out1); + } +} + void iadst8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); const __m128i __zero = _mm_setzero_si128(); @@ -1599,6 +1665,24 @@ void iadst16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { output[15] = _mm_subs_epi16(__zero, x8[1]); } +static void iidentity4_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const __m128i scale = _mm_set1_epi16(NewSqrt2); + const __m128i rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1)); + const __m128i one = _mm_set1_epi16(1); + const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding); + for (int i = 0; i < 4; ++i) { + __m128i a_lo = _mm_unpacklo_epi16(input[i], one); + __m128i a_hi = _mm_unpackhi_epi16(input[i], one); + __m128i b_lo = _mm_madd_epi16(a_lo, scale_rounding); + __m128i b_hi = _mm_madd_epi16(a_hi, scale_rounding); + __m128i c_lo = _mm_srai_epi32(b_lo, NewSqrt2Bits); + __m128i c_hi = _mm_srai_epi32(b_hi, NewSqrt2Bits); + output[i] = _mm_packs_epi32(c_lo, c_hi); + } +} + static void iidentity8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { (void)cos_bit; @@ -1661,6 +1745,20 @@ static INLINE __m128i lowbd_get_recon_8x8_sse2(const __m128i pred, return _mm_packus_epi16(x0, x0); } +static INLINE void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output, + int stride, int flipud, + const int height) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + const __m128i zero = _mm_setzero_si128(); + for (int i = 0; i < height; ++i, j += step) { + const __m128i v = _mm_cvtsi32_si128(*((uint32_t *)(output + i * stride))); + __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero)); + u = _mm_packus_epi16(u, zero); + *((uint32_t *)(output + i * stride)) = _mm_cvtsi128_si32(u); + } +} + static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output, int stride, int flipud, const int height) { @@ -1674,7 +1772,7 @@ static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output, } static const transform_1d_sse2 lowbd_txfm_all_1d_arr[TX_SIZES][TX_TYPES_1D] = { - { NULL, NULL, NULL, NULL }, + { idct4_new_sse2, iadst4_new_sse2, iadst4_new_sse2, iidentity4_new_sse2 }, { idct8_new_sse2, iadst8_new_sse2, iadst8_new_sse2, iidentity8_new_sse2 }, { idct16_new_sse2, iadst16_new_sse2, iadst16_new_sse2, iidentity16_new_sse2 }, { idct32_new_sse2, NULL, NULL, iidentity32_new_sse2 }, @@ -1683,16 +1781,53 @@ static const transform_1d_sse2 lowbd_txfm_all_1d_arr[TX_SIZES][TX_TYPES_1D] = { #endif }; +void av1_lowbd_inv_txfm2d_add_4x4_sse2(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf[4]; + const TX_SIZE tx_size = TX_4X4; + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + + const transform_1d_sse2 row_txfm = + lowbd_txfm_all_1d_arr[txw_idx][htx_tab[tx_type]]; + const transform_1d_sse2 col_txfm = + lowbd_txfm_all_1d_arr[txh_idx][vtx_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row); + transpose_16bit_4x4(buf, buf); + row_txfm(buf, buf, cos_bit_row); + if (lr_flip) { + __m128i temp[4]; + flip_buf_sse2(buf, temp, txfm_size_col); + transpose_16bit_4x4(temp, buf); + } else { + transpose_16bit_4x4(buf, buf); + } + col_txfm(buf, buf, cos_bit_col); + round_shift_16bit(buf, txfm_size_row, shift[1]); + lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row); +} + void av1_lowbd_inv_txfm2d_add_8x8_sse2(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; __m128i buf[8]; - const int8_t *shift = inv_txfm_shift_ls[TX_8X8]; - const int txw_idx = get_txw_idx(TX_8X8); - const int txh_idx = get_txh_idx(TX_8X8); + const TX_SIZE tx_size = TX_8X8; + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; - const int buf_size = 8; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; const transform_1d_sse2 row_txfm = lowbd_txfm_all_1d_arr[txw_idx][htx_tab[tx_type]]; @@ -1701,7 +1836,7 @@ void av1_lowbd_inv_txfm2d_add_8x8_sse2(const int32_t *input, uint8_t *output, int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); - load_buffer_32bit_to_16bit(input, 8, buf, buf_size); + load_buffer_32bit_to_16bit(input, txfm_size_col, buf, txfm_size_row); transpose_16bit_8x8(buf, buf); row_txfm(buf, buf, cos_bit_row); round_shift_16bit(buf, 8, shift[0]); @@ -1858,6 +1993,80 @@ void av1_lowbd_inv_txfm2d_add_64x64_sse2(const int32_t *input, uint8_t *output, } #endif +void av1_lowbd_inv_txfm2d_add_4x8_sse2(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf[8]; + const TX_SIZE tx_size = TX_4X8; + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + + const transform_1d_sse2 row_txfm = + lowbd_txfm_all_1d_arr[txw_idx][htx_tab[tx_type]]; + const transform_1d_sse2 col_txfm = + lowbd_txfm_all_1d_arr[txh_idx][vtx_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row); + transpose_16bit_4x8(buf, buf); + round_shift_sse2(buf, buf, txfm_size_col); // rect special code + row_txfm(buf, buf, cos_bit_row); + // round_shift_16bit(buf, txfm_size_col, shift[0]);// shift[0] is 0 + if (lr_flip) { + __m128i temp[4]; + flip_buf_sse2(buf, temp, txfm_size_col); + transpose_16bit_8x4(temp, buf); + } else { + transpose_16bit_8x4(buf, buf); + } + col_txfm(buf, buf, cos_bit_col); + round_shift_16bit(buf, txfm_size_row, shift[1]); + lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row); +} + +void av1_lowbd_inv_txfm2d_add_8x4_sse2(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf[8]; + const TX_SIZE tx_size = TX_8X4; + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + + const transform_1d_sse2 row_txfm = + lowbd_txfm_all_1d_arr[txw_idx][htx_tab[tx_type]]; + const transform_1d_sse2 col_txfm = + lowbd_txfm_all_1d_arr[txh_idx][vtx_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + load_buffer_32bit_to_16bit(input, txfm_size_col, buf, txfm_size_row); + transpose_16bit_8x4(buf, buf); + round_shift_sse2(buf, buf, txfm_size_col); // rect special code + row_txfm(buf, buf, cos_bit_row); + // round_shift_16bit(buf, txfm_size_col, shift[0]); // shift[0] is 0 + if (lr_flip) { + __m128i temp[8]; + flip_buf_sse2(buf, temp, txfm_size_col); + transpose_16bit_4x8(temp, buf); + } else { + transpose_16bit_4x8(buf, buf); + } + col_txfm(buf, buf, cos_bit_col); + round_shift_16bit(buf, txfm_size_row, shift[1]); + lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row); +} + void av1_lowbd_inv_txfm2d_add_8x16_sse2(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; @@ -1912,6 +2121,97 @@ void av1_lowbd_inv_txfm2d_add_64x32_sse2(const int32_t *input, uint8_t *output, } #endif +void av1_lowbd_inv_txfm2d_add_4x16_sse2(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf[16]; + const TX_SIZE tx_size = TX_4X16; + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + + const transform_1d_sse2 row_txfm = + lowbd_txfm_all_1d_arr[txw_idx][htx_tab[tx_type]]; + const transform_1d_sse2 col_txfm = + lowbd_txfm_all_1d_arr[txh_idx][vtx_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + const int row_one_loop = 8; + for (int i = 0; i < 2; ++i) { + const int32_t *input_cur = input + i * txfm_size_col * row_one_loop; + __m128i *buf_cur = buf + i * row_one_loop; + load_buffer_32bit_to_16bit_w4(input_cur, txfm_size_col, buf_cur, + row_one_loop); + transpose_16bit_4x8(buf_cur, buf_cur); + row_txfm(buf_cur, buf_cur, cos_bit_row); + round_shift_16bit(buf_cur, row_one_loop, shift[0]); + if (lr_flip) { + __m128i temp[8]; + flip_buf_sse2(buf_cur, temp, txfm_size_col); + transpose_16bit_8x4(temp, buf_cur); + } else { + transpose_16bit_8x4(buf_cur, buf_cur); + } + } + col_txfm(buf, buf, cos_bit_col); + round_shift_16bit(buf, txfm_size_row, shift[1]); + lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row); +} + +void av1_lowbd_inv_txfm2d_add_16x4_sse2(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf[16]; + const TX_SIZE tx_size = TX_16X4; + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 3; + + const transform_1d_sse2 row_txfm = + lowbd_txfm_all_1d_arr[txw_idx][htx_tab[tx_type]]; + const transform_1d_sse2 col_txfm = + lowbd_txfm_all_1d_arr[txh_idx][vtx_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + const int row_one_loop = 8; + for (int i = 0; i < buf_size_w_div8; ++i) { + const int32_t *input_cur = input + i * row_one_loop; + __m128i *buf_cur = buf + i * row_one_loop; + load_buffer_32bit_to_16bit(input_cur, txfm_size_col, buf_cur, + txfm_size_row); + transpose_16bit_8x4(buf_cur, buf_cur); + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, txfm_size_col, shift[0]); + if (lr_flip) { + __m128i temp[16]; + flip_buf_sse2(buf, temp, 16); + transpose_16bit_4x8(temp, buf); + transpose_16bit_4x8(temp + 8, buf + 8); + } else { + transpose_16bit_4x8(buf, buf); + transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop); + } + for (int i = 0; i < buf_size_w_div8; i++) { + col_txfm(buf + i * row_one_loop, buf + i * row_one_loop, cos_bit_col); + round_shift_16bit(buf + i * row_one_loop, txfm_size_row, shift[1]); + } + lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, 4); + lowbd_write_buffer_8xn_sse2(buf + 8, output + 8, stride, ud_flip, 4); +} + void av1_lowbd_inv_txfm2d_add_8x32_sse2(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; @@ -1957,15 +2257,15 @@ typedef void (*inv_txfm_func)(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, int bd); static inv_txfm_func inv_txfm_func_ls[TX_SIZES_ALL] = { - NULL, // 4x4 + av1_lowbd_inv_txfm2d_add_4x4_sse2, // 4x4 av1_lowbd_inv_txfm2d_add_8x8_sse2, // 8x8 av1_lowbd_inv_txfm2d_add_16x16_sse2, // 16x16 av1_lowbd_inv_txfm2d_add_32x32_sse2, // 32x32 #if CONFIG_TX64X64 av1_lowbd_inv_txfm2d_add_64x64_sse2, // 64x64 #endif // CONFIG_TX64X64 - NULL, // 4x8 - NULL, // 8x4 + av1_lowbd_inv_txfm2d_add_4x8_sse2, // 4x8 + av1_lowbd_inv_txfm2d_add_8x4_sse2, // 8x4 av1_lowbd_inv_txfm2d_add_8x16_sse2, // 8x16 av1_lowbd_inv_txfm2d_add_16x8_sse2, // 16x8 av1_lowbd_inv_txfm2d_add_16x32_sse2, // 16x32 @@ -1974,8 +2274,8 @@ static inv_txfm_func inv_txfm_func_ls[TX_SIZES_ALL] = { av1_lowbd_inv_txfm2d_add_32x64_sse2, // 32x64 av1_lowbd_inv_txfm2d_add_64x32_sse2, // 64x32 #endif // CONFIG_TX64X64 - NULL, // 4x16 - NULL, // 16x4 + av1_lowbd_inv_txfm2d_add_4x16_sse2, // 4x16 + av1_lowbd_inv_txfm2d_add_16x4_sse2, // 16x4 av1_lowbd_inv_txfm2d_add_8x32_sse2, // 8x32 av1_lowbd_inv_txfm2d_add_32x8_sse2, // 32x8 #if CONFIG_TX64X64 @@ -1988,7 +2288,7 @@ void av1_inv_txfm_add_sse2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param) { const TX_TYPE tx_type = txfm_param->tx_type; const inv_txfm_func inv_func = inv_txfm_func_ls[txfm_param->tx_size]; - if (inv_func != NULL) { + if (inv_func != NULL && (!txfm_param->lossless)) { inv_func(dqcoeff, dst, stride, tx_type, txfm_param->bd); } else { av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param); diff --git a/av1/common/x86/av1_txfm_sse2.h b/av1/common/x86/av1_txfm_sse2.h index a924003cd..efbcae7b2 100644 --- a/av1/common/x86/av1_txfm_sse2.h +++ b/av1/common/x86/av1_txfm_sse2.h @@ -59,6 +59,11 @@ static INLINE __m128i load_32bit_to_16bit(const int32_t *a) { return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4)); } +static INLINE __m128i load_32bit_to_16bit_w4(const int32_t *a) { + const __m128i a_low = _mm_load_si128((const __m128i *)a); + return _mm_packs_epi32(a_low, a_low); +} + // Store 8 16 bit values. Sign extend the values. static INLINE void store_16bit_to_32bit(__m128i a, int32_t *b) { const __m128i a_lo = _mm_unpacklo_epi16(a, a); @@ -107,6 +112,13 @@ static INLINE void load_buffer_32bit_to_16bit(const int32_t *in, int stride, } } +static INLINE void load_buffer_32bit_to_16bit_w4(const int32_t *in, int stride, + __m128i *out, int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = load_32bit_to_16bit_w4(in + i * stride); + } +} + static INLINE void load_buffer_32bit_to_16bit_flip(const int32_t *in, int stride, __m128i *out, int out_size) { @@ -194,6 +206,9 @@ typedef struct { transform_1d_sse2 col, row; // vertical and horizontal } transform_2d_sse2; +void av1_lowbd_inv_txfm2d_add_4x4_sse2(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, int bd); + void av1_lowbd_inv_txfm2d_add_8x8_sse2(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, int bd); @@ -208,6 +223,12 @@ void av1_lowbd_inv_txfm2d_add_64x64_sse2(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, int bd); #endif +void av1_lowbd_inv_txfm2d_add_4x8_sse2(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_inv_txfm2d_add_8x4_sse2(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, int bd); + void av1_lowbd_inv_txfm2d_add_8x16_sse2(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, int bd); @@ -228,6 +249,12 @@ void av1_lowbd_inv_txfm2d_add_64x32_sse2(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, int bd); #endif +void av1_lowbd_inv_txfm2d_add_4x16_sse2(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_inv_txfm2d_add_16x4_sse2(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, int bd); + void av1_lowbd_inv_txfm2d_add_8x32_sse2(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, int bd); diff --git a/test/av1_inv_txfm2d_test.cc b/test/av1_inv_txfm2d_test.cc index 5d56cdce2..6e74534de 100644 --- a/test/av1_inv_txfm2d_test.cc +++ b/test/av1_inv_txfm2d_test.cc @@ -329,15 +329,15 @@ TEST_P(AV1LbdInvTxfm2d, DISABLED_Speed) { #include "av1/common/x86/av1_txfm_sse2.h" const LbdInvTxfm2dFunc kLbdInvFuncSSE2List[TX_SIZES_ALL] = { - NULL, // TX_4X4 + av1_lowbd_inv_txfm2d_add_4x4_sse2, // TX_4X4 av1_lowbd_inv_txfm2d_add_8x8_sse2, // TX_8X8 av1_lowbd_inv_txfm2d_add_16x16_sse2, // TX_16X16 av1_lowbd_inv_txfm2d_add_32x32_sse2, // TX_32X32 #if CONFIG_TX64X64 av1_lowbd_inv_txfm2d_add_64x64_sse2, // 64x64 #endif // CONFIG_TX64X64 - NULL, // TX_4X8 - NULL, // TX_8X4 + av1_lowbd_inv_txfm2d_add_4x8_sse2, // TX_4X8 + av1_lowbd_inv_txfm2d_add_8x4_sse2, // TX_8X4 av1_lowbd_inv_txfm2d_add_8x16_sse2, // TX_8X16 av1_lowbd_inv_txfm2d_add_16x8_sse2, // TX_16X8 av1_lowbd_inv_txfm2d_add_16x32_sse2, // TX_16X32 @@ -346,8 +346,8 @@ const LbdInvTxfm2dFunc kLbdInvFuncSSE2List[TX_SIZES_ALL] = { av1_lowbd_inv_txfm2d_add_32x64_sse2, // TX_32X64 av1_lowbd_inv_txfm2d_add_64x32_sse2, // TX_64X32 #endif // CONFIG_TX64X64 - NULL, // TX_4X16 - NULL, // TX_16X4 + av1_lowbd_inv_txfm2d_add_4x16_sse2, // TX_4X16 + av1_lowbd_inv_txfm2d_add_16x4_sse2, // TX_16X4 av1_lowbd_inv_txfm2d_add_8x32_sse2, // 8x32 av1_lowbd_inv_txfm2d_add_32x8_sse2, // 32x8 #if CONFIG_TX64X64 -- GitLab